From 47f06678366d9cb25a377964c0e6fd57e407770c Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Mar 2026 16:37:19 +0000 Subject: [PATCH 1/2] Add adversarial weight regularisation pipeline Introduces a diagnostics package that detects high-influence survey records, generates synthetic offspring via TVAE, and recalibrates with entropy regularisation and weight capping to reduce output noise in population subgroup statistics. Components: - influence.py: reporting surface definition, per-record influence computation, Kish effective sample size, random reform sampling - generative_model.py: TVAE training on FRS input attributes, conditional sampling with varied conditioning fractions - offspring.py: adversarial detect-spawn-recalibrate loop - recalibrate.py: entropy-regularised weight optimisation with optional hard weight cap and zero-weight pruning - __main__.py: CLI with diagnose/train/regularise commands --- .../adversarial-weight-regularisation.added | 1 + policyengine_uk_data/diagnostics/__init__.py | 38 ++ policyengine_uk_data/diagnostics/__main__.py | 270 +++++++++++ .../diagnostics/generative_model.py | 386 +++++++++++++++ policyengine_uk_data/diagnostics/influence.py | 447 ++++++++++++++++++ policyengine_uk_data/diagnostics/offspring.py | 329 +++++++++++++ .../diagnostics/recalibrate.py | 165 +++++++ pyproject.toml | 2 + uv.lock | 367 +++++++++++++- 9 files changed, 1980 insertions(+), 25 deletions(-) create mode 100644 changelog.d/adversarial-weight-regularisation.added create mode 100644 policyengine_uk_data/diagnostics/__init__.py create mode 100644 policyengine_uk_data/diagnostics/__main__.py create mode 100644 policyengine_uk_data/diagnostics/generative_model.py create mode 100644 policyengine_uk_data/diagnostics/influence.py create mode 100644 policyengine_uk_data/diagnostics/offspring.py create mode 100644 policyengine_uk_data/diagnostics/recalibrate.py diff --git a/changelog.d/adversarial-weight-regularisation.added b/changelog.d/adversarial-weight-regularisation.added new file mode 100644 index 00000000..68088959 --- /dev/null +++ b/changelog.d/adversarial-weight-regularisation.added @@ -0,0 +1 @@ +Added adversarial weight regularisation pipeline: detects high-influence survey records, generates synthetic offspring via TVAE, and recalibrates with entropy regularisation and weight capping to reduce output noise in population subgroup statistics. diff --git a/policyengine_uk_data/diagnostics/__init__.py b/policyengine_uk_data/diagnostics/__init__.py new file mode 100644 index 00000000..06e85fa8 --- /dev/null +++ b/policyengine_uk_data/diagnostics/__init__.py @@ -0,0 +1,38 @@ +"""Adversarial weight regularisation for PolicyEngine UK. + +Detects high-influence survey records, generates synthetic offspring +to diffuse their weight, and recalibrates to population targets with +entropy regularisation. +""" + +from policyengine_uk_data.diagnostics.influence import ( + compute_influence_matrix, + find_high_influence_records, + compute_kish_effective_sample_size, + run_diagnostics, +) +from policyengine_uk_data.diagnostics.generative_model import ( + train_generative_model, + extract_household_features, + validate_generative_model, +) +from policyengine_uk_data.diagnostics.offspring import ( + run_adversarial_loop, +) +from policyengine_uk_data.diagnostics.recalibrate import ( + recalibrate_with_regularisation, + prune_zero_weight_records, +) + +__all__ = [ + "compute_influence_matrix", + "find_high_influence_records", + "compute_kish_effective_sample_size", + "run_diagnostics", + "train_generative_model", + "extract_household_features", + "validate_generative_model", + "run_adversarial_loop", + "recalibrate_with_regularisation", + "prune_zero_weight_records", +] diff --git a/policyengine_uk_data/diagnostics/__main__.py b/policyengine_uk_data/diagnostics/__main__.py new file mode 100644 index 00000000..ed895719 --- /dev/null +++ b/policyengine_uk_data/diagnostics/__main__.py @@ -0,0 +1,270 @@ +"""CLI entry point for adversarial weight regularisation. + +Usage: + uv run python -m policyengine_uk_data.diagnostics [command] [options] + +Commands: + diagnose Run Phase 1 influence diagnostics (read-only) + train Train the generative model on FRS attributes + regularise Run the full adversarial loop (detect + spawn + recalibrate) +""" + +import argparse +import json +import logging +import sys + +import numpy as np + + +def cmd_diagnose(args): + """Run influence diagnostics on a dataset.""" + from policyengine_uk.data import UKSingleYearDataset + from policyengine_uk_data.diagnostics.influence import run_diagnostics + + dataset = UKSingleYearDataset(file_path=args.dataset) + results = run_diagnostics( + dataset, + time_period=args.year, + n_reforms=args.n_reforms, + threshold=args.threshold, + seed=args.seed, + ) + + print("\n=== Weight distribution ===") + for k, v in results["weight_stats"].items(): + print(f" {k}: {v:,.1f}") + + print(f"\n=== Flagged records (threshold={args.threshold}) ===") + flagged = results["flagged_records"] + if flagged.empty: + print(" No records exceed the influence threshold.") + else: + print(f" {len(flagged)} records flagged") + print(flagged.head(20).to_string(index=False)) + + print("\n=== Kish effective sample size (top 20 worst) ===") + kish = results["kish_by_slice"] + sorted_kish = sorted(kish.items(), key=lambda x: x[1]) + for name, val in sorted_kish[:20]: + print(f" {name}: {val:,.0f}") + + if args.output: + # Save full results to JSON + serialisable = { + "weight_stats": results["weight_stats"], + "kish_by_slice": {k: float(v) for k, v in kish.items()}, + "flagged_records": flagged.to_dict(orient="records"), + } + with open(args.output, "w") as f: + json.dump(serialisable, f, indent=2) + print(f"\nFull results saved to {args.output}") + + +def cmd_train(args): + """Train the generative model.""" + from policyengine_uk.data import UKSingleYearDataset + from policyengine_uk_data.diagnostics.generative_model import ( + train_generative_model, + extract_household_features, + validate_generative_model, + ) + import pickle + + dataset = UKSingleYearDataset(file_path=args.dataset) + model = train_generative_model( + dataset, + epochs=args.epochs, + seed=args.seed, + ) + + # Validate + features = extract_household_features(dataset) + validation = validate_generative_model(model, features) + + print("\n=== Generative model validation ===") + print("Marginal KS statistics (lower is better):") + for col, ks in sorted(validation["marginal_ks"].items(), key=lambda x: -x[1])[:10]: + print(f" {col}: {ks:.3f}") + + if validation["correlation_diff"] is not None: + print(f"Max correlation difference: {validation['correlation_diff']:.3f}") + + # Save model + with open(args.output, "wb") as f: + pickle.dump(model, f) + print(f"\nModel saved to {args.output}") + + +def cmd_regularise(args): + """Run the full adversarial weight regularisation pipeline.""" + from policyengine_uk.data import UKSingleYearDataset + from policyengine_uk_data.diagnostics.offspring import ( + run_adversarial_loop, + ) + from policyengine_uk_data.diagnostics.recalibrate import ( + prune_zero_weight_records, + ) + import pickle + + dataset = UKSingleYearDataset(file_path=args.dataset) + + # Load or train generative model + if args.model: + with open(args.model, "rb") as f: + model = pickle.load(f) + else: + from policyengine_uk_data.diagnostics.generative_model import ( + train_generative_model, + ) + + print("No model provided, training generative model...") + model = train_generative_model(dataset, epochs=args.train_epochs) + + result = run_adversarial_loop( + dataset, + model, + time_period=args.year, + threshold=args.threshold, + max_rounds=args.max_rounds, + n_offspring=args.n_offspring, + weight_target=args.weight_target, + seed=args.seed, + ) + + expanded = result["expanded_dataset"] + + # Prune zero-weight records + pruned = prune_zero_weight_records(expanded, epsilon=1.0) + + # Save + pruned.save(args.output) + + weights = pruned.household.household_weight.values + print("\n=== Results ===") + print(f" Rounds completed: {result['rounds_completed']}") + print(f" Records added: {result['records_expanded']}") + print(f" Final dataset size: {len(pruned.household)} households") + print(f" Max weight: {weights.max():,.0f}") + print(f" Median weight: {np.median(weights):,.0f}") + print(f" Influence history: {[f'{x:.3f}' for x in result['influence_history']]}") + print(f"\nExpanded dataset saved to {args.output}") + + +def main(): + parser = argparse.ArgumentParser( + description="Adversarial weight regularisation for PolicyEngine UK", + ) + subparsers = parser.add_subparsers(dest="command") + + # diagnose + diag = subparsers.add_parser( + "diagnose", + help="Run influence diagnostics", + ) + diag.add_argument("dataset", help="Path to .h5 dataset") + diag.add_argument("--year", default="2025", help="Time period") + diag.add_argument( + "--n-reforms", + type=int, + default=50, + help="Number of random reforms for influence sampling", + ) + diag.add_argument( + "--threshold", + type=float, + default=0.05, + help="Influence threshold", + ) + diag.add_argument("--seed", type=int, default=42) + diag.add_argument("--output", "-o", help="Output JSON file for full results") + + # train + tr = subparsers.add_parser( + "train", + help="Train generative model", + ) + tr.add_argument("dataset", help="Path to .h5 dataset") + tr.add_argument( + "--epochs", + type=int, + default=300, + help="TVAE training epochs", + ) + tr.add_argument("--seed", type=int, default=42) + tr.add_argument( + "--output", + "-o", + default="generative_model.pkl", + help="Output pickle file", + ) + + # regularise + reg = subparsers.add_parser( + "regularise", + help="Run full adversarial loop", + ) + reg.add_argument("dataset", help="Path to .h5 dataset") + reg.add_argument( + "--model", + help="Path to trained generative model (.pkl)", + ) + reg.add_argument("--year", default="2025", help="Time period") + reg.add_argument( + "--threshold", + type=float, + default=0.05, + help="Influence threshold", + ) + reg.add_argument( + "--max-rounds", + type=int, + default=10, + help="Max adversarial rounds", + ) + reg.add_argument( + "--n-offspring", + type=int, + default=50, + help="Offspring per flagged record", + ) + reg.add_argument( + "--weight-target", + type=float, + default=None, + help="Target max weight for offspring splitting", + ) + reg.add_argument( + "--train-epochs", + type=int, + default=300, + help="TVAE epochs if training from scratch", + ) + reg.add_argument("--seed", type=int, default=42) + reg.add_argument( + "--output", + "-o", + default="regularised_dataset.h5", + help="Output .h5 file", + ) + + args = parser.parse_args() + if args.command is None: + parser.print_help() + sys.exit(1) + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", + ) + + commands = { + "diagnose": cmd_diagnose, + "train": cmd_train, + "regularise": cmd_regularise, + } + commands[args.command](args) + + +if __name__ == "__main__": + main() diff --git a/policyengine_uk_data/diagnostics/generative_model.py b/policyengine_uk_data/diagnostics/generative_model.py new file mode 100644 index 00000000..2d70f11b --- /dev/null +++ b/policyengine_uk_data/diagnostics/generative_model.py @@ -0,0 +1,386 @@ +"""Generative model for household attributes. + +Trains a TVAE (Tabular Variational Autoencoder) on FRS input +attributes and provides conditional sampling for offspring +generation. The model learns the joint distribution of household +demographics, income, housing, and geographic variables so that +synthetic records are plausible completions of partial attribute +sets. + +Only *input* attributes are modelled. Tax-benefit *outputs* (tax +liability, benefit entitlement, net income) are recomputed by +running offspring through PolicyEngine's calculator. +""" + +import logging + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +# Input attributes to model. These are the FRS variables that +# define a household before PolicyEngine calculates anything. +PERSON_INPUT_ATTRS = [ + "age", + "gender", + "employment_income", + "self_employment_income", + "private_pension_income", + "savings_interest_income", + "dividend_income", + "property_income", + "hours_worked", + "employment_status", + "is_disabled_for_benefits", + "marital_status", +] + +HOUSEHOLD_INPUT_ATTRS = [ + "region", + "tenure_type", + "rent", + "council_tax", + "council_tax_band", + "accommodation_type", + "household_weight", +] + +BENUNIT_INPUT_ATTRS = [ + "would_claim_uc", + "is_married", +] + + +def extract_household_features( + dataset, + use_design_weights: bool = True, +) -> pd.DataFrame: + """Extract a flat feature table from a UKSingleYearDataset. + + Each row is one household. Person-level variables are aggregated + to household level (head's values for demographics, sums for + incomes). + + Args: + dataset: UKSingleYearDataset instance. + use_design_weights: if True, use the original grossing + weights (not calibrated) for training the generative + model. + + Returns: + DataFrame with one row per household. + """ + person = dataset.person + household = dataset.household + benunit = dataset.benunit + + hh_ids = household.household_id.values + features = pd.DataFrame({"household_id": hh_ids}) + + # Household-level attributes + for attr in HOUSEHOLD_INPUT_ATTRS: + if attr in household.columns: + features[attr] = household[attr].values + + # Person-level: head's demographics + income sums + head_mask = person.is_household_head.astype(bool) + heads = person[head_mask].set_index("person_household_id") + + for attr in ["age", "gender", "employment_status", "marital_status"]: + if attr in heads.columns: + features[f"head_{attr}"] = heads[attr].reindex(hh_ids).values + + # Income sums across all persons in household + income_attrs = [ + "employment_income", + "self_employment_income", + "private_pension_income", + "savings_interest_income", + "dividend_income", + "property_income", + ] + for attr in income_attrs: + if attr in person.columns: + summed = ( + person.groupby("person_household_id")[attr] + .sum() + .reindex(hh_ids) + .fillna(0) + ) + features[f"hh_{attr}"] = summed.values + + # Household size + features["n_persons"] = ( + person.groupby("person_household_id") + .size() + .reindex(hh_ids) + .fillna(1) + .astype(int) + .values + ) + + # Number of children (age < 18) + if "age" in person.columns: + features["n_children"] = ( + person[person.age < 18] + .groupby("person_household_id") + .size() + .reindex(hh_ids) + .fillna(0) + .astype(int) + .values + ) + + # Hours worked (head) + if "hours_worked" in heads.columns: + features["head_hours_worked"] = ( + heads["hours_worked"].reindex(hh_ids).fillna(0).values + ) + + # Disability flag (any person in household) + if "is_disabled_for_benefits" in person.columns: + features["has_disabled_member"] = ( + person.groupby("person_household_id")["is_disabled_for_benefits"] + .max() + .reindex(hh_ids) + .fillna(0) + .astype(int) + .values + ) + + # Benunit: UC claim status (any benunit in household) + if "would_claim_uc" in benunit.columns: + person_bu = person[["person_household_id", "person_benunit_id"]] + bu_hh = person_bu.drop_duplicates("person_benunit_id").set_index( + "person_benunit_id" + )["person_household_id"] + uc_by_hh = benunit.set_index("benunit_id")["would_claim_uc"].reindex( + bu_hh.index + ) + uc_by_hh.index = bu_hh.values + features["any_uc_claim"] = ( + uc_by_hh.groupby(level=0).max().reindex(hh_ids).fillna(0).astype(int).values + ) + + return features + + +def identify_column_types( + df: pd.DataFrame, +) -> tuple[list[str], list[str]]: + """Split columns into categorical and continuous. + + Returns: + (categorical_columns, continuous_columns) + """ + categorical = [] + continuous = [] + skip = {"household_id", "household_weight"} + + for col in df.columns: + if col in skip: + continue + if df[col].dtype == object or df[col].nunique() < 20: + categorical.append(col) + else: + continuous.append(col) + + return categorical, continuous + + +def train_generative_model( + dataset, + epochs: int = 300, + seed: int = 42, +): + """Train a TVAE on household features. + + Args: + dataset: UKSingleYearDataset instance. + epochs: training epochs for the TVAE. + seed: random seed. + + Returns: + Trained TVAE model (sdv SingleTableSynthesizer). + """ + from sdv.single_table import TVAESynthesizer + from sdv.metadata import Metadata + + features = extract_household_features(dataset) + categorical_cols, continuous_cols = identify_column_types(features) + + # Drop household_id for training + train_df = features.drop(columns=["household_id"]) + if "household_weight" in train_df.columns: + sample_weights = train_df["household_weight"].values.copy() + train_df = train_df.drop(columns=["household_weight"]) + else: + sample_weights = None + + # Build metadata + metadata = Metadata.detect_from_dataframe(data=train_df) + + model = TVAESynthesizer( + metadata=metadata, + epochs=epochs, + verbose=True, + ) + + # Weight the training data by design weights if available + if sample_weights is not None: + # Resample proportional to weights for training + rng = np.random.default_rng(seed) + probs = sample_weights / sample_weights.sum() + n_train = len(train_df) + indices = rng.choice(len(train_df), size=n_train, replace=True, p=probs) + train_df = train_df.iloc[indices].reset_index(drop=True) + + model.fit(train_df) + logger.info("TVAE trained on %d records", len(train_df)) + + return model + + +def sample_offspring( + model, + source_record: pd.Series, + n_samples: int = 50, + conditioning_fractions: list[float] | None = None, + seed: int = 42, +) -> pd.DataFrame: + """Generate synthetic offspring conditioned on a source record. + + For each sample, a random subset of the source record's + attributes are fixed, and the rest are sampled from the model. + The conditioning fraction varies across samples to explore both + close variants and broader alternatives. + + Args: + model: trained TVAE model. + source_record: Series of attribute values for the source + household. + n_samples: number of offspring to generate. + conditioning_fractions: list of fractions of attributes to + condition on. Defaults to a spread from 0.2 to 0.8. + seed: random seed. + + Returns: + DataFrame of synthetic offspring (n_samples rows). + """ + rng = np.random.default_rng(seed) + + if conditioning_fractions is None: + conditioning_fractions = [0.2, 0.4, 0.5, 0.6, 0.8] + + all_cols = list(source_record.index) + skip_cols = {"household_id", "household_weight"} + usable_cols = [c for c in all_cols if c not in skip_cols] + + offspring = [] + samples_per_fraction = max(1, n_samples // len(conditioning_fractions)) + + for frac in conditioning_fractions: + n_cond = max(1, int(len(usable_cols) * frac)) + for _ in range(samples_per_fraction): + cond_cols = rng.choice(usable_cols, size=n_cond, replace=False).tolist() + conditions = {col: source_record[col] for col in cond_cols} + try: + from sdv.sampling import Condition + + condition = Condition( + num_rows=1, + column_values=conditions, + ) + sample = model.sample_from_conditions(conditions=[condition]) + offspring.append(sample) + except Exception: + # Fall back to unconditional sampling and manually + # override conditioned columns + sample = model.sample(num_rows=1) + for col, val in conditions.items(): + if col in sample.columns: + sample[col] = val + offspring.append(sample) + + if not offspring: + return pd.DataFrame() + + result = pd.concat(offspring, ignore_index=True) + + # Top up if we're short + if len(result) < n_samples: + extra = model.sample(num_rows=n_samples - len(result)) + result = pd.concat([result, extra], ignore_index=True) + + return result.head(n_samples) + + +def validate_generative_model( + model, + original_features: pd.DataFrame, + n_samples: int = 10_000, +) -> dict: + """Compare synthetic samples against original data. + + Args: + model: trained TVAE model. + original_features: the training data. + n_samples: number of synthetic samples to generate. + + Returns: + Dict with validation metrics: + - marginal_ks: Kolmogorov-Smirnov stats for continuous cols + - categorical_tvd: total variation distance for cat cols + - correlation_diff: max absolute difference in correlation + matrix + """ + from scipy import stats + + synthetic = model.sample(num_rows=n_samples) + orig = original_features.drop( + columns=["household_id", "household_weight"], + errors="ignore", + ) + + categorical_cols, continuous_cols = identify_column_types(orig) + + # KS test for continuous columns + ks_stats = {} + for col in continuous_cols: + if col in synthetic.columns and col in orig.columns: + stat, _ = stats.ks_2samp( + orig[col].dropna().values, + synthetic[col].dropna().values, + ) + ks_stats[col] = float(stat) + + # Total variation distance for categorical columns + tvd = {} + for col in categorical_cols: + if col in synthetic.columns and col in orig.columns: + orig_dist = orig[col].value_counts(normalize=True) + synth_dist = synthetic[col].value_counts(normalize=True) + all_vals = set(orig_dist.index) | set(synth_dist.index) + tv = ( + sum(abs(orig_dist.get(v, 0) - synth_dist.get(v, 0)) for v in all_vals) + / 2 + ) + tvd[col] = float(tv) + + # Correlation matrix difference (continuous only) + shared_cont = [ + c for c in continuous_cols if c in synthetic.columns and c in orig.columns + ] + if len(shared_cont) >= 2: + orig_corr = orig[shared_cont].corr().values + synth_corr = synthetic[shared_cont].corr().values + corr_diff = float(np.nanmax(np.abs(orig_corr - synth_corr))) + else: + corr_diff = None + + return { + "marginal_ks": ks_stats, + "categorical_tvd": tvd, + "correlation_diff": corr_diff, + } diff --git a/policyengine_uk_data/diagnostics/influence.py b/policyengine_uk_data/diagnostics/influence.py new file mode 100644 index 00000000..e2672820 --- /dev/null +++ b/policyengine_uk_data/diagnostics/influence.py @@ -0,0 +1,447 @@ +"""Influence detector for survey record weights. + +Computes per-record influence across a reporting surface of +(metric x slice) statistics. A record has high influence when it +contributes a large fraction of a slice-level aggregate, meaning +small perturbations to that record propagate into published outputs. + +The reporting surface is built from: + - metrics: net income, income tax, NI, universal credit, child + benefit, pension credit, council tax, housing benefit + - slices: income decile, region, age band, family type, tenure + +Influence is computed under a sample of policy reforms (random +parameter perturbations) so that structurally high-influence records +are identified regardless of which reform is being analysed. +""" + +import logging + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +# ── Reporting surface definition ──────────────────────────────────── + +METRICS = [ + "household_net_income", + "income_tax", + "national_insurance", + "universal_credit", + "child_benefit", + "pension_credit", + "council_tax", + "housing_benefit_reported", + "employment_income", + "self_employment_income", +] + +SLICE_DEFINITIONS = { + "income_decile": { + "variable": "household_net_income", + "bins": 10, + "labels": [f"decile_{i}" for i in range(1, 11)], + }, + "region": { + "variable": "region", + "categorical": True, + }, + "age_band": { + "variable": "age", + "bins": [0, 16, 25, 35, 45, 55, 65, 75, 100], + "labels": [ + "0-15", + "16-24", + "25-34", + "35-44", + "45-54", + "55-64", + "65-74", + "75+", + ], + }, + "tenure": { + "variable": "tenure_type", + "categorical": True, + }, +} + + +def _build_slice_assignments( + sim, + time_period: str, +) -> dict[str, np.ndarray]: + """Compute household-level slice assignments. + + Returns a dict mapping slice_name -> array of labels, one per + household. + """ + slices = {} + + for name, defn in SLICE_DEFINITIONS.items(): + variable = defn["variable"] + + if defn.get("categorical"): + values = sim.calculate(variable, map_to="household") + slices[name] = np.asarray(values) + continue + + values = sim.calculate(variable, map_to="household").astype(float) + weights = sim.calculate("household_weight", map_to="household").astype(float) + + if "bins" in defn and isinstance(defn["bins"], int): + # Weighted quantile bins + n_bins = defn["bins"] + sorted_idx = np.argsort(values) + cum_weight = np.cumsum(weights[sorted_idx]) + total_weight = cum_weight[-1] + labels = np.empty(len(values), dtype=object) + for b in range(n_bins): + lo = b / n_bins * total_weight + hi = (b + 1) / n_bins * total_weight + mask_sorted = (cum_weight > lo) & (cum_weight <= hi) + if b == 0: + mask_sorted[0] = True + labels[sorted_idx[mask_sorted]] = defn["labels"][b] + slices[name] = labels + else: + bins = defn["bins"] + label_list = defn["labels"] + digitised = np.digitize(values, bins) - 1 + digitised = np.clip(digitised, 0, len(label_list) - 1) + slices[name] = np.array([label_list[d] for d in digitised]) + + return slices + + +def _compute_metric_values( + sim, + time_period: str, +) -> dict[str, np.ndarray]: + """Compute household-level metric values. + + Returns a dict mapping metric_name -> array of values, one per + household. + """ + result = {} + for metric in METRICS: + try: + result[metric] = np.asarray( + sim.calculate(metric, map_to="household"), + dtype=float, + ) + except Exception: + logger.debug("Metric %s not available, skipping", metric) + return result + + +def compute_influence_matrix( + sim, + time_period: str, + reform_sim=None, +) -> pd.DataFrame: + """Compute per-record influence across the reporting surface. + + Args: + sim: a policyengine_uk Microsimulation (baseline). + time_period: the period string (e.g. "2025"). + reform_sim: optional reform Microsimulation. When provided + the metric is the *change* between baseline and reform. + + Returns: + DataFrame with shape (n_households, n_statistics) where each + cell I[i,s] is the fractional influence of household i on + statistic s. + """ + weights = np.asarray( + sim.calculate("household_weight", map_to="household"), + dtype=float, + ) + slices = _build_slice_assignments(sim, time_period) + + if reform_sim is not None: + baseline_vals = _compute_metric_values(sim, time_period) + reform_vals = _compute_metric_values(reform_sim, time_period) + metric_values = { + m: reform_vals[m] - baseline_vals[m] + for m in baseline_vals + if m in reform_vals + } + else: + metric_values = _compute_metric_values(sim, time_period) + + records = [] + stat_names = [] + + for metric_name, values in metric_values.items(): + for slice_name, labels in slices.items(): + unique_labels = np.unique(labels) + for label in unique_labels: + if label is None or (isinstance(label, float) and np.isnan(label)): + continue + mask = labels == label + weighted_total = np.sum(weights[mask] * values[mask]) + denom = max(abs(weighted_total), 1e-10) + influence = np.abs(weights * values * mask) / denom + records.append(influence) + stat_names.append(f"{metric_name}/{slice_name}={label}") + + if not records: + return pd.DataFrame() + + matrix = np.column_stack(records) + return pd.DataFrame(matrix, columns=stat_names) + + +def find_high_influence_records( + influence_matrix: pd.DataFrame, + threshold: float = 0.05, +) -> pd.DataFrame: + """Identify records exceeding the influence threshold. + + Args: + influence_matrix: output of compute_influence_matrix. + threshold: max allowable influence fraction (default 5%). + + Returns: + DataFrame with columns: + - record_idx: household index + - max_influence: maximum influence across all statistics + - worst_statistic: the statistic where influence is highest + - n_violations: number of statistics exceeding threshold + """ + if influence_matrix.empty: + return pd.DataFrame( + columns=[ + "record_idx", + "max_influence", + "worst_statistic", + "n_violations", + ] + ) + + max_influence = influence_matrix.max(axis=1) + worst_stat_idx = influence_matrix.values.argmax(axis=1) + worst_stat = influence_matrix.columns[worst_stat_idx] + n_violations = (influence_matrix > threshold).sum(axis=1) + + flagged_mask = max_influence > threshold + result = pd.DataFrame( + { + "record_idx": np.where(flagged_mask)[0], + "max_influence": max_influence[flagged_mask].values, + "worst_statistic": worst_stat[flagged_mask], + "n_violations": n_violations[flagged_mask].values, + } + ) + return result.sort_values("max_influence", ascending=False).reset_index(drop=True) + + +def compute_kish_effective_sample_size( + weights: np.ndarray, + slice_mask: np.ndarray | None = None, +) -> float: + """Compute Kish's effective sample size. + + n_eff = (sum w_i)^2 / sum(w_i^2) + + Args: + weights: array of household weights. + slice_mask: optional boolean mask to restrict to a subgroup. + + Returns: + Effective sample size. + """ + if slice_mask is not None: + w = weights[slice_mask] + else: + w = weights + w = w[w > 0] + if len(w) == 0: + return 0.0 + return float(np.sum(w) ** 2 / np.sum(w**2)) + + +def generate_random_reforms( + n_reforms: int = 50, + seed: int = 42, +) -> list[dict]: + """Generate random parameter perturbations for influence sampling. + + Each reform is a dict of parameter_path -> multiplier pairs. + The reforms perturb tax rates and benefit amounts by +-20%. + + Args: + n_reforms: number of reforms to generate. + seed: random seed. + + Returns: + List of reform specification dicts. + """ + rng = np.random.default_rng(seed) + + # Parameters amenable to perturbation + rate_params = [ + "gov.hmrc.income_tax.rates.uk[0].rate", + "gov.hmrc.income_tax.rates.uk[1].rate", + "gov.hmrc.income_tax.rates.uk[2].rate", + "gov.hmrc.national_insurance.class_1.rates.employee.main.rate", + ] + amount_params = [ + "gov.hmrc.income_tax.allowances.personal_allowance.amount", + "gov.dwp.universal_credit.elements.standard_allowance.amount.single.over_25", + "gov.dwp.universal_credit.elements.child.amount.first", + ] + + reforms = [] + for _ in range(n_reforms): + reform = {} + # Perturb 2-4 parameters per reform + n_params = rng.integers(2, 5) + all_params = rate_params + amount_params + chosen = rng.choice( + len(all_params), + size=min(n_params, len(all_params)), + replace=False, + ) + for idx in chosen: + param = all_params[idx] + if param in rate_params: + # Rates: multiply by 0.8-1.2 + reform[param] = float(rng.uniform(0.8, 1.2)) + else: + # Amounts: multiply by 0.8-1.2 + reform[param] = float(rng.uniform(0.8, 1.2)) + reforms.append(reform) + + return reforms + + +def run_diagnostics( + dataset, + time_period: str = "2025", + n_reforms: int = 50, + threshold: float = 0.05, + seed: int = 42, +) -> dict: + """Run the full Phase 1 influence diagnostics. + + Args: + dataset: a UKSingleYearDataset. + time_period: calendar year as string. + n_reforms: number of random reforms for influence sampling. + threshold: max allowable influence fraction. + seed: random seed. + + Returns: + Dict with keys: + - baseline_influence: DataFrame of influence matrix under + current law + - flagged_records: DataFrame of high-influence records + - weight_stats: dict of weight distribution statistics + - kish_by_slice: dict of Kish effective sample sizes + - reform_influence_summary: DataFrame summarising influence + across reforms + """ + from policyengine_uk import Microsimulation + + sim = Microsimulation(dataset=dataset) + sim.default_calculation_period = time_period + + weights = np.asarray( + sim.calculate("household_weight", map_to="household"), + dtype=float, + ) + + # Weight distribution statistics + weight_stats = { + "n_households": len(weights), + "mean": float(np.mean(weights)), + "median": float(np.median(weights)), + "p90": float(np.percentile(weights, 90)), + "p99": float(np.percentile(weights, 99)), + "max": float(np.max(weights)), + "min": float(np.min(weights[weights > 0])), + "skewness": float( + np.mean(((weights - np.mean(weights)) / np.std(weights)) ** 3) + ), + } + + # Baseline influence + logger.info("Computing baseline influence matrix...") + baseline_influence = compute_influence_matrix(sim, time_period) + flagged = find_high_influence_records(baseline_influence, threshold) + + # Kish effective sample size by slice + slices = _build_slice_assignments(sim, time_period) + kish_by_slice = {"overall": compute_kish_effective_sample_size(weights)} + for slice_name, labels in slices.items(): + for label in np.unique(labels): + if label is None: + continue + mask = labels == label + kish_by_slice[f"{slice_name}={label}"] = compute_kish_effective_sample_size( + weights, mask + ) + + # Reform-level influence sampling + reforms = generate_random_reforms(n_reforms, seed) + reform_max_influences = [] + + for i, reform_spec in enumerate(reforms): + logger.info( + "Computing influence for reform %d/%d...", + i + 1, + len(reforms), + ) + try: + reform_sim = _create_reform_sim(dataset, time_period, reform_spec) + infl = compute_influence_matrix(sim, time_period, reform_sim=reform_sim) + if not infl.empty: + max_per_record = infl.max(axis=1) + reform_max_influences.append(max_per_record) + except Exception as e: + logger.warning("Reform %d failed: %s", i, e) + + if reform_max_influences: + reform_matrix = pd.concat(reform_max_influences, axis=1).fillna(0) + reform_summary = pd.DataFrame( + { + "mean_max_influence": reform_matrix.mean(axis=1), + "max_max_influence": reform_matrix.max(axis=1), + "n_reforms_above_threshold": (reform_matrix > threshold).sum(axis=1), + } + ) + else: + reform_summary = pd.DataFrame() + + return { + "baseline_influence": baseline_influence, + "flagged_records": flagged, + "weight_stats": weight_stats, + "kish_by_slice": kish_by_slice, + "reform_influence_summary": reform_summary, + } + + +def _create_reform_sim(dataset, time_period, reform_spec): + """Create a Microsimulation with parameter perturbations applied.""" + from policyengine_uk import Microsimulation + + sim = Microsimulation(dataset=dataset) + sim.default_calculation_period = time_period + + for param_path, multiplier in reform_spec.items(): + try: + param = sim.tax_benefit_system.parameters.get_child(param_path) + current = param(time_period) + param.update( + period=f"year:{time_period}:1", + value=current * multiplier, + ) + except Exception: + pass + + sim.tax_benefit_system.reset_parameter_caches() + return sim diff --git a/policyengine_uk_data/diagnostics/offspring.py b/policyengine_uk_data/diagnostics/offspring.py new file mode 100644 index 00000000..3d6df43f --- /dev/null +++ b/policyengine_uk_data/diagnostics/offspring.py @@ -0,0 +1,329 @@ +"""Adversarial offspring generation. + +For each high-influence household record, generates synthetic +offspring via the generative model, runs them through PolicyEngine +to compute tax-benefit outputs, and assembles an expanded dataset +ready for recalibration. +""" + +import logging + +import numpy as np +import pandas as pd + +from policyengine_uk_data.diagnostics.influence import ( + compute_influence_matrix, + find_high_influence_records, +) +from policyengine_uk_data.diagnostics.generative_model import ( + extract_household_features, + sample_offspring, +) + +logger = logging.getLogger(__name__) + + +def _expand_household_to_dataset_records( + synthetic_hh: pd.Series, + source_dataset, + source_hh_idx: int, + new_hh_id_start: int, + weight_per_offspring: float, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Create person/benunit/household rows for one synthetic household. + + The synthetic household inherits the *structure* (number of + persons, benefit units, relationships) from the source household, + with attribute values replaced by the synthetic record where + applicable. + + Args: + synthetic_hh: Series of synthetic household-level features. + source_dataset: the original UKSingleYearDataset. + source_hh_idx: index of the source household in the dataset. + new_hh_id_start: starting household_id for the new record. + weight_per_offspring: weight to assign. + + Returns: + (person_df, benunit_df, household_df) for the new household. + """ + orig_hh = source_dataset.household + orig_person = source_dataset.person + orig_benunit = source_dataset.benunit + + source_hh_id = orig_hh.household_id.iloc[source_hh_idx] + new_hh_id = new_hh_id_start + + # Copy the source household's structure + hh_row = orig_hh.iloc[[source_hh_idx]].copy() + hh_row["household_id"] = new_hh_id + hh_row["household_weight"] = weight_per_offspring + + # Override household-level attributes from the synthetic record + hh_attr_map = { + "region": "region", + "tenure_type": "tenure_type", + "rent": "rent", + "council_tax": "council_tax", + "council_tax_band": "council_tax_band", + "accommodation_type": "accommodation_type", + } + for synth_col, hh_col in hh_attr_map.items(): + if synth_col in synthetic_hh.index and hh_col in hh_row.columns: + hh_row[hh_col] = synthetic_hh[synth_col] + + # Copy persons, remapping IDs + person_mask = orig_person.person_household_id == source_hh_id + new_persons = orig_person[person_mask].copy() + new_persons["person_household_id"] = new_hh_id + + # Remap person IDs to avoid collisions + person_id_offset = new_hh_id * 1000 + new_persons["person_id"] = np.arange( + person_id_offset, + person_id_offset + len(new_persons), + ) + + # Override head's income attributes from the synthetic record + head_mask = new_persons.is_household_head.astype(bool) + income_map = { + "hh_employment_income": "employment_income", + "hh_self_employment_income": "self_employment_income", + "hh_private_pension_income": "private_pension_income", + "hh_savings_interest_income": "savings_interest_income", + "hh_dividend_income": "dividend_income", + "hh_property_income": "property_income", + } + for synth_col, person_col in income_map.items(): + if synth_col in synthetic_hh.index and person_col in new_persons.columns: + # Assign all household income to head (simplified) + new_persons.loc[head_mask, person_col] = max( + 0, float(synthetic_hh[synth_col]) + ) + + # Copy benefit units, remapping IDs + old_bu_ids = new_persons.person_benunit_id.unique() + bu_id_offset = new_hh_id * 100 + bu_id_map = {old: bu_id_offset + i for i, old in enumerate(old_bu_ids)} + new_persons["person_benunit_id"] = new_persons["person_benunit_id"].map(bu_id_map) + + bu_mask = orig_benunit.benunit_id.isin(old_bu_ids) + new_beunits = orig_benunit[bu_mask].copy() + new_beunits["benunit_id"] = new_beunits["benunit_id"].map(bu_id_map) + + return new_persons, new_beunits, hh_row + + +def generate_offspring_for_record( + dataset, + record_idx: int, + model, + features: pd.DataFrame, + n_offspring: int = 50, + weight_target: float | None = None, + seed: int = 42, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Generate synthetic offspring for a single household. + + Args: + dataset: UKSingleYearDataset. + record_idx: index of the household to split. + model: trained generative model (TVAE). + features: household features DataFrame (from + extract_household_features). + n_offspring: number of candidate offspring. + weight_target: desired max weight per offspring. If None, + uses p90 of the current weight distribution. + seed: random seed. + + Returns: + (person_df, benunit_df, household_df) for all offspring + combined. + """ + weights = dataset.household.household_weight.values + source_weight = weights[record_idx] + + if weight_target is None: + weight_target = float(np.percentile(weights[weights > 0], 90)) + + k = max(2, int(np.ceil(source_weight / weight_target))) + n_candidates = max(n_offspring, k * 3) + + source_features = features.iloc[record_idx] + synthetic = sample_offspring( + model, + source_features, + n_samples=n_candidates, + seed=seed, + ) + + weight_per = source_weight / n_candidates + max_hh_id = dataset.household.household_id.max() + + all_persons = [] + all_beunits = [] + all_households = [] + + for i in range(len(synthetic)): + new_hh_id = int(max_hh_id + record_idx * 10_000 + i + 1) + try: + p, b, h = _expand_household_to_dataset_records( + synthetic.iloc[i], + dataset, + record_idx, + new_hh_id, + weight_per, + ) + all_persons.append(p) + all_beunits.append(b) + all_households.append(h) + except Exception as e: + logger.debug("Offspring %d failed: %s", i, e) + + if not all_persons: + return pd.DataFrame(), pd.DataFrame(), pd.DataFrame() + + return ( + pd.concat(all_persons, ignore_index=True), + pd.concat(all_beunits, ignore_index=True), + pd.concat(all_households, ignore_index=True), + ) + + +def run_adversarial_loop( + dataset, + model, + time_period: str = "2025", + threshold: float = 0.05, + max_rounds: int = 10, + n_offspring: int = 50, + weight_target: float | None = None, + seed: int = 42, +) -> dict: + """Run the full adversarial detect-spawn-recalibrate loop. + + Args: + dataset: UKSingleYearDataset to expand. + model: trained generative model (TVAE). + time_period: calendar year as string. + threshold: max allowable influence fraction. + max_rounds: maximum number of adversarial rounds. + n_offspring: offspring per flagged record. + weight_target: desired max weight. + seed: random seed. + + Returns: + Dict with: + - expanded_dataset: the expanded UKSingleYearDataset + - rounds_completed: number of rounds run + - influence_history: list of max-influence per round + - records_expanded: total number of records added + """ + from policyengine_uk import Microsimulation + from policyengine_uk.data import UKSingleYearDataset + from policyengine_uk_data.diagnostics.recalibrate import ( + recalibrate_with_regularisation, + ) + + working = dataset.copy() + features = extract_household_features(working) + influence_history = [] + total_added = 0 + + for round_num in range(max_rounds): + logger.info("Adversarial round %d/%d", round_num + 1, max_rounds) + + # Detect + sim = Microsimulation(dataset=working) + sim.default_calculation_period = time_period + infl = compute_influence_matrix(sim, time_period) + flagged = find_high_influence_records(infl, threshold) + + if flagged.empty: + logger.info( + "No records above threshold, stopping at round %d", + round_num + 1, + ) + break + + max_infl = flagged.max_influence.iloc[0] + influence_history.append(float(max_infl)) + logger.info( + "Round %d: %d flagged records, max influence %.3f", + round_num + 1, + len(flagged), + max_infl, + ) + + # Spawn offspring for the worst offender + worst_idx = int(flagged.record_idx.iloc[0]) + persons_new, beunits_new, hh_new = generate_offspring_for_record( + working, + worst_idx, + model, + features, + n_offspring=n_offspring, + weight_target=weight_target, + seed=seed + round_num, + ) + + if hh_new.empty: + logger.warning( + "No offspring generated for record %d, skipping", + worst_idx, + ) + continue + + # Remove source record and add offspring + orig_hh_id = working.household.household_id.iloc[worst_idx] + + new_person = pd.concat( + [ + working.person[working.person.person_household_id != orig_hh_id], + persons_new, + ], + ignore_index=True, + ) + new_benunit = pd.concat( + [ + working.benunit[ + ~working.benunit.benunit_id.isin( + working.person[ + working.person.person_household_id == orig_hh_id + ].person_benunit_id + ) + ], + beunits_new, + ], + ignore_index=True, + ) + new_household = pd.concat( + [ + working.household[working.household.household_id != orig_hh_id], + hh_new, + ], + ignore_index=True, + ) + + working = UKSingleYearDataset( + person=new_person, + benunit=new_benunit, + household=new_household, + fiscal_year=int(time_period), + ) + features = extract_household_features(working) + total_added += len(hh_new) + + # Recalibrate + logger.info("Recalibrating expanded dataset...") + working = recalibrate_with_regularisation( + working, + time_period=time_period, + ) + + return { + "expanded_dataset": working, + "rounds_completed": min(round_num + 1, max_rounds), + "influence_history": influence_history, + "records_expanded": total_added, + } diff --git a/policyengine_uk_data/diagnostics/recalibrate.py b/policyengine_uk_data/diagnostics/recalibrate.py new file mode 100644 index 00000000..20bf0c41 --- /dev/null +++ b/policyengine_uk_data/diagnostics/recalibrate.py @@ -0,0 +1,165 @@ +"""Recalibration with weight regularisation. + +Extends the existing calibration pipeline to add entropy +regularisation (penalising weight distributions that diverge from +a prior) and optional hard weight capping. This prevents the +calibration from concentrating weight on a few records, even when +the expanded dataset provides alternatives. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import numpy as np +import torch + +if TYPE_CHECKING: + from policyengine_uk.data import UKSingleYearDataset + +logger = logging.getLogger(__name__) + + +def recalibrate_with_regularisation( + dataset, + time_period: str = "2025", + entropy_lambda: float = 0.01, + weight_cap: float | None = 5_000.0, + epochs: int = 256, + lr: float = 0.05, +) -> UKSingleYearDataset: + """Recalibrate dataset weights with entropy regularisation. + + Minimises: + sum_t (hat_T_t(w) - T_t)^2 + lambda * sum_i w_i * log(w_i / w0_i) + + where T_t are population targets, hat_T_t are weighted estimates, + w0_i are prior weights (uniform for offspring, design weights for + originals), and lambda controls regularisation strength. + + Args: + dataset: UKSingleYearDataset to recalibrate. + time_period: calendar year as string. + entropy_lambda: entropy regularisation strength. + weight_cap: optional hard upper bound on any weight. + epochs: optimisation epochs. + lr: learning rate. + + Returns: + Recalibrated UKSingleYearDataset (copy with updated weights). + """ + from policyengine_uk_data.targets.build_loss_matrix import ( + create_target_matrix, + ) + + dataset = dataset.copy() + + matrix, targets = create_target_matrix(dataset, time_period=time_period) + + if matrix.empty: + logger.warning("No targets available, returning unmodified dataset") + return dataset + + initial_weights = dataset.household.household_weight.values.astype(float) + # Prior weights: original weights normalised to sum to population + w0 = np.maximum(initial_weights, 1.0) + + # Tensors + log_w = torch.tensor( + np.log(np.maximum(initial_weights, 1e-6)), + dtype=torch.float32, + requires_grad=True, + ) + M = torch.tensor(matrix.values, dtype=torch.float32) # (n_households, n_targets) + T = torch.tensor(targets.values, dtype=torch.float32) # (n_targets,) + w0_t = torch.tensor(w0, dtype=torch.float32) + + optimizer = torch.optim.Adam([log_w], lr=lr) + + def loss_fn(): + w = torch.exp(log_w) + + # Apply weight cap via soft clamping + if weight_cap is not None: + w = torch.clamp(w, max=weight_cap) + + # Target matching: symmetric relative error + pred = (w.unsqueeze(1) * M).sum(dim=0) + sre = torch.min( + ((1 + pred) / (1 + T) - 1) ** 2, + ((1 + T) / (1 + pred) - 1) ** 2, + ) + target_loss = sre.mean() + + # Entropy regularisation: KL divergence from prior + w_normed = w / w.sum() + w0_normed = w0_t / w0_t.sum() + # Avoid log(0) with small epsilon + kl = (w_normed * torch.log((w_normed + 1e-10) / (w0_normed + 1e-10))).sum() + + return target_loss + entropy_lambda * kl + + for epoch in range(epochs): + optimizer.zero_grad() + loss = loss_fn() + loss.backward() + optimizer.step() + + if epoch % 50 == 0: + w_current = torch.exp(log_w).detach().numpy() + if weight_cap is not None: + w_current = np.clip(w_current, 0, weight_cap) + logger.info( + "Epoch %d: loss=%.6f, max_weight=%.0f, n_nonzero=%d", + epoch, + loss.item(), + w_current.max(), + (w_current > 1).sum(), + ) + + # Final weights + final_weights = torch.exp(log_w).detach().numpy() + if weight_cap is not None: + final_weights = np.clip(final_weights, 0, weight_cap) + + dataset.household["household_weight"] = final_weights + return dataset + + +def prune_zero_weight_records( + dataset, + epsilon: float = 1.0, +) -> UKSingleYearDataset: + """Remove records with near-zero weight after recalibration. + + Args: + dataset: UKSingleYearDataset with calibrated weights. + epsilon: weight threshold below which records are removed. + + Returns: + Pruned UKSingleYearDataset. + """ + from policyengine_uk.data import UKSingleYearDataset + + keep_mask = dataset.household.household_weight > epsilon + keep_hh_ids = dataset.household.household_id[keep_mask].values + + person_keep = dataset.person.person_household_id.isin(keep_hh_ids) + keep_bu_ids = dataset.person[person_keep].person_benunit_id.unique() + benunit_keep = dataset.benunit.benunit_id.isin(keep_bu_ids) + + n_removed = (~keep_mask).sum() + logger.info( + "Pruned %d zero-weight records (%.1f%%), %d remain", + n_removed, + 100 * n_removed / len(keep_mask), + keep_mask.sum(), + ) + + return UKSingleYearDataset( + person=dataset.person[person_keep].reset_index(drop=True), + benunit=dataset.benunit[benunit_keep].reset_index(drop=True), + household=dataset.household[keep_mask].reset_index(drop=True), + fiscal_year=int(dataset.time_period), + ) diff --git a/pyproject.toml b/pyproject.toml index 7bad94c4..8f8c2a5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,8 @@ dependencies = [ "openpyxl", "pydantic>=2.0", "pyyaml", + "sdv>=1.0.0", + "scipy", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index fd7ec078..14d34e40 100644 --- a/uv.lock +++ b/uv.lock @@ -1,6 +1,10 @@ version = 1 revision = 3 requires-python = ">=3.13" +resolution-markers = [ + "python_full_version >= '3.14'", + "python_full_version < '3.14'", +] [[package]] name = "accessible-pygments" @@ -107,7 +111,8 @@ dependencies = [ { name = "msgpack" }, { name = "ndindex" }, { name = "numexpr", marker = "platform_machine != 'wasm32'" }, - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "platformdirs" }, { name = "py-cpuinfo", marker = "platform_machine != 'wasm32'" }, { name = "requests" }, @@ -131,6 +136,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/24/b5/05dd7a720d8cd5f523146a4e2ff5d051125b23c2395c5423a9b1c42a3889/blosc2-3.11.1-cp314-cp314t-win_amd64.whl", hash = "sha256:cd276ab00b9b6ea2810091879e4665150738b6d323e1f1970ccd62b58df7b9b6", size = 2355394, upload-time = "2025-11-16T16:02:11.061Z" }, ] +[[package]] +name = "boto3" +version = "1.42.69" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1f/f3/26d800e4efe85e7d59c63ac11d02ab2fafed371bede567af7258eb7e4c1c/boto3-1.42.69.tar.gz", hash = "sha256:e59846f4ff467b23bae4751948298db554dbdda0d72b09028d2cacbeff27e1ad", size = 112777, upload-time = "2026-03-16T20:35:30.77Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f3/39/54ad87436c637de9f7bf83ba2a28cf3b15409cbb849401837fcc37fbd794/boto3-1.42.69-py3-none-any.whl", hash = "sha256:6823a4b59aa578c7d98124280a9b6d83cea04bdb02525cbaa79370e5b6f7f631", size = 140556, upload-time = "2026-03-16T20:35:28.754Z" }, +] + +[[package]] +name = "botocore" +version = "1.42.69" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/d1/81a6e39c7d5419ba34bad8a1ac2c5360c26f21af698a481a8397d79134d1/botocore-1.42.69.tar.gz", hash = "sha256:0934f2d90403c5c8c2cba83e754a39d77edcad5885d04a79363edff3e814f55e", size = 14997632, upload-time = "2026-03-16T20:35:18.533Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/13/779f3427e17f9989fd0fa6651817c5f13b63e574f3541e460b8238883290/botocore-1.42.69-py3-none-any.whl", hash = "sha256:ef0e3d860a5d7bffc0ccb4911781c4c27d538557ed9a616ba1926c762d72e5f6", size = 14670334, upload-time = "2026-03-16T20:35:14.543Z" }, +] + [[package]] name = "build" version = "1.3.0" @@ -216,6 +249,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, ] +[[package]] +name = "cloudpickle" +version = "3.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -237,6 +279,39 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" }, ] +[[package]] +name = "copulas" +version = "0.14.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "pandas" }, + { name = "plotly" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/a8/f15432ee5a691eafb7ecc5637c01843df92c0801aa83151c35d0fb190c92/copulas-0.14.1.tar.gz", hash = "sha256:adec8f65c98f16816bde5a03e9e7e7b3df91f3fb22ef9fd5023ebba8dd1628c8", size = 45007, upload-time = "2026-02-05T18:52:41.019Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/2c/7984ead5c59c7d3066d6c7b6a7839991a317ecfb3ab1f963900e14f3c339/copulas-0.14.1-py3-none-any.whl", hash = "sha256:6f010444385a304274e7587b45f56b860ca38b621529ad0f9bbd4024de91dd1d", size = 52663, upload-time = "2026-02-05T18:52:39.263Z" }, +] + +[[package]] +name = "ctgan" +version = "0.12.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "pandas" }, + { name = "rdt" }, + { name = "torch" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/0e/50724b2e49ad83c2ebd00d8b57753dc07f256b27e64aca1306de5f4666b8/ctgan-0.12.1.tar.gz", hash = "sha256:e545c2b1a752affba3de2933a5f8037228e837f7a73f5593399b52cfe9611bc7", size = 27072, upload-time = "2026-02-13T03:22:40.909Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/c2/e4761d20a9704ba7595ea7d14dc4880c3cd0bd81b8ae588435536b7d8f19/ctgan-0.12.1-py3-none-any.whl", hash = "sha256:38a3b83432643caa8381c74c49e6a079166efa40f8f6c3b7204db44d6d2c8f18", size = 25490, upload-time = "2026-02-13T03:22:39.48Z" }, +] + [[package]] name = "datetime" version = "6.0" @@ -259,6 +334,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, ] +[[package]] +name = "deepecho" +version = "0.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "pandas" }, + { name = "torch" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/d7/68d071d98a2a921121f4e2f2a78ece38ce83dcdeb5dadad42d207b153e07/deepecho-0.8.1.tar.gz", hash = "sha256:7589d9b1be1a482a879caca7f674acf1195441de0c8ae020dd1c17a726472f86", size = 30733, upload-time = "2026-02-12T21:16:35.964Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/dd/43e447dbac86b38e7ac4afc38f24efc396b0bd380d172edf3aa2635e1364/deepecho-0.8.1-py3-none-any.whl", hash = "sha256:1706f85e479b8be5cedfbb14d9823eee5fddff9f3d13e73691241af7bd874e84", size = 28070, upload-time = "2026-02-12T21:16:34.582Z" }, +] + [[package]] name = "defusedxml" version = "0.7.1" @@ -304,6 +395,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, ] +[[package]] +name = "faker" +version = "40.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tzdata", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/94/dc/b68e5378e5a7db0ab776efcdd53b6fe374b29d703e156fd5bb4c5437069e/faker-40.11.0.tar.gz", hash = "sha256:7c419299103b13126bd02ec14bd2b47b946edb5a5eedf305e66a193b25f9a734", size = 1957570, upload-time = "2026-03-13T14:36:11.844Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/fa/a86c6ba66f0308c95b9288b1e3eaccd934b545646f63494a86f1ec2f8c8e/faker-40.11.0-py3-none-any.whl", hash = "sha256:0e9816c950528d2a37d74863f3ef389ea9a3a936cbcde0b11b8499942e25bf90", size = 1989457, upload-time = "2026-03-13T14:36:09.792Z" }, +] + [[package]] name = "filelock" version = "3.20.0" @@ -437,6 +540,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" }, ] +[[package]] +name = "graphviz" +version = "0.21" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/b3/3ac91e9be6b761a4b30d66ff165e54439dcd48b83f4e20d644867215f6ca/graphviz-0.21.tar.gz", hash = "sha256:20743e7183be82aaaa8ad6c93f8893c923bd6658a04c32ee115edb3c8a835f78", size = 200434, upload-time = "2025-06-15T09:35:05.824Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl", hash = "sha256:54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42", size = 47300, upload-time = "2025-06-15T09:35:04.433Z" }, +] + [[package]] name = "greenlet" version = "3.2.4" @@ -446,7 +558,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, - { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, @@ -457,7 +568,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, - { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, @@ -479,7 +589,8 @@ name = "h5py" version = "3.15.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4d/6a/0d79de0b025aa85dc8864de8e97659c94cf3d23148394a954dc5ca52f8c8/h5py-3.15.1.tar.gz", hash = "sha256:c86e3ed45c4473564de55aa83b6fc9e5ead86578773dfbd93047380042e26b69", size = 426236, upload-time = "2025-10-16T10:35:27.404Z" } wheels = [ @@ -632,7 +743,8 @@ version = "2.5.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "ipython" }, - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "pandas" }, ] sdist = { url = "https://files.pythonhosted.org/packages/02/a2/4652db589b5767ead6d1dd8016e94e6adc5ec9e9552ccd17cf1886900b04/itables-2.5.2.tar.gz", hash = "sha256:ec34bbacfbf4305570ea75b36970de442f924126f3701c323a5a46018de84c8a", size = 2356416, upload-time = "2025-09-02T20:14:41.722Z" } @@ -664,6 +776,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "jmespath" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, +] + [[package]] name = "joblib" version = "1.5.2" @@ -687,7 +808,8 @@ name = "l0-python" version = "0.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "scipy" }, { name = "torch" }, ] @@ -799,7 +921,8 @@ version = "0.21.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "l0-python" }, - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "optuna" }, { name = "pandas" }, { name = "torch" }, @@ -815,7 +938,8 @@ name = "microdf-python" version = "1.2.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "pandas" }, ] sdist = { url = "https://files.pythonhosted.org/packages/91/f0/9689f33e2524b0c0d1cdf0d556ad196bfbb2ec0292f4545f467a37b27773/microdf_python-1.2.2.tar.gz", hash = "sha256:7e5f6adc10b0469de0e6549789ede0a2e6c600d0f5c83eafffc009d1495a7933", size = 20395, upload-time = "2026-02-24T10:47:16.438Z" } @@ -829,7 +953,8 @@ version = "1.8.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "joblib" }, - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "optuna" }, { name = "pandas" }, { name = "plotly" }, @@ -837,7 +962,8 @@ dependencies = [ { name = "pydantic" }, { name = "quantile-forest" }, { name = "requests" }, - { name = "scikit-learn" }, + { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "scipy" }, { name = "statsmodels" }, { name = "tqdm" }, @@ -945,7 +1071,8 @@ name = "numexpr" version = "2.14.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/cb/2f/fdba158c9dbe5caca9c3eca3eaffffb251f2fb8674bf8e2d0aed5f38d319/numexpr-2.14.1.tar.gz", hash = "sha256:4be00b1086c7b7a5c32e31558122b7b80243fe098579b170967da83f3152b48b", size = 119400, upload-time = "2025-10-13T16:17:27.351Z" } wheels = [ @@ -987,6 +1114,9 @@ wheels = [ name = "numpy" version = "2.1.3" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.14'", +] sdist = { url = "https://files.pythonhosted.org/packages/25/ca/1166b75c21abd1da445b97bf1fa2f14f423c6cfb4fc7c4ef31dccf9f6a94/numpy-2.1.3.tar.gz", hash = "sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761", size = 20166090, upload-time = "2024-11-02T17:48:55.832Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/4d/0b/620591441457e25f3404c8057eb924d04f161244cb8a3680d529419aa86e/numpy-2.1.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f", size = 20836263, upload-time = "2024-11-02T17:40:39.528Z" }, @@ -1011,6 +1141,59 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/09/a5ab407bd7f5f5599e6a9261f964ace03a73e7c6928de906981c31c38082/numpy-2.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4", size = 12644098, upload-time = "2024-11-02T17:46:07.941Z" }, ] +[[package]] +name = "numpy" +version = "2.4.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14'", +] +sdist = { url = "https://files.pythonhosted.org/packages/10/8b/c265f4823726ab832de836cdd184d0986dcf94480f81e8739692a7ac7af2/numpy-2.4.3.tar.gz", hash = "sha256:483a201202b73495f00dbc83796c6ae63137a9bdade074f7648b3e32613412dd", size = 20727743, upload-time = "2026-03-09T07:58:53.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/d0/1fe47a98ce0df229238b77611340aff92d52691bcbc10583303181abf7fc/numpy-2.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b346845443716c8e542d54112966383b448f4a3ba5c66409771b8c0889485dd3", size = 16665297, upload-time = "2026-03-09T07:56:52.296Z" }, + { url = "https://files.pythonhosted.org/packages/27/d9/4e7c3f0e68dfa91f21c6fb6cf839bc829ec920688b1ce7ec722b1a6202fb/numpy-2.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2629289168f4897a3c4e23dc98d6f1731f0fc0fe52fb9db19f974041e4cc12b9", size = 14691853, upload-time = "2026-03-09T07:56:54.992Z" }, + { url = "https://files.pythonhosted.org/packages/3a/66/bd096b13a87549683812b53ab211e6d413497f84e794fb3c39191948da97/numpy-2.4.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:bb2e3cf95854233799013779216c57e153c1ee67a0bf92138acca0e429aefaee", size = 5198435, upload-time = "2026-03-09T07:56:57.184Z" }, + { url = "https://files.pythonhosted.org/packages/a2/2f/687722910b5a5601de2135c891108f51dfc873d8e43c8ed9f4ebb440b4a2/numpy-2.4.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:7f3408ff897f8ab07a07fbe2823d7aee6ff644c097cc1f90382511fe982f647f", size = 6546347, upload-time = "2026-03-09T07:56:59.531Z" }, + { url = "https://files.pythonhosted.org/packages/bf/ec/7971c4e98d86c564750393fab8d7d83d0a9432a9d78bb8a163a6dc59967a/numpy-2.4.3-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:decb0eb8a53c3b009b0962378065589685d66b23467ef5dac16cbe818afde27f", size = 15664626, upload-time = "2026-03-09T07:57:01.385Z" }, + { url = "https://files.pythonhosted.org/packages/7e/eb/7daecbea84ec935b7fc732e18f532073064a3816f0932a40a17f3349185f/numpy-2.4.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5f51900414fc9204a0e0da158ba2ac52b75656e7dce7e77fb9f84bfa343b4cc", size = 16608916, upload-time = "2026-03-09T07:57:04.008Z" }, + { url = "https://files.pythonhosted.org/packages/df/58/2a2b4a817ffd7472dca4421d9f0776898b364154e30c95f42195041dc03b/numpy-2.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6bd06731541f89cdc01b261ba2c9e037f1543df7472517836b78dfb15bd6e476", size = 17015824, upload-time = "2026-03-09T07:57:06.347Z" }, + { url = "https://files.pythonhosted.org/packages/4a/ca/627a828d44e78a418c55f82dd4caea8ea4a8ef24e5144d9e71016e52fb40/numpy-2.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:22654fe6be0e5206f553a9250762c653d3698e46686eee53b399ab90da59bd92", size = 18334581, upload-time = "2026-03-09T07:57:09.114Z" }, + { url = "https://files.pythonhosted.org/packages/cd/c0/76f93962fc79955fcba30a429b62304332345f22d4daec1cb33653425643/numpy-2.4.3-cp313-cp313-win32.whl", hash = "sha256:d71e379452a2f670ccb689ec801b1218cd3983e253105d6e83780967e899d687", size = 5958618, upload-time = "2026-03-09T07:57:11.432Z" }, + { url = "https://files.pythonhosted.org/packages/b1/3c/88af0040119209b9b5cb59485fa48b76f372c73068dbf9254784b975ac53/numpy-2.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:0a60e17a14d640f49146cb38e3f105f571318db7826d9b6fef7e4dce758faecd", size = 12312824, upload-time = "2026-03-09T07:57:13.586Z" }, + { url = "https://files.pythonhosted.org/packages/58/ce/3d07743aced3d173f877c3ef6a454c2174ba42b584ab0b7e6d99374f51ed/numpy-2.4.3-cp313-cp313-win_arm64.whl", hash = "sha256:c9619741e9da2059cd9c3f206110b97583c7152c1dc9f8aafd4beb450ac1c89d", size = 10221218, upload-time = "2026-03-09T07:57:16.183Z" }, + { url = "https://files.pythonhosted.org/packages/62/09/d96b02a91d09e9d97862f4fc8bfebf5400f567d8eb1fe4b0cc4795679c15/numpy-2.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7aa4e54f6469300ebca1d9eb80acd5253cdfa36f2c03d79a35883687da430875", size = 14819570, upload-time = "2026-03-09T07:57:18.564Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ca/0b1aba3905fdfa3373d523b2b15b19029f4f3031c87f4066bd9d20ef6c6b/numpy-2.4.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d1b90d840b25874cf5cd20c219af10bac3667db3876d9a495609273ebe679070", size = 5326113, upload-time = "2026-03-09T07:57:21.052Z" }, + { url = "https://files.pythonhosted.org/packages/c0/63/406e0fd32fcaeb94180fd6a4c41e55736d676c54346b7efbce548b94a914/numpy-2.4.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:a749547700de0a20a6718293396ec237bb38218049cfce788e08fcb716e8cf73", size = 6646370, upload-time = "2026-03-09T07:57:22.804Z" }, + { url = "https://files.pythonhosted.org/packages/b6/d0/10f7dc157d4b37af92720a196be6f54f889e90dcd30dce9dc657ed92c257/numpy-2.4.3-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94f3c4a151a2e529adf49c1d54f0f57ff8f9b233ee4d44af623a81553ab86368", size = 15723499, upload-time = "2026-03-09T07:57:24.693Z" }, + { url = "https://files.pythonhosted.org/packages/66/f1/d1c2bf1161396629701bc284d958dc1efa3a5a542aab83cf11ee6eb4cba5/numpy-2.4.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22c31dc07025123aedf7f2db9e91783df13f1776dc52c6b22c620870dc0fab22", size = 16657164, upload-time = "2026-03-09T07:57:27.676Z" }, + { url = "https://files.pythonhosted.org/packages/1a/be/cca19230b740af199ac47331a21c71e7a3d0ba59661350483c1600d28c37/numpy-2.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:148d59127ac95979d6f07e4d460f934ebdd6eed641db9c0db6c73026f2b2101a", size = 17081544, upload-time = "2026-03-09T07:57:30.664Z" }, + { url = "https://files.pythonhosted.org/packages/b9/c5/9602b0cbb703a0936fb40f8a95407e8171935b15846de2f0776e08af04c7/numpy-2.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a97cbf7e905c435865c2d939af3d93f99d18eaaa3cabe4256f4304fb51604349", size = 18380290, upload-time = "2026-03-09T07:57:33.763Z" }, + { url = "https://files.pythonhosted.org/packages/ed/81/9f24708953cd30be9ee36ec4778f4b112b45165812f2ada4cc5ea1c1f254/numpy-2.4.3-cp313-cp313t-win32.whl", hash = "sha256:be3b8487d725a77acccc9924f65fd8bce9af7fac8c9820df1049424a2115af6c", size = 6082814, upload-time = "2026-03-09T07:57:36.491Z" }, + { url = "https://files.pythonhosted.org/packages/e2/9e/52f6eaa13e1a799f0ab79066c17f7016a4a8ae0c1aefa58c82b4dab690b4/numpy-2.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1ec84fd7c8e652b0f4aaaf2e6e9cc8eaa9b1b80a537e06b2e3a2fb176eedcb26", size = 12452673, upload-time = "2026-03-09T07:57:38.281Z" }, + { url = "https://files.pythonhosted.org/packages/c4/04/b8cece6ead0b30c9fbd99bb835ad7ea0112ac5f39f069788c5558e3b1ab2/numpy-2.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:120df8c0a81ebbf5b9020c91439fccd85f5e018a927a39f624845be194a2be02", size = 10290907, upload-time = "2026-03-09T07:57:40.747Z" }, + { url = "https://files.pythonhosted.org/packages/70/ae/3936f79adebf8caf81bd7a599b90a561334a658be4dcc7b6329ebf4ee8de/numpy-2.4.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:5884ce5c7acfae1e4e1b6fde43797d10aa506074d25b531b4f54bde33c0c31d4", size = 16664563, upload-time = "2026-03-09T07:57:43.817Z" }, + { url = "https://files.pythonhosted.org/packages/9b/62/760f2b55866b496bb1fa7da2a6db076bef908110e568b02fcfc1422e2a3a/numpy-2.4.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:297837823f5bc572c5f9379b0c9f3a3365f08492cbdc33bcc3af174372ebb168", size = 14702161, upload-time = "2026-03-09T07:57:46.169Z" }, + { url = "https://files.pythonhosted.org/packages/32/af/a7a39464e2c0a21526fb4fb76e346fb172ebc92f6d1c7a07c2c139cc17b1/numpy-2.4.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:a111698b4a3f8dcbe54c64a7708f049355abd603e619013c346553c1fd4ca90b", size = 5208738, upload-time = "2026-03-09T07:57:48.506Z" }, + { url = "https://files.pythonhosted.org/packages/29/8c/2a0cf86a59558fa078d83805589c2de490f29ed4fb336c14313a161d358a/numpy-2.4.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:4bd4741a6a676770e0e97fe9ab2e51de01183df3dcbcec591d26d331a40de950", size = 6543618, upload-time = "2026-03-09T07:57:50.591Z" }, + { url = "https://files.pythonhosted.org/packages/aa/b8/612ce010c0728b1c363fa4ea3aa4c22fe1c5da1de008486f8c2f5cb92fae/numpy-2.4.3-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54f29b877279d51e210e0c80709ee14ccbbad647810e8f3d375561c45ef613dd", size = 15680676, upload-time = "2026-03-09T07:57:52.34Z" }, + { url = "https://files.pythonhosted.org/packages/a9/7e/4f120ecc54ba26ddf3dc348eeb9eb063f421de65c05fc961941798feea18/numpy-2.4.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:679f2a834bae9020f81534671c56fd0cc76dd7e5182f57131478e23d0dc59e24", size = 16613492, upload-time = "2026-03-09T07:57:54.91Z" }, + { url = "https://files.pythonhosted.org/packages/2c/86/1b6020db73be330c4b45d5c6ee4295d59cfeef0e3ea323959d053e5a6909/numpy-2.4.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d84f0f881cb2225c2dfd7f78a10a5645d487a496c6668d6cc39f0f114164f3d0", size = 17031789, upload-time = "2026-03-09T07:57:57.641Z" }, + { url = "https://files.pythonhosted.org/packages/07/3a/3b90463bf41ebc21d1b7e06079f03070334374208c0f9a1f05e4ae8455e7/numpy-2.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d213c7e6e8d211888cc359bab7199670a00f5b82c0978b9d1c75baf1eddbeac0", size = 18339941, upload-time = "2026-03-09T07:58:00.577Z" }, + { url = "https://files.pythonhosted.org/packages/a8/74/6d736c4cd962259fd8bae9be27363eb4883a2f9069763747347544c2a487/numpy-2.4.3-cp314-cp314-win32.whl", hash = "sha256:52077feedeff7c76ed7c9f1a0428558e50825347b7545bbb8523da2cd55c547a", size = 6007503, upload-time = "2026-03-09T07:58:03.331Z" }, + { url = "https://files.pythonhosted.org/packages/48/39/c56ef87af669364356bb011922ef0734fc49dad51964568634c72a009488/numpy-2.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:0448e7f9caefb34b4b7dd2b77f21e8906e5d6f0365ad525f9f4f530b13df2afc", size = 12444915, upload-time = "2026-03-09T07:58:06.353Z" }, + { url = "https://files.pythonhosted.org/packages/9d/1f/ab8528e38d295fd349310807496fabb7cf9fe2e1f70b97bc20a483ea9d4a/numpy-2.4.3-cp314-cp314-win_arm64.whl", hash = "sha256:b44fd60341c4d9783039598efadd03617fa28d041fc37d22b62d08f2027fa0e7", size = 10494875, upload-time = "2026-03-09T07:58:08.734Z" }, + { url = "https://files.pythonhosted.org/packages/e6/ef/b7c35e4d5ef141b836658ab21a66d1a573e15b335b1d111d31f26c8ef80f/numpy-2.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0a195f4216be9305a73c0e91c9b026a35f2161237cf1c6de9b681637772ea657", size = 14822225, upload-time = "2026-03-09T07:58:11.034Z" }, + { url = "https://files.pythonhosted.org/packages/cd/8d/7730fa9278cf6648639946cc816e7cc89f0d891602584697923375f801ed/numpy-2.4.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:cd32fbacb9fd1bf041bf8e89e4576b6f00b895f06d00914820ae06a616bdfef7", size = 5328769, upload-time = "2026-03-09T07:58:13.67Z" }, + { url = "https://files.pythonhosted.org/packages/47/01/d2a137317c958b074d338807c1b6a383406cdf8b8e53b075d804cc3d211d/numpy-2.4.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:2e03c05abaee1f672e9d67bc858f300b5ccba1c21397211e8d77d98350972093", size = 6649461, upload-time = "2026-03-09T07:58:15.912Z" }, + { url = "https://files.pythonhosted.org/packages/5c/34/812ce12bc0f00272a4b0ec0d713cd237cb390666eb6206323d1cc9cedbb2/numpy-2.4.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d1ce23cce91fcea443320a9d0ece9b9305d4368875bab09538f7a5b4131938a", size = 15725809, upload-time = "2026-03-09T07:58:17.787Z" }, + { url = "https://files.pythonhosted.org/packages/25/c0/2aed473a4823e905e765fee3dc2cbf504bd3e68ccb1150fbdabd5c39f527/numpy-2.4.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c59020932feb24ed49ffd03704fbab89f22aa9c0d4b180ff45542fe8918f5611", size = 16655242, upload-time = "2026-03-09T07:58:20.476Z" }, + { url = "https://files.pythonhosted.org/packages/f2/c8/7e052b2fc87aa0e86de23f20e2c42bd261c624748aa8efd2c78f7bb8d8c6/numpy-2.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9684823a78a6cd6ad7511fc5e25b07947d1d5b5e2812c93fe99d7d4195130720", size = 17080660, upload-time = "2026-03-09T07:58:23.067Z" }, + { url = "https://files.pythonhosted.org/packages/f3/3d/0876746044db2adcb11549f214d104f2e1be00f07a67edbb4e2812094847/numpy-2.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0200b25c687033316fb39f0ff4e3e690e8957a2c3c8d22499891ec58c37a3eb5", size = 18380384, upload-time = "2026-03-09T07:58:25.839Z" }, + { url = "https://files.pythonhosted.org/packages/07/12/8160bea39da3335737b10308df4f484235fd297f556745f13092aa039d3b/numpy-2.4.3-cp314-cp314t-win32.whl", hash = "sha256:5e10da9e93247e554bb1d22f8edc51847ddd7dde52d85ce31024c1b4312bfba0", size = 6154547, upload-time = "2026-03-09T07:58:28.289Z" }, + { url = "https://files.pythonhosted.org/packages/42/f3/76534f61f80d74cc9cdf2e570d3d4eeb92c2280a27c39b0aaf471eda7b48/numpy-2.4.3-cp314-cp314t-win_amd64.whl", hash = "sha256:45f003dbdffb997a03da2d1d0cb41fbd24a87507fb41605c0420a3db5bd4667b", size = 12633645, upload-time = "2026-03-09T07:58:30.384Z" }, + { url = "https://files.pythonhosted.org/packages/1f/b6/7c0d4334c15983cec7f92a69e8ce9b1e6f31857e5ee3a413ac424e6bd63d/numpy-2.4.3-cp314-cp314t-win_arm64.whl", hash = "sha256:4d382735cecd7bcf090172489a525cd7d4087bc331f7df9f60ddc9a296cf208e", size = 10565454, upload-time = "2026-03-09T07:58:33.031Z" }, +] + [[package]] name = "nvidia-cublas-cu12" version = "12.8.4.1" @@ -1173,7 +1356,8 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "alembic" }, { name = "colorlog" }, - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "packaging" }, { name = "pyyaml" }, { name = "sqlalchemy" }, @@ -1198,7 +1382,8 @@ name = "pandas" version = "2.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "python-dateutil" }, { name = "pytz" }, { name = "tzdata" }, @@ -1256,7 +1441,8 @@ name = "patsy" version = "1.0.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/be/44/ed13eccdd0519eff265f44b670d46fbb0ec813e2274932dc1c0e48520f7d/patsy-1.0.2.tar.gz", hash = "sha256:cdc995455f6233e90e22de72c37fcadb344e7586fb83f06696f54d92f8ce74c0", size = 399942, upload-time = "2025-10-20T16:17:37.535Z" } wheels = [ @@ -1333,7 +1519,8 @@ dependencies = [ { name = "ipython" }, { name = "microdf-python" }, { name = "numexpr" }, - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "pandas" }, { name = "plotly" }, { name = "psutil" }, @@ -1366,7 +1553,7 @@ wheels = [ [[package]] name = "policyengine-uk-data" -version = "1.40.3" +version = "1.44.0" source = { editable = "." } dependencies = [ { name = "google-auth" }, @@ -1385,6 +1572,8 @@ dependencies = [ { name = "requests" }, { name = "rich" }, { name = "ruff" }, + { name = "scipy" }, + { name = "sdv" }, { name = "tabulate" }, { name = "tqdm" }, ] @@ -1427,6 +1616,8 @@ requires-dist = [ { name = "rich", specifier = ">=13.0.0" }, { name = "ruff", specifier = ">=0.9.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, + { name = "scipy" }, + { name = "sdv", specifier = ">=1.0.0" }, { name = "tables", marker = "extra == 'dev'" }, { name = "tabulate" }, { name = "torch", marker = "extra == 'dev'" }, @@ -1716,8 +1907,10 @@ name = "quantile-forest" version = "1.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, - { name = "scikit-learn" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "scipy" }, ] sdist = { url = "https://files.pythonhosted.org/packages/62/6e/3f1493d4abcce71fdc82ed575475d3e02da7b03375129e84be2622e1532f/quantile_forest-1.4.1.tar.gz", hash = "sha256:713a23c69562b7551ba4a05c22ce9d0e90db6a73d043e760b29c331cb19dc552", size = 486249, upload-time = "2025-09-10T12:48:04.578Z" } @@ -1729,6 +1922,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f2/be/f77c6705e974b23353c43da1cd93e11fe0afc7e859c2d14f748d25cc0376/quantile_forest-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:fe33f6a8b63b3617568cc1254e1802a70ce3ac23897790f3be10f8db5257fe83", size = 685417, upload-time = "2025-09-10T12:47:57.346Z" }, ] +[[package]] +name = "rdt" +version = "1.20.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "faker" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "pandas" }, + { name = "python-dateutil" }, + { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6b/2d/0218de90d3f995ecc2c86a1bd5c6f6a6c1a4109979389e4dbecd308d1b90/rdt-1.20.0.tar.gz", hash = "sha256:2f68e62f1a722cccea8b2ac44cfb2b1ade6e632f54684d84cf6e86c2c5fce773", size = 65388, upload-time = "2026-01-23T20:07:36.578Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/f2/0a95df18b5f549932228a196cace4571253b088f30430c0eec8662056256/rdt-1.20.0-py3-none-any.whl", hash = "sha256:e239edde36fd2bc1de51c119d93105a074adc8fdd6aa1941fa50af61ccb65dac", size = 74468, upload-time = "2026-01-23T20:07:34.107Z" }, +] + [[package]] name = "requests" version = "2.32.5" @@ -1803,15 +2015,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/4e/cd76eca6db6115604b7626668e891c9dd03330384082e33662fb0f113614/ruff-0.15.5-py3-none-win_arm64.whl", hash = "sha256:b498d1c60d2fe5c10c45ec3f698901065772730b411f164ae270bb6bfcc4740b", size = 10965572, upload-time = "2026-03-05T20:06:16.984Z" }, ] +[[package]] +name = "s3transfer" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" }, +] + [[package]] name = "scikit-learn" version = "1.7.2" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.14'", +] dependencies = [ - { name = "joblib" }, - { name = "numpy" }, - { name = "scipy" }, - { name = "threadpoolctl" }, + { name = "joblib", marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "scipy", marker = "python_full_version < '3.14'" }, + { name = "threadpoolctl", marker = "python_full_version < '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/98/c2/a7855e41c9d285dfe86dc50b250978105dce513d6e459ea66a6aeb0e1e0c/scikit_learn-1.7.2.tar.gz", hash = "sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda", size = 7193136, upload-time = "2025-09-09T08:21:29.075Z" } wheels = [ @@ -1832,12 +2059,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/87/24f541b6d62b1794939ae6422f8023703bbf6900378b2b34e0b4384dfefd/scikit_learn-1.7.2-cp314-cp314-win_amd64.whl", hash = "sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8", size = 8820007, upload-time = "2025-09-09T08:21:26.713Z" }, ] +[[package]] +name = "scikit-learn" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14'", +] +dependencies = [ + { name = "joblib", marker = "python_full_version >= '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "scipy", marker = "python_full_version >= '3.14'" }, + { name = "threadpoolctl", marker = "python_full_version >= '3.14'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770, upload-time = "2025-12-10T07:08:03.251Z" }, + { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458, upload-time = "2025-12-10T07:08:05.336Z" }, + { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341, upload-time = "2025-12-10T07:08:07.732Z" }, + { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022, upload-time = "2025-12-10T07:08:09.862Z" }, + { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409, upload-time = "2025-12-10T07:08:12.028Z" }, + { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760, upload-time = "2025-12-10T07:08:13.688Z" }, + { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045, upload-time = "2025-12-10T07:08:15.215Z" }, + { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324, upload-time = "2025-12-10T07:08:17.561Z" }, + { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651, upload-time = "2025-12-10T07:08:19.952Z" }, + { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045, upload-time = "2025-12-10T07:08:22.11Z" }, + { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994, upload-time = "2025-12-10T07:08:23.943Z" }, + { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518, upload-time = "2025-12-10T07:08:25.71Z" }, + { url = "https://files.pythonhosted.org/packages/24/05/1af2c186174cc92dcab2233f327336058c077d38f6fe2aceb08e6ab4d509/scikit_learn-1.8.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3", size = 8528667, upload-time = "2025-12-10T07:08:27.541Z" }, + { url = "https://files.pythonhosted.org/packages/a8/25/01c0af38fe969473fb292bba9dc2b8f9b451f3112ff242c647fee3d0dfe7/scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7", size = 8066524, upload-time = "2025-12-10T07:08:29.822Z" }, + { url = "https://files.pythonhosted.org/packages/be/ce/a0623350aa0b68647333940ee46fe45086c6060ec604874e38e9ab7d8e6c/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6", size = 8657133, upload-time = "2025-12-10T07:08:31.865Z" }, + { url = "https://files.pythonhosted.org/packages/b8/cb/861b41341d6f1245e6ca80b1c1a8c4dfce43255b03df034429089ca2a2c5/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4", size = 8923223, upload-time = "2025-12-10T07:08:34.166Z" }, + { url = "https://files.pythonhosted.org/packages/76/18/a8def8f91b18cd1ba6e05dbe02540168cb24d47e8dcf69e8d00b7da42a08/scikit_learn-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6", size = 8096518, upload-time = "2025-12-10T07:08:36.339Z" }, + { url = "https://files.pythonhosted.org/packages/d1/77/482076a678458307f0deb44e29891d6022617b2a64c840c725495bee343f/scikit_learn-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242", size = 7754546, upload-time = "2025-12-10T07:08:38.128Z" }, + { url = "https://files.pythonhosted.org/packages/2d/d1/ef294ca754826daa043b2a104e59960abfab4cf653891037d19dd5b6f3cf/scikit_learn-1.8.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7", size = 8848305, upload-time = "2025-12-10T07:08:41.013Z" }, + { url = "https://files.pythonhosted.org/packages/5b/e2/b1f8b05138ee813b8e1a4149f2f0d289547e60851fd1bb268886915adbda/scikit_learn-1.8.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9", size = 8432257, upload-time = "2025-12-10T07:08:42.873Z" }, + { url = "https://files.pythonhosted.org/packages/26/11/c32b2138a85dcb0c99f6afd13a70a951bfdff8a6ab42d8160522542fb647/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f", size = 8678673, upload-time = "2025-12-10T07:08:45.362Z" }, + { url = "https://files.pythonhosted.org/packages/c7/57/51f2384575bdec454f4fe4e7a919d696c9ebce914590abf3e52d47607ab8/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9", size = 8922467, upload-time = "2025-12-10T07:08:47.408Z" }, + { url = "https://files.pythonhosted.org/packages/35/4d/748c9e2872637a57981a04adc038dacaa16ba8ca887b23e34953f0b3f742/scikit_learn-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2", size = 8774395, upload-time = "2025-12-10T07:08:49.337Z" }, + { url = "https://files.pythonhosted.org/packages/60/22/d7b2ebe4704a5e50790ba089d5c2ae308ab6bb852719e6c3bd4f04c3a363/scikit_learn-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c", size = 8002647, upload-time = "2025-12-10T07:08:51.601Z" }, +] + [[package]] name = "scipy" version = "1.16.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" } wheels = [ @@ -1883,6 +2152,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/47/a494741db7280eae6dc033510c319e34d42dd41b7ac0c7ead39354d1a2b5/scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562", size = 26464127, upload-time = "2025-10-28T17:38:11.34Z" }, ] +[[package]] +name = "sdmetrics" +version = "0.27.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "copulas" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "pandas" }, + { name = "plotly" }, + { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "scipy" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1a/94/8b09a3df8d3572104528a3a52b7f7ea6e2b304495ef3e34ce570e4b653c1/sdmetrics-0.27.2.tar.gz", hash = "sha256:afb5b5b7084b62dbcecb3fdaeeac67e1c4ba4ed05f760342336bf67ea4e5cc52", size = 137564, upload-time = "2026-02-26T22:14:54.696Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/eb/c764f388b4b86b2c48e4c7af5cceba577d57dfc35e100e5ad59b8b3e5174/sdmetrics-0.27.2-py3-none-any.whl", hash = "sha256:c6ebb17850716bd290e7fb39c6bc520b2752c85961e5bafd63ca1fb55aa0adc1", size = 201455, upload-time = "2026-02-26T22:14:52.621Z" }, +] + +[[package]] +name = "sdv" +version = "1.34.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "boto3" }, + { name = "botocore" }, + { name = "cloudpickle" }, + { name = "copulas" }, + { name = "ctgan" }, + { name = "deepecho" }, + { name = "graphviz" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "pandas" }, + { name = "platformdirs" }, + { name = "pyyaml" }, + { name = "rdt" }, + { name = "sdmetrics" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/8f/1f31e3ec68ed9ab28a7882416c627ecd3c51b37b6852c250afb45f82606a/sdv-1.34.3.tar.gz", hash = "sha256:e85238b9bbde276a9386c9221b7749f35a4e02ba53467674ee8123afb85be474", size = 174054, upload-time = "2026-03-06T20:34:15.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/8a/4991c5351a384c9f8f405e9483f045b01243c9c947374fcc9d77b97d6299/sdv-1.34.3-py3-none-any.whl", hash = "sha256:19e89eb0f48baf971a30b2dd5fcb9272d9939a7206813277542da4c50d5d238e", size = 200876, upload-time = "2026-03-06T20:34:13.385Z" }, +] + [[package]] name = "setuptools" version = "80.9.0" @@ -2089,7 +2404,8 @@ name = "statsmodels" version = "0.14.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "packaging" }, { name = "pandas" }, { name = "patsy" }, @@ -2130,7 +2446,8 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "blosc2" }, { name = "numexpr" }, - { name = "numpy" }, + { name = "numpy", version = "2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, { name = "packaging" }, { name = "py-cpuinfo" }, { name = "typing-extensions" }, From db6a4b71d6267be486b581c7d42053297e89181b Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Mar 2026 16:39:32 +0000 Subject: [PATCH 2/2] Add weight diagnostics visualisation script Produces charts showing weight distribution, Kish effective sample sizes by population slice, high-influence records table, influence heatmap, and weight-vs-influence scatter plot. --- analysis/visualise_weight_diagnostics.py | 243 +++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 analysis/visualise_weight_diagnostics.py diff --git a/analysis/visualise_weight_diagnostics.py b/analysis/visualise_weight_diagnostics.py new file mode 100644 index 00000000..0aa556ed --- /dev/null +++ b/analysis/visualise_weight_diagnostics.py @@ -0,0 +1,243 @@ +"""Visualise weight distribution diagnostics for the enhanced FRS. + +Produces a set of charts showing: +1. Weight distribution histogram (before regularisation) +2. Per-slice Kish effective sample sizes +3. Top high-influence records +4. Influence heatmap (top records x statistics) +""" + +import json +import logging +import sys + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +DATASET_PATH = "policyengine_uk_data/storage/enhanced_frs_2023_24.h5" +OUTPUT_PREFIX = "analysis/weight_diagnostics" +TIME_PERIOD = "2025" +# Use fewer reforms for speed; increase for production +N_REFORMS = 10 +THRESHOLD = 0.05 + + +def main(): + from policyengine_uk.data import UKSingleYearDataset + from policyengine_uk_data.diagnostics.influence import ( + compute_influence_matrix, + compute_kish_effective_sample_size, + find_high_influence_records, + _build_slice_assignments, + ) + from policyengine_uk import Microsimulation + + logger.info("Loading dataset from %s", DATASET_PATH) + dataset = UKSingleYearDataset(file_path=DATASET_PATH) + + sim = Microsimulation(dataset=dataset) + sim.default_calculation_period = TIME_PERIOD + + weights = np.asarray( + sim.calculate("household_weight", map_to="household"), + dtype=float, + ) + + # ── 1. Weight distribution ────────────────────────────────────── + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + + ax = axes[0] + ax.hist(weights, bins=100, edgecolor="white", alpha=0.8, color="#2563eb") + ax.set_xlabel("Household weight") + ax.set_ylabel("Count") + ax.set_title("Weight distribution (all households)") + ax.axvline( + np.median(weights), + color="red", + linestyle="--", + label=f"Median: {np.median(weights):,.0f}", + ) + ax.axvline( + np.percentile(weights, 90), + color="orange", + linestyle="--", + label=f"P90: {np.percentile(weights, 90):,.0f}", + ) + ax.axvline( + np.percentile(weights, 99), + color="darkred", + linestyle="--", + label=f"P99: {np.percentile(weights, 99):,.0f}", + ) + ax.legend() + + ax = axes[1] + log_weights = np.log10(np.maximum(weights, 1)) + ax.hist(log_weights, bins=80, edgecolor="white", alpha=0.8, color="#7c3aed") + ax.set_xlabel("log₁₀(weight)") + ax.set_ylabel("Count") + ax.set_title("Weight distribution (log scale)") + + plt.tight_layout() + fig.savefig(f"{OUTPUT_PREFIX}_weight_dist.png", dpi=150, bbox_inches="tight") + logger.info("Saved weight distribution plot") + plt.close() + + # ── 2. Kish effective sample size by slice ────────────────────── + slices = _build_slice_assignments(sim, TIME_PERIOD) + kish_data = {"overall": compute_kish_effective_sample_size(weights)} + for slice_name, labels in slices.items(): + for label in np.unique(labels): + if label is None: + continue + mask = labels == label + n_actual = mask.sum() + n_eff = compute_kish_effective_sample_size(weights, mask) + kish_data[f"{slice_name}={label}"] = n_eff + + kish_df = pd.DataFrame( + {"slice": list(kish_data.keys()), "kish_n_eff": list(kish_data.values())} + ).sort_values("kish_n_eff") + + fig, ax = plt.subplots(figsize=(10, max(6, len(kish_df) * 0.3))) + colors = [ + "#ef4444" if v < 100 else "#f59e0b" if v < 500 else "#22c55e" + for v in kish_df.kish_n_eff + ] + ax.barh(kish_df.slice, kish_df.kish_n_eff, color=colors, edgecolor="white") + ax.set_xlabel("Kish effective sample size") + ax.set_title("Effective sample size by population slice") + ax.axvline(100, color="red", linestyle=":", alpha=0.5, label="n_eff = 100") + ax.axvline(500, color="orange", linestyle=":", alpha=0.5, label="n_eff = 500") + ax.legend() + plt.tight_layout() + fig.savefig(f"{OUTPUT_PREFIX}_kish.png", dpi=150, bbox_inches="tight") + logger.info("Saved Kish ESS plot") + plt.close() + + # ── 3. Influence matrix ───────────────────────────────────────── + logger.info("Computing baseline influence matrix...") + infl = compute_influence_matrix(sim, TIME_PERIOD) + flagged = find_high_influence_records(infl, THRESHOLD) + + if not flagged.empty: + # Top flagged records table + fig, ax = plt.subplots(figsize=(12, max(4, len(flagged.head(20)) * 0.4))) + ax.axis("off") + table_data = flagged.head(20).copy() + table_data["max_influence"] = table_data["max_influence"].map( + lambda x: f"{x:.3f}" + ) + table = ax.table( + cellText=table_data.values, + colLabels=table_data.columns, + cellLoc="center", + loc="center", + ) + table.auto_set_font_size(False) + table.set_fontsize(8) + table.auto_set_column_width(col=list(range(len(table_data.columns)))) + ax.set_title( + f"Top {min(20, len(flagged))} high-influence records " + f"(threshold={THRESHOLD})", + fontsize=12, + pad=20, + ) + plt.tight_layout() + fig.savefig( + f"{OUTPUT_PREFIX}_flagged_records.png", dpi=150, bbox_inches="tight" + ) + logger.info("Saved flagged records table") + plt.close() + + # Influence heatmap for top records + top_n = min(15, len(flagged)) + top_indices = flagged.record_idx.iloc[:top_n].values + + # Select columns with highest max influence + col_maxes = infl.max(axis=0).sort_values(ascending=False) + top_cols = col_maxes.head(30).index + heatmap_data = infl.iloc[top_indices][top_cols] + + fig, ax = plt.subplots(figsize=(16, max(4, top_n * 0.5))) + im = ax.imshow(heatmap_data.values, aspect="auto", cmap="YlOrRd") + ax.set_yticks(range(top_n)) + ax.set_yticklabels([f"HH #{idx}" for idx in top_indices], fontsize=7) + ax.set_xticks(range(len(top_cols))) + ax.set_xticklabels( + [c.split("/")[-1][:25] for c in top_cols], + rotation=90, + fontsize=6, + ) + ax.set_title("Influence heatmap: top records × top statistics") + plt.colorbar(im, ax=ax, label="Influence fraction") + plt.tight_layout() + fig.savefig(f"{OUTPUT_PREFIX}_heatmap.png", dpi=150, bbox_inches="tight") + logger.info("Saved influence heatmap") + plt.close() + else: + logger.info("No records exceed influence threshold — no flagged records plot") + + # ── 4. Weight vs influence scatter ────────────────────────────── + max_infl_per_record = infl.max(axis=1) if not infl.empty else pd.Series(dtype=float) + + if not max_infl_per_record.empty: + fig, ax = plt.subplots(figsize=(10, 6)) + sc = ax.scatter( + weights, + max_infl_per_record.values, + alpha=0.3, + s=5, + c=np.log10(np.maximum(weights, 1)), + cmap="viridis", + ) + ax.set_xlabel("Household weight") + ax.set_ylabel("Max influence across all statistics") + ax.set_title("Weight vs maximum influence") + ax.axhline(THRESHOLD, color="red", linestyle="--", label=f"Threshold={THRESHOLD}") + ax.set_xscale("log") + ax.legend() + plt.colorbar(sc, ax=ax, label="log₁₀(weight)") + plt.tight_layout() + fig.savefig(f"{OUTPUT_PREFIX}_scatter.png", dpi=150, bbox_inches="tight") + logger.info("Saved weight vs influence scatter") + plt.close() + + # ── 5. Summary statistics ─────────────────────────────────────── + summary = { + "n_households": int(len(weights)), + "weight_mean": float(np.mean(weights)), + "weight_median": float(np.median(weights)), + "weight_p90": float(np.percentile(weights, 90)), + "weight_p99": float(np.percentile(weights, 99)), + "weight_max": float(np.max(weights)), + "weight_skewness": float( + np.mean(((weights - np.mean(weights)) / np.std(weights)) ** 3) + ), + "kish_overall": float(kish_data["overall"]), + "n_flagged_records": int(len(flagged)) if not flagged.empty else 0, + "threshold": THRESHOLD, + } + + with open(f"{OUTPUT_PREFIX}_summary.json", "w") as f: + json.dump(summary, f, indent=2) + logger.info("Saved summary to %s_summary.json", OUTPUT_PREFIX) + + # Print summary + print("\n" + "=" * 60) + print("WEIGHT DIAGNOSTICS SUMMARY") + print("=" * 60) + for k, v in summary.items(): + if isinstance(v, float): + print(f" {k}: {v:,.2f}") + else: + print(f" {k}: {v}") + print("=" * 60) + + +if __name__ == "__main__": + main()