Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/281.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Impute below-threshold England student loan holders into the FRS base dataset and add SLC liable-to-repay calibration targets for Plans 2 and 5.
182 changes: 132 additions & 50 deletions policyengine_uk_data/datasets/imputations/student_loans.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,133 @@
"""
Student loan plan imputation.

This module imputes the student_loan_plan variable based on:
- Whether the person has reported student loan repayments
- Their estimated university attendance year (inferred from age)
"""Student loan plan imputation.

The imputation assigns plan types according to when the loan system changed:
- NONE: No reported repayments
- PLAN_1: Started university before September 2012
- PLAN_2: Started September 2012 - August 2023
- PLAN_5: Started September 2023 onwards
This module imputes `student_loan_plan` in two steps:
- assign plans to people with reported PAYE student loan repayments
- assign missing below-threshold holders to match SLC liable-to-repay totals

This enables policyengine-uk's student_loan_repayment variable to calculate
repayments using official threshold parameters.
The FRS only observes active repayment through PAYE, so many England borrowers
who hold a loan but earn below the repayment threshold are missing from the
base dataset. We fill that stock using the checked-in SLC snapshot, restricting
the new assignments to plausible England tertiary-education cohorts.
"""

import numpy as np
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk import Microsimulation
from policyengine_uk.data import UKSingleYearDataset

from policyengine_uk_data.targets.sources.slc import get_snapshot_data

_ENGLAND = "ENGLAND"
_PLAN_2_MIN_AGE = 21
_PLAN_2_MAX_AGE = 55
_PLAN_5_MAX_AGE = 25


def _weighted_count(mask: np.ndarray, weights: np.ndarray) -> float:
return float(np.sum(weights[mask]))


def _assign_probabilistically(
plan: np.ndarray,
eligible: np.ndarray,
weights: np.ndarray,
target_count: float,
plan_name: str,
rng: np.random.Generator,
) -> None:
"""Assign a plan to a weighted eligible pool up to a target count."""
eligible_weight = _weighted_count(eligible, weights)
if target_count <= 0 or eligible_weight <= 0:
return
assignment_probability = min(1.0, target_count / eligible_weight)
draws = rng.random(len(plan))
plan[eligible & (draws < assignment_probability)] = plan_name


def _impute_student_loan_plan_values(
age: np.ndarray,
student_loan_repayments: np.ndarray,
country: np.ndarray,
highest_education: np.ndarray,
person_weight: np.ndarray,
*,
year: int,
seed: int = 42,
slc_data: dict | None = None,
) -> np.ndarray:
"""Impute plan values from person-level arrays."""
age = np.asarray(age)
repayments = np.asarray(student_loan_repayments)
country = np.asarray(country)
highest_education = np.asarray(highest_education)
person_weight = np.asarray(person_weight, dtype=float)
slc_data = get_snapshot_data() if slc_data is None else slc_data

rng = np.random.default_rng(seed)
plan = np.full(len(age), "NONE", dtype=object)

has_repayments = repayments > 0
is_england = country == _ENGLAND
is_tertiary = highest_education == "TERTIARY"
estimated_uni_start_year = year - age + 18

plan_1_cohort = estimated_uni_start_year < 2012
plan_2_cohort = (estimated_uni_start_year >= 2012) & (
estimated_uni_start_year < 2023
)
plan_5_cohort = estimated_uni_start_year >= 2023
plan_2_age_band = (age >= _PLAN_2_MIN_AGE) & (age <= _PLAN_2_MAX_AGE)
plan_5_age_band = (age >= 18) & (age <= _PLAN_5_MAX_AGE)

# Reported PAYE repayers identify the active stock directly.
plan[has_repayments & plan_1_cohort] = "PLAN_1"
plan[has_repayments & plan_5_cohort] = "PLAN_5"
plan[has_repayments & (plan == "NONE")] = "PLAN_2"

# Impute missing below-threshold holders so the total England stock matches
# the SLC liable-to-repay series, using the observed repayer stock as the
# starting point rather than the official above-threshold count.
plan_5_target = slc_data["plan_5"]["liable"].get(year, 0)
plan_5_shortfall = max(
0.0,
plan_5_target - _weighted_count((plan == "PLAN_5") & is_england, person_weight),
)
plan_5_eligible = (
(plan == "NONE") & is_england & is_tertiary & plan_5_age_band & plan_5_cohort
)
_assign_probabilistically(
plan,
plan_5_eligible,
person_weight,
plan_5_shortfall,
"PLAN_5",
rng,
)

plan_2_target = slc_data["plan_2"]["liable"].get(year, 0)
plan_2_shortfall = max(
0.0,
plan_2_target - _weighted_count((plan == "PLAN_2") & is_england, person_weight),
)
plan_2_eligible = (
(plan == "NONE") & is_england & is_tertiary & plan_2_age_band & plan_2_cohort
)
_assign_probabilistically(
plan,
plan_2_eligible,
person_weight,
plan_2_shortfall,
"PLAN_2",
rng,
)

return plan


def impute_student_loan_plan(
dataset: UKSingleYearDataset,
year: int = 2025,
seed: int = 42,
slc_data: dict | None = None,
) -> UKSingleYearDataset:
"""
Impute student loan plan type based on age and reported repayments.
Expand All @@ -34,45 +139,22 @@ def impute_student_loan_plan(
- PLAN_5: £25,000 (2025), Sept 2023 onwards

Args:
dataset: PolicyEngine UK dataset with student_loan_repayments.
year: The simulation year, used to estimate university attendance.

Returns:
Dataset with imputed student_loan_plan values.
dataset: PolicyEngine UK dataset with student loan inputs.
year: Simulation year, used to estimate university start cohorts.
seed: Random seed for reproducible below-threshold assignment.
slc_data: Optional override for the SLC borrower snapshot.
"""
dataset = dataset.copy()
sim = Microsimulation(dataset=dataset)

# Get required variables
age = sim.calculate("age").values
student_loan_repayments = sim.calculate("student_loan_repayments").values

# Determine if person has a student loan based on reported repayments
has_student_loan = student_loan_repayments > 0

# Estimate when they started university (assume age 18)
# For simulation year Y and age A, university start year = Y - A + 18
estimated_uni_start_year = year - age + 18

# Assign plan types based on when loan system changed
# StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
plan = np.full(len(age), "NONE", dtype=object)

# Plan 1: Started before September 2012
plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
plan[plan_1_mask] = "PLAN_1"

# Plan 2: Started September 2012 - August 2023
plan_2_mask = has_student_loan & (
(estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
dataset.person["student_loan_plan"] = _impute_student_loan_plan_values(
age=sim.calculate("age").values,
student_loan_repayments=sim.calculate("student_loan_repayments").values,
country=sim.calculate("country", map_to="person").values,
highest_education=sim.calculate("highest_education").values,
person_weight=sim.calculate("person_weight").values,
year=year,
seed=seed,
slc_data=slc_data,
)
plan[plan_2_mask] = "PLAN_2"

# Plan 5: Started September 2023 onwards
plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
plan[plan_5_mask] = "PLAN_5"

# Store as the plan type
dataset.person["student_loan_plan"] = plan

return dataset
5 changes: 4 additions & 1 deletion policyengine_uk_data/targets/build_loss_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
compute_scotland_uc_child,
compute_scottish_child_payment,
compute_student_loan_plan,
compute_student_loan_plan_liable,
compute_ss_contributions,
compute_ss_headcount,
compute_ss_it_relief,
Expand Down Expand Up @@ -316,8 +317,10 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray |
return compute_scottish_child_payment(target, ctx)

# Student loan plan borrower counts (SLC)
if name.startswith("slc/plan_"):
if name.startswith("slc/plan_") and "above_threshold" in name:
return compute_student_loan_plan(target, ctx)
if name.startswith("slc/plan_") and "liable" in name:
return compute_student_loan_plan_liable(target, ctx)

# PIP claimants
if name in (
Expand Down
2 changes: 2 additions & 0 deletions policyengine_uk_data/targets/compute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
compute_savings_interest,
compute_scottish_child_payment,
compute_student_loan_plan,
compute_student_loan_plan_liable,
compute_vehicles,
)

Expand All @@ -61,6 +62,7 @@
"compute_scotland_uc_child",
"compute_scottish_child_payment",
"compute_student_loan_plan",
"compute_student_loan_plan_liable",
"compute_ss_contributions",
"compute_ss_headcount",
"compute_ss_it_relief",
Expand Down
40 changes: 22 additions & 18 deletions policyengine_uk_data/targets/compute/other.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,7 @@
"""Miscellaneous compute functions (vehicles, housing, savings, SCP,
student loans)."""
"""Miscellaneous compute functions (vehicles, housing, savings, SCP, student loans)."""

import numpy as np

_ENGLAND_REGIONS = {
"NORTH_EAST",
"NORTH_WEST",
"YORKSHIRE",
"EAST_MIDLANDS",
"WEST_MIDLANDS",
"EAST_OF_ENGLAND",
"LONDON",
"SOUTH_EAST",
"SOUTH_WEST",
}


def compute_vehicles(target, ctx) -> np.ndarray:
"""Compute vehicle ownership targets."""
Expand Down Expand Up @@ -78,9 +65,26 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray:
else:
return None

plan = ctx.sim.calculate("student_loan_plan").values
region = ctx.sim.calculate("region", map_to="person").values
is_england = np.isin(region, list(_ENGLAND_REGIONS))
on_plan = (plan == plan_value) & is_england
plan = ctx.pe_person("student_loan_plan")
repayments = ctx.pe_person("student_loan_repayments")
person_country = ctx.sim.calculate("country", map_to="person").values
on_plan = (plan == plan_value) & (person_country == "ENGLAND") & (repayments > 0)

return ctx.household_from_person(on_plan.astype(float))


def compute_student_loan_plan_liable(target, ctx) -> np.ndarray:
"""Count all England borrowers on a given plan, including non-repayers."""
plan_name = target.name # e.g. "slc/plan_2_borrowers_liable"
if "plan_2" in plan_name:
plan_value = "PLAN_2"
elif "plan_5" in plan_name:
plan_value = "PLAN_5"
else:
return None

plan = ctx.pe_person("student_loan_plan")
person_country = ctx.sim.calculate("country", map_to="person").values
on_plan = (plan == plan_value) & (person_country == "ENGLAND")

return ctx.household_from_person(on_plan.astype(float))
Loading
Loading