PolicyEngine · MaxGhenis · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/changelog.d/281.md b/changelog.d/281.md
@@ -0,0 +1 @@
+Impute below-threshold England student loan holders into the FRS base dataset and add SLC liable-to-repay calibration targets for Plans 2 and 5.
diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -1,28 +1,133 @@
-"""
-Student loan plan imputation.
-
-This module imputes the student_loan_plan variable based on:
-- Whether the person has reported student loan repayments
-- Their estimated university attendance year (inferred from age)
+"""Student loan plan imputation.
 
-The imputation assigns plan types according to when the loan system changed:
-- NONE: No reported repayments
-- PLAN_1: Started university before September 2012
-- PLAN_2: Started September 2012 - August 2023
-- PLAN_5: Started September 2023 onwards
+This module imputes `student_loan_plan` in two steps:
+- assign plans to people with reported PAYE student loan repayments
+- assign missing below-threshold holders to match SLC liable-to-repay totals
 
-This enables policyengine-uk's student_loan_repayment variable to calculate
-repayments using official threshold parameters.
+The FRS only observes active repayment through PAYE, so many England borrowers
+who hold a loan but earn below the repayment threshold are missing from the
+base dataset. We fill that stock using the checked-in SLC snapshot, restricting
+the new assignments to plausible England tertiary-education cohorts.
 """
 
 import numpy as np
-from policyengine_uk.data import UKSingleYearDataset
 from policyengine_uk import Microsimulation
+from policyengine_uk.data import UKSingleYearDataset
+
+from policyengine_uk_data.targets.sources.slc import get_snapshot_data
+
+_ENGLAND = "ENGLAND"
+_PLAN_2_MIN_AGE = 21
+_PLAN_2_MAX_AGE = 55
+_PLAN_5_MAX_AGE = 25
+
+
+def _weighted_count(mask: np.ndarray, weights: np.ndarray) -> float:
+    return float(np.sum(weights[mask]))
+
+
+def _assign_probabilistically(
+    plan: np.ndarray,
+    eligible: np.ndarray,
+    weights: np.ndarray,
+    target_count: float,
+    plan_name: str,
+    rng: np.random.Generator,
+) -> None:
+    """Assign a plan to a weighted eligible pool up to a target count."""
+    eligible_weight = _weighted_count(eligible, weights)
+    if target_count <= 0 or eligible_weight <= 0:
+        return
+    assignment_probability = min(1.0, target_count / eligible_weight)
+    draws = rng.random(len(plan))
+    plan[eligible & (draws < assignment_probability)] = plan_name
+
+
+def _impute_student_loan_plan_values(
+    age: np.ndarray,
+    student_loan_repayments: np.ndarray,
+    country: np.ndarray,
+    highest_education: np.ndarray,
+    person_weight: np.ndarray,
+    *,
+    year: int,
+    seed: int = 42,
+    slc_data: dict | None = None,
+) -> np.ndarray:
+    """Impute plan values from person-level arrays."""
+    age = np.asarray(age)
+    repayments = np.asarray(student_loan_repayments)
+    country = np.asarray(country)
+    highest_education = np.asarray(highest_education)
+    person_weight = np.asarray(person_weight, dtype=float)
+    slc_data = get_snapshot_data() if slc_data is None else slc_data
+
+    rng = np.random.default_rng(seed)
+    plan = np.full(len(age), "NONE", dtype=object)
+
+    has_repayments = repayments > 0
+    is_england = country == _ENGLAND
+    is_tertiary = highest_education == "TERTIARY"
+    estimated_uni_start_year = year - age + 18
+
+    plan_1_cohort = estimated_uni_start_year < 2012
+    plan_2_cohort = (estimated_uni_start_year >= 2012) & (
+        estimated_uni_start_year < 2023
+    )
+    plan_5_cohort = estimated_uni_start_year >= 2023
+    plan_2_age_band = (age >= _PLAN_2_MIN_AGE) & (age <= _PLAN_2_MAX_AGE)
+    plan_5_age_band = (age >= 18) & (age <= _PLAN_5_MAX_AGE)
+
+    # Reported PAYE repayers identify the active stock directly.
+    plan[has_repayments & plan_1_cohort] = "PLAN_1"
+    plan[has_repayments & plan_5_cohort] = "PLAN_5"
+    plan[has_repayments & (plan == "NONE")] = "PLAN_2"
+
+    # Impute missing below-threshold holders so the total England stock matches
+    # the SLC liable-to-repay series, using the observed repayer stock as the
+    # starting point rather than the official above-threshold count.
+    plan_5_target = slc_data["plan_5"]["liable"].get(year, 0)
+    plan_5_shortfall = max(
+        0.0,
+        plan_5_target - _weighted_count((plan == "PLAN_5") & is_england, person_weight),
+    )
+    plan_5_eligible = (
+        (plan == "NONE") & is_england & is_tertiary & plan_5_age_band & plan_5_cohort
+    )
+    _assign_probabilistically(
+        plan,
+        plan_5_eligible,
+        person_weight,
+        plan_5_shortfall,
+        "PLAN_5",
+        rng,
+    )
+
+    plan_2_target = slc_data["plan_2"]["liable"].get(year, 0)
+    plan_2_shortfall = max(
+        0.0,
+        plan_2_target - _weighted_count((plan == "PLAN_2") & is_england, person_weight),
+    )
+    plan_2_eligible = (
+        (plan == "NONE") & is_england & is_tertiary & plan_2_age_band & plan_2_cohort
+    )
+    _assign_probabilistically(
+        plan,
+        plan_2_eligible,
+        person_weight,
+        plan_2_shortfall,
+        "PLAN_2",
+        rng,
+    )
+
+    return plan
 
 
 def impute_student_loan_plan(
     dataset: UKSingleYearDataset,
     year: int = 2025,
+    seed: int = 42,
+    slc_data: dict | None = None,
 ) -> UKSingleYearDataset:
     """
     Impute student loan plan type based on age and reported repayments.
@@ -34,45 +139,22 @@ def impute_student_loan_plan(
     - PLAN_5: £25,000 (2025), Sept 2023 onwards
 
     Args:
-        dataset: PolicyEngine UK dataset with student_loan_repayments.
-        year: The simulation year, used to estimate university attendance.
-
-    Returns:
-        Dataset with imputed student_loan_plan values.
+        dataset: PolicyEngine UK dataset with student loan inputs.
+        year: Simulation year, used to estimate university start cohorts.
+        seed: Random seed for reproducible below-threshold assignment.
+        slc_data: Optional override for the SLC borrower snapshot.
     """
     dataset = dataset.copy()
     sim = Microsimulation(dataset=dataset)
-
-    # Get required variables
-    age = sim.calculate("age").values
-    student_loan_repayments = sim.calculate("student_loan_repayments").values
-
-    # Determine if person has a student loan based on reported repayments
-    has_student_loan = student_loan_repayments > 0
-
-    # Estimate when they started university (assume age 18)
-    # For simulation year Y and age A, university start year = Y - A + 18
-    estimated_uni_start_year = year - age + 18
-
-    # Assign plan types based on when loan system changed
-    # StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
-    plan = np.full(len(age), "NONE", dtype=object)
-
-    # Plan 1: Started before September 2012
-    plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
-    plan[plan_1_mask] = "PLAN_1"
-
-    # Plan 2: Started September 2012 - August 2023
-    plan_2_mask = has_student_loan & (
-        (estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
+    dataset.person["student_loan_plan"] = _impute_student_loan_plan_values(
+        age=sim.calculate("age").values,
+        student_loan_repayments=sim.calculate("student_loan_repayments").values,
+        country=sim.calculate("country", map_to="person").values,
+        highest_education=sim.calculate("highest_education").values,
+        person_weight=sim.calculate("person_weight").values,
+        year=year,
+        seed=seed,
+        slc_data=slc_data,
     )
-    plan[plan_2_mask] = "PLAN_2"
-
-    # Plan 5: Started September 2023 onwards
-    plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
-    plan[plan_5_mask] = "PLAN_5"
-
-    # Store as the plan type
-    dataset.person["student_loan_plan"] = plan
 
     return dataset
diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py
@@ -40,6 +40,7 @@
     compute_scotland_uc_child,
     compute_scottish_child_payment,
     compute_student_loan_plan,
+    compute_student_loan_plan_liable,
     compute_ss_contributions,
     compute_ss_headcount,
     compute_ss_it_relief,
@@ -316,8 +317,10 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray |
         return compute_scottish_child_payment(target, ctx)
 
     # Student loan plan borrower counts (SLC)
-    if name.startswith("slc/plan_"):
+    if name.startswith("slc/plan_") and "above_threshold" in name:
         return compute_student_loan_plan(target, ctx)
+    if name.startswith("slc/plan_") and "liable" in name:
+        return compute_student_loan_plan_liable(target, ctx)
 
     # PIP claimants
     if name in (

diff --git a/policyengine_uk_data/targets/compute/__init__.py b/policyengine_uk_data/targets/compute/__init__.py
@@ -40,6 +40,7 @@
     compute_savings_interest,
     compute_scottish_child_payment,
     compute_student_loan_plan,
+    compute_student_loan_plan_liable,
     compute_vehicles,
 )
 
@@ -61,6 +62,7 @@
     "compute_scotland_uc_child",
     "compute_scottish_child_payment",
     "compute_student_loan_plan",
+    "compute_student_loan_plan_liable",
     "compute_ss_contributions",
     "compute_ss_headcount",
     "compute_ss_it_relief",

diff --git a/policyengine_uk_data/targets/compute/other.py b/policyengine_uk_data/targets/compute/other.py
@@ -1,20 +1,7 @@
-"""Miscellaneous compute functions (vehicles, housing, savings, SCP,
-student loans)."""
+"""Miscellaneous compute functions (vehicles, housing, savings, SCP, student loans)."""
 
 import numpy as np
 
-_ENGLAND_REGIONS = {
-    "NORTH_EAST",
-    "NORTH_WEST",
-    "YORKSHIRE",
-    "EAST_MIDLANDS",
-    "WEST_MIDLANDS",
-    "EAST_OF_ENGLAND",
-    "LONDON",
-    "SOUTH_EAST",
-    "SOUTH_WEST",
-}
-
 
 def compute_vehicles(target, ctx) -> np.ndarray:
     """Compute vehicle ownership targets."""
@@ -78,9 +65,26 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray:
     else:
         return None
 
-    plan = ctx.sim.calculate("student_loan_plan").values
-    region = ctx.sim.calculate("region", map_to="person").values
-    is_england = np.isin(region, list(_ENGLAND_REGIONS))
-    on_plan = (plan == plan_value) & is_england
+    plan = ctx.pe_person("student_loan_plan")
+    repayments = ctx.pe_person("student_loan_repayments")
+    person_country = ctx.sim.calculate("country", map_to="person").values
+    on_plan = (plan == plan_value) & (person_country == "ENGLAND") & (repayments > 0)
+
+    return ctx.household_from_person(on_plan.astype(float))
+
+
+def compute_student_loan_plan_liable(target, ctx) -> np.ndarray:
+    """Count all England borrowers on a given plan, including non-repayers."""
+    plan_name = target.name  # e.g. "slc/plan_2_borrowers_liable"
+    if "plan_2" in plan_name:
+        plan_value = "PLAN_2"
+    elif "plan_5" in plan_name:
+        plan_value = "PLAN_5"
+    else:
+        return None
+
+    plan = ctx.pe_person("student_loan_plan")
+    person_country = ctx.sim.calculate("country", map_to="person").values
+    on_plan = (plan == plan_value) & (person_country == "ENGLAND")
 
     return ctx.household_from_person(on_plan.astype(float))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Impute below-threshold England student loan holders into the FRS base dataset and add SLC liable-to-repay calibration targets for Plans 2 and 5.