From 2e7684372ecdd88327d391ed2793610eaf04aa4b Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sat, 11 Apr 2026 10:52:01 -0400
Subject: [PATCH 1/6] Add legacy ESA and JSA proxy fields

---
 policyengine_uk_data/datasets/frs.py          | 130 ++++++++++++++++++
 .../tests/test_legacy_benefit_proxies.py      | 108 +++++++++++++++
 2 files changed, 238 insertions(+)
 create mode 100644 policyengine_uk_data/tests/test_legacy_benefit_proxies.py

diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
index 7cc05cd3..0f75612e 100644
--- a/policyengine_uk_data/datasets/frs.py
+++ b/policyengine_uk_data/datasets/frs.py
@@ -22,6 +22,130 @@
 from policyengine_uk_data.parameters import load_take_up_rate, load_parameter
 
 
+LEGACY_JOBSEEKER_MIN_AGE = 18
+ESA_MIN_AGE = 16
+ESA_HEALTH_EMPLOYMENT_STATUSES = (
+    "LONG_TERM_DISABLED",
+    "SHORT_TERM_DISABLED",
+)
+
+
+def derive_legacy_jobseeker_proxy(
+    age,
+    employment_status,
+    hours_worked,
+    state_pension_age,
+) -> np.ndarray:
+    """Approximate legacy JSA claimant-state from observed survey data.
+
+    This is intentionally a proxy, not a legislative determination. It
+    identifies person-level working-age adults who report being unemployed
+    and not currently working any hours.
+    """
+
+    age = np.asarray(age)
+    employment_status = np.asarray(employment_status)
+    hours_worked = np.asarray(hours_worked)
+    state_pension_age = np.asarray(state_pension_age)
+
+    return (
+        (age >= LEGACY_JOBSEEKER_MIN_AGE)
+        & (age < state_pension_age)
+        & (employment_status == "UNEMPLOYED")
+        & (hours_worked <= 0)
+    )
+
+
+def derive_esa_health_condition_proxy(
+    age,
+    employment_status,
+    state_pension_age,
+) -> np.ndarray:
+    """Approximate working-age ESA health-related claimant-state.
+
+    This proxy relies only on person-level labour market status, not on
+    current disability or incapacity benefit receipt. It is a dataset-side
+    approximation for future modelling, not a direct observation of ESA
+    legal entitlement or LCW/LCWRA status.
+    """
+
+    age = np.asarray(age)
+    employment_status = np.asarray(employment_status)
+    state_pension_age = np.asarray(state_pension_age)
+    disability_labour_market_state = np.isin(
+        employment_status, ESA_HEALTH_EMPLOYMENT_STATUSES
+    )
+
+    return (
+        (age >= ESA_MIN_AGE)
+        & (age < state_pension_age)
+        & disability_labour_market_state
+    )
+
+
+def derive_esa_support_group_proxy(
+    age,
+    employment_status,
+    hours_worked,
+    esa_health_condition_proxy,
+    state_pension_age,
+) -> np.ndarray:
+    """Approximate a severe-health ESA subgroup akin to support group.
+
+    This is a stricter subset of ``esa_health_condition_proxy`` intended
+    for future legacy ESA approximation work. It uses only non-receipt
+    labour market signals already available in the survey.
+    """
+
+    age = np.asarray(age)
+    employment_status = np.asarray(employment_status)
+    hours_worked = np.asarray(hours_worked)
+    esa_health_condition_proxy = np.asarray(esa_health_condition_proxy)
+    state_pension_age = np.asarray(state_pension_age)
+    severe_health_evidence = (employment_status == "LONG_TERM_DISABLED") & (
+        hours_worked <= 0
+    )
+
+    return (
+        (age >= ESA_MIN_AGE)
+        & (age < state_pension_age)
+        & esa_health_condition_proxy
+        & severe_health_evidence
+    )
+
+
+def add_legacy_benefit_proxies(
+    pe_person: pd.DataFrame, state_pension_age
+) -> pd.DataFrame:
+    """Populate person-scoped ESA/JSA proxy columns on the person frame.
+
+    These remain person-level by design because the claimant-state inputs
+    they approximate attach to individuals. Downstream benunit-level legacy
+    benefit models should aggregate them explicitly rather than assuming the
+    raw survey contains a benunit claimant-state field.
+    """
+
+    pe_person["legacy_jobseeker_proxy"] = derive_legacy_jobseeker_proxy(
+        age=pe_person.age,
+        employment_status=pe_person.employment_status,
+        hours_worked=pe_person.hours_worked,
+        state_pension_age=state_pension_age,
+    )
+    pe_person["esa_health_condition_proxy"] = derive_esa_health_condition_proxy(
+        age=pe_person.age,
+        employment_status=pe_person.employment_status,
+        state_pension_age=state_pension_age,
+    )
+    pe_person["esa_support_group_proxy"] = derive_esa_support_group_proxy(
+        age=pe_person.age,
+        employment_status=pe_person.employment_status,
+        hours_worked=pe_person.hours_worked,
+        esa_health_condition_proxy=pe_person.esa_health_condition_proxy,
+        state_pension_age=state_pension_age,
+    )
+    return pe_person
+
+
 def create_frs(
     raw_frs_folder: str,
     year: int,
@@ -744,6 +868,7 @@ def determine_education_level(fted_val, typeed2_val, age_val):
     sim = Microsimulation(dataset=dataset)
     region = sim.populations["benunit"].household("region", dataset.time_period)
     lha_category = sim.calculate("LHA_category", year)
+    state_pension_age = sim.calculate("state_pension_age", year).values
 
     brma = np.empty(len(region), dtype=object)
 
@@ -808,6 +933,11 @@ def determine_education_level(fted_val, typeed2_val, age_val):
         paragraph_3 | paragraph_4 | paragraph_5
     )
 
+    # Dataset-side claimant-state approximations for future legacy ESA/JSA
+    # modelling. These are explicit proxies based on observed survey
+    # conditions, not legislative determinations.
+    pe_person = add_legacy_benefit_proxies(pe_person, state_pension_age)
+
     # Generate stochastic take-up decisions
     # All randomness is generated here in the data package using take-up rates
     # stored in YAML parameter files. This keeps the country package purely
diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
new file mode 100644
index 00000000..bd77c818
--- /dev/null
+++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
@@ -0,0 +1,108 @@
+import inspect
+
+import numpy as np
+import pandas as pd
+
+from policyengine_uk_data.datasets.frs import (
+    add_legacy_benefit_proxies,
+    create_frs,
+    derive_esa_health_condition_proxy,
+    derive_esa_support_group_proxy,
+    derive_legacy_jobseeker_proxy,
+)
+
+
+def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_non_workers():
+    result = derive_legacy_jobseeker_proxy(
+        age=np.array([18, 30, 66, 17, 25, 25, 66]),
+        employment_status=np.array(
+            [
+                "UNEMPLOYED",
+                "UNEMPLOYED",
+                "UNEMPLOYED",
+                "UNEMPLOYED",
+                "STUDENT",
+                "CARER",
+                "UNEMPLOYED",
+            ]
+        ),
+        hours_worked=np.array([0, 12, 0, 0, 0, 0, 0]),
+        state_pension_age=np.array([66, 66, 66, 66, 66, 66, 67]),
+    )
+
+    assert result.tolist() == [True, False, False, False, False, False, True]
+
+
+def test_esa_health_condition_proxy_uses_disabled_employment_states():
+    result = derive_esa_health_condition_proxy(
+        age=np.array([16, 45, 45, 66]),
+        employment_status=np.array(
+            [
+                "LONG_TERM_DISABLED",
+                "SHORT_TERM_DISABLED",
+                "FT_EMPLOYED",
+                "LONG_TERM_DISABLED",
+            ]
+        ),
+        state_pension_age=np.array([66, 66, 66, 66]),
+    )
+
+    assert result.tolist() == [True, True, False, False]
+
+
+def test_esa_support_group_proxy_is_stricter_subset_of_health_proxy():
+    health_proxy = np.array([True, True, True, False])
+    result = derive_esa_support_group_proxy(
+        age=np.array([16, 45, 45, 66]),
+        employment_status=np.array(
+            [
+                "LONG_TERM_DISABLED",
+                "SHORT_TERM_DISABLED",
+                "LONG_TERM_DISABLED",
+                "FT_EMPLOYED",
+            ]
+        ),
+        hours_worked=np.array([0, 0, 12, 0]),
+        esa_health_condition_proxy=health_proxy,
+        state_pension_age=np.array([66, 66, 66, 66]),
+    )
+
+    assert result.tolist() == [True, False, False, False]
+
+
+def test_add_legacy_benefit_proxies_wires_all_three_columns():
+    pe_person = pd.DataFrame(
+        {
+            "age": [18, 45, 45, 66],
+            "employment_status": [
+                "UNEMPLOYED",
+                "LONG_TERM_DISABLED",
+                "SHORT_TERM_DISABLED",
+                "LONG_TERM_DISABLED",
+            ],
+            "hours_worked": [0, 0, 12, 0],
+            "is_disabled_for_benefits": [False, True, False, True],
+            "is_severely_disabled_for_benefits": [False, False, True, True],
+            "esa_income_reported": [0.0, 0.0, 100.0, 0.0],
+            "esa_contrib_reported": [0.0, 0.0, 0.0, 0.0],
+            "incapacity_benefit_reported": [0.0, 0.0, 0.0, 0.0],
+            "sda_reported": [0.0, 0.0, 0.0, 0.0],
+        }
+    )
+
+    result = add_legacy_benefit_proxies(
+        pe_person.copy(), state_pension_age=np.array([66, 66, 66, 66])
+    )
+
+    assert result["legacy_jobseeker_proxy"].tolist() == [True, False, False, False]
+    assert result["esa_health_condition_proxy"].tolist() == [False, True, True, False]
+    assert result["esa_support_group_proxy"].tolist() == [False, True, False, False]
+
+
+def test_create_frs_calls_add_legacy_benefit_proxies():
+    source = inspect.getsource(create_frs)
+
+    assert (
+        'state_pension_age = sim.calculate("state_pension_age", year).values' in source
+    )
+    assert "add_legacy_benefit_proxies(pe_person, state_pension_age)" in source

From e568e787f3955a6f14467766ecf32fdb9c16c023 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sat, 11 Apr 2026 11:31:26 -0400
Subject: [PATCH 2/6] Harden legacy ESA and JSA proxy coverage

---
 policyengine_uk_data/datasets/frs.py          |  62 ++-
 .../tests/test_legacy_benefit_proxies.py      | 373 +++++++++++++++++-
 2 files changed, 405 insertions(+), 30 deletions(-)

diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
index 0f75612e..8cebdd7f 100644
--- a/policyengine_uk_data/datasets/frs.py
+++ b/policyengine_uk_data/datasets/frs.py
@@ -23,6 +23,7 @@
 
 
 LEGACY_JOBSEEKER_MIN_AGE = 18
+LEGACY_JOBSEEKER_MAX_HOURS = 16
 ESA_MIN_AGE = 16
 ESA_HEALTH_EMPLOYMENT_STATUSES = (
     "LONG_TERM_DISABLED",
@@ -34,31 +35,38 @@ def derive_legacy_jobseeker_proxy(
     age,
     employment_status,
     hours_worked,
+    current_education,
+    employment_status_reported,
     state_pension_age,
 ) -> np.ndarray:
     """Approximate legacy JSA claimant-state from observed survey data.
 
     This is intentionally a proxy, not a legislative determination. It
     identifies person-level working-age adults who report being unemployed
-    and not currently working any hours.
+    and working less than the legacy JSA 16-hour limit.
     """
 
     age = np.asarray(age)
     employment_status = np.asarray(employment_status)
     hours_worked = np.asarray(hours_worked)
+    current_education = np.asarray(current_education)
+    employment_status_reported = np.asarray(employment_status_reported)
     state_pension_age = np.asarray(state_pension_age)
 
     return (
-        (age >= LEGACY_JOBSEEKER_MIN_AGE)
+        employment_status_reported
+        & (age >= LEGACY_JOBSEEKER_MIN_AGE)
         & (age < state_pension_age)
         & (employment_status == "UNEMPLOYED")
-        & (hours_worked <= 0)
+        & (hours_worked < LEGACY_JOBSEEKER_MAX_HOURS)
+        & (current_education == "NOT_IN_EDUCATION")
     )
 
 
 def derive_esa_health_condition_proxy(
     age,
     employment_status,
+    employment_status_reported,
     state_pension_age,
 ) -> np.ndarray:
     """Approximate working-age ESA health-related claimant-state.
@@ -71,13 +79,15 @@ def derive_esa_health_condition_proxy(
 
     age = np.asarray(age)
     employment_status = np.asarray(employment_status)
+    employment_status_reported = np.asarray(employment_status_reported)
     state_pension_age = np.asarray(state_pension_age)
     disability_labour_market_state = np.isin(
         employment_status, ESA_HEALTH_EMPLOYMENT_STATUSES
     )
 
     return (
-        (age >= ESA_MIN_AGE)
+        employment_status_reported
+        & (age >= ESA_MIN_AGE)
         & (age < state_pension_age)
         & disability_labour_market_state
     )
@@ -88,6 +98,7 @@ def derive_esa_support_group_proxy(
     employment_status,
     hours_worked,
     esa_health_condition_proxy,
+    employment_status_reported,
     state_pension_age,
 ) -> np.ndarray:
     """Approximate a severe-health ESA subgroup akin to support group.
@@ -101,13 +112,15 @@ def derive_esa_support_group_proxy(
     employment_status = np.asarray(employment_status)
     hours_worked = np.asarray(hours_worked)
     esa_health_condition_proxy = np.asarray(esa_health_condition_proxy)
+    employment_status_reported = np.asarray(employment_status_reported)
     state_pension_age = np.asarray(state_pension_age)
     severe_health_evidence = (employment_status == "LONG_TERM_DISABLED") & (
         hours_worked <= 0
     )
 
     return (
-        (age >= ESA_MIN_AGE)
+        employment_status_reported
+        & (age >= ESA_MIN_AGE)
         & (age < state_pension_age)
         & esa_health_condition_proxy
         & severe_health_evidence
@@ -115,7 +128,7 @@ def derive_esa_support_group_proxy(
 
 
 def add_legacy_benefit_proxies(
-    pe_person: pd.DataFrame, state_pension_age
+    pe_person: pd.DataFrame, employment_status_reported, state_pension_age
 ) -> pd.DataFrame:
     """Populate person-scoped ESA/JSA proxy columns on the person frame.
 
@@ -129,11 +142,14 @@ def add_legacy_benefit_proxies(
         age=pe_person.age,
         employment_status=pe_person.employment_status,
         hours_worked=pe_person.hours_worked,
+        current_education=pe_person.current_education,
+        employment_status_reported=employment_status_reported,
         state_pension_age=state_pension_age,
     )
     pe_person["esa_health_condition_proxy"] = derive_esa_health_condition_proxy(
         age=pe_person.age,
         employment_status=pe_person.employment_status,
+        employment_status_reported=employment_status_reported,
         state_pension_age=state_pension_age,
     )
     pe_person["esa_support_group_proxy"] = derive_esa_support_group_proxy(
@@ -141,11 +157,39 @@ def add_legacy_benefit_proxies(
         employment_status=pe_person.employment_status,
         hours_worked=pe_person.hours_worked,
         esa_health_condition_proxy=pe_person.esa_health_condition_proxy,
+        employment_status_reported=employment_status_reported,
         state_pension_age=state_pension_age,
     )
     return pe_person
 
 
+def apply_legacy_benefit_proxies(
+    pe_person: pd.DataFrame, sim, year: int, employment_status_reported
+) -> pd.DataFrame:
+    """Attach legacy ESA/JSA proxies using post-build simulation context."""
+
+    state_pension_age = sim.calculate("state_pension_age", year).values
+    return add_legacy_benefit_proxies(
+        pe_person,
+        employment_status_reported=employment_status_reported,
+        state_pension_age=state_pension_age,
+    )
+
+
+def attach_legacy_benefit_proxies_from_frs_person(
+    pe_person: pd.DataFrame, person: pd.DataFrame, sim, year: int
+) -> pd.DataFrame:
+    """Bridge raw FRS person fields into the proxy derivation hook."""
+
+    employment_status_reported = person.empstati.fillna(0).to_numpy() > 0
+    return apply_legacy_benefit_proxies(
+        pe_person,
+        sim,
+        year,
+        employment_status_reported=employment_status_reported,
+    )
+
+
 def create_frs(
     raw_frs_folder: str,
     year: int,
@@ -868,8 +912,6 @@ def determine_education_level(fted_val, typeed2_val, age_val):
     sim = Microsimulation(dataset=dataset)
     region = sim.populations["benunit"].household("region", dataset.time_period)
     lha_category = sim.calculate("LHA_category", year)
-    state_pension_age = sim.calculate("state_pension_age", year).values
-
     brma = np.empty(len(region), dtype=object)
 
     # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA
@@ -936,7 +978,9 @@ def determine_education_level(fted_val, typeed2_val, age_val):
     # Dataset-side claimant-state approximations for future legacy ESA/JSA
     # modelling. These are explicit proxies based on observed survey
     # conditions, not legislative determinations.
-    pe_person = add_legacy_benefit_proxies(pe_person, state_pension_age)
+    pe_person = attach_legacy_benefit_proxies_from_frs_person(
+        pe_person, person, sim, year
+    )
 
     # Generate stochastic take-up decisions
     # All randomness is generated here in the data package using take-up rates
diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
index bd77c818..f8d6a146 100644
--- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
+++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
@@ -1,10 +1,12 @@
-import inspect
-
 import numpy as np
 import pandas as pd
+import policyengine_uk
+import policyengine_uk_data.datasets.frs as frs_module
 
 from policyengine_uk_data.datasets.frs import (
     add_legacy_benefit_proxies,
+    attach_legacy_benefit_proxies_from_frs_person,
+    apply_legacy_benefit_proxies,
     create_frs,
     derive_esa_health_condition_proxy,
     derive_esa_support_group_proxy,
@@ -12,62 +14,106 @@
 )
 
 
-def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_non_workers():
+class FakeSim:
+    def __init__(self, state_pension_age):
+        self._state_pension_age = np.asarray(state_pension_age)
+
+    def calculate(self, variable, period):
+        assert variable == "state_pension_age"
+        return pd.Series(self._state_pension_age)
+
+
+def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_low_hours():
     result = derive_legacy_jobseeker_proxy(
-        age=np.array([18, 30, 66, 17, 25, 25, 66]),
+        age=np.array([18, 30, 30, 66, 17, 25, 25, 66, 30, 30]),
         employment_status=np.array(
             [
                 "UNEMPLOYED",
                 "UNEMPLOYED",
                 "UNEMPLOYED",
                 "UNEMPLOYED",
+                "UNEMPLOYED",
                 "STUDENT",
                 "CARER",
                 "UNEMPLOYED",
+                "UNEMPLOYED",
+                "UNEMPLOYED",
             ]
         ),
-        hours_worked=np.array([0, 12, 0, 0, 0, 0, 0]),
-        state_pension_age=np.array([66, 66, 66, 66, 66, 66, 67]),
+        hours_worked=np.array([0, 12, 16, 0, 0, 0, 0, 0, 0, 0]),
+        current_education=np.array(
+            [
+                "NOT_IN_EDUCATION",
+                "NOT_IN_EDUCATION",
+                "NOT_IN_EDUCATION",
+                "NOT_IN_EDUCATION",
+                "NOT_IN_EDUCATION",
+                "TERTIARY",
+                "NOT_IN_EDUCATION",
+                "NOT_IN_EDUCATION",
+                "UPPER_SECONDARY",
+                "NOT_IN_EDUCATION",
+            ]
+        ),
+        employment_status_reported=np.array(
+            [True, True, True, True, True, True, True, True, True, False]
+        ),
+        state_pension_age=np.array([66, 66, 66, 66, 66, 66, 66, 67, 66, 66]),
     )
 
-    assert result.tolist() == [True, False, False, False, False, False, True]
+    assert result.tolist() == [
+        True,
+        True,
+        False,
+        False,
+        False,
+        False,
+        False,
+        True,
+        False,
+        False,
+    ]
 
 
 def test_esa_health_condition_proxy_uses_disabled_employment_states():
     result = derive_esa_health_condition_proxy(
-        age=np.array([16, 45, 45, 66]),
+        age=np.array([16, 45, 45, 66, 45]),
         employment_status=np.array(
             [
                 "LONG_TERM_DISABLED",
                 "SHORT_TERM_DISABLED",
                 "FT_EMPLOYED",
                 "LONG_TERM_DISABLED",
+                "LONG_TERM_DISABLED",
             ]
         ),
-        state_pension_age=np.array([66, 66, 66, 66]),
+        employment_status_reported=np.array([True, True, True, True, False]),
+        state_pension_age=np.array([66, 66, 66, 66, 66]),
     )
 
-    assert result.tolist() == [True, True, False, False]
+    assert result.tolist() == [True, True, False, False, False]
 
 
 def test_esa_support_group_proxy_is_stricter_subset_of_health_proxy():
-    health_proxy = np.array([True, True, True, False])
+    health_proxy = np.array([True, True, True, False, True])
     result = derive_esa_support_group_proxy(
-        age=np.array([16, 45, 45, 66]),
+        age=np.array([16, 45, 45, 66, 45]),
         employment_status=np.array(
             [
                 "LONG_TERM_DISABLED",
                 "SHORT_TERM_DISABLED",
                 "LONG_TERM_DISABLED",
                 "FT_EMPLOYED",
+                "LONG_TERM_DISABLED",
             ]
         ),
-        hours_worked=np.array([0, 0, 12, 0]),
+        hours_worked=np.array([0, 0, 12, 0, 0]),
         esa_health_condition_proxy=health_proxy,
-        state_pension_age=np.array([66, 66, 66, 66]),
+        employment_status_reported=np.array([True, True, True, True, False]),
+        state_pension_age=np.array([66, 66, 66, 66, 66]),
     )
 
-    assert result.tolist() == [True, False, False, False]
+    assert result.tolist() == [True, False, False, False, False]
 
 
 def test_add_legacy_benefit_proxies_wires_all_three_columns():
@@ -81,6 +127,12 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns():
                 "LONG_TERM_DISABLED",
             ],
             "hours_worked": [0, 0, 12, 0],
+            "current_education": [
+                "NOT_IN_EDUCATION",
+                "NOT_IN_EDUCATION",
+                "NOT_IN_EDUCATION",
+                "NOT_IN_EDUCATION",
+            ],
             "is_disabled_for_benefits": [False, True, False, True],
             "is_severely_disabled_for_benefits": [False, False, True, True],
             "esa_income_reported": [0.0, 0.0, 100.0, 0.0],
@@ -91,7 +143,9 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns():
     )
 
     result = add_legacy_benefit_proxies(
-        pe_person.copy(), state_pension_age=np.array([66, 66, 66, 66])
+        pe_person.copy(),
+        employment_status_reported=np.array([True, True, True, False]),
+        state_pension_age=np.array([66, 66, 66, 66]),
     )
 
     assert result["legacy_jobseeker_proxy"].tolist() == [True, False, False, False]
@@ -99,10 +153,287 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns():
     assert result["esa_support_group_proxy"].tolist() == [False, True, False, False]
 
 
-def test_create_frs_calls_add_legacy_benefit_proxies():
-    source = inspect.getsource(create_frs)
+def test_apply_legacy_benefit_proxies_uses_sim_state_pension_age():
+    pe_person = pd.DataFrame(
+        {
+            "age": [66, 66],
+            "employment_status": ["UNEMPLOYED", "UNEMPLOYED"],
+            "hours_worked": [0, 0],
+            "current_education": ["NOT_IN_EDUCATION", "NOT_IN_EDUCATION"],
+        }
+    )
+
+    result = apply_legacy_benefit_proxies(
+        pe_person.copy(),
+        FakeSim([66, 67]),
+        2025,
+        employment_status_reported=np.array([True, True]),
+    )
+
+    assert result["legacy_jobseeker_proxy"].tolist() == [False, True]
+
+
+def test_attach_legacy_benefit_proxies_from_frs_person_uses_empstati_mask():
+    pe_person = pd.DataFrame(
+        {
+            "age": [30, 30],
+            "employment_status": ["UNEMPLOYED", "LONG_TERM_DISABLED"],
+            "hours_worked": [12, 0],
+            "current_education": ["NOT_IN_EDUCATION", "NOT_IN_EDUCATION"],
+        }
+    )
+    person = pd.DataFrame({"empstati": [1, np.nan]})
+
+    result = attach_legacy_benefit_proxies_from_frs_person(
+        pe_person.copy(),
+        person,
+        FakeSim([66, 66]),
+        2025,
+    )
+
+    assert result["legacy_jobseeker_proxy"].tolist() == [True, False]
+    assert result["esa_health_condition_proxy"].tolist() == [False, False]
+    assert result["esa_support_group_proxy"].tolist() == [False, False]
+
+
+class FakeBenunitPopulation:
+    def __init__(self, dataset):
+        self.dataset = dataset
+
+    def household(self, variable, period):
+        if variable == "region":
+            return np.array(["LONDON"])
+        if variable == "household_id":
+            return np.array([100])
+        raise KeyError(variable)
+
 
-    assert (
-        'state_pension_age = sim.calculate("state_pension_age", year).values' in source
+class FakeMicrosimulation:
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self.populations = {"benunit": FakeBenunitPopulation(dataset)}
+        self.tax_benefit_system = type(
+            "FakeTaxBenefitSystem",
+            (),
+            {
+                "parameters": lambda self, year: type(
+                    "FakeParametersRoot",
+                    (),
+                    {
+                        "gov": type(
+                            "FakeGov",
+                            (),
+                            {
+                                "dwp": type(
+                                    "FakeDwp",
+                                    (),
+                                    {
+                                        "dla": type(
+                                            "FakeDla",
+                                            (),
+                                            {
+                                                "self_care": type(
+                                                    "FakeSelfCare",
+                                                    (),
+                                                    {"higher": 1},
+                                                )()
+                                            },
+                                        )(),
+                                        "pip": type(
+                                            "FakePip",
+                                            (),
+                                            {
+                                                "daily_living": type(
+                                                    "FakeDailyLiving",
+                                                    (),
+                                                    {"enhanced": 1},
+                                                )()
+                                            },
+                                        )(),
+                                    },
+                                )()
+                            },
+                        )()
+                    },
+                )()
+            },
+        )()
+
+    def calculate(self, variable, year=None):
+        if variable == "LHA_category":
+            return np.array(["A"])
+        if variable == "household_id":
+            return np.array([100])
+        if variable == "state_pension_age":
+            return pd.Series([66])
+        raise KeyError(variable)
+
+
+def test_create_frs_smoke_includes_legacy_proxy_columns(tmp_path, monkeypatch):
+    original_read_csv = frs_module.pd.read_csv
+
+    def fake_read_csv(path, *args, **kwargs):
+        if str(path).endswith("lha_list_of_rents.csv.gz"):
+            return pd.DataFrame(
+                {"region": ["LONDON"], "lha_category": ["A"], "brma": ["BRMA1"]}
+            )
+        return original_read_csv(path, *args, **kwargs)
+
+    monkeypatch.setattr(policyengine_uk, "Microsimulation", FakeMicrosimulation)
+    monkeypatch.setattr(frs_module.pd, "read_csv", fake_read_csv)
+    monkeypatch.setattr(frs_module, "load_take_up_rate", lambda *args, **kwargs: 0.0)
+    monkeypatch.setattr(frs_module, "load_parameter", lambda *args, **kwargs: 0.0)
+    monkeypatch.setattr(
+        frs_module, "sum_to_entity", lambda values, ids, index: np.zeros(len(index))
+    )
+    monkeypatch.setattr(
+        frs_module,
+        "sum_from_positive_fields",
+        lambda table, fields: np.zeros(len(table)),
     )
-    assert "add_legacy_benefit_proxies(pe_person, state_pension_age)" in source
+    monkeypatch.setattr(
+        frs_module,
+        "sum_positive_variables",
+        lambda variables: (
+            np.sum(np.vstack([np.asarray(v) for v in variables]), axis=0)
+            if variables
+            else 0
+        ),
+    )
+    monkeypatch.setattr(
+        frs_module,
+        "fill_with_mean",
+        lambda table, indicator, amount: np.zeros(len(table)),
+    )
+
+    adult = pd.DataFrame(
+        [
+            {
+                "sernum": 100,
+                "benunit": 1,
+                "person": 1,
+                "accssamt": 0,
+                "adema": 0,
+                "ademaamt": 0,
+                "age": 30,
+                "age80": 30,
+                "cvpay": 0,
+                "educft": 0,
+                "educqual": 0,
+                "eduma": 0,
+                "edumaamt": 0,
+                "empstati": 8,
+                "fsbval": 0,
+                "fsfvval": 0,
+                "fsmval": 0,
+                "fted": 0,
+                "heartval": 0,
+                "hrpid": 1,
+                "inearns": 0,
+                "marital": 0,
+                "mntamt1": 0,
+                "mntamt2": 0,
+                "mntus1": 0,
+                "mntusam1": 0,
+                "redamt": 0,
+                "royyr1": 0,
+                "seincam2": 0,
+                "sex": 1,
+                "slrepamt": 0,
+                "smpadj": 0,
+                "sspadj": 0,
+                "tothours": 0,
+                "tuborr": 0,
+                "typeed2": 0,
+                "uperson": 1,
+                "allpay2": 0,
+                "royyr2": 0,
+                "royyr3": 0,
+                "royyr4": 0,
+                "chamtern": 0,
+                "chamttst": 0,
+                "apamt": 0,
+                "apdamt": 0,
+                "pareamt": 0,
+                "allpay3": 0,
+                "allpay4": 0,
+                "grtdir1": 0,
+                "grtdir2": 0,
+            }
+        ]
+    )
+    child = pd.DataFrame(columns=adult.columns)
+    benunit = pd.DataFrame([{"sernum": 100, "benunit": 1, "famtypb2": 1}])
+    househol = pd.DataFrame(
+        [
+            {
+                "sernum": 100,
+                "adulth": 1,
+                "bedroom6": 1,
+                "csewamt": 0,
+                "ctannual": 0,
+                "ctband": 1,
+                "ctrebamt": 0,
+                "cwatamtd": 0,
+                "gross4": 0,
+                "gvtregno": 1,
+                "hhrent": 0,
+                "mortint": 0,
+                "ptentyp2": 0,
+                "rt2rebam": 0,
+                "struins": 0,
+                "subrent": 0,
+                "tentyp2": 0,
+                "typeacc": 0,
+                "watsewrt": 0,
+                "niratlia": 0,
+                **{f"chrgamt{i}": 0 for i in range(1, 10)},
+            }
+        ]
+    )
+    raw_tables = {
+        "adult": adult,
+        "child": child,
+        "benunit": benunit,
+        "househol": househol,
+        "pension": pd.DataFrame(
+            columns=[
+                "person",
+                "sernum",
+                "penoth",
+                "penpay",
+                "poamt",
+                "poinc",
+                "ptamt",
+                "ptinc",
+            ]
+        ),
+        "oddjob": pd.DataFrame(columns=["person", "sernum", "ojamt", "ojnow"]),
+        "accounts": pd.DataFrame(
+            columns=["person", "sernum", "accint", "acctax", "invtax", "account"]
+        ),
+        "job": pd.DataFrame(columns=["person", "sernum", "deduc1", "spnamt", "salsac"]),
+        "benefits": pd.DataFrame(
+            columns=["person", "sernum", "benamt", "benefit", "var2"]
+        ),
+        "maint": pd.DataFrame(columns=["person", "sernum", "mramt", "mruamt", "mrus"]),
+        "penprov": pd.DataFrame(columns=["person", "sernum", "penamt", "stemppen"]),
+        "chldcare": pd.DataFrame(
+            columns=["person", "sernum", "chamt", "cost", "registrd"]
+        ),
+        "extchild": pd.DataFrame(columns=["sernum", "nhhamt"]),
+        "mortgage": pd.DataFrame(
+            columns=["sernum", "borramt", "mortend", "rmamt", "rmort"]
+        ),
+    }
+
+    for name, table in raw_tables.items():
+        table.to_csv(tmp_path / f"{name}.tab", sep="\t", index=False)
+
+    dataset = create_frs(tmp_path, 2025)
+
+    assert {
+        "legacy_jobseeker_proxy",
+        "esa_health_condition_proxy",
+        "esa_support_group_proxy",
+    }.issubset(dataset.person.columns)

From 3a709663b64751f995d6692c6c828c259a46c0ee Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sat, 11 Apr 2026 11:34:11 -0400
Subject: [PATCH 3/6] Match JSA proxy hours to annualized units

---
 policyengine_uk_data/datasets/frs.py                 | 12 +++++++++---
 .../tests/test_legacy_benefit_proxies.py             |  8 ++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
index 8cebdd7f..0b2ef6ff 100644
--- a/policyengine_uk_data/datasets/frs.py
+++ b/policyengine_uk_data/datasets/frs.py
@@ -23,7 +23,11 @@
 
 
 LEGACY_JOBSEEKER_MIN_AGE = 18
-LEGACY_JOBSEEKER_MAX_HOURS = 16
+HOURS_WORKED_WEEKS_PER_YEAR = 52
+LEGACY_JOBSEEKER_MAX_WEEKLY_HOURS = 16
+LEGACY_JOBSEEKER_MAX_ANNUAL_HOURS = (
+    LEGACY_JOBSEEKER_MAX_WEEKLY_HOURS * HOURS_WORKED_WEEKS_PER_YEAR
+)
 ESA_MIN_AGE = 16
 ESA_HEALTH_EMPLOYMENT_STATUSES = (
     "LONG_TERM_DISABLED",
@@ -43,7 +47,9 @@ def derive_legacy_jobseeker_proxy(
 
     This is intentionally a proxy, not a legislative determination. It
     identifies person-level working-age adults who report being unemployed
-    and working less than the legacy JSA 16-hour limit.
+    and working less than the legacy JSA 16-hour weekly limit. The
+    ``hours_worked`` input is the annualised FRS-derived measure used in the
+    dataset, so the threshold is converted to annual hours here.
     """
 
     age = np.asarray(age)
@@ -58,7 +64,7 @@ def derive_legacy_jobseeker_proxy(
         & (age >= LEGACY_JOBSEEKER_MIN_AGE)
         & (age < state_pension_age)
         & (employment_status == "UNEMPLOYED")
-        & (hours_worked < LEGACY_JOBSEEKER_MAX_HOURS)
+        & (hours_worked < LEGACY_JOBSEEKER_MAX_ANNUAL_HOURS)
         & (current_education == "NOT_IN_EDUCATION")
     )
 
diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
index f8d6a146..29846f61 100644
--- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
+++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
@@ -40,7 +40,7 @@ def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_low_hours():
                 "UNEMPLOYED",
             ]
         ),
-        hours_worked=np.array([0, 12, 16, 0, 0, 0, 0, 0, 0, 0]),
+        hours_worked=np.array([0, 12 * 52, 16 * 52, 0, 0, 0, 0, 0, 0, 0]),
         current_education=np.array(
             [
                 "NOT_IN_EDUCATION",
@@ -107,7 +107,7 @@ def test_esa_support_group_proxy_is_stricter_subset_of_health_proxy():
                 "LONG_TERM_DISABLED",
             ]
         ),
-        hours_worked=np.array([0, 0, 12, 0, 0]),
+        hours_worked=np.array([0, 0, 12 * 52, 0, 0]),
         esa_health_condition_proxy=health_proxy,
         employment_status_reported=np.array([True, True, True, True, False]),
         state_pension_age=np.array([66, 66, 66, 66, 66]),
@@ -126,7 +126,7 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns():
                 "SHORT_TERM_DISABLED",
                 "LONG_TERM_DISABLED",
             ],
-            "hours_worked": [0, 0, 12, 0],
+            "hours_worked": [0, 0, 12 * 52, 0],
             "current_education": [
                 "NOT_IN_EDUCATION",
                 "NOT_IN_EDUCATION",
@@ -178,7 +178,7 @@ def test_attach_legacy_benefit_proxies_from_frs_person_uses_empstati_mask():
         {
             "age": [30, 30],
             "employment_status": ["UNEMPLOYED", "LONG_TERM_DISABLED"],
-            "hours_worked": [12, 0],
+            "hours_worked": [12 * 52, 0],
             "current_education": ["NOT_IN_EDUCATION", "NOT_IN_EDUCATION"],
         }
     )

From e13fcd0ff90a70c2176cb0b73f9632f83342e74c Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sat, 11 Apr 2026 22:35:17 -0400
Subject: [PATCH 4/6] Source JSA proxy hours from policyengine-uk

---
 policyengine_uk_data/datasets/frs.py          | 31 ++++++++++++++-----
 .../tests/test_legacy_benefit_proxies.py      | 10 ++++++
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
index 0b2ef6ff..8f20aa84 100644
--- a/policyengine_uk_data/datasets/frs.py
+++ b/policyengine_uk_data/datasets/frs.py
@@ -7,10 +7,13 @@
 modelling and policy analysis.
 """
 
-from policyengine_uk.data import UKSingleYearDataset
+from functools import lru_cache
 from pathlib import Path
-import pandas as pd
+
 import numpy as np
+import pandas as pd
+from policyengine_uk import CountryTaxBenefitSystem
+from policyengine_uk.data import UKSingleYearDataset
 from policyengine_uk_data.utils.datasets import (
     sum_to_entity,
     categorical,
@@ -24,10 +27,6 @@
 
 LEGACY_JOBSEEKER_MIN_AGE = 18
 HOURS_WORKED_WEEKS_PER_YEAR = 52
-LEGACY_JOBSEEKER_MAX_WEEKLY_HOURS = 16
-LEGACY_JOBSEEKER_MAX_ANNUAL_HOURS = (
-    LEGACY_JOBSEEKER_MAX_WEEKLY_HOURS * HOURS_WORKED_WEEKS_PER_YEAR
-)
 ESA_MIN_AGE = 16
 ESA_HEALTH_EMPLOYMENT_STATUSES = (
     "LONG_TERM_DISABLED",
@@ -35,6 +34,15 @@
 )
 
 
+@lru_cache(maxsize=None)
+def load_legacy_jobseeker_max_annual_hours(year: int) -> int:
+    """Read the JSA single-claimant hours rule from policyengine-uk."""
+
+    system = CountryTaxBenefitSystem()
+    max_weekly_hours = int(system.parameters.gov.dwp.JSA.hours.single(str(year)))
+    return max_weekly_hours * HOURS_WORKED_WEEKS_PER_YEAR
+
+
 def derive_legacy_jobseeker_proxy(
     age,
     employment_status,
@@ -42,6 +50,7 @@ def derive_legacy_jobseeker_proxy(
     current_education,
     employment_status_reported,
     state_pension_age,
+    max_annual_hours,
 ) -> np.ndarray:
     """Approximate legacy JSA claimant-state from observed survey data.
 
@@ -64,7 +73,7 @@ def derive_legacy_jobseeker_proxy(
         & (age >= LEGACY_JOBSEEKER_MIN_AGE)
         & (age < state_pension_age)
         & (employment_status == "UNEMPLOYED")
-        & (hours_worked < LEGACY_JOBSEEKER_MAX_ANNUAL_HOURS)
+        & (hours_worked < max_annual_hours)
         & (current_education == "NOT_IN_EDUCATION")
     )
 
@@ -134,7 +143,10 @@ def derive_esa_support_group_proxy(
 
 
 def add_legacy_benefit_proxies(
-    pe_person: pd.DataFrame, employment_status_reported, state_pension_age
+    pe_person: pd.DataFrame,
+    employment_status_reported,
+    state_pension_age,
+    legacy_jobseeker_max_annual_hours,
 ) -> pd.DataFrame:
     """Populate person-scoped ESA/JSA proxy columns on the person frame.
 
@@ -151,6 +163,7 @@ def add_legacy_benefit_proxies(
         current_education=pe_person.current_education,
         employment_status_reported=employment_status_reported,
         state_pension_age=state_pension_age,
+        max_annual_hours=legacy_jobseeker_max_annual_hours,
     )
     pe_person["esa_health_condition_proxy"] = derive_esa_health_condition_proxy(
         age=pe_person.age,
@@ -175,10 +188,12 @@ def apply_legacy_benefit_proxies(
     """Attach legacy ESA/JSA proxies using post-build simulation context."""
 
     state_pension_age = sim.calculate("state_pension_age", year).values
+    legacy_jobseeker_max_annual_hours = load_legacy_jobseeker_max_annual_hours(year)
     return add_legacy_benefit_proxies(
         pe_person,
         employment_status_reported=employment_status_reported,
         state_pension_age=state_pension_age,
+        legacy_jobseeker_max_annual_hours=legacy_jobseeker_max_annual_hours,
     )
 
 
diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
index 29846f61..4f754d54 100644
--- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
+++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
@@ -11,6 +11,7 @@
     derive_esa_health_condition_proxy,
     derive_esa_support_group_proxy,
     derive_legacy_jobseeker_proxy,
+    load_legacy_jobseeker_max_annual_hours,
 )
 
 
@@ -24,6 +25,7 @@ def calculate(self, variable, period):
 
 
 def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_low_hours():
+    max_annual_hours = load_legacy_jobseeker_max_annual_hours(2025)
     result = derive_legacy_jobseeker_proxy(
         age=np.array([18, 30, 30, 66, 17, 25, 25, 66, 30, 30]),
         employment_status=np.array(
@@ -59,6 +61,7 @@ def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_low_hours():
             [True, True, True, True, True, True, True, True, True, False]
         ),
         state_pension_age=np.array([66, 66, 66, 66, 66, 66, 66, 67, 66, 66]),
+        max_annual_hours=max_annual_hours,
     )
 
     assert result.tolist() == [
@@ -146,6 +149,9 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns():
         pe_person.copy(),
         employment_status_reported=np.array([True, True, True, False]),
         state_pension_age=np.array([66, 66, 66, 66]),
+        legacy_jobseeker_max_annual_hours=load_legacy_jobseeker_max_annual_hours(
+            2025
+        ),
     )
 
     assert result["legacy_jobseeker_proxy"].tolist() == [True, False, False, False]
@@ -153,6 +159,10 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns():
     assert result["esa_support_group_proxy"].tolist() == [False, True, False, False]
 
 
+def test_legacy_jobseeker_hours_limit_matches_policyengine_uk_parameter():
+    assert load_legacy_jobseeker_max_annual_hours(2025) == 16 * 52
+
+
 def test_apply_legacy_benefit_proxies_uses_sim_state_pension_age():
     pe_person = pd.DataFrame(
         {

From 29d3b1b1017d0796b3bfc11d783079e849e37790 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sat, 11 Apr 2026 22:51:49 -0400
Subject: [PATCH 5/6] Use policyengine-uk employment status enum

---
 policyengine_uk_data/datasets/frs.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
index 8f20aa84..c1692475 100644
--- a/policyengine_uk_data/datasets/frs.py
+++ b/policyengine_uk_data/datasets/frs.py
@@ -14,6 +14,9 @@
 import pandas as pd
 from policyengine_uk import CountryTaxBenefitSystem
 from policyengine_uk.data import UKSingleYearDataset
+from policyengine_uk.variables.household.income.employment_status import (
+    EmploymentStatus,
+)
 from policyengine_uk_data.utils.datasets import (
     sum_to_entity,
     categorical,
@@ -29,8 +32,8 @@
 HOURS_WORKED_WEEKS_PER_YEAR = 52
 ESA_MIN_AGE = 16
 ESA_HEALTH_EMPLOYMENT_STATUSES = (
-    "LONG_TERM_DISABLED",
-    "SHORT_TERM_DISABLED",
+    EmploymentStatus.LONG_TERM_DISABLED.name,
+    EmploymentStatus.SHORT_TERM_DISABLED.name,
 )
 
 

From 49cb77a74cbac7d6b502adc51a1e3644df83bb00 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sat, 11 Apr 2026 22:56:22 -0400
Subject: [PATCH 6/6] Format legacy benefit proxy files

---
 policyengine_uk_data/tests/test_legacy_benefit_proxies.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
index 4f754d54..07479277 100644
--- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
+++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py
@@ -149,9 +149,7 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns():
         pe_person.copy(),
         employment_status_reported=np.array([True, True, True, False]),
         state_pension_age=np.array([66, 66, 66, 66]),
-        legacy_jobseeker_max_annual_hours=load_legacy_jobseeker_max_annual_hours(
-            2025
-        ),
+        legacy_jobseeker_max_annual_hours=load_legacy_jobseeker_max_annual_hours(2025),
     )
 
     assert result["legacy_jobseeker_proxy"].tolist() == [True, False, False, False]