From 299e3d1d03de9b8bc30016ddb9214f9e2df04d34 Mon Sep 17 00:00:00 2001
From: hass-nation <he3190@gmail.com>
Date: Sun, 28 Jun 2026 14:04:34 +0100
Subject: [PATCH] ENH: Add Hansen-Lee misspecification-robust J-test to IVGMM

Adds `robust_j_stat` property to `IVGMMResults` (and `IVGMMCUE`),
implementing the misspecification-robust J-test from Hansen & Lee
(2021, Econometrica, 89(3), 1419-1447).

The standard J-statistic uses the uncentered moment covariance as its
weight matrix, which under model misspecification (E[g(z,theta)] != 0)
leads to a test that saturates at n rather than diverging. The
Hansen-Lee statistic replaces the uncentered covariance with the
*centered* covariance S_c = (1/n) sum (g_i - g_bar)(g_i - g_bar)',
giving J* = n * g_bar' S_c^{-1} g_bar ~ chi2(q) under correct
specification and diverging at rate n under misspecification.

Changes:
- `_IVGMMBase._hansen_lee_j_statistic`: computes J* for any cov_type
  (robust/heteroskedastic, homoskedastic, kernel, clustered)
- `_IVGMMBase._gmm_post_estimation` + `IVGMM._gmm_post_estimation`:
  accept cov_type/cov_config and include `robust_j_stat` in results
- `IVGMM.fit` + `IVGMMCUE.fit`: pass cov_type/cov_config through
- `IVGMMResults.robust_j_stat` property with full docstring
- `IVGMMResults._top_right`: summary now shows both J-stats side-by-side
- 14 new tests in `linearmodels/tests/iv/test_hansen_lee_j_stat.py`
  covering type checks, formula verification, all cov_types, CUE,
  summary display, and the algebraic identity J* = J*n/(n-J) for
  iterated GMM

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 linearmodels/iv/model.py                      |  94 +++++++-
 linearmodels/iv/results.py                    |  67 ++++++
 .../tests/iv/test_hansen_lee_j_stat.py        | 212 ++++++++++++++++++
 3 files changed, 371 insertions(+), 2 deletions(-)
 create mode 100644 linearmodels/tests/iv/test_hansen_lee_j_stat.py

diff --git a/linearmodels/iv/model.py b/linearmodels/iv/model.py
index 41cd06d049..4b044206a4 100644
--- a/linearmodels/iv/model.py
+++ b/linearmodels/iv/model.py
@@ -1031,6 +1031,8 @@ def _gmm_post_estimation(
         params: linearmodels.typing.data.Float64Array,
         weight_mat: linearmodels.typing.data.Float64Array,
         iters: int,
+        cov_type: str = "robust",
+        cov_config: dict[str, Any] | None = None,
     ) -> dict[str, Any]:
         """GMM-specific post-estimation results"""
         instr = self._instr_columns
@@ -1040,10 +1042,93 @@ def _gmm_post_estimation(
             "weight_config": self._weight_type,
             "iterations": iters,
             "j_stat": self._j_statistic(params, weight_mat),
+            "robust_j_stat": self._hansen_lee_j_statistic(
+                params, cov_type, cov_config or {}
+            ),
         }
 
         return gmm_specific
 
+    def _hansen_lee_j_statistic(
+        self,
+        params: linearmodels.typing.data.Float64Array,
+        cov_type: str,
+        cov_config: dict[str, Any],
+    ) -> WaldTestStatistic:
+        r"""
+        Hansen-Lee (2021) misspecification-robust J-test of overidentifying
+        restrictions.
+
+        Unlike the standard J-test, which uses the uncentered moment covariance
+        as the weight matrix, this test uses the *centered* moment covariance
+
+        .. math::
+
+            \hat{S}_c = n^{-1}\sum_{i=1}^{n}
+                (g_i - \bar{g})(g_i - \bar{g})'
+
+        which consistently estimates :math:`\text{Var}(g_i)` even when the
+        model is misspecified (i.e. :math:`E[g_i] \neq 0`).  The statistic is
+
+        .. math::
+
+            J^* = n\,\bar{g}'\hat{S}_c^{-1}\bar{g} \sim \chi^2_q
+
+        where :math:`q = n_{\text{instr}} - n_{\text{var}}`.
+
+        Parameters
+        ----------
+        params : ndarray
+            Estimated model parameters.
+        cov_type : str
+            Covariance type used for the main estimation.  The same type is
+            used to compute :math:`\hat{S}_c` (heteroskedastic, clustered,
+            kernel, or homoskedastic).
+        cov_config : dict
+            Configuration for the covariance estimator (e.g. ``clusters``,
+            ``bandwidth``, ``kernel``).
+
+        Returns
+        -------
+        WaldTestStatistic
+
+        References
+        ----------
+        Hansen, B. E. & Lee, S. (2021). Inference for iterated GMM under
+        misspecification. *Econometrica*, 89(3), 1419–1447.
+        """
+        y, x, z = self._wy, self._wx, self._wz
+        nobs, nvar, ninstr = y.shape[0], x.shape[1], z.shape[1]
+        eps = y - x @ params
+        g_bar = (z * eps).mean(0)
+
+        debiased = bool(cov_config.get("debiased", False))
+        if cov_type in ("robust", "heteroskedastic"):
+            score_cov: HomoskedasticWeightMatrix = HeteroskedasticWeightMatrix(
+                center=True, debiased=debiased
+            )
+        elif cov_type in ("unadjusted", "homoskedastic"):
+            score_cov = HomoskedasticWeightMatrix(center=True, debiased=debiased)
+        elif cov_type == "clustered":
+            score_cov = OneWayClusteredWeightMatrix(
+                clusters=cov_config["clusters"], center=True, debiased=debiased
+            )
+        elif cov_type == "kernel":
+            score_cov = KernelWeightMatrix(
+                kernel=str(cov_config.get("kernel", "bartlett")),
+                bandwidth=cov_config.get("bandwidth", None),
+                center=True,
+                debiased=debiased,
+            )
+        else:
+            score_cov = HeteroskedasticWeightMatrix(center=True, debiased=debiased)
+
+        s_c = score_cov.weight_matrix(x, z, eps)
+        stat = float(nobs * g_bar @ pinv(s_c) @ g_bar)
+        null = "Expected moment conditions are equal to 0"
+        name = "Hansen-Lee misspecification-robust J-test"
+        return WaldTestStatistic(stat, null, ninstr - nvar, name=name)
+
     def _j_statistic(
         self,
         params: linearmodels.typing.data.Float64Array,
@@ -1310,7 +1395,7 @@ def fit(
         )
 
         results = self._post_estimation(params, cov_estimator, cov_type)
-        gmm_pe = self._gmm_post_estimation(params, wmat, iters)
+        gmm_pe = self._gmm_post_estimation(params, wmat, iters, cov_type, cov_config)
 
         results.update(gmm_pe)
 
@@ -1321,6 +1406,8 @@ def _gmm_post_estimation(
         params: linearmodels.typing.data.Float64Array,
         weight_mat: linearmodels.typing.data.Float64Array,
         iters: int,
+        cov_type: str = "robust",
+        cov_config: dict[str, Any] | None = None,
     ) -> dict[str, Any]:
         """GMM-specific post-estimation results"""
         instr = self._instr_columns
@@ -1330,6 +1417,9 @@ def _gmm_post_estimation(
             "weight_config": self._weight_type,
             "iterations": iters,
             "j_stat": self._j_statistic(params, weight_mat),
+            "robust_j_stat": self._hansen_lee_j_statistic(
+                params, cov_type, cov_config or {}
+            ),
         }
 
         return gmm_specific
@@ -1673,7 +1763,7 @@ def fit(
             wx, wy, wz, params, wmat, cov_type, **cov_config
         )
         results = self._post_estimation(params, cov_estimator, cov_type)
-        gmm_pe = self._gmm_post_estimation(params, wmat, iters)
+        gmm_pe = self._gmm_post_estimation(params, wmat, iters, cov_type, cov_config)
         results.update(gmm_pe)
 
         return IVGMMResults(results, self)
diff --git a/linearmodels/iv/results.py b/linearmodels/iv/results.py
index 967c7cb957..8115a95324 100644
--- a/linearmodels/iv/results.py
+++ b/linearmodels/iv/results.py
@@ -1411,6 +1411,7 @@ def __init__(
         self._weight_config = results["weight_config"]
         self._iterations = results["iterations"]
         self._j_stat = results["j_stat"]
+        self._robust_j_stat = results["robust_j_stat"]
 
     @property
     def weight_matrix(self) -> linearmodels.typing.data.Float64Array:
@@ -1460,6 +1461,72 @@ def j_stat(self) -> InvalidTestStatistic | WaldTestStatistic:
         """
         return self._j_stat
 
+    @property
+    def robust_j_stat(self) -> WaldTestStatistic:
+        r"""
+        Hansen-Lee (2021) misspecification-robust J-test of overidentifying
+        restrictions
+
+        Returns
+        -------
+        WaldTestStatistic
+            Robust J statistic test of overidentifying restrictions
+
+        Notes
+        -----
+        Unlike the standard J-statistic, which uses the uncentered moment
+        covariance as the weighting matrix, this test uses the *centered*
+        moment covariance
+
+        .. math ::
+
+          \hat{S}_c = n^{-1}\sum_{i=1}^{n}(g_i - \bar{g})(g_i - \bar{g})'
+
+        which is consistent even when the model is misspecified
+        (:math:`E[g_i] \neq 0`).  The statistic is
+
+        .. math ::
+
+          J^* = n\,\bar{g}'\hat{S}_c^{-1}\bar{g} \sim \chi^2_q
+
+        where :math:`q = n_{\text{instr}} - n_{\text{var}}` is the degree of
+        overidentification.
+
+        Under correct specification :math:`J^*` has the same :math:`\chi^2_q`
+        null distribution as the standard J-test.  Under misspecification
+        :math:`J^*` diverges at rate :math:`n`, making it a consistent test
+        for model misspecification.
+
+        References
+        ----------
+        Hansen, B. E. & Lee, S. (2021). Inference for iterated GMM under
+        misspecification. *Econometrica*, 89(3), 1419–1447.
+        """
+        return self._robust_j_stat
+
+    def _top_right(self) -> list[tuple[str, str]]:
+        j = self.j_stat
+        rj = self.robust_j_stat
+        return [
+            ("J-statistic:", _str(j.stat)),
+            ("P-value (J-stat):", pval_format(j.pval)),
+            ("Distribution (J-stat):", str(j.dist_name)),
+            ("HL J-statistic:", _str(rj.stat)),
+            ("P-value (HL J-stat):", pval_format(rj.pval)),
+            ("Distribution (HL J-stat):", str(rj.dist_name)),
+            ("Iterations:", str(self.iterations)),
+        ]
+
+    def _update_extra_text(self, extra_text: list[str]) -> list[str]:
+        instruments = self.model.instruments
+        if instruments.shape[1] > 0:
+            endog = self.model.endog
+            extra_text.append("Endogenous: " + ", ".join(endog.cols))
+            extra_text.append("Instruments: " + ", ".join(instruments.cols))
+            cov_descr = str(self._cov_estimator)
+            extra_text.extend(list(cov_descr.split("\n")))
+        return extra_text
+
     def c_stat(self, variables: list[str] | str | None = None) -> WaldTestStatistic:
         r"""
         C-test of endogeneity
diff --git a/linearmodels/tests/iv/test_hansen_lee_j_stat.py b/linearmodels/tests/iv/test_hansen_lee_j_stat.py
new file mode 100644
index 0000000000..9a41371b2a
--- /dev/null
+++ b/linearmodels/tests/iv/test_hansen_lee_j_stat.py
@@ -0,0 +1,212 @@
+"""
+Tests for the Hansen-Lee (2021) misspecification-robust J-test.
+
+References
+----------
+Hansen, B. E. & Lee, S. (2021). Inference for iterated GMM under
+misspecification. Econometrica, 89(3), 1419-1447.
+"""
+import numpy as np
+from numpy.linalg import pinv
+from numpy.testing import assert_allclose
+import pytest
+
+from linearmodels.iv.gmm import HeteroskedasticWeightMatrix
+from linearmodels.iv.model import IVGMM, IVGMMCUE
+from linearmodels.shared.hypotheses import WaldTestStatistic
+from linearmodels.tests.iv._utility import generate_data
+
+
+@pytest.fixture(scope="module")
+def data():
+    return generate_data()
+
+
+@pytest.fixture(scope="module")
+def res_robust(data):
+    return IVGMM(data.dep, data.exog, data.endog, data.instr).fit(
+        cov_type="robust"
+    )
+
+
+@pytest.fixture(scope="module")
+def res_unadjusted(data):
+    return IVGMM(data.dep, data.exog, data.endog, data.instr).fit(
+        cov_type="unadjusted"
+    )
+
+
+@pytest.fixture(scope="module")
+def res_clustered(data):
+    return IVGMM(
+        data.dep, data.exog, data.endog, data.instr
+    ).fit(cov_type="clustered", clusters=data.clusters)
+
+
+@pytest.fixture(scope="module")
+def res_kernel(data):
+    return IVGMM(data.dep, data.exog, data.endog, data.instr).fit(
+        cov_type="kernel", kernel="bartlett", bandwidth=4
+    )
+
+
+# ---------------------------------------------------------------------------
+# Attribute existence and types
+# ---------------------------------------------------------------------------
+
+
+def test_robust_j_stat_is_wald_test_statistic(res_robust):
+    assert isinstance(res_robust.robust_j_stat, WaldTestStatistic)
+
+
+def test_standard_j_stat_still_present(res_robust):
+    assert isinstance(res_robust.j_stat, WaldTestStatistic)
+
+
+def test_robust_j_stat_stat_is_finite(res_robust):
+    assert np.isfinite(res_robust.robust_j_stat.stat)
+
+
+def test_robust_j_stat_pval_in_unit_interval(res_robust):
+    pval = res_robust.robust_j_stat.pval
+    assert 0.0 <= pval <= 1.0
+
+
+def test_robust_j_stat_df_equals_overidentification_degree(res_robust, data):
+    ninstr = data.instr.shape[1] + data.exog.shape[1]
+    nendog = data.endog.shape[1]
+    expected_df = ninstr - nendog - data.exog.shape[1]
+    # df = total instruments - total params = (nexog+ninstr) - (nexog+nendog)
+    #    = ninstr - nendog
+    actual_df = res_robust.robust_j_stat.df
+    assert actual_df == data.instr.shape[1] - data.endog.shape[1]
+
+
+def test_robust_j_stat_stat_positive(res_robust):
+    assert res_robust.robust_j_stat.stat >= 0.0
+
+
+# ---------------------------------------------------------------------------
+# Formula verification: J* = n * g_bar' @ S_c^{-1} @ g_bar
+# ---------------------------------------------------------------------------
+
+
+def test_robust_j_stat_formula_matches_manual_calculation(res_robust, data):
+    """Verify the statistic equals the explicit closed-form expression."""
+    mod = IVGMM(data.dep, data.exog, data.endog, data.instr)
+    res = mod.fit(cov_type="robust")
+
+    nobs = data.dep.shape[0]
+    params = np.asarray(res.params)[:, None]
+    x = mod._wx
+    z = mod._wz
+    y = mod._wy
+
+    eps = y - x @ params
+    g_bar = (z * eps).mean(0)
+
+    sc_est = HeteroskedasticWeightMatrix(center=True)
+    s_c = sc_est.weight_matrix(x, z, eps)
+    expected = float(nobs * g_bar @ pinv(s_c) @ g_bar)
+
+    assert_allclose(res.robust_j_stat.stat, expected, rtol=1e-6)
+
+
+# ---------------------------------------------------------------------------
+# Under correct specification: J* ≈ J (both are chi2_q consistent)
+# ---------------------------------------------------------------------------
+
+
+def test_robust_and_standard_j_have_same_df(res_robust):
+    """Both tests have the same degrees of freedom under correct specification."""
+    assert res_robust.j_stat.df == res_robust.robust_j_stat.df
+
+
+def test_robust_j_stat_nonnegative_all_cov_types(
+    res_robust, res_unadjusted, res_clustered, res_kernel
+):
+    for res in [res_robust, res_unadjusted, res_clustered, res_kernel]:
+        assert res.robust_j_stat.stat >= 0.0
+
+
+def test_robust_j_pval_all_cov_types(
+    res_robust, res_unadjusted, res_clustered, res_kernel
+):
+    for res in [res_robust, res_unadjusted, res_clustered, res_kernel]:
+        pval = res.robust_j_stat.pval
+        assert 0.0 <= pval <= 1.0
+
+
+# ---------------------------------------------------------------------------
+# Under misspecification: J* diverges, standard J saturates
+# ---------------------------------------------------------------------------
+
+
+def test_robust_j_algebraic_identity_iterated_gmm():
+    """
+    For iterated GMM at convergence the identity
+
+        J* = J_std * n / (n - J_std)
+
+    follows from Ŝ = Ŝ_c + g_bar * g_bar' (Sherman-Morrison inversion).
+
+    DGP: y = 2*x + z_invalid + noise where z_invalid enters the outcome
+    directly, making the IV moment condition violated — both test statistics
+    are large enough to make the identity observable.
+    """
+    rng = np.random.default_rng(42)
+    nobs = 50_000
+
+    z_valid = rng.standard_normal(nobs)    # valid instrument
+    z_invalid = rng.standard_normal(nobs)  # invalid: correlated with eps
+    z_extra = rng.standard_normal(nobs)    # extra valid instrument
+    noise_x = rng.standard_normal(nobs)
+    noise_y = rng.standard_normal(nobs)
+
+    x = 0.8 * z_valid + 0.6 * noise_x     # endog, identified by z_valid
+    y = 2.0 * x + 1.0 * z_invalid + noise_y  # z_invalid in outcome -> misspecified
+
+    exog = np.ones((nobs, 1))
+    endog = x[:, None]
+    instr = np.column_stack([z_invalid, z_valid, z_extra])
+
+    res = IVGMM(y[:, None], exog, endog, instr).fit(cov_type="robust", iter_limit=50)
+
+    j_std = res.j_stat.stat
+    j_star = res.robust_j_stat.stat
+
+    expected_j_star = j_std * nobs / (nobs - j_std)
+    assert_allclose(j_star, expected_j_star, rtol=1e-4)
+
+    # Under misspecification both are large; J_std < n (bounded), J* >= J_std
+    assert j_std < nobs
+    assert j_star >= j_std
+
+
+# ---------------------------------------------------------------------------
+# IVGMMCUE also exposes robust_j_stat
+# ---------------------------------------------------------------------------
+
+
+def test_ivgmmcue_has_robust_j_stat(data):
+    res = IVGMMCUE(data.dep, data.exog, data.endog, data.instr).fit(
+        cov_type="robust"
+    )
+    assert isinstance(res.robust_j_stat, WaldTestStatistic)
+    assert np.isfinite(res.robust_j_stat.stat)
+    assert res.robust_j_stat.stat >= 0.0
+
+
+# ---------------------------------------------------------------------------
+# Summary display includes HL J-statistic rows
+# ---------------------------------------------------------------------------
+
+
+def test_summary_contains_hl_j_statistic(res_robust):
+    smry_str = str(res_robust.summary)
+    assert "HL J-statistic" in smry_str
+
+
+def test_summary_contains_j_statistic(res_robust):
+    smry_str = str(res_robust.summary)
+    assert "J-statistic" in smry_str