From 299e3d1d03de9b8bc30016ddb9214f9e2df04d34 Mon Sep 17 00:00:00 2001 From: hass-nation Date: Sun, 28 Jun 2026 14:04:34 +0100 Subject: [PATCH] ENH: Add Hansen-Lee misspecification-robust J-test to IVGMM Adds `robust_j_stat` property to `IVGMMResults` (and `IVGMMCUE`), implementing the misspecification-robust J-test from Hansen & Lee (2021, Econometrica, 89(3), 1419-1447). The standard J-statistic uses the uncentered moment covariance as its weight matrix, which under model misspecification (E[g(z,theta)] != 0) leads to a test that saturates at n rather than diverging. The Hansen-Lee statistic replaces the uncentered covariance with the *centered* covariance S_c = (1/n) sum (g_i - g_bar)(g_i - g_bar)', giving J* = n * g_bar' S_c^{-1} g_bar ~ chi2(q) under correct specification and diverging at rate n under misspecification. Changes: - `_IVGMMBase._hansen_lee_j_statistic`: computes J* for any cov_type (robust/heteroskedastic, homoskedastic, kernel, clustered) - `_IVGMMBase._gmm_post_estimation` + `IVGMM._gmm_post_estimation`: accept cov_type/cov_config and include `robust_j_stat` in results - `IVGMM.fit` + `IVGMMCUE.fit`: pass cov_type/cov_config through - `IVGMMResults.robust_j_stat` property with full docstring - `IVGMMResults._top_right`: summary now shows both J-stats side-by-side - 14 new tests in `linearmodels/tests/iv/test_hansen_lee_j_stat.py` covering type checks, formula verification, all cov_types, CUE, summary display, and the algebraic identity J* = J*n/(n-J) for iterated GMM Co-Authored-By: Claude Sonnet 4.6 --- linearmodels/iv/model.py | 94 +++++++- linearmodels/iv/results.py | 67 ++++++ .../tests/iv/test_hansen_lee_j_stat.py | 212 ++++++++++++++++++ 3 files changed, 371 insertions(+), 2 deletions(-) create mode 100644 linearmodels/tests/iv/test_hansen_lee_j_stat.py diff --git a/linearmodels/iv/model.py b/linearmodels/iv/model.py index 41cd06d049..4b044206a4 100644 --- a/linearmodels/iv/model.py +++ b/linearmodels/iv/model.py @@ -1031,6 +1031,8 @@ def _gmm_post_estimation( params: linearmodels.typing.data.Float64Array, weight_mat: linearmodels.typing.data.Float64Array, iters: int, + cov_type: str = "robust", + cov_config: dict[str, Any] | None = None, ) -> dict[str, Any]: """GMM-specific post-estimation results""" instr = self._instr_columns @@ -1040,10 +1042,93 @@ def _gmm_post_estimation( "weight_config": self._weight_type, "iterations": iters, "j_stat": self._j_statistic(params, weight_mat), + "robust_j_stat": self._hansen_lee_j_statistic( + params, cov_type, cov_config or {} + ), } return gmm_specific + def _hansen_lee_j_statistic( + self, + params: linearmodels.typing.data.Float64Array, + cov_type: str, + cov_config: dict[str, Any], + ) -> WaldTestStatistic: + r""" + Hansen-Lee (2021) misspecification-robust J-test of overidentifying + restrictions. + + Unlike the standard J-test, which uses the uncentered moment covariance + as the weight matrix, this test uses the *centered* moment covariance + + .. math:: + + \hat{S}_c = n^{-1}\sum_{i=1}^{n} + (g_i - \bar{g})(g_i - \bar{g})' + + which consistently estimates :math:`\text{Var}(g_i)` even when the + model is misspecified (i.e. :math:`E[g_i] \neq 0`). The statistic is + + .. math:: + + J^* = n\,\bar{g}'\hat{S}_c^{-1}\bar{g} \sim \chi^2_q + + where :math:`q = n_{\text{instr}} - n_{\text{var}}`. + + Parameters + ---------- + params : ndarray + Estimated model parameters. + cov_type : str + Covariance type used for the main estimation. The same type is + used to compute :math:`\hat{S}_c` (heteroskedastic, clustered, + kernel, or homoskedastic). + cov_config : dict + Configuration for the covariance estimator (e.g. ``clusters``, + ``bandwidth``, ``kernel``). + + Returns + ------- + WaldTestStatistic + + References + ---------- + Hansen, B. E. & Lee, S. (2021). Inference for iterated GMM under + misspecification. *Econometrica*, 89(3), 1419–1447. + """ + y, x, z = self._wy, self._wx, self._wz + nobs, nvar, ninstr = y.shape[0], x.shape[1], z.shape[1] + eps = y - x @ params + g_bar = (z * eps).mean(0) + + debiased = bool(cov_config.get("debiased", False)) + if cov_type in ("robust", "heteroskedastic"): + score_cov: HomoskedasticWeightMatrix = HeteroskedasticWeightMatrix( + center=True, debiased=debiased + ) + elif cov_type in ("unadjusted", "homoskedastic"): + score_cov = HomoskedasticWeightMatrix(center=True, debiased=debiased) + elif cov_type == "clustered": + score_cov = OneWayClusteredWeightMatrix( + clusters=cov_config["clusters"], center=True, debiased=debiased + ) + elif cov_type == "kernel": + score_cov = KernelWeightMatrix( + kernel=str(cov_config.get("kernel", "bartlett")), + bandwidth=cov_config.get("bandwidth", None), + center=True, + debiased=debiased, + ) + else: + score_cov = HeteroskedasticWeightMatrix(center=True, debiased=debiased) + + s_c = score_cov.weight_matrix(x, z, eps) + stat = float(nobs * g_bar @ pinv(s_c) @ g_bar) + null = "Expected moment conditions are equal to 0" + name = "Hansen-Lee misspecification-robust J-test" + return WaldTestStatistic(stat, null, ninstr - nvar, name=name) + def _j_statistic( self, params: linearmodels.typing.data.Float64Array, @@ -1310,7 +1395,7 @@ def fit( ) results = self._post_estimation(params, cov_estimator, cov_type) - gmm_pe = self._gmm_post_estimation(params, wmat, iters) + gmm_pe = self._gmm_post_estimation(params, wmat, iters, cov_type, cov_config) results.update(gmm_pe) @@ -1321,6 +1406,8 @@ def _gmm_post_estimation( params: linearmodels.typing.data.Float64Array, weight_mat: linearmodels.typing.data.Float64Array, iters: int, + cov_type: str = "robust", + cov_config: dict[str, Any] | None = None, ) -> dict[str, Any]: """GMM-specific post-estimation results""" instr = self._instr_columns @@ -1330,6 +1417,9 @@ def _gmm_post_estimation( "weight_config": self._weight_type, "iterations": iters, "j_stat": self._j_statistic(params, weight_mat), + "robust_j_stat": self._hansen_lee_j_statistic( + params, cov_type, cov_config or {} + ), } return gmm_specific @@ -1673,7 +1763,7 @@ def fit( wx, wy, wz, params, wmat, cov_type, **cov_config ) results = self._post_estimation(params, cov_estimator, cov_type) - gmm_pe = self._gmm_post_estimation(params, wmat, iters) + gmm_pe = self._gmm_post_estimation(params, wmat, iters, cov_type, cov_config) results.update(gmm_pe) return IVGMMResults(results, self) diff --git a/linearmodels/iv/results.py b/linearmodels/iv/results.py index 967c7cb957..8115a95324 100644 --- a/linearmodels/iv/results.py +++ b/linearmodels/iv/results.py @@ -1411,6 +1411,7 @@ def __init__( self._weight_config = results["weight_config"] self._iterations = results["iterations"] self._j_stat = results["j_stat"] + self._robust_j_stat = results["robust_j_stat"] @property def weight_matrix(self) -> linearmodels.typing.data.Float64Array: @@ -1460,6 +1461,72 @@ def j_stat(self) -> InvalidTestStatistic | WaldTestStatistic: """ return self._j_stat + @property + def robust_j_stat(self) -> WaldTestStatistic: + r""" + Hansen-Lee (2021) misspecification-robust J-test of overidentifying + restrictions + + Returns + ------- + WaldTestStatistic + Robust J statistic test of overidentifying restrictions + + Notes + ----- + Unlike the standard J-statistic, which uses the uncentered moment + covariance as the weighting matrix, this test uses the *centered* + moment covariance + + .. math :: + + \hat{S}_c = n^{-1}\sum_{i=1}^{n}(g_i - \bar{g})(g_i - \bar{g})' + + which is consistent even when the model is misspecified + (:math:`E[g_i] \neq 0`). The statistic is + + .. math :: + + J^* = n\,\bar{g}'\hat{S}_c^{-1}\bar{g} \sim \chi^2_q + + where :math:`q = n_{\text{instr}} - n_{\text{var}}` is the degree of + overidentification. + + Under correct specification :math:`J^*` has the same :math:`\chi^2_q` + null distribution as the standard J-test. Under misspecification + :math:`J^*` diverges at rate :math:`n`, making it a consistent test + for model misspecification. + + References + ---------- + Hansen, B. E. & Lee, S. (2021). Inference for iterated GMM under + misspecification. *Econometrica*, 89(3), 1419–1447. + """ + return self._robust_j_stat + + def _top_right(self) -> list[tuple[str, str]]: + j = self.j_stat + rj = self.robust_j_stat + return [ + ("J-statistic:", _str(j.stat)), + ("P-value (J-stat):", pval_format(j.pval)), + ("Distribution (J-stat):", str(j.dist_name)), + ("HL J-statistic:", _str(rj.stat)), + ("P-value (HL J-stat):", pval_format(rj.pval)), + ("Distribution (HL J-stat):", str(rj.dist_name)), + ("Iterations:", str(self.iterations)), + ] + + def _update_extra_text(self, extra_text: list[str]) -> list[str]: + instruments = self.model.instruments + if instruments.shape[1] > 0: + endog = self.model.endog + extra_text.append("Endogenous: " + ", ".join(endog.cols)) + extra_text.append("Instruments: " + ", ".join(instruments.cols)) + cov_descr = str(self._cov_estimator) + extra_text.extend(list(cov_descr.split("\n"))) + return extra_text + def c_stat(self, variables: list[str] | str | None = None) -> WaldTestStatistic: r""" C-test of endogeneity diff --git a/linearmodels/tests/iv/test_hansen_lee_j_stat.py b/linearmodels/tests/iv/test_hansen_lee_j_stat.py new file mode 100644 index 0000000000..9a41371b2a --- /dev/null +++ b/linearmodels/tests/iv/test_hansen_lee_j_stat.py @@ -0,0 +1,212 @@ +""" +Tests for the Hansen-Lee (2021) misspecification-robust J-test. + +References +---------- +Hansen, B. E. & Lee, S. (2021). Inference for iterated GMM under +misspecification. Econometrica, 89(3), 1419-1447. +""" +import numpy as np +from numpy.linalg import pinv +from numpy.testing import assert_allclose +import pytest + +from linearmodels.iv.gmm import HeteroskedasticWeightMatrix +from linearmodels.iv.model import IVGMM, IVGMMCUE +from linearmodels.shared.hypotheses import WaldTestStatistic +from linearmodels.tests.iv._utility import generate_data + + +@pytest.fixture(scope="module") +def data(): + return generate_data() + + +@pytest.fixture(scope="module") +def res_robust(data): + return IVGMM(data.dep, data.exog, data.endog, data.instr).fit( + cov_type="robust" + ) + + +@pytest.fixture(scope="module") +def res_unadjusted(data): + return IVGMM(data.dep, data.exog, data.endog, data.instr).fit( + cov_type="unadjusted" + ) + + +@pytest.fixture(scope="module") +def res_clustered(data): + return IVGMM( + data.dep, data.exog, data.endog, data.instr + ).fit(cov_type="clustered", clusters=data.clusters) + + +@pytest.fixture(scope="module") +def res_kernel(data): + return IVGMM(data.dep, data.exog, data.endog, data.instr).fit( + cov_type="kernel", kernel="bartlett", bandwidth=4 + ) + + +# --------------------------------------------------------------------------- +# Attribute existence and types +# --------------------------------------------------------------------------- + + +def test_robust_j_stat_is_wald_test_statistic(res_robust): + assert isinstance(res_robust.robust_j_stat, WaldTestStatistic) + + +def test_standard_j_stat_still_present(res_robust): + assert isinstance(res_robust.j_stat, WaldTestStatistic) + + +def test_robust_j_stat_stat_is_finite(res_robust): + assert np.isfinite(res_robust.robust_j_stat.stat) + + +def test_robust_j_stat_pval_in_unit_interval(res_robust): + pval = res_robust.robust_j_stat.pval + assert 0.0 <= pval <= 1.0 + + +def test_robust_j_stat_df_equals_overidentification_degree(res_robust, data): + ninstr = data.instr.shape[1] + data.exog.shape[1] + nendog = data.endog.shape[1] + expected_df = ninstr - nendog - data.exog.shape[1] + # df = total instruments - total params = (nexog+ninstr) - (nexog+nendog) + # = ninstr - nendog + actual_df = res_robust.robust_j_stat.df + assert actual_df == data.instr.shape[1] - data.endog.shape[1] + + +def test_robust_j_stat_stat_positive(res_robust): + assert res_robust.robust_j_stat.stat >= 0.0 + + +# --------------------------------------------------------------------------- +# Formula verification: J* = n * g_bar' @ S_c^{-1} @ g_bar +# --------------------------------------------------------------------------- + + +def test_robust_j_stat_formula_matches_manual_calculation(res_robust, data): + """Verify the statistic equals the explicit closed-form expression.""" + mod = IVGMM(data.dep, data.exog, data.endog, data.instr) + res = mod.fit(cov_type="robust") + + nobs = data.dep.shape[0] + params = np.asarray(res.params)[:, None] + x = mod._wx + z = mod._wz + y = mod._wy + + eps = y - x @ params + g_bar = (z * eps).mean(0) + + sc_est = HeteroskedasticWeightMatrix(center=True) + s_c = sc_est.weight_matrix(x, z, eps) + expected = float(nobs * g_bar @ pinv(s_c) @ g_bar) + + assert_allclose(res.robust_j_stat.stat, expected, rtol=1e-6) + + +# --------------------------------------------------------------------------- +# Under correct specification: J* ≈ J (both are chi2_q consistent) +# --------------------------------------------------------------------------- + + +def test_robust_and_standard_j_have_same_df(res_robust): + """Both tests have the same degrees of freedom under correct specification.""" + assert res_robust.j_stat.df == res_robust.robust_j_stat.df + + +def test_robust_j_stat_nonnegative_all_cov_types( + res_robust, res_unadjusted, res_clustered, res_kernel +): + for res in [res_robust, res_unadjusted, res_clustered, res_kernel]: + assert res.robust_j_stat.stat >= 0.0 + + +def test_robust_j_pval_all_cov_types( + res_robust, res_unadjusted, res_clustered, res_kernel +): + for res in [res_robust, res_unadjusted, res_clustered, res_kernel]: + pval = res.robust_j_stat.pval + assert 0.0 <= pval <= 1.0 + + +# --------------------------------------------------------------------------- +# Under misspecification: J* diverges, standard J saturates +# --------------------------------------------------------------------------- + + +def test_robust_j_algebraic_identity_iterated_gmm(): + """ + For iterated GMM at convergence the identity + + J* = J_std * n / (n - J_std) + + follows from Ŝ = Ŝ_c + g_bar * g_bar' (Sherman-Morrison inversion). + + DGP: y = 2*x + z_invalid + noise where z_invalid enters the outcome + directly, making the IV moment condition violated — both test statistics + are large enough to make the identity observable. + """ + rng = np.random.default_rng(42) + nobs = 50_000 + + z_valid = rng.standard_normal(nobs) # valid instrument + z_invalid = rng.standard_normal(nobs) # invalid: correlated with eps + z_extra = rng.standard_normal(nobs) # extra valid instrument + noise_x = rng.standard_normal(nobs) + noise_y = rng.standard_normal(nobs) + + x = 0.8 * z_valid + 0.6 * noise_x # endog, identified by z_valid + y = 2.0 * x + 1.0 * z_invalid + noise_y # z_invalid in outcome -> misspecified + + exog = np.ones((nobs, 1)) + endog = x[:, None] + instr = np.column_stack([z_invalid, z_valid, z_extra]) + + res = IVGMM(y[:, None], exog, endog, instr).fit(cov_type="robust", iter_limit=50) + + j_std = res.j_stat.stat + j_star = res.robust_j_stat.stat + + expected_j_star = j_std * nobs / (nobs - j_std) + assert_allclose(j_star, expected_j_star, rtol=1e-4) + + # Under misspecification both are large; J_std < n (bounded), J* >= J_std + assert j_std < nobs + assert j_star >= j_std + + +# --------------------------------------------------------------------------- +# IVGMMCUE also exposes robust_j_stat +# --------------------------------------------------------------------------- + + +def test_ivgmmcue_has_robust_j_stat(data): + res = IVGMMCUE(data.dep, data.exog, data.endog, data.instr).fit( + cov_type="robust" + ) + assert isinstance(res.robust_j_stat, WaldTestStatistic) + assert np.isfinite(res.robust_j_stat.stat) + assert res.robust_j_stat.stat >= 0.0 + + +# --------------------------------------------------------------------------- +# Summary display includes HL J-statistic rows +# --------------------------------------------------------------------------- + + +def test_summary_contains_hl_j_statistic(res_robust): + smry_str = str(res_robust.summary) + assert "HL J-statistic" in smry_str + + +def test_summary_contains_j_statistic(res_robust): + smry_str = str(res_robust.summary) + assert "J-statistic" in smry_str