From 59ae2e07612d1d46463166a4327d1c41de2e31d8 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 27 May 2026 15:53:12 -0400
Subject: [PATCH 1/3] Add weight-coverage walker to converter test suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For each test fixture with a checkpoint format, materialise the Fast-LLM
base model (CPU, ParameterMeta — no distributed setup) and assert every
parameter is consumed by some leaf WeightConverter emitted by
base_model_converter_class.get_converters(config). Runtime-tied parameters
count as covered when any group member has a converter, matching export
behaviour. Gemma4 is xfailed against pre-existing coverage gaps in its
declarations.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 tests/models/test_converters.py | 71 +++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/tests/models/test_converters.py b/tests/models/test_converters.py
index fcc9adf9f..53c7e41e2 100644
--- a/tests/models/test_converters.py
+++ b/tests/models/test_converters.py
@@ -7,6 +7,11 @@
 * Architecture-hint fields on ``cls.fast_llm_config_class`` are all consumed by some declaration.
 * OptionalConfigConverter sentinels match the resolved field default. Otherwise an exported value equal
   to the sentinel becomes absent on disk and re-imports as a different default, silently breaking round-trip.
+
+Plus an end-to-end weight-coverage walker (:func:`test_format_weight_coverage`) — for each test
+fixture with a checkpoint format, materialise the Fast-LLM model and assert every parameter is consumed
+by some leaf :class:`WeightConverter`. Catches the "silent drop" failure mode where a model param has
+no converter and ``_convert_state_dict`` skips it on export.
 """
 
 import typing
@@ -24,9 +29,12 @@
     _safe_set_nested_dict_value,
 )
 from fast_llm.engine.checkpoint.huggingface import HuggingfaceStateDictCheckpointHandler
+from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.layers.attention.config import AttentionConfig
 from fast_llm.layers.block.config import PatternBlockSequenceConfig
 from fast_llm.layers.decoder.config import DecoderBlockConfig, StochasticMixerConfig
+from fast_llm.models.gpt.conversion.config import Gemma4CheckpointFormat
+from tests.utils.model_configs import MODEL_CONFIGS
 
 # Configs that don't default-construct cleanly need a minimal-valid factory.
 _DEFAULT_FACTORIES: dict[type, typing.Callable[[], typing.Any]] = {
@@ -156,6 +164,69 @@ def test_safe_set_nested_dict_value_collision() -> None:
         _safe_set_nested_dict_value(out, ("nested", "key"), "other")
 
 
+_FIXTURES_WITH_FORMAT = [name for name, cfg in MODEL_CONFIGS.items() if cfg.checkpoint_format is not None]
+
+
+def _weight_coverage_param(fixture_name: str) -> typing.Any:
+    handler = MODEL_CONFIGS[fixture_name].checkpoint_format.get_handler_class()
+    if handler.format is Gemma4CheckpointFormat:
+        return pytest.param(
+            fixture_name,
+            marks=pytest.mark.xfail(
+                strict=True,
+                reason=(
+                    "Gemma4 converters drop LayerNorm biases and non-MoE norm_2 on the full_attention "
+                    "branch of the test fixture, and declare ``output_scale`` unconditionally even when "
+                    "the block disables it."
+                ),
+            ),
+        )
+    return pytest.param(fixture_name)
+
+
+@pytest.mark.parametrize("fixture_name", [_weight_coverage_param(n) for n in _FIXTURES_WITH_FORMAT])
+def test_format_weight_coverage(fixture_name: str) -> None:
+    """Every Fast-LLM parameter must be consumed by some :class:`WeightConverter`.
+
+    Materialises the fixture's base model (CPU, meta tensors via ``ParameterMeta`` — no distributed
+    setup) and compares ``named_parameters()`` against the set of ``fast_llm_name`` entries emitted by
+    ``base_model_converter_class.get_converters(config)``. Runtime-tied parameters
+    (``BaseModel.get_tied_parameters``) count as covered if any member of their group has a converter,
+    matching the export-time behaviour where a single shared weight is serialised once.
+    """
+    model_testing_config = MODEL_CONFIGS[fixture_name]
+    handler = model_testing_config.checkpoint_format.get_handler_class()
+    base_model_config = model_testing_config.base_model_config_class.from_dict(
+        model_testing_config.config_dict["model"]["base_model"]
+    )
+    base_model = base_model_config.base_model_class(base_model_config, DistributedConfig())
+
+    param_id_to_name = {id(parameter): name for name, parameter in base_model.named_parameters()}
+    model_names = set(param_id_to_name.values())
+    tied_groups = [
+        frozenset(param_id_to_name[id(parameter)] for parameter in parameters)
+        for parameters in base_model.get_tied_parameters().values()
+    ]
+
+    consumed: set[str] = set()
+    for leaf in handler.base_model_converter_class.get_converters(base_model_config):
+        consumed.update(leaf.fast_llm_name)
+
+    # Tied closure: any group with at least one explicit consumer is covered in full.
+    covered = set(consumed)
+    for group in tied_groups:
+        if group & consumed:
+            covered |= group
+
+    missing = sorted(model_names - covered)
+    phantom = sorted(consumed - model_names)
+    assert not missing and not phantom, (
+        f"{handler.__name__}: weight coverage mismatch — "
+        f"Fast-LLM params with no converter: {missing}; "
+        f"converters with no matching param: {phantom}"
+    )
+
+
 def test_llama_export_rejects_mismatched_block_and_head_norm_epsilon() -> None:
     """End-to-end regression: a Llama config with mismatched block/head normalization epsilon must fail to
     export. Both the decoder Custom and the head Nested write ``rms_norm_eps`` into the same HF dict; a

From 49aab4b6e4b67bf1982d7a1182c005075e6f1972 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 28 May 2026 11:49:23 -0400
Subject: [PATCH 2/3] Inline weight-coverage param helper, drop redundant
 inline import

Fold the xfail-or-bare decision into the parametrize comprehension so the
gemma4 case is visible in one place, and drop the now-redundant inline
``MODEL_CONFIGS`` import from the existing llama export test.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 tests/models/test_converters.py | 40 ++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/models/test_converters.py b/tests/models/test_converters.py
index 53c7e41e2..0d0b40481 100644
--- a/tests/models/test_converters.py
+++ b/tests/models/test_converters.py
@@ -164,27 +164,28 @@ def test_safe_set_nested_dict_value_collision() -> None:
         _safe_set_nested_dict_value(out, ("nested", "key"), "other")
 
 
-_FIXTURES_WITH_FORMAT = [name for name, cfg in MODEL_CONFIGS.items() if cfg.checkpoint_format is not None]
-
-
-def _weight_coverage_param(fixture_name: str) -> typing.Any:
-    handler = MODEL_CONFIGS[fixture_name].checkpoint_format.get_handler_class()
-    if handler.format is Gemma4CheckpointFormat:
-        return pytest.param(
-            fixture_name,
-            marks=pytest.mark.xfail(
-                strict=True,
-                reason=(
-                    "Gemma4 converters drop LayerNorm biases and non-MoE norm_2 on the full_attention "
-                    "branch of the test fixture, and declare ``output_scale`` unconditionally even when "
-                    "the block disables it."
+@pytest.mark.parametrize(
+    "fixture_name",
+    [
+        (
+            pytest.param(
+                name,
+                marks=pytest.mark.xfail(
+                    strict=True,
+                    reason=(
+                        "Gemma4 converters drop LayerNorm biases and non-MoE norm_2 on the full_attention "
+                        "branch of the test fixture, and declare ``output_scale`` unconditionally even when "
+                        "the block disables it."
+                    ),
                 ),
-            ),
+            )
+            if cfg.checkpoint_format is Gemma4CheckpointFormat
+            else name
         )
-    return pytest.param(fixture_name)
-
-
-@pytest.mark.parametrize("fixture_name", [_weight_coverage_param(n) for n in _FIXTURES_WITH_FORMAT])
+        for name, cfg in MODEL_CONFIGS.items()
+        if cfg.checkpoint_format is not None
+    ],
+)
 def test_format_weight_coverage(fixture_name: str) -> None:
     """Every Fast-LLM parameter must be consumed by some :class:`WeightConverter`.
 
@@ -235,7 +236,6 @@ def test_llama_export_rejects_mismatched_block_and_head_norm_epsilon() -> None:
 
     from fast_llm.models.gpt.config import GPTBaseModelConfig
     from fast_llm.models.gpt.conversion.llama import LlamaBaseModelConverter
-    from tests.utils.model_configs import MODEL_CONFIGS
 
     cfg = copy.deepcopy(MODEL_CONFIGS["llama"].config_dict["model"]["base_model"])
     # Default head normalization inherits the block default (1e-5); pin head to a different value.

From ac04292e3dcb6978dfbcb8edd8d8a4370015a613 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 28 May 2026 15:03:39 -0400
Subject: [PATCH 3/3] Deepcopy per-block override dicts in the gemma4 fixture,
 drop walker xfail

The gemma4 fixture spread ``_gemma4_block_overrides`` and ``_gemma4_mixer_overrides``
into both ``sliding_attention`` and ``full_attention`` blocks without per-use
deepcopy. Dict-spread copies top-level keys but values are shared references, so
the two blocks' nested override dicts aliased. ``Config._from_dict`` then
mutates its input via ``pop`` when extracting fields, emptying the shared
sub-dicts after the first block's resolution. The second block silently fell
back to type defaults: LayerNormalizationConfig where the fixture said
RMSNormalizationConfig, ``output_scale.enabled=None`` where it said ``True``.

The walker xfail on ``test_format_weight_coverage[gemma4]`` had been blamed
(in PR description and the prior xfail reason) on three converter declaration
gaps; investigation showed those were all symptoms of the fixture aliasing.
With the per-spread deepcopy, both blocks resolve as intended (RMSNorm
everywhere, no LayerNorm biases, ``output_scale.enabled=True``), and the
walker passes for gemma4 without any converter changes.

The underlying ``Config._from_dict`` mutate-input behaviour is a footgun
beyond this fixture and is worth addressing separately.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 tests/models/test_converters.py | 22 +---------------------
 tests/utils/model_configs.py    | 13 +++++++++----
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/tests/models/test_converters.py b/tests/models/test_converters.py
index 0d0b40481..8b23ab222 100644
--- a/tests/models/test_converters.py
+++ b/tests/models/test_converters.py
@@ -33,7 +33,6 @@
 from fast_llm.layers.attention.config import AttentionConfig
 from fast_llm.layers.block.config import PatternBlockSequenceConfig
 from fast_llm.layers.decoder.config import DecoderBlockConfig, StochasticMixerConfig
-from fast_llm.models.gpt.conversion.config import Gemma4CheckpointFormat
 from tests.utils.model_configs import MODEL_CONFIGS
 
 # Configs that don't default-construct cleanly need a minimal-valid factory.
@@ -165,26 +164,7 @@ def test_safe_set_nested_dict_value_collision() -> None:
 
 
 @pytest.mark.parametrize(
-    "fixture_name",
-    [
-        (
-            pytest.param(
-                name,
-                marks=pytest.mark.xfail(
-                    strict=True,
-                    reason=(
-                        "Gemma4 converters drop LayerNorm biases and non-MoE norm_2 on the full_attention "
-                        "branch of the test fixture, and declare ``output_scale`` unconditionally even when "
-                        "the block disables it."
-                    ),
-                ),
-            )
-            if cfg.checkpoint_format is Gemma4CheckpointFormat
-            else name
-        )
-        for name, cfg in MODEL_CONFIGS.items()
-        if cfg.checkpoint_format is not None
-    ],
+    "fixture_name", [name for name, cfg in MODEL_CONFIGS.items() if cfg.checkpoint_format is not None]
 )
 def test_format_weight_coverage(fixture_name: str) -> None:
     """Every Fast-LLM parameter must be consumed by some :class:`WeightConverter`.
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index a7ff05478..77828e0bd 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -1051,22 +1051,27 @@ def update_and_add_testing_config(
         ("model", "base_model", "decoder"): {
             "type": "pattern",
             "blocks": {
+                # Sub-dicts in ``_gemma4_block_overrides`` / ``_gemma4_mixer_overrides`` are deepcopied
+                # per block. Without this the two blocks' nested override dicts would alias, and
+                # ``Config._from_dict`` (which mutates its input via ``pop``) would consume the
+                # shared sub-dicts when processing the first block, leaving the second to silently
+                # fall back to type defaults (LayerNorm / output_scale.enabled=None).
                 "sliding_attention": {
                     **copy.deepcopy(_llama_block),
-                    **_gemma4_block_overrides,
+                    **copy.deepcopy(_gemma4_block_overrides),
                     "mixer": {
                         **copy.deepcopy(_llama_block["mixer"]),
-                        **_gemma4_mixer_overrides,
+                        **copy.deepcopy(_gemma4_mixer_overrides),
                         "window_size": 128,
                     },
                     "mlp": copy.deepcopy(_gemma4_moe_mlp),
                 },
                 "full_attention": {
                     **copy.deepcopy(_llama_block),
-                    **_gemma4_block_overrides,
+                    **copy.deepcopy(_gemma4_block_overrides),
                     "mixer": {
                         **copy.deepcopy(_llama_block["mixer"]),
-                        **_gemma4_mixer_overrides,
+                        **copy.deepcopy(_gemma4_mixer_overrides),
                         "rotary": {"type": "proportional", "partial_rotary_factor": 0.25},
                     },
                     "mlp": copy.deepcopy(_gemma4_moe_mlp),