From 59ae2e07612d1d46463166a4327d1c41de2e31d8 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 27 May 2026 15:53:12 -0400 Subject: [PATCH 1/3] Add weight-coverage walker to converter test suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For each test fixture with a checkpoint format, materialise the Fast-LLM base model (CPU, ParameterMeta — no distributed setup) and assert every parameter is consumed by some leaf WeightConverter emitted by base_model_converter_class.get_converters(config). Runtime-tied parameters count as covered when any group member has a converter, matching export behaviour. Gemma4 is xfailed against pre-existing coverage gaps in its declarations. Co-Authored-By: Claude Opus 4.7 --- tests/models/test_converters.py | 71 +++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tests/models/test_converters.py b/tests/models/test_converters.py index fcc9adf9f..53c7e41e2 100644 --- a/tests/models/test_converters.py +++ b/tests/models/test_converters.py @@ -7,6 +7,11 @@ * Architecture-hint fields on ``cls.fast_llm_config_class`` are all consumed by some declaration. * OptionalConfigConverter sentinels match the resolved field default. Otherwise an exported value equal to the sentinel becomes absent on disk and re-imports as a different default, silently breaking round-trip. + +Plus an end-to-end weight-coverage walker (:func:`test_format_weight_coverage`) — for each test +fixture with a checkpoint format, materialise the Fast-LLM model and assert every parameter is consumed +by some leaf :class:`WeightConverter`. Catches the "silent drop" failure mode where a model param has +no converter and ``_convert_state_dict`` skips it on export. """ import typing @@ -24,9 +29,12 @@ _safe_set_nested_dict_value, ) from fast_llm.engine.checkpoint.huggingface import HuggingfaceStateDictCheckpointHandler +from fast_llm.engine.distributed.config import DistributedConfig from fast_llm.layers.attention.config import AttentionConfig from fast_llm.layers.block.config import PatternBlockSequenceConfig from fast_llm.layers.decoder.config import DecoderBlockConfig, StochasticMixerConfig +from fast_llm.models.gpt.conversion.config import Gemma4CheckpointFormat +from tests.utils.model_configs import MODEL_CONFIGS # Configs that don't default-construct cleanly need a minimal-valid factory. _DEFAULT_FACTORIES: dict[type, typing.Callable[[], typing.Any]] = { @@ -156,6 +164,69 @@ def test_safe_set_nested_dict_value_collision() -> None: _safe_set_nested_dict_value(out, ("nested", "key"), "other") +_FIXTURES_WITH_FORMAT = [name for name, cfg in MODEL_CONFIGS.items() if cfg.checkpoint_format is not None] + + +def _weight_coverage_param(fixture_name: str) -> typing.Any: + handler = MODEL_CONFIGS[fixture_name].checkpoint_format.get_handler_class() + if handler.format is Gemma4CheckpointFormat: + return pytest.param( + fixture_name, + marks=pytest.mark.xfail( + strict=True, + reason=( + "Gemma4 converters drop LayerNorm biases and non-MoE norm_2 on the full_attention " + "branch of the test fixture, and declare ``output_scale`` unconditionally even when " + "the block disables it." + ), + ), + ) + return pytest.param(fixture_name) + + +@pytest.mark.parametrize("fixture_name", [_weight_coverage_param(n) for n in _FIXTURES_WITH_FORMAT]) +def test_format_weight_coverage(fixture_name: str) -> None: + """Every Fast-LLM parameter must be consumed by some :class:`WeightConverter`. + + Materialises the fixture's base model (CPU, meta tensors via ``ParameterMeta`` — no distributed + setup) and compares ``named_parameters()`` against the set of ``fast_llm_name`` entries emitted by + ``base_model_converter_class.get_converters(config)``. Runtime-tied parameters + (``BaseModel.get_tied_parameters``) count as covered if any member of their group has a converter, + matching the export-time behaviour where a single shared weight is serialised once. + """ + model_testing_config = MODEL_CONFIGS[fixture_name] + handler = model_testing_config.checkpoint_format.get_handler_class() + base_model_config = model_testing_config.base_model_config_class.from_dict( + model_testing_config.config_dict["model"]["base_model"] + ) + base_model = base_model_config.base_model_class(base_model_config, DistributedConfig()) + + param_id_to_name = {id(parameter): name for name, parameter in base_model.named_parameters()} + model_names = set(param_id_to_name.values()) + tied_groups = [ + frozenset(param_id_to_name[id(parameter)] for parameter in parameters) + for parameters in base_model.get_tied_parameters().values() + ] + + consumed: set[str] = set() + for leaf in handler.base_model_converter_class.get_converters(base_model_config): + consumed.update(leaf.fast_llm_name) + + # Tied closure: any group with at least one explicit consumer is covered in full. + covered = set(consumed) + for group in tied_groups: + if group & consumed: + covered |= group + + missing = sorted(model_names - covered) + phantom = sorted(consumed - model_names) + assert not missing and not phantom, ( + f"{handler.__name__}: weight coverage mismatch — " + f"Fast-LLM params with no converter: {missing}; " + f"converters with no matching param: {phantom}" + ) + + def test_llama_export_rejects_mismatched_block_and_head_norm_epsilon() -> None: """End-to-end regression: a Llama config with mismatched block/head normalization epsilon must fail to export. Both the decoder Custom and the head Nested write ``rms_norm_eps`` into the same HF dict; a From 49aab4b6e4b67bf1982d7a1182c005075e6f1972 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 28 May 2026 11:49:23 -0400 Subject: [PATCH 2/3] Inline weight-coverage param helper, drop redundant inline import Fold the xfail-or-bare decision into the parametrize comprehension so the gemma4 case is visible in one place, and drop the now-redundant inline ``MODEL_CONFIGS`` import from the existing llama export test. Co-Authored-By: Claude Opus 4.7 --- tests/models/test_converters.py | 40 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/models/test_converters.py b/tests/models/test_converters.py index 53c7e41e2..0d0b40481 100644 --- a/tests/models/test_converters.py +++ b/tests/models/test_converters.py @@ -164,27 +164,28 @@ def test_safe_set_nested_dict_value_collision() -> None: _safe_set_nested_dict_value(out, ("nested", "key"), "other") -_FIXTURES_WITH_FORMAT = [name for name, cfg in MODEL_CONFIGS.items() if cfg.checkpoint_format is not None] - - -def _weight_coverage_param(fixture_name: str) -> typing.Any: - handler = MODEL_CONFIGS[fixture_name].checkpoint_format.get_handler_class() - if handler.format is Gemma4CheckpointFormat: - return pytest.param( - fixture_name, - marks=pytest.mark.xfail( - strict=True, - reason=( - "Gemma4 converters drop LayerNorm biases and non-MoE norm_2 on the full_attention " - "branch of the test fixture, and declare ``output_scale`` unconditionally even when " - "the block disables it." +@pytest.mark.parametrize( + "fixture_name", + [ + ( + pytest.param( + name, + marks=pytest.mark.xfail( + strict=True, + reason=( + "Gemma4 converters drop LayerNorm biases and non-MoE norm_2 on the full_attention " + "branch of the test fixture, and declare ``output_scale`` unconditionally even when " + "the block disables it." + ), ), - ), + ) + if cfg.checkpoint_format is Gemma4CheckpointFormat + else name ) - return pytest.param(fixture_name) - - -@pytest.mark.parametrize("fixture_name", [_weight_coverage_param(n) for n in _FIXTURES_WITH_FORMAT]) + for name, cfg in MODEL_CONFIGS.items() + if cfg.checkpoint_format is not None + ], +) def test_format_weight_coverage(fixture_name: str) -> None: """Every Fast-LLM parameter must be consumed by some :class:`WeightConverter`. @@ -235,7 +236,6 @@ def test_llama_export_rejects_mismatched_block_and_head_norm_epsilon() -> None: from fast_llm.models.gpt.config import GPTBaseModelConfig from fast_llm.models.gpt.conversion.llama import LlamaBaseModelConverter - from tests.utils.model_configs import MODEL_CONFIGS cfg = copy.deepcopy(MODEL_CONFIGS["llama"].config_dict["model"]["base_model"]) # Default head normalization inherits the block default (1e-5); pin head to a different value. From ac04292e3dcb6978dfbcb8edd8d8a4370015a613 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 28 May 2026 15:03:39 -0400 Subject: [PATCH 3/3] Deepcopy per-block override dicts in the gemma4 fixture, drop walker xfail The gemma4 fixture spread ``_gemma4_block_overrides`` and ``_gemma4_mixer_overrides`` into both ``sliding_attention`` and ``full_attention`` blocks without per-use deepcopy. Dict-spread copies top-level keys but values are shared references, so the two blocks' nested override dicts aliased. ``Config._from_dict`` then mutates its input via ``pop`` when extracting fields, emptying the shared sub-dicts after the first block's resolution. The second block silently fell back to type defaults: LayerNormalizationConfig where the fixture said RMSNormalizationConfig, ``output_scale.enabled=None`` where it said ``True``. The walker xfail on ``test_format_weight_coverage[gemma4]`` had been blamed (in PR description and the prior xfail reason) on three converter declaration gaps; investigation showed those were all symptoms of the fixture aliasing. With the per-spread deepcopy, both blocks resolve as intended (RMSNorm everywhere, no LayerNorm biases, ``output_scale.enabled=True``), and the walker passes for gemma4 without any converter changes. The underlying ``Config._from_dict`` mutate-input behaviour is a footgun beyond this fixture and is worth addressing separately. Co-Authored-By: Claude Opus 4.7 --- tests/models/test_converters.py | 22 +--------------------- tests/utils/model_configs.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/tests/models/test_converters.py b/tests/models/test_converters.py index 0d0b40481..8b23ab222 100644 --- a/tests/models/test_converters.py +++ b/tests/models/test_converters.py @@ -33,7 +33,6 @@ from fast_llm.layers.attention.config import AttentionConfig from fast_llm.layers.block.config import PatternBlockSequenceConfig from fast_llm.layers.decoder.config import DecoderBlockConfig, StochasticMixerConfig -from fast_llm.models.gpt.conversion.config import Gemma4CheckpointFormat from tests.utils.model_configs import MODEL_CONFIGS # Configs that don't default-construct cleanly need a minimal-valid factory. @@ -165,26 +164,7 @@ def test_safe_set_nested_dict_value_collision() -> None: @pytest.mark.parametrize( - "fixture_name", - [ - ( - pytest.param( - name, - marks=pytest.mark.xfail( - strict=True, - reason=( - "Gemma4 converters drop LayerNorm biases and non-MoE norm_2 on the full_attention " - "branch of the test fixture, and declare ``output_scale`` unconditionally even when " - "the block disables it." - ), - ), - ) - if cfg.checkpoint_format is Gemma4CheckpointFormat - else name - ) - for name, cfg in MODEL_CONFIGS.items() - if cfg.checkpoint_format is not None - ], + "fixture_name", [name for name, cfg in MODEL_CONFIGS.items() if cfg.checkpoint_format is not None] ) def test_format_weight_coverage(fixture_name: str) -> None: """Every Fast-LLM parameter must be consumed by some :class:`WeightConverter`. diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index a7ff05478..77828e0bd 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -1051,22 +1051,27 @@ def update_and_add_testing_config( ("model", "base_model", "decoder"): { "type": "pattern", "blocks": { + # Sub-dicts in ``_gemma4_block_overrides`` / ``_gemma4_mixer_overrides`` are deepcopied + # per block. Without this the two blocks' nested override dicts would alias, and + # ``Config._from_dict`` (which mutates its input via ``pop``) would consume the + # shared sub-dicts when processing the first block, leaving the second to silently + # fall back to type defaults (LayerNorm / output_scale.enabled=None). "sliding_attention": { **copy.deepcopy(_llama_block), - **_gemma4_block_overrides, + **copy.deepcopy(_gemma4_block_overrides), "mixer": { **copy.deepcopy(_llama_block["mixer"]), - **_gemma4_mixer_overrides, + **copy.deepcopy(_gemma4_mixer_overrides), "window_size": 128, }, "mlp": copy.deepcopy(_gemma4_moe_mlp), }, "full_attention": { **copy.deepcopy(_llama_block), - **_gemma4_block_overrides, + **copy.deepcopy(_gemma4_block_overrides), "mixer": { **copy.deepcopy(_llama_block["mixer"]), - **_gemma4_mixer_overrides, + **copy.deepcopy(_gemma4_mixer_overrides), "rotary": {"type": "proportional", "partial_rotary_factor": 0.25}, }, "mlp": copy.deepcopy(_gemma4_moe_mlp),