NVIDIA · juhi10071998 · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 29, 2026
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,7 @@ Changelog
 
 - Add the ``day0-release`` agent skill (``.agents/skills/day0-release/``), a deterministic end-to-end driver that chains the PTQ → evaluation → comparison skills (the evaluation stage deploys the checkpoint itself) with an enforced gate after each stage and returns a publish decision (ACCEPT / REGRESSION / ANOMALOUS / INFEASIBLE). Ships three GPU-free, unit-tested gate scripts (``gate_ptq.py``, ``gate_run.py``, ``gate_compare.py``) that validate checkpoint coverage, evaluation-run completeness, and baseline-vs-candidate accuracy threshold. v1 reports and stops on regression; the recipe-search loop is deferred.
 - Add **streaming** speculative-decoding training (EAGLE3 / DFlash): the draft trains on base-model hidden states produced on the fly by a co-located ``vllm serve`` (no disk dump), moved trainer-side over NIXL RDMA, scaling to multi-node (dedicated serve replicas + DDP trainers). New launcher examples for NVFP4 Kimi-K2.5 / K2.6 on GB200/aarch64 under ``tools/launcher/examples/moonshotai/``.
+- Add tied-weight PTQ and HF-checkpoint export support for block-diffusion encoder-decoder LLMs (e.g. DiffusionGemma) whose encoder/decoder stacks share parameters via HF ``_tied_weights_keys``. ``_export_quantized_weight`` and ``_export_fused_experts`` now alias bit-identical packed ``weight`` / ``weight_scale`` / ``weight_scale_2`` buffers across modules sharing a source weight ``data_ptr()`` so the downstream ``postprocess_state_dict`` dedup catches them (~42% storage reduction on ``nvfp4_experts_only`` for tied 26B MoE checkpoints). New ``sync_tied_input_amax`` helper max-merges per-side ``input_quantizer.amax`` across tied modules before export so single-backbone consumers that load one ``input_scale`` per parameter don't clip either side. Opt-in ``--canonical_tied_naming`` flag (default off) reorders the state_dict so canonical-side keys per HF's ``_tied_weights_keys`` declaration win the data_ptr dedup. ``default_disabled_quantizers`` gains a ``*self_conditioning*`` wildcard companion to the upstream vision excludes (PR #1691). ``hf_ptq.py`` also unwraps ``ModelOutput`` dataclasses from ``.generate()`` so the preview decode works on diffusion models. Non-tied models see no behavioral change.
 
 0.45 (2026-06-xx)
 ^^^^^^^^^^^^^^^^^

@@ -806,7 +806,13 @@ def is_model_on_gpu(model) -> bool:
 
 
 def is_enc_dec(model_type) -> bool:
-    """Return if the model is a encoder-decoder model."""
+    """Return whether the model_type uses encoder-decoder-style preview decode.
+
+    Controls whether ``hf_ptq.py`` slices off the prompt prefix from
+    ``.generate()`` output. ``diffusion_gemma`` is structurally encoder-decoder
+    but returns prompt+canvas concatenated, so it stays OFF this list (AR-style
+    decode applies).
+    """
     return model_type in ["t5", "bart", "whisper"]
 
 

@@ -774,6 +774,7 @@ def export_quantized(
                     full_model,
                     export_dir=export_path,
                     extra_state_dict=mtp_state_dict,
+                    canonical_tied_naming=args.canonical_tied_naming,
                 )
 
                 if args.qformat == "w4a16_nvfp4":
@@ -941,6 +942,11 @@ def input_decode(input_ids):
             raise ValueError("The processor or tokenizer must be set")
 
     def output_decode(generated_ids, input_shape):
+        # Some `.generate()` returns a ModelOutput dataclass (e.g. DiffusionGemma);
+        # unwrap to the token tensor so downstream slicing works uniformly.
+        if hasattr(generated_ids, "sequences"):
+            generated_ids = generated_ids.sequences
+
         if is_enc_dec(model_type):
             if processor is not None and isinstance(processor, WhisperProcessor):
                 return processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -1252,6 +1258,19 @@ def parse_args() -> argparse.Namespace:
         default=512,
     )
     parser.add_argument("--export_path", default="exported_model")
+    parser.add_argument(
+        "--canonical_tied_naming",
+        type=lambda s: s.lower() in ("1", "true", "yes"),
+        default=False,
+        help=(
+            "If True, reorder the exported state_dict so tied-weight aliases "
+            "dedup to the canonical side declared in the model's HF "
+            "_tied_weights_keys (e.g. decoder-side for DiffusionGemma4). Off "
+            "by default to avoid renaming exported keys for models whose "
+            "downstream consumers expect the legacy (registration-order) "
+            "winner."
+        ),
+    )
     parser.add_argument(
         "--dataset",
         help=(

@@ -33,6 +33,9 @@
     "Qwen3Next": "qwen3next",
     "QWen": "qwen",
     "RecurrentGemma": "recurrentgemma",
+    # DiffusionGemma must come before "Gemma" — get_model_type substring-matches
+    # in order, and "gemma" is a substring of "diffusiongemma".
+    "DiffusionGemma": "diffusion_gemma",
     "Gemma3": "gemma3",
     "Gemma2": "gemma2",
     "Gemma": "gemma",

@@ -42,13 +42,24 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
            {E}.gate_proj.weight, {E}.gate_proj.weight_scale, ...
            {E}.up_proj.weight, {E}.up_proj.weight_scale, ...
            {E}.down_proj.weight, {E}.down_proj.weight_scale, ...
+
+    Tied-experts dedup: when multiple fused-expert modules share their 3-D
+    source params via HF ``_tied_weights_keys``, the unpacking creates fresh
+    per-expert tensors that break the tie. We cache the source ``data_ptr()``
+    at entry and on a later cache hit alias the per-expert ``weight`` /
+    ``weight_scale`` / ``weight_scale_2`` back to the prior module so
+    downstream dedup catches them. ``input_scale`` is left per-side.
     """
     from modelopt.torch.export.unified_export_hf import _export_quantized_weight
     from modelopt.torch.quantization.plugins.huggingface import _get_fused_expert_intermediate_dim
 
     n = module.num_experts
     expert_dim = _get_fused_expert_intermediate_dim(module)
 
+    # Capture source tensor identities BEFORE unpacking (the source
+    # attrs are deleted at the end of this function).
+    _source_key = (module.gate_up_proj.data_ptr(), module.down_proj.data_ptr())
+
     # 1. Shared input quantizers — one per projection type, shared across all experts.
     gate_up_input_q = module.gate_up_proj_input_quantizer
     down_input_q = module.down_proj_input_quantizer
@@ -178,6 +189,46 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
         if hasattr(module, attr):
             delattr(module, attr)
 
+    # 5. Tied-experts dedup: if this module's source params have been seen
+    # before, alias the bit-identical per-expert buffers (weight,
+    # weight_scale, weight_scale_2, input_scale) to the previously-unpacked
+    # module. input_scale is safe to alias because sync_tied_input_amax
+    # runs earlier in _export_transformers_checkpoint and max-merges the
+    # shared input_quantizer amaxes across tied fused-experts modules, so
+    # both sides now derive bit-identical input_scale values.
+    _cache = _export_fused_experts.__dict__.setdefault("_tied_unpacked_cache", {})
+    _prior = _cache.get(_source_key)
+    if _prior is not None and _prior is not module:
+        for _idx in range(n):
+            _cur_expert = getattr(module, str(_idx), None)
+            _prior_expert = getattr(_prior, str(_idx), None)
+            if _cur_expert is None or _prior_expert is None:
+                continue
+            for _proj_name in ("gate_proj", "up_proj", "down_proj"):
+                _cur_proj = getattr(_cur_expert, _proj_name, None)
+                _prior_proj = getattr(_prior_expert, _proj_name, None)
+                if _cur_proj is None or _prior_proj is None:
+                    continue
+                # Alias the weight (Parameter) so both sides reference the
+                # same nn.Parameter → same data_ptr() → existing dedup
+                # in postprocess_state_dict will drop the duplicate.
+                if hasattr(_prior_proj, "weight"):
+                    _cur_proj.weight = _prior_proj.weight
+                # Alias the bit-identical scale buffers (including
+                # input_scale, made safe by sync_tied_input_amax pre-export
+                # merging). Re-register to ensure data_ptr() matches the
+                # prior side's tensor.
+                for _attr in ("weight_scale", "weight_scale_2", "input_scale"):
+                    if not hasattr(_prior_proj, _attr):
+                        continue
+                    if _attr in _cur_proj._buffers:
+                        del _cur_proj._buffers[_attr]
+                    elif hasattr(_cur_proj, _attr):
+                        delattr(_cur_proj, _attr)
+                    _cur_proj.register_buffer(_attr, getattr(_prior_proj, _attr))
+    else:
+        _cache[_source_key] = module
+
 
 def save_expert_token_count_table(model: nn.Module, output_dir: str | Path | None = None):
     """Collect expert_token_count from all quantized MoE layers and save as an HTML table.