Skip to content
Open
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Changelog

- Add the ``day0-release`` agent skill (``.agents/skills/day0-release/``), a deterministic end-to-end driver that chains the PTQ → evaluation → comparison skills (the evaluation stage deploys the checkpoint itself) with an enforced gate after each stage and returns a publish decision (ACCEPT / REGRESSION / ANOMALOUS / INFEASIBLE). Ships three GPU-free, unit-tested gate scripts (``gate_ptq.py``, ``gate_run.py``, ``gate_compare.py``) that validate checkpoint coverage, evaluation-run completeness, and baseline-vs-candidate accuracy threshold. v1 reports and stops on regression; the recipe-search loop is deferred.
- Add **streaming** speculative-decoding training (EAGLE3 / DFlash): the draft trains on base-model hidden states produced on the fly by a co-located ``vllm serve`` (no disk dump), moved trainer-side over NIXL RDMA, scaling to multi-node (dedicated serve replicas + DDP trainers). New launcher examples for NVFP4 Kimi-K2.5 / K2.6 on GB200/aarch64 under ``tools/launcher/examples/moonshotai/``.
- Add tied-weight PTQ and HF-checkpoint export support for block-diffusion encoder-decoder LLMs (e.g. DiffusionGemma) whose encoder/decoder stacks share parameters via HF ``_tied_weights_keys``. ``_export_quantized_weight`` and ``_export_fused_experts`` now alias bit-identical packed ``weight`` / ``weight_scale`` / ``weight_scale_2`` buffers across modules sharing a source weight ``data_ptr()`` so the downstream ``postprocess_state_dict`` dedup catches them (~42% storage reduction on ``nvfp4_experts_only`` for tied 26B MoE checkpoints). New ``sync_tied_input_amax`` helper max-merges per-side ``input_quantizer.amax`` across tied modules before export so single-backbone consumers that load one ``input_scale`` per parameter don't clip either side. Opt-in ``--canonical_tied_naming`` flag (default off) reorders the state_dict so canonical-side keys per HF's ``_tied_weights_keys`` declaration win the data_ptr dedup. ``default_disabled_quantizers`` gains a ``*self_conditioning*`` wildcard companion to the upstream vision excludes (PR #1691). ``hf_ptq.py`` also unwraps ``ModelOutput`` dataclasses from ``.generate()`` so the preview decode works on diffusion models. Non-tied models see no behavioral change.

0.45 (2026-06-xx)
^^^^^^^^^^^^^^^^^
Expand Down
8 changes: 7 additions & 1 deletion examples/llm_ptq/example_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,13 @@ def is_model_on_gpu(model) -> bool:


def is_enc_dec(model_type) -> bool:
"""Return if the model is a encoder-decoder model."""
"""Return whether the model_type uses encoder-decoder-style preview decode.

Controls whether ``hf_ptq.py`` slices off the prompt prefix from
``.generate()`` output. ``diffusion_gemma`` is structurally encoder-decoder
but returns prompt+canvas concatenated, so it stays OFF this list (AR-style
decode applies).
"""
return model_type in ["t5", "bart", "whisper"]


Expand Down
19 changes: 19 additions & 0 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,7 @@ def export_quantized(
full_model,
export_dir=export_path,
extra_state_dict=mtp_state_dict,
canonical_tied_naming=args.canonical_tied_naming,
)

if args.qformat == "w4a16_nvfp4":
Expand Down Expand Up @@ -941,6 +942,11 @@ def input_decode(input_ids):
raise ValueError("The processor or tokenizer must be set")

def output_decode(generated_ids, input_shape):
# Some `.generate()` returns a ModelOutput dataclass (e.g. DiffusionGemma);
# unwrap to the token tensor so downstream slicing works uniformly.
if hasattr(generated_ids, "sequences"):
generated_ids = generated_ids.sequences

if is_enc_dec(model_type):
if processor is not None and isinstance(processor, WhisperProcessor):
return processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
Expand Down Expand Up @@ -1252,6 +1258,19 @@ def parse_args() -> argparse.Namespace:
default=512,
)
parser.add_argument("--export_path", default="exported_model")
parser.add_argument(
"--canonical_tied_naming",
type=lambda s: s.lower() in ("1", "true", "yes"),
default=False,
help=(
"If True, reorder the exported state_dict so tied-weight aliases "
"dedup to the canonical side declared in the model's HF "
"_tied_weights_keys (e.g. decoder-side for DiffusionGemma4). Off "
"by default to avoid renaming exported keys for models whose "
"downstream consumers expect the legacy (registration-order) "
"winner."
),
)
parser.add_argument(
"--dataset",
help=(
Expand Down
3 changes: 3 additions & 0 deletions modelopt/torch/export/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
"Qwen3Next": "qwen3next",
"QWen": "qwen",
"RecurrentGemma": "recurrentgemma",
# DiffusionGemma must come before "Gemma" — get_model_type substring-matches
# in order, and "gemma" is a substring of "diffusiongemma".
"DiffusionGemma": "diffusion_gemma",
"Gemma3": "gemma3",
"Gemma2": "gemma2",
"Gemma": "gemma",
Expand Down
51 changes: 51 additions & 0 deletions modelopt/torch/export/moe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,24 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
{E}.gate_proj.weight, {E}.gate_proj.weight_scale, ...
{E}.up_proj.weight, {E}.up_proj.weight_scale, ...
{E}.down_proj.weight, {E}.down_proj.weight_scale, ...

Tied-experts dedup: when multiple fused-expert modules share their 3-D
source params via HF ``_tied_weights_keys``, the unpacking creates fresh
per-expert tensors that break the tie. We cache the source ``data_ptr()``
at entry and on a later cache hit alias the per-expert ``weight`` /
``weight_scale`` / ``weight_scale_2`` back to the prior module so
downstream dedup catches them. ``input_scale`` is left per-side.
Comment on lines +45 to +51

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Docstring contradicts implementation for input_scale aliasing.

Line 51 states input_scale is left per-side, but line 221 explicitly aliases input_scale along with weight_scale and weight_scale_2. The implementation comment at lines 195-198 correctly explains that input_scale IS aliased because sync_tied_input_amax runs earlier.

📝 Suggested docstring fix
     Tied-experts dedup: when multiple fused-expert modules share their 3-D
     source params via HF ``_tied_weights_keys``, the unpacking creates fresh
     per-expert tensors that break the tie. We cache the source ``data_ptr()``
     at entry and on a later cache hit alias the per-expert ``weight`` /
-    ``weight_scale`` / ``weight_scale_2`` back to the prior module so
-    downstream dedup catches them. ``input_scale`` is left per-side.
+    ``weight_scale`` / ``weight_scale_2`` / ``input_scale`` back to the prior
+    module so downstream dedup catches them. ``input_scale`` aliasing is safe
+    because ``sync_tied_input_amax`` runs earlier and max-merges the shared
+    input_quantizer amaxes, so both sides derive bit-identical values.
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@modelopt/torch/export/moe_utils.py` around lines 45 - 51, The docstring for
the tied-experts dedup block contradicts the implementation by saying
"input_scale is left per-side" while the code in moe_utils.py aliases
input_scale alongside weight_scale and weight_scale_2 (due to
sync_tied_input_amax running earlier); update the docstring to state that
input_scale is aliased too and mention the reason (sync_tied_input_amax runs
prior), keeping the rest of the explanation about caching data_ptr() and
aliasing behavior intact so docstring matches the implementation.

"""
from modelopt.torch.export.unified_export_hf import _export_quantized_weight
from modelopt.torch.quantization.plugins.huggingface import _get_fused_expert_intermediate_dim

n = module.num_experts
expert_dim = _get_fused_expert_intermediate_dim(module)

# Capture source tensor identities BEFORE unpacking (the source
# attrs are deleted at the end of this function).
_source_key = (module.gate_up_proj.data_ptr(), module.down_proj.data_ptr())

# 1. Shared input quantizers — one per projection type, shared across all experts.
gate_up_input_q = module.gate_up_proj_input_quantizer
down_input_q = module.down_proj_input_quantizer
Expand Down Expand Up @@ -178,6 +189,46 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
if hasattr(module, attr):
delattr(module, attr)

# 5. Tied-experts dedup: if this module's source params have been seen
# before, alias the bit-identical per-expert buffers (weight,
# weight_scale, weight_scale_2, input_scale) to the previously-unpacked
# module. input_scale is safe to alias because sync_tied_input_amax
# runs earlier in _export_transformers_checkpoint and max-merges the
# shared input_quantizer amaxes across tied fused-experts modules, so
# both sides now derive bit-identical input_scale values.
_cache = _export_fused_experts.__dict__.setdefault("_tied_unpacked_cache", {})
_prior = _cache.get(_source_key)
if _prior is not None and _prior is not module:
for _idx in range(n):
_cur_expert = getattr(module, str(_idx), None)
_prior_expert = getattr(_prior, str(_idx), None)
if _cur_expert is None or _prior_expert is None:
continue
for _proj_name in ("gate_proj", "up_proj", "down_proj"):
_cur_proj = getattr(_cur_expert, _proj_name, None)
_prior_proj = getattr(_prior_expert, _proj_name, None)
if _cur_proj is None or _prior_proj is None:
continue
# Alias the weight (Parameter) so both sides reference the
# same nn.Parameter → same data_ptr() → existing dedup
# in postprocess_state_dict will drop the duplicate.
if hasattr(_prior_proj, "weight"):
_cur_proj.weight = _prior_proj.weight
# Alias the bit-identical scale buffers (including
# input_scale, made safe by sync_tied_input_amax pre-export
# merging). Re-register to ensure data_ptr() matches the
# prior side's tensor.
for _attr in ("weight_scale", "weight_scale_2", "input_scale"):
if not hasattr(_prior_proj, _attr):
continue
if _attr in _cur_proj._buffers:
del _cur_proj._buffers[_attr]
elif hasattr(_cur_proj, _attr):
delattr(_cur_proj, _attr)
_cur_proj.register_buffer(_attr, getattr(_prior_proj, _attr))
else:
_cache[_source_key] = module


def save_expert_token_count_table(model: nn.Module, output_dir: str | Path | None = None):
"""Collect expert_token_count from all quantized MoE layers and save as an HTML table.
Expand Down
Loading
Loading