From 9e2acad0a2bdc2de27c6122b0f6eb793b7a7ff73 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 11 Jun 2026 14:26:36 -0700 Subject: [PATCH 1/2] Fix gemma-4 fp8_default / nvfp4_mlp_only recipes quantizing vision branch in sglang (NVBug 6293731, 6293762) The general PTQ presets `fp8_default-kv_fp8` and `nvfp4_mlp_only-kv_fp8` (and their `_cast` KV siblings) enable quantization with broad wildcards that, on multimodal Gemma checkpoints (e.g. gemma-4-31B-it), also match the SigLIP vision tower (`model.vision_tower.*`), the vision embedding projection (`model.embed_vision.*`), and the vision block MLPs: - `fp8_default`: the `w8a8_fp8_fp8` unit enables bare `*weight_quantizer` / `*input_quantizer`, FP8-quantizing the whole vision branch. The exported checkpoint then deploys but emits garbled text in sglang (NVBug 6293731). - `nvfp4_mlp_only`: the `*mlp*` enables match `vision_tower.encoder.layers.*.mlp`, so the FP4 kernel crashes at decode with `ValueError: too many values to unpack (expected 2)` in sglang's modelopt_quant apply path (NVBug 6293762). Add trailing `*visual*` / `*vision_tower*` / `*embed_vision*` disable rules (placed after the enables and `default_disabled_quantizers` so the disable wins), keeping the vision branch in BF16. Mirrors the vision exclusions already shipped in the gemma w4a8_awq / qwen3_5 / nemotron_vl recipes. The rules are no-ops on text-only models. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Zhiyu Cheng --- .../general/ptq/fp8_default-kv_fp8.yaml | 15 +++++++++++++++ .../general/ptq/fp8_default-kv_fp8_cast.yaml | 15 +++++++++++++++ .../general/ptq/nvfp4_mlp_only-kv_fp8.yaml | 14 ++++++++++++++ .../general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml | 14 ++++++++++++++ 4 files changed, 58 insertions(+) diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml index ea2ac567290..430a09628d8 100644 --- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml @@ -32,3 +32,18 @@ quantize: - $import: w8a8_fp8_fp8 - $import: kv_fp8 - $import: default_disabled_quantizers + # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT) + # and any multimodal embedding projection in BF16. The `w8a8_fp8_fp8` unit + # enables bare `*weight_quantizer` / `*input_quantizer`, which on VL + # checkpoints also match the vision tower (`model.vision_tower.*`, + # `model.visual.*`) and the embedding projection (`model.embed_vision.*`). + # FP8-quantizing the vision branch yields garbage output on gemma-4 + # (NVBug 6293731) and is accuracy-harmful generally. Must come after the + # enables so the disable wins (later entries override earlier). No-op on + # text-only models. + - quantizer_name: '*visual*' + enable: false + - quantizer_name: '*vision_tower*' + enable: false + - quantizer_name: '*embed_vision*' + enable: false diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml index 4e24bf53274..9fb6d5c98cb 100644 --- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml @@ -33,3 +33,18 @@ quantize: - $import: w8a8_fp8_fp8 - $import: kv_fp8_cast - $import: default_disabled_quantizers + # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT) + # and any multimodal embedding projection in BF16. The `w8a8_fp8_fp8` unit + # enables bare `*weight_quantizer` / `*input_quantizer`, which on VL + # checkpoints also match the vision tower (`model.vision_tower.*`, + # `model.visual.*`) and the embedding projection (`model.embed_vision.*`). + # FP8-quantizing the vision branch yields garbage output on gemma-4 + # (NVBug 6293731) and is accuracy-harmful generally. Must come after the + # enables so the disable wins (later entries override earlier). No-op on + # text-only models. + - quantizer_name: '*visual*' + enable: false + - quantizer_name: '*vision_tower*' + enable: false + - quantizer_name: '*embed_vision*' + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml index a4cf71a1dbd..4d3242c6bf6 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml @@ -50,3 +50,17 @@ quantize: $import: nvfp4 - $import: kv_fp8 - $import: default_disabled_quantizers + # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT) + # and any multimodal embedding projection in BF16. The `*mlp*` enables above + # match not only the language-model MLPs but also the vision tower's block + # MLPs (`model.vision_tower.encoder.layers.*.mlp`, `model.visual.blocks.*.mlp`). + # NVFP4-quantizing the vision branch crashes the FP4 kernel on gemma-4 + # (NVBug 6293762) and produces garbage image embeddings on Qwen3.5-VL. Must + # come after the enables so the disable wins (later entries override earlier). + # No-op on text-only models. + - quantizer_name: '*visual*' + enable: false + - quantizer_name: '*vision_tower*' + enable: false + - quantizer_name: '*embed_vision*' + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml index 225ecf7f086..4c224ef8835 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml @@ -50,3 +50,17 @@ quantize: $import: nvfp4 - $import: kv_fp8_cast - $import: default_disabled_quantizers + # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT) + # and any multimodal embedding projection in BF16. The `*mlp*` enables above + # match not only the language-model MLPs but also the vision tower's block + # MLPs (`model.vision_tower.encoder.layers.*.mlp`, `model.visual.blocks.*.mlp`). + # NVFP4-quantizing the vision branch crashes the FP4 kernel on gemma-4 + # (NVBug 6293762) and produces garbage image embeddings on Qwen3.5-VL. Must + # come after the enables so the disable wins (later entries override earlier). + # No-op on text-only models. + - quantizer_name: '*visual*' + enable: false + - quantizer_name: '*vision_tower*' + enable: false + - quantizer_name: '*embed_vision*' + enable: false From 513862e5d5e6d199592cb4a94718546b7f9c8b9d Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 11 Jun 2026 17:54:27 -0700 Subject: [PATCH 2/2] Exclude multimodal vision branch from quantization by default (NVBug 6293731, 6293762) The general PTQ presets quantize via broad wildcards: `fp8_default` enables bare `*weight_quantizer` / `*input_quantizer` (the `w8a8_fp8_fp8` unit) and `nvfp4_mlp_only` enables `*mlp*`. On multimodal checkpoints (e.g. gemma-4-31B-it) these also match the SigLIP vision tower (`model.vision_tower.*`, `model.visual.*`) and the vision embedding projection (`model.embed_vision.*`): - fp8_default-kv_fp8: FP8-quantizes the vision branch; the checkpoint deploys but emits garbled text in sglang (NVBug 6293731). - nvfp4_mlp_only-kv_fp8: NVFP4-quantizes the vision block MLPs; the FP4 kernel crashes at decode with `too many values to unpack (expected 2)` (NVBug 6293762). Add `*embed_vision*` / `*vision_tower*` / `*visual*` disable rules to the shared `configs/ptq/units/default_disabled_quantizers` unit, alongside the existing `*router*` / `*lm_head*` entries. Because both the composed `general/ptq/*` recipes and the `configs/ptq/presets/model/*` presets import this unit, every general recipe keeps the vision branch in BF16 by default and the YAML<->preset parity test stays satisfied. No-op on text-only models; a recipe that intentionally quantizes vision can re-enable after importing this unit. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Zhiyu Cheng --- .../ptq/units/default_disabled_quantizers.yaml | 14 ++++++++++++++ .../general/ptq/fp8_default-kv_fp8.yaml | 15 --------------- .../general/ptq/fp8_default-kv_fp8_cast.yaml | 15 --------------- .../general/ptq/nvfp4_mlp_only-kv_fp8.yaml | 14 -------------- .../general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml | 14 -------------- 5 files changed, 14 insertions(+), 58 deletions(-) diff --git a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml index 2adcf1f60f0..057ed77409f 100644 --- a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml +++ b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml @@ -36,6 +36,20 @@ enable: false - quantizer_name: 'output.*' enable: false + # Multimodal vision branch: keep the vision encoder (SigLIP / ViT) and any + # multimodal embedding projection in BF16 by default. Recipes that enable bare + # `*weight_quantizer` / `*input_quantizer` or `*mlp*` wildcards otherwise also + # match the vision tower (`model.vision_tower.*`, `model.visual.*`) and the + # embedding projection (`model.embed_vision.*`); quantizing the vision branch + # crashes export / produces garbage image embeddings on VL models (gemma-4, + # Qwen3.5-VL — NVBugs 6293731, 6293762, 6294017). A recipe that intentionally + # quantizes vision must re-enable these after importing this unit. + - quantizer_name: '*embed_vision*' + enable: false + - quantizer_name: '*vision_tower*' + enable: false + - quantizer_name: '*visual*' + enable: false - parent_class: 'nn.BatchNorm1d' quantizer_name: '*' enable: false diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml index 430a09628d8..ea2ac567290 100644 --- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml @@ -32,18 +32,3 @@ quantize: - $import: w8a8_fp8_fp8 - $import: kv_fp8 - $import: default_disabled_quantizers - # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT) - # and any multimodal embedding projection in BF16. The `w8a8_fp8_fp8` unit - # enables bare `*weight_quantizer` / `*input_quantizer`, which on VL - # checkpoints also match the vision tower (`model.vision_tower.*`, - # `model.visual.*`) and the embedding projection (`model.embed_vision.*`). - # FP8-quantizing the vision branch yields garbage output on gemma-4 - # (NVBug 6293731) and is accuracy-harmful generally. Must come after the - # enables so the disable wins (later entries override earlier). No-op on - # text-only models. - - quantizer_name: '*visual*' - enable: false - - quantizer_name: '*vision_tower*' - enable: false - - quantizer_name: '*embed_vision*' - enable: false diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml index 9fb6d5c98cb..4e24bf53274 100644 --- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml @@ -33,18 +33,3 @@ quantize: - $import: w8a8_fp8_fp8 - $import: kv_fp8_cast - $import: default_disabled_quantizers - # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT) - # and any multimodal embedding projection in BF16. The `w8a8_fp8_fp8` unit - # enables bare `*weight_quantizer` / `*input_quantizer`, which on VL - # checkpoints also match the vision tower (`model.vision_tower.*`, - # `model.visual.*`) and the embedding projection (`model.embed_vision.*`). - # FP8-quantizing the vision branch yields garbage output on gemma-4 - # (NVBug 6293731) and is accuracy-harmful generally. Must come after the - # enables so the disable wins (later entries override earlier). No-op on - # text-only models. - - quantizer_name: '*visual*' - enable: false - - quantizer_name: '*vision_tower*' - enable: false - - quantizer_name: '*embed_vision*' - enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml index 4d3242c6bf6..a4cf71a1dbd 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml @@ -50,17 +50,3 @@ quantize: $import: nvfp4 - $import: kv_fp8 - $import: default_disabled_quantizers - # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT) - # and any multimodal embedding projection in BF16. The `*mlp*` enables above - # match not only the language-model MLPs but also the vision tower's block - # MLPs (`model.vision_tower.encoder.layers.*.mlp`, `model.visual.blocks.*.mlp`). - # NVFP4-quantizing the vision branch crashes the FP4 kernel on gemma-4 - # (NVBug 6293762) and produces garbage image embeddings on Qwen3.5-VL. Must - # come after the enables so the disable wins (later entries override earlier). - # No-op on text-only models. - - quantizer_name: '*visual*' - enable: false - - quantizer_name: '*vision_tower*' - enable: false - - quantizer_name: '*embed_vision*' - enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml index 4c224ef8835..225ecf7f086 100644 --- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml @@ -50,17 +50,3 @@ quantize: $import: nvfp4 - $import: kv_fp8_cast - $import: default_disabled_quantizers - # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT) - # and any multimodal embedding projection in BF16. The `*mlp*` enables above - # match not only the language-model MLPs but also the vision tower's block - # MLPs (`model.vision_tower.encoder.layers.*.mlp`, `model.visual.blocks.*.mlp`). - # NVFP4-quantizing the vision branch crashes the FP4 kernel on gemma-4 - # (NVBug 6293762) and produces garbage image embeddings on Qwen3.5-VL. Must - # come after the enables so the disable wins (later entries override earlier). - # No-op on text-only models. - - quantizer_name: '*visual*' - enable: false - - quantizer_name: '*vision_tower*' - enable: false - - quantizer_name: '*embed_vision*' - enable: false