From 9e2acad0a2bdc2de27c6122b0f6eb793b7a7ff73 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 11 Jun 2026 14:26:36 -0700
Subject: [PATCH 1/2] Fix gemma-4 fp8_default / nvfp4_mlp_only recipes
 quantizing vision branch in sglang (NVBug 6293731, 6293762)

The general PTQ presets `fp8_default-kv_fp8` and `nvfp4_mlp_only-kv_fp8`
(and their `_cast` KV siblings) enable quantization with broad wildcards
that, on multimodal Gemma checkpoints (e.g. gemma-4-31B-it), also match the
SigLIP vision tower (`model.vision_tower.*`), the vision embedding projection
(`model.embed_vision.*`), and the vision block MLPs:

  - `fp8_default`: the `w8a8_fp8_fp8` unit enables bare `*weight_quantizer` /
    `*input_quantizer`, FP8-quantizing the whole vision branch. The exported
    checkpoint then deploys but emits garbled text in sglang (NVBug 6293731).
  - `nvfp4_mlp_only`: the `*mlp*` enables match
    `vision_tower.encoder.layers.*.mlp`, so the FP4 kernel crashes at decode
    with `ValueError: too many values to unpack (expected 2)` in sglang's
    modelopt_quant apply path (NVBug 6293762).

Add trailing `*visual*` / `*vision_tower*` / `*embed_vision*` disable rules
(placed after the enables and `default_disabled_quantizers` so the disable
wins), keeping the vision branch in BF16. Mirrors the vision exclusions
already shipped in the gemma w4a8_awq / qwen3_5 / nemotron_vl recipes. The
rules are no-ops on text-only models.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 .../general/ptq/fp8_default-kv_fp8.yaml           | 15 +++++++++++++++
 .../general/ptq/fp8_default-kv_fp8_cast.yaml      | 15 +++++++++++++++
 .../general/ptq/nvfp4_mlp_only-kv_fp8.yaml        | 14 ++++++++++++++
 .../general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml   | 14 ++++++++++++++
 4 files changed, 58 insertions(+)

diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml
index ea2ac567290..430a09628d8 100644
--- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml
+++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml
@@ -32,3 +32,18 @@ quantize:
     - $import: w8a8_fp8_fp8
     - $import: kv_fp8
     - $import: default_disabled_quantizers
+    # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT)
+    # and any multimodal embedding projection in BF16. The `w8a8_fp8_fp8` unit
+    # enables bare `*weight_quantizer` / `*input_quantizer`, which on VL
+    # checkpoints also match the vision tower (`model.vision_tower.*`,
+    # `model.visual.*`) and the embedding projection (`model.embed_vision.*`).
+    # FP8-quantizing the vision branch yields garbage output on gemma-4
+    # (NVBug 6293731) and is accuracy-harmful generally. Must come after the
+    # enables so the disable wins (later entries override earlier). No-op on
+    # text-only models.
+    - quantizer_name: '*visual*'
+      enable: false
+    - quantizer_name: '*vision_tower*'
+      enable: false
+    - quantizer_name: '*embed_vision*'
+      enable: false
diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml
index 4e24bf53274..9fb6d5c98cb 100644
--- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml
+++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml
@@ -33,3 +33,18 @@ quantize:
     - $import: w8a8_fp8_fp8
     - $import: kv_fp8_cast
     - $import: default_disabled_quantizers
+    # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT)
+    # and any multimodal embedding projection in BF16. The `w8a8_fp8_fp8` unit
+    # enables bare `*weight_quantizer` / `*input_quantizer`, which on VL
+    # checkpoints also match the vision tower (`model.vision_tower.*`,
+    # `model.visual.*`) and the embedding projection (`model.embed_vision.*`).
+    # FP8-quantizing the vision branch yields garbage output on gemma-4
+    # (NVBug 6293731) and is accuracy-harmful generally. Must come after the
+    # enables so the disable wins (later entries override earlier). No-op on
+    # text-only models.
+    - quantizer_name: '*visual*'
+      enable: false
+    - quantizer_name: '*vision_tower*'
+      enable: false
+    - quantizer_name: '*embed_vision*'
+      enable: false
diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml
index a4cf71a1dbd..4d3242c6bf6 100644
--- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml
+++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml
@@ -50,3 +50,17 @@ quantize:
         $import: nvfp4
     - $import: kv_fp8
     - $import: default_disabled_quantizers
+    # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT)
+    # and any multimodal embedding projection in BF16. The `*mlp*` enables above
+    # match not only the language-model MLPs but also the vision tower's block
+    # MLPs (`model.vision_tower.encoder.layers.*.mlp`, `model.visual.blocks.*.mlp`).
+    # NVFP4-quantizing the vision branch crashes the FP4 kernel on gemma-4
+    # (NVBug 6293762) and produces garbage image embeddings on Qwen3.5-VL. Must
+    # come after the enables so the disable wins (later entries override earlier).
+    # No-op on text-only models.
+    - quantizer_name: '*visual*'
+      enable: false
+    - quantizer_name: '*vision_tower*'
+      enable: false
+    - quantizer_name: '*embed_vision*'
+      enable: false
diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml
index 225ecf7f086..4c224ef8835 100644
--- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml
+++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml
@@ -50,3 +50,17 @@ quantize:
         $import: nvfp4
     - $import: kv_fp8_cast
     - $import: default_disabled_quantizers
+    # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT)
+    # and any multimodal embedding projection in BF16. The `*mlp*` enables above
+    # match not only the language-model MLPs but also the vision tower's block
+    # MLPs (`model.vision_tower.encoder.layers.*.mlp`, `model.visual.blocks.*.mlp`).
+    # NVFP4-quantizing the vision branch crashes the FP4 kernel on gemma-4
+    # (NVBug 6293762) and produces garbage image embeddings on Qwen3.5-VL. Must
+    # come after the enables so the disable wins (later entries override earlier).
+    # No-op on text-only models.
+    - quantizer_name: '*visual*'
+      enable: false
+    - quantizer_name: '*vision_tower*'
+      enable: false
+    - quantizer_name: '*embed_vision*'
+      enable: false

From 513862e5d5e6d199592cb4a94718546b7f9c8b9d Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 11 Jun 2026 17:54:27 -0700
Subject: [PATCH 2/2] Exclude multimodal vision branch from quantization by
 default (NVBug 6293731, 6293762)

The general PTQ presets quantize via broad wildcards: `fp8_default` enables
bare `*weight_quantizer` / `*input_quantizer` (the `w8a8_fp8_fp8` unit) and
`nvfp4_mlp_only` enables `*mlp*`. On multimodal checkpoints (e.g. gemma-4-31B-it)
these also match the SigLIP vision tower (`model.vision_tower.*`,
`model.visual.*`) and the vision embedding projection (`model.embed_vision.*`):

  - fp8_default-kv_fp8: FP8-quantizes the vision branch; the checkpoint deploys
    but emits garbled text in sglang (NVBug 6293731).
  - nvfp4_mlp_only-kv_fp8: NVFP4-quantizes the vision block MLPs; the FP4 kernel
    crashes at decode with `too many values to unpack (expected 2)` (NVBug 6293762).

Add `*embed_vision*` / `*vision_tower*` / `*visual*` disable rules to the shared
`configs/ptq/units/default_disabled_quantizers` unit, alongside the existing
`*router*` / `*lm_head*` entries. Because both the composed `general/ptq/*`
recipes and the `configs/ptq/presets/model/*` presets import this unit, every
general recipe keeps the vision branch in BF16 by default and the YAML<->preset
parity test stays satisfied. No-op on text-only models; a recipe that
intentionally quantizes vision can re-enable after importing this unit.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 .../ptq/units/default_disabled_quantizers.yaml    | 14 ++++++++++++++
 .../general/ptq/fp8_default-kv_fp8.yaml           | 15 ---------------
 .../general/ptq/fp8_default-kv_fp8_cast.yaml      | 15 ---------------
 .../general/ptq/nvfp4_mlp_only-kv_fp8.yaml        | 14 --------------
 .../general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml   | 14 --------------
 5 files changed, 14 insertions(+), 58 deletions(-)

diff --git a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml
index 2adcf1f60f0..057ed77409f 100644
--- a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml
+++ b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml
@@ -36,6 +36,20 @@
     enable: false
   - quantizer_name: 'output.*'
     enable: false
+  # Multimodal vision branch: keep the vision encoder (SigLIP / ViT) and any
+  # multimodal embedding projection in BF16 by default. Recipes that enable bare
+  # `*weight_quantizer` / `*input_quantizer` or `*mlp*` wildcards otherwise also
+  # match the vision tower (`model.vision_tower.*`, `model.visual.*`) and the
+  # embedding projection (`model.embed_vision.*`); quantizing the vision branch
+  # crashes export / produces garbage image embeddings on VL models (gemma-4,
+  # Qwen3.5-VL — NVBugs 6293731, 6293762, 6294017). A recipe that intentionally
+  # quantizes vision must re-enable these after importing this unit.
+  - quantizer_name: '*embed_vision*'
+    enable: false
+  - quantizer_name: '*vision_tower*'
+    enable: false
+  - quantizer_name: '*visual*'
+    enable: false
   - parent_class: 'nn.BatchNorm1d'
     quantizer_name: '*'
     enable: false
diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml
index 430a09628d8..ea2ac567290 100644
--- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml
+++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8.yaml
@@ -32,18 +32,3 @@ quantize:
     - $import: w8a8_fp8_fp8
     - $import: kv_fp8
     - $import: default_disabled_quantizers
-    # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT)
-    # and any multimodal embedding projection in BF16. The `w8a8_fp8_fp8` unit
-    # enables bare `*weight_quantizer` / `*input_quantizer`, which on VL
-    # checkpoints also match the vision tower (`model.vision_tower.*`,
-    # `model.visual.*`) and the embedding projection (`model.embed_vision.*`).
-    # FP8-quantizing the vision branch yields garbage output on gemma-4
-    # (NVBug 6293731) and is accuracy-harmful generally. Must come after the
-    # enables so the disable wins (later entries override earlier). No-op on
-    # text-only models.
-    - quantizer_name: '*visual*'
-      enable: false
-    - quantizer_name: '*vision_tower*'
-      enable: false
-    - quantizer_name: '*embed_vision*'
-      enable: false
diff --git a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml
index 9fb6d5c98cb..4e24bf53274 100644
--- a/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml
+++ b/modelopt_recipes/general/ptq/fp8_default-kv_fp8_cast.yaml
@@ -33,18 +33,3 @@ quantize:
     - $import: w8a8_fp8_fp8
     - $import: kv_fp8_cast
     - $import: default_disabled_quantizers
-    # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT)
-    # and any multimodal embedding projection in BF16. The `w8a8_fp8_fp8` unit
-    # enables bare `*weight_quantizer` / `*input_quantizer`, which on VL
-    # checkpoints also match the vision tower (`model.vision_tower.*`,
-    # `model.visual.*`) and the embedding projection (`model.embed_vision.*`).
-    # FP8-quantizing the vision branch yields garbage output on gemma-4
-    # (NVBug 6293731) and is accuracy-harmful generally. Must come after the
-    # enables so the disable wins (later entries override earlier). No-op on
-    # text-only models.
-    - quantizer_name: '*visual*'
-      enable: false
-    - quantizer_name: '*vision_tower*'
-      enable: false
-    - quantizer_name: '*embed_vision*'
-      enable: false
diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml
index 4d3242c6bf6..a4cf71a1dbd 100644
--- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml
+++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8.yaml
@@ -50,17 +50,3 @@ quantize:
         $import: nvfp4
     - $import: kv_fp8
     - $import: default_disabled_quantizers
-    # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT)
-    # and any multimodal embedding projection in BF16. The `*mlp*` enables above
-    # match not only the language-model MLPs but also the vision tower's block
-    # MLPs (`model.vision_tower.encoder.layers.*.mlp`, `model.visual.blocks.*.mlp`).
-    # NVFP4-quantizing the vision branch crashes the FP4 kernel on gemma-4
-    # (NVBug 6293762) and produces garbage image embeddings on Qwen3.5-VL. Must
-    # come after the enables so the disable wins (later entries override earlier).
-    # No-op on text-only models.
-    - quantizer_name: '*visual*'
-      enable: false
-    - quantizer_name: '*vision_tower*'
-      enable: false
-    - quantizer_name: '*embed_vision*'
-      enable: false
diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml
index 4c224ef8835..225ecf7f086 100644
--- a/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml
+++ b/modelopt_recipes/general/ptq/nvfp4_mlp_only-kv_fp8_cast.yaml
@@ -50,17 +50,3 @@ quantize:
         $import: nvfp4
     - $import: kv_fp8_cast
     - $import: default_disabled_quantizers
-    # Multimodal vision-branch exclusion: keep the vision encoder (SigLIP / ViT)
-    # and any multimodal embedding projection in BF16. The `*mlp*` enables above
-    # match not only the language-model MLPs but also the vision tower's block
-    # MLPs (`model.vision_tower.encoder.layers.*.mlp`, `model.visual.blocks.*.mlp`).
-    # NVFP4-quantizing the vision branch crashes the FP4 kernel on gemma-4
-    # (NVBug 6293762) and produces garbage image embeddings on Qwen3.5-VL. Must
-    # come after the enables so the disable wins (later entries override earlier).
-    # No-op on text-only models.
-    - quantizer_name: '*visual*'
-      enable: false
-    - quantizer_name: '*vision_tower*'
-      enable: false
-    - quantizer_name: '*embed_vision*'
-      enable: false