From 62a093a1f344f9e0123007eaaa006eb1f24b5edd Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 11 Jun 2026 13:50:37 -0700
Subject: [PATCH 1/3] Fix gemma w4a8_awq recipe crashing export on multimodal
 checkpoints (NVBug 6294017)

The gemma `w4a8_awq-kv_fp8_cast` recipe enables quantization with bare
`*weight_quantizer` / `*input_quantizer` wildcards. On multimodal Gemma
checkpoints (e.g. gemma-4-31B-it) these also match the SigLIP vision tower
(`model.vision_tower.*`) and the vision embedding projection
(`model.embed_vision.*`). The vision tower's MLP in-features (4304) are not a
multiple of the INT4 block size (128), so INT4 weight packing at export hits a
device-side `index out of bounds` assert in `pack_int4_in_uint8`. Quantizing
the vision branch is also accuracy-harmful.

Add trailing `*vision_tower*` / `*embed_vision*` disable rules (placed after the
enables so the disable wins), keeping the vision branch in BF16. Mirrors the
vision exclusions already shipped in the qwen3_5 / nemotron_vl / phi4mm recipes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 .../gemma/ptq/w4a8_awq-kv_fp8_cast.yaml       | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml
index c649234088e..d7fdf592fd5 100644
--- a/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml
+++ b/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml
@@ -16,6 +16,16 @@
 # Gemma-specific W4A8 AWQ PTQ recipe with FP8 KV-cache cast. Uses a coarser
 # optimal-scale search (awq_lite with alpha_step=1) to avoid overflow observed
 # in TRT-LLM kernels when using the default AWQ search on Gemma.
+#
+# On multimodal Gemma checkpoints (e.g. gemma-4-31B-it), the bare
+# `*weight_quantizer` / `*input_quantizer` enables below also match the SigLIP
+# vision tower (`model.vision_tower.*`) and the multimodal embedding projection
+# (`model.embed_vision.*`). The vision tower's MLP in-features (4304) are not a
+# multiple of the INT4 block size (128), so INT4 weight packing at export hits a
+# device-side "index out of bounds" assert in pack_int4_in_uint8 (NVBug
+# 6294017). It is also accuracy-harmful to W4A8 the vision branch. The trailing
+# `*vision_tower*` / `*embed_vision*` disable rules keep that branch in BF16,
+# mirroring the vision exclusions in the qwen3_5 / nemotron_vl recipes.
 
 imports:
   base_disable_all: configs/ptq/units/base_disable_all
@@ -45,3 +55,12 @@ quantize:
         $import: fp8
     - $import: kv_fp8_cast
     - $import: default_disabled_quantizers
+    # Multimodal vision-branch exclusion: keep the SigLIP vision tower and the
+    # vision embedding projection in BF16. Must come after the `*weight_quantizer`
+    # / `*input_quantizer` enables above so the disable wins (later entries
+    # override earlier ones). Fixes NVBug 6294017 (INT4 packing index-out-of-bounds
+    # on the vision MLP whose in-features are not a multiple of the 128 block size).
+    - quantizer_name: '*vision_tower*'
+      enable: false
+    - quantizer_name: '*embed_vision*'
+      enable: false

From e6049d0a57caa1b121a24f7432d292a52a9bafd5 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 11 Jun 2026 17:40:24 -0700
Subject: [PATCH 2/3] Move gemma-4 W4A8-AWQ vision exclusion to a dedicated
 gemma4 recipe

Per review (shengliangxu): gemma-4-31B-it is model_type=gemma4, a distinct
multimodal architecture from the text-only `gemma` model type. The vision-branch
exclusion belongs in a gemma4-specific recipe, not the gemma one (which is
text-only and has no vision tower).

- Revert modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml to its
  original (text-only) form.
- Add modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml with the
  same awq_lite alpha_step=1 numerics plus `*vision_tower*` / `*embed_vision*`
  enable:false rules to keep the SigLIP vision branch in BF16 (fixes NVBug
  6294017: INT4 pack_int4_in_uint8 index-out-of-bounds on the vision MLP whose
  in-features (4304) are not a multiple of the 128 block size).
- Add modelopt_recipes/huggingface/gemma4/ptq/README.md.

Verified: load_recipe resolves the new recipe (<builtin>/huggingface/gemma4/...)
with the vision excludes present; the quantize block is identical to the
previously hardware-verified fix (full calib_size 512 export on gemma-4-31B-it:
no index-out-of-bounds, exclude_modules = lm_head, model.embed_vision*,
model.vision_tower*; vision 0/353 quantized, LM 410/772 quantized W4A8_AWQ).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 .../gemma/ptq/w4a8_awq-kv_fp8_cast.yaml       | 19 ------
 .../huggingface/gemma4/ptq/README.md          | 17 +++++
 .../gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml      | 68 +++++++++++++++++++
 3 files changed, 85 insertions(+), 19 deletions(-)
 create mode 100644 modelopt_recipes/huggingface/gemma4/ptq/README.md
 create mode 100644 modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml

diff --git a/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml
index d7fdf592fd5..c649234088e 100644
--- a/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml
+++ b/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml
@@ -16,16 +16,6 @@
 # Gemma-specific W4A8 AWQ PTQ recipe with FP8 KV-cache cast. Uses a coarser
 # optimal-scale search (awq_lite with alpha_step=1) to avoid overflow observed
 # in TRT-LLM kernels when using the default AWQ search on Gemma.
-#
-# On multimodal Gemma checkpoints (e.g. gemma-4-31B-it), the bare
-# `*weight_quantizer` / `*input_quantizer` enables below also match the SigLIP
-# vision tower (`model.vision_tower.*`) and the multimodal embedding projection
-# (`model.embed_vision.*`). The vision tower's MLP in-features (4304) are not a
-# multiple of the INT4 block size (128), so INT4 weight packing at export hits a
-# device-side "index out of bounds" assert in pack_int4_in_uint8 (NVBug
-# 6294017). It is also accuracy-harmful to W4A8 the vision branch. The trailing
-# `*vision_tower*` / `*embed_vision*` disable rules keep that branch in BF16,
-# mirroring the vision exclusions in the qwen3_5 / nemotron_vl recipes.
 
 imports:
   base_disable_all: configs/ptq/units/base_disable_all
@@ -55,12 +45,3 @@ quantize:
         $import: fp8
     - $import: kv_fp8_cast
     - $import: default_disabled_quantizers
-    # Multimodal vision-branch exclusion: keep the SigLIP vision tower and the
-    # vision embedding projection in BF16. Must come after the `*weight_quantizer`
-    # / `*input_quantizer` enables above so the disable wins (later entries
-    # override earlier ones). Fixes NVBug 6294017 (INT4 packing index-out-of-bounds
-    # on the vision MLP whose in-features are not a multiple of the 128 block size).
-    - quantizer_name: '*vision_tower*'
-      enable: false
-    - quantizer_name: '*embed_vision*'
-      enable: false
diff --git a/modelopt_recipes/huggingface/gemma4/ptq/README.md b/modelopt_recipes/huggingface/gemma4/ptq/README.md
new file mode 100644
index 00000000000..849891f444a
--- /dev/null
+++ b/modelopt_recipes/huggingface/gemma4/ptq/README.md
@@ -0,0 +1,17 @@
+# Gemma 4 PTQ recipes
+
+Recipes for the **`gemma4`** model type (multimodal, e.g.
+[`google/gemma-4-31B-it`](https://huggingface.co/google/gemma-4-31B-it)). This is
+a distinct architecture from the text-only `gemma` model type — see
+[`../../gemma/ptq/`](../../gemma/ptq/) for that one. These recipes override the
+algorithm defaults that ship in the general PTQ presets because Gemma needs
+different settings to converge / stay accurate, and additionally exclude the
+multimodal vision branch from quantization.
+
+| Recipe | What's model-specific |
+|--------|-----------------------|
+| `w4a8_awq-kv_fp8_cast.yaml` | Uses `awq_lite` with `alpha_step: 1` instead of the default AWQ search (the default search overflows in TRT-LLM kernels on Gemma; the coarser sweep avoids it without measurably hurting accuracy). Excludes the SigLIP vision tower (`model.vision_tower.*`) and the vision embedding projection (`model.embed_vision.*`), keeping them in BF16 — quantizing them to INT4 crashes export (`pack_int4_in_uint8` index-out-of-bounds, NVBug 6294017) and is accuracy-harmful. Numerics: INT4 block weights + FP8 inputs + FP8 KV-cache cast (constant amax, no KV calibration). |
+
+The base numerics units and the standard disabled-quantizer list are inherited
+from the shared `configs/`; only the algorithm fields and the vision-branch
+exclusions are model-specific.
diff --git a/modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml
new file mode 100644
index 00000000000..cb3bb719a92
--- /dev/null
+++ b/modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Gemma 4 (model_type=gemma4, e.g. gemma-4-31B-it) W4A8 AWQ PTQ recipe with FP8
+# KV-cache cast. gemma4 is a distinct, multimodal architecture from the text-only
+# `gemma` model type (see ../../gemma/ptq/). Uses a coarser optimal-scale search
+# (awq_lite with alpha_step=1) to avoid the overflow observed in TRT-LLM kernels
+# when using the default AWQ search on Gemma.
+#
+# The bare `*weight_quantizer` / `*input_quantizer` enables below also match the
+# SigLIP vision tower (`model.vision_tower.*`) and the multimodal embedding
+# projection (`model.embed_vision.*`). The vision tower's MLP in-features (4304)
+# are not a multiple of the INT4 block size (128), so INT4 weight packing at
+# export hits a device-side "index out of bounds" assert in pack_int4_in_uint8
+# (NVBug 6294017). It is also accuracy-harmful to W4A8 the vision branch. The
+# trailing `*vision_tower*` / `*embed_vision*` disable rules keep that branch in
+# BF16, mirroring the vision exclusions in the qwen3_5 / nemotron_vl recipes.
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  fp8: configs/numerics/fp8
+  int4_per_block: configs/numerics/int4_per_block
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    Gemma 4 (multimodal) W4A8 AWQ recipe with FP8 KV-cache cast: INT4 block
+    weights + FP8 inputs, awq_lite with alpha_step=1 (coarser search) to avoid
+    TRT-LLM overflow, plus FP8 KV-cache using constant amax (no KV calibration).
+    The SigLIP vision tower and vision embedding projection are kept in BF16.
+quantize:
+  algorithm:
+    method: awq_lite
+    alpha_step: 1
+  quant_cfg:
+    - $import: base_disable_all
+    - quantizer_name: '*weight_quantizer'
+      cfg:
+        - $import: int4_per_block
+        - $import: fp8
+    - quantizer_name: '*input_quantizer'
+      cfg:
+        $import: fp8
+    - $import: kv_fp8_cast
+    - $import: default_disabled_quantizers
+    # Multimodal vision-branch exclusion: keep the SigLIP vision tower and the
+    # vision embedding projection in BF16. Must come after the `*weight_quantizer`
+    # / `*input_quantizer` enables above so the disable wins (later entries
+    # override earlier ones). Fixes NVBug 6294017 (INT4 packing index-out-of-bounds
+    # on the vision MLP whose in-features are not a multiple of the 128 block size).
+    - quantizer_name: '*vision_tower*'
+      enable: false
+    - quantizer_name: '*embed_vision*'
+      enable: false

From 0cf494bfd1d0316f445e921cbba6c4ed52afa531 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Fri, 12 Jun 2026 10:28:31 -0700
Subject: [PATCH 3/3] gemma4: rely on shared vision exclusion; resolve
 default_disabled overlap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to #1691 (merged) and meenchen's qwen3.6 vision-exclusion addition,
both of which landed `*vision_tower*` / `*visual*` in default_disabled_quantizers.

- default_disabled_quantizers.yaml: remove the duplicate bare `*visual*` /
  `*vision_tower*` entries (qwen3.6) now that the documented block already
  disables `*vision_tower*` / `*visual*` / `*embed_vision*`. One source of truth.
- gemma4 w4a8_awq recipe: drop the now-redundant explicit `*vision_tower*` /
  `*embed_vision*` excludes — they are inherited from the shared
  default_disabled_quantizers unit (imported last so its disables win). The
  recipe is now just the gemma-specific awq_lite alpha_step=1 numerics.
- Update the gemma4 recipe comment / README to reflect the shared-unit source.

Verified: load_recipe on the gemma4 recipe resolves `*vision_tower*` /
`*visual*` / `*embed_vision*` as disabled (via the shared unit) with
`*weight_quantizer` still enabled (INT4). Fixes NVBug 6294017.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 .../units/default_disabled_quantizers.yaml    |  4 ---
 .../huggingface/gemma4/ptq/README.md          | 12 +++++----
 .../gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml      | 26 +++++++------------
 3 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml
index e2efcb5142d..87fd67300fa 100644
--- a/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml
+++ b/modelopt_recipes/configs/ptq/units/default_disabled_quantizers.yaml
@@ -38,10 +38,6 @@
     enable: false
   - quantizer_name: '*router*'
     enable: false
-  - quantizer_name: '*visual*'
-    enable: false
-  - quantizer_name: '*vision_tower*'
-    enable: false
   - quantizer_name: 'output.*'
     enable: false
   # Multimodal vision branch: keep the vision encoder (SigLIP / ViT) and any
diff --git a/modelopt_recipes/huggingface/gemma4/ptq/README.md b/modelopt_recipes/huggingface/gemma4/ptq/README.md
index 849891f444a..34c10a6ac1b 100644
--- a/modelopt_recipes/huggingface/gemma4/ptq/README.md
+++ b/modelopt_recipes/huggingface/gemma4/ptq/README.md
@@ -5,13 +5,15 @@ Recipes for the **`gemma4`** model type (multimodal, e.g.
 a distinct architecture from the text-only `gemma` model type — see
 [`../../gemma/ptq/`](../../gemma/ptq/) for that one. These recipes override the
 algorithm defaults that ship in the general PTQ presets because Gemma needs
-different settings to converge / stay accurate, and additionally exclude the
-multimodal vision branch from quantization.
+different settings to converge / stay accurate.
 
 | Recipe | What's model-specific |
 |--------|-----------------------|
-| `w4a8_awq-kv_fp8_cast.yaml` | Uses `awq_lite` with `alpha_step: 1` instead of the default AWQ search (the default search overflows in TRT-LLM kernels on Gemma; the coarser sweep avoids it without measurably hurting accuracy). Excludes the SigLIP vision tower (`model.vision_tower.*`) and the vision embedding projection (`model.embed_vision.*`), keeping them in BF16 — quantizing them to INT4 crashes export (`pack_int4_in_uint8` index-out-of-bounds, NVBug 6294017) and is accuracy-harmful. Numerics: INT4 block weights + FP8 inputs + FP8 KV-cache cast (constant amax, no KV calibration). |
+| `w4a8_awq-kv_fp8_cast.yaml` | Uses `awq_lite` with `alpha_step: 1` instead of the default AWQ search (the default search overflows in TRT-LLM kernels on Gemma; the coarser sweep avoids it without measurably hurting accuracy). Numerics: INT4 block weights + FP8 inputs + FP8 KV-cache cast (constant amax, no KV calibration). |
 
 The base numerics units and the standard disabled-quantizer list are inherited
-from the shared `configs/`; only the algorithm fields and the vision-branch
-exclusions are model-specific.
+from the shared `configs/`; only the algorithm fields are model-specific. The
+multimodal vision branch (`*vision_tower*` / `*visual*` / `*embed_vision*`) is
+kept in BF16 by the shared `default_disabled_quantizers` unit — quantizing it to
+INT4 crashes export (`pack_int4_in_uint8` index-out-of-bounds, NVBug 6294017) and
+is accuracy-harmful.
diff --git a/modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml
index cb3bb719a92..28410cc2565 100644
--- a/modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml
+++ b/modelopt_recipes/huggingface/gemma4/ptq/w4a8_awq-kv_fp8_cast.yaml
@@ -19,14 +19,14 @@
 # (awq_lite with alpha_step=1) to avoid the overflow observed in TRT-LLM kernels
 # when using the default AWQ search on Gemma.
 #
-# The bare `*weight_quantizer` / `*input_quantizer` enables below also match the
-# SigLIP vision tower (`model.vision_tower.*`) and the multimodal embedding
+# The bare `*weight_quantizer` / `*input_quantizer` enables below would also match
+# the SigLIP vision tower (`model.vision_tower.*`) and the multimodal embedding
 # projection (`model.embed_vision.*`). The vision tower's MLP in-features (4304)
-# are not a multiple of the INT4 block size (128), so INT4 weight packing at
-# export hits a device-side "index out of bounds" assert in pack_int4_in_uint8
-# (NVBug 6294017). It is also accuracy-harmful to W4A8 the vision branch. The
-# trailing `*vision_tower*` / `*embed_vision*` disable rules keep that branch in
-# BF16, mirroring the vision exclusions in the qwen3_5 / nemotron_vl recipes.
+# are not a multiple of the INT4 block size (128), so INT4-packing them at export
+# hits a device-side "index out of bounds" assert in pack_int4_in_uint8 (NVBug
+# 6294017); it is also accuracy-harmful to W4A8 the vision branch. The vision
+# branch is kept in BF16 by the shared `default_disabled_quantizers` unit imported
+# below, which globally disables `*vision_tower*` / `*visual*` / `*embed_vision*`.
 
 imports:
   base_disable_all: configs/ptq/units/base_disable_all
@@ -56,13 +56,7 @@ quantize:
       cfg:
         $import: fp8
     - $import: kv_fp8_cast
+    # default_disabled_quantizers (imported last so its disables win) keeps the
+    # multimodal vision branch — `*vision_tower*` / `*visual*` / `*embed_vision*` —
+    # in BF16, preventing the INT4 pack_int4_in_uint8 crash (NVBug 6294017).
     - $import: default_disabled_quantizers
-    # Multimodal vision-branch exclusion: keep the SigLIP vision tower and the
-    # vision embedding projection in BF16. Must come after the `*weight_quantizer`
-    # / `*input_quantizer` enables above so the disable wins (later entries
-    # override earlier ones). Fixes NVBug 6294017 (INT4 packing index-out-of-bounds
-    # on the vision MLP whose in-features are not a multiple of the 128 block size).
-    - quantizer_name: '*vision_tower*'
-      enable: false
-    - quantizer_name: '*embed_vision*'
-      enable: false