NVIDIA · Edwardf0t1 · Jun 11, 2026 · Jun 12, 2026 · Jun 12, 2026
@@ -38,10 +38,6 @@
     enable: false
   - quantizer_name: '*router*'
     enable: false
-  - quantizer_name: '*visual*'
-    enable: false
-  - quantizer_name: '*vision_tower*'
-    enable: false
   - quantizer_name: 'output.*'
     enable: false
   # Multimodal vision branch: keep the vision encoder (SigLIP / ViT) and any

@@ -0,0 +1,19 @@
+# Gemma 4 PTQ recipes
+
+Recipes for the **`gemma4`** model type (multimodal, e.g.
+[`google/gemma-4-31B-it`](https://huggingface.co/google/gemma-4-31B-it)). This is
+a distinct architecture from the text-only `gemma` model type — see
+[`../../gemma/ptq/`](../../gemma/ptq/) for that one. These recipes override the
+algorithm defaults that ship in the general PTQ presets because Gemma needs
+different settings to converge / stay accurate.
+
+| Recipe | What's model-specific |
+|--------|-----------------------|
+| `w4a8_awq-kv_fp8_cast.yaml` | Uses `awq_lite` with `alpha_step: 1` instead of the default AWQ search (the default search overflows in TRT-LLM kernels on Gemma; the coarser sweep avoids it without measurably hurting accuracy). Numerics: INT4 block weights + FP8 inputs + FP8 KV-cache cast (constant amax, no KV calibration). |
+
+The base numerics units and the standard disabled-quantizer list are inherited
+from the shared `configs/`; only the algorithm fields are model-specific. The
+multimodal vision branch (`*vision_tower*` / `*visual*` / `*embed_vision*`) is
+kept in BF16 by the shared `default_disabled_quantizers` unit — quantizing it to
+INT4 crashes export (`pack_int4_in_uint8` index-out-of-bounds, NVBug 6294017) and
+is accuracy-harmful.
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Gemma 4 (model_type=gemma4, e.g. gemma-4-31B-it) W4A8 AWQ PTQ recipe with FP8
+# KV-cache cast. gemma4 is a distinct, multimodal architecture from the text-only
+# `gemma` model type (see ../../gemma/ptq/). Uses a coarser optimal-scale search
+# (awq_lite with alpha_step=1) to avoid the overflow observed in TRT-LLM kernels
+# when using the default AWQ search on Gemma.
+#
+# The bare `*weight_quantizer` / `*input_quantizer` enables below would also match
+# the SigLIP vision tower (`model.vision_tower.*`) and the multimodal embedding
+# projection (`model.embed_vision.*`). The vision tower's MLP in-features (4304)
+# are not a multiple of the INT4 block size (128), so INT4-packing them at export
+# hits a device-side "index out of bounds" assert in pack_int4_in_uint8 (NVBug
+# 6294017); it is also accuracy-harmful to W4A8 the vision branch. The vision
+# branch is kept in BF16 by the shared `default_disabled_quantizers` unit imported
+# below, which globally disables `*vision_tower*` / `*visual*` / `*embed_vision*`.
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  fp8: configs/numerics/fp8
+  int4_per_block: configs/numerics/int4_per_block
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    Gemma 4 (multimodal) W4A8 AWQ recipe with FP8 KV-cache cast: INT4 block
+    weights + FP8 inputs, awq_lite with alpha_step=1 (coarser search) to avoid
+    TRT-LLM overflow, plus FP8 KV-cache using constant amax (no KV calibration).
+    The SigLIP vision tower and vision embedding projection are kept in BF16.
+quantize:
+  algorithm:
+    method: awq_lite
+    alpha_step: 1
+  quant_cfg:
+    - $import: base_disable_all
+    - quantizer_name: '*weight_quantizer'
+      cfg:
+        - $import: int4_per_block
+        - $import: fp8
+    - quantizer_name: '*input_quantizer'
+      cfg:
+        $import: fp8
+    - $import: kv_fp8_cast
+    # default_disabled_quantizers (imported last so its disables win) keeps the
+    # multimodal vision branch — `*vision_tower*` / `*visual*` / `*embed_vision*` —
+    # in BF16, preventing the INT4 pack_int4_in_uint8 crash (NVBug 6294017).
+    - $import: default_disabled_quantizers