Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 21 additions & 26 deletions examples/llm_ptq/scripts/huggingface_example.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,7 @@ dense | sparsegpt) ;;
;;
esac

#Iterate over list of qformats provided and check if they are valid
IFS=","
for qformat in $QFORMAT; do
case $qformat in
fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian) ;;
*)
echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian]" >&2
exit 1
;;
esac
done
IFS=" "
# Quant format / recipe validation is delegated to hf_ptq.py.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bot comment.

Regression: deleting the for qformat in $QFORMAT; do … done loop also drops the implicit binding of the lowercase loop variable $qformat, which is still used below at if [ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]. With the loop removed, $qformat is empty and that bf16/fp16 shortcut (which symlinks the source model into $SAVE_PATH and marks MODEL_CONFIG_EXIST=true) will never trigger — users running --quant=bf16 or --quant=fp16 will now fall through to python hf_ptq.py --qformat=bf16 instead. Either replace $qformat with $QFORMAT in that check, or add a dedicated qformat="$QFORMAT" assignment here.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need bf16/fp16 path anyway? Maybe we can deprecate them

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we still have the use cases where we quantize fp32 to fp16.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I think we can delete. Let me add this to the PR


script_dir="$(dirname "$(readlink -f "$0")")"

Expand All @@ -72,7 +61,14 @@ fi

QFORMAT_MODIFIED="${QFORMAT//,/_}"

MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
# When using --recipe, build the model name from the recipe basename (without
# directory or .yaml suffix) so each recipe gets its own SAVE_PATH.
if [ -n "$RECIPE" ]; then
RECIPE_TAG=$(basename "$RECIPE" .yaml | sed 's/[^0-9a-zA-Z\-]/_/g')
MODEL_NAME=$(basename "$MODEL_PATH" | sed 's/[^0-9a-zA-Z\-]/_/g')_recipe_${RECIPE_TAG}
else
MODEL_NAME=$(basename "$MODEL_PATH" | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
fi

SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}

Expand Down Expand Up @@ -164,24 +160,18 @@ fi

if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH) ]]; then

if [ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]; then
if [ -d "$MODEL_PATH" ]; then
MODEL_CONFIG_EXIST=true
MODEL_CONFIG=$MODEL_PATH/config.json
for file in $MODEL_PATH/*; do ln -sf "$file" $SAVE_PATH/; done
else
echo "Please use the model directory where the config.json file is present."
exit 1
fi
fi

if [[ "$MODEL_CONFIG_EXIST" == false ]]; then
echo "Quantizing original model..."
if [ -n "$RECIPE" ]; then
QUANT_SPEC_ARGS="--recipe=$RECIPE"
else
QUANT_SPEC_ARGS="--qformat=${QFORMAT// /,}"
fi
python hf_ptq.py \
--pyt_ckpt_path=$MODEL_PATH \
--export_path=$SAVE_PATH \
--sparsity_fmt=$SPARSITY_FMT \
--qformat="${QFORMAT// /,}" \
$QUANT_SPEC_ARGS \
--calib_size=$CALIB_SIZE \
--batch_size=$CALIB_BATCH_SIZE \
--inference_tensor_parallel=$TP \
Expand All @@ -203,7 +193,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
exit 0
fi

if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then
if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]] || [[ "$RECIPE" == *"nvfp4"* ]]; then
cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1)

if [ "$cuda_major" -lt 10 ]; then
Expand All @@ -212,6 +202,11 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
fi
fi

if [ -n "$RECIPE" ]; then
echo "Recipe $RECIPE used. Please deploy with TensorRT-LLM directly. Checkpoint export_path: $SAVE_PATH"
exit 0
fi

if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then
echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
exit 0
Expand Down
16 changes: 13 additions & 3 deletions examples/llm_ptq/scripts/parser.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ parse_options() {
# Default values
MODEL_PATH=""
QFORMAT=""
RECIPE=""
KV_CACHE_QUANT=""
TP=1
PP=1
Expand All @@ -37,13 +38,14 @@ parse_options() {
CAST_MXFP4_TO_NVFP4=false

# Parse command-line options
ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")

eval set -- "$ARGS"
while true; do
case "$1" in
--model ) MODEL_PATH="$2"; shift 2;;
--quant ) QFORMAT="$2"; shift 2;;
--recipe ) RECIPE="$2"; shift 2;;
--kv_cache_quant ) KV_CACHE_QUANT="$2"; shift 2;;
--tp ) TP="$2"; shift 2;;
--pp ) PP="$2"; shift 2;;
Expand Down Expand Up @@ -99,12 +101,19 @@ parse_options() {
fi

# Verify required options are provided
if [ -z "$MODEL_PATH" ] || [ -z "$QFORMAT" ] || [ -z "$TASKS" ]; then
echo "Usage: $0 --model=<MODEL_PATH> --quant=<QFORMAT> --tasks=<TASK,...>"
if [ -z "$MODEL_PATH" ] || [ -z "$TASKS" ] || ([ -z "$QFORMAT" ] && [ -z "$RECIPE" ]); then
echo "Usage: $0 --model=<MODEL_PATH> (--quant=<QFORMAT> | --recipe=<RECIPE>) --tasks=<TASK,...>"
echo "Optional args: --sparsity=<SPARSITY_FMT> --awq_block_size=<AWQ_BLOCK_SIZE> --calib=<CALIB_SIZE>"
exit 1
fi

# --quant and --recipe are mutually exclusive: --recipe is a full PTQ spec, while
# --quant selects a built-in qformat preset. Pick exactly one.
if [ -n "$QFORMAT" ] && [ -n "$RECIPE" ]; then
echo "Cannot specify both --quant and --recipe; pick one." >&2
exit 1
fi

VALID_TASKS=("quant" "mmlu" "lm_eval" "livecodebench" "simple_eval")

for task in $(echo "$TASKS" | tr ',' ' '); do
Expand Down Expand Up @@ -135,6 +144,7 @@ parse_options() {
echo "================="
echo "model: $MODEL_PATH"
echo "quant: $QFORMAT"
echo "recipe: $RECIPE"
echo "tp (TensorRT-LLM Checkpoint only): $TP"
echo "pp (TensorRT-LLM Checkpoint only): $PP"
echo "sparsity: $SPARSITY_FMT"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bot comment.

Stylistic nit: sibling recipes in this directory follow the <numerics>-kv_<kv_fmt> naming pattern (e.g. nvfp4_experts_only-kv_fp8.yaml, nvfp4_default-kv_fp8_cast.yaml). This file uses _mse-fp8_cast_kv which flips the KV descriptor order. Consider renaming to something like nvfp4_experts_only_mse-kv_fp8_cast.yaml for consistency with the existing convention. Same for nvfp4_mlp_only_mse-fp8_cast_kv.yaml.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree.

# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

imports:
base_disable_all: configs/ptq/units/base_disable_all
default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
nvfp4: configs/numerics/nvfp4
nvfp4_static: configs/numerics/nvfp4_static
kv_fp8_cast: configs/ptq/units/kv_fp8_cast

metadata:
recipe_type: ptq
description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), FP8 KV cache with constant amax.
quantize:
algorithm:
method: mse
fp8_scale_sweep: true
# layerwise=false required for VLMs where the decoder layers are nested under
# `model.language_model.layers` (layerwise_calibrate can't find them otherwise).
layerwise: false
quant_cfg:
- $import: base_disable_all
- quantizer_name: '*mlp.experts*weight_quantizer'
cfg:
$import: nvfp4_static
- quantizer_name: '*mlp.experts*input_quantizer'
cfg:
$import: nvfp4
- quantizer_name: '*block_sparse_moe*weight_quantizer'
cfg:
$import: nvfp4_static
- quantizer_name: '*block_sparse_moe*input_quantizer'
cfg:
$import: nvfp4
- $import: kv_fp8_cast
- $import: default_disabled_quantizers
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

imports:
base_disable_all: configs/ptq/units/base_disable_all
default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
nvfp4: configs/numerics/nvfp4
nvfp4_static: configs/numerics/nvfp4_static
kv_fp8_cast: configs/ptq/units/kv_fp8_cast

metadata:
recipe_type: ptq
description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for MLP/MoE linear layers (W4A4), FP8 KV cache with constant amax.
quantize:
algorithm:
method: mse
fp8_scale_sweep: true
# layerwise=false required for VLMs where the decoder layers are nested under
# `model.language_model.layers` (layerwise_calibrate can't find them otherwise).
layerwise: false
quant_cfg:
- $import: base_disable_all
- quantizer_name: '*mlp*weight_quantizer'
cfg:
$import: nvfp4_static
- quantizer_name: '*mlp*input_quantizer'
cfg:
$import: nvfp4
- quantizer_name: '*block_sparse_moe*weight_quantizer'
cfg:
$import: nvfp4_static
- quantizer_name: '*block_sparse_moe*input_quantizer'
cfg:
$import: nvfp4
- quantizer_name: '*.experts.*weight_quantizer'
cfg:
$import: nvfp4_static
- quantizer_name: '*.experts.*input_quantizer'
cfg:
$import: nvfp4
- $import: kv_fp8_cast
- $import: default_disabled_quantizers
Loading