NVIDIA · Fridah-nv · Mar 18, 2026 · realAsma · Mar 19, 2026 · realAsma
@@ -51,6 +51,7 @@
 import modelopt.torch.sparsity as mts
 from modelopt.torch.export import (
     export_hf_checkpoint,
+    export_hf_vllm_fq_checkpoint,
     export_speculative_decoding,
     export_tensorrt_llm_checkpoint,
     get_model_type,
@@ -650,16 +651,21 @@ def export_quantized(
 
             # Load any missing weights from non-standard safetensors (handled in get_model for non-low-memory mode)
             # Store the MTP layer prefixes on the model for later exclusion from quantization
-            mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(full_model, args.pyt_ckpt_path)
+            if args.vllm_fakequant_export:
+                export_hf_vllm_fq_checkpoint(full_model, export_dir=export_path)
+            else:
+                mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(
+                    full_model, args.pyt_ckpt_path
+                )
 
-            if mtp_layer_prefixes:
-                full_model._mtp_layer_prefixes = mtp_layer_prefixes
+                if mtp_layer_prefixes:
+                    full_model._mtp_layer_prefixes = mtp_layer_prefixes
 
-            export_hf_checkpoint(
-                full_model,
-                export_dir=export_path,
-                extra_state_dict=mtp_state_dict,
-            )
+                export_hf_checkpoint(
+                    full_model,
+                    export_dir=export_path,
+                    extra_state_dict=mtp_state_dict,
+                )
 
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
@@ -1016,6 +1022,13 @@ def parse_args() -> argparse.Namespace:
         default=512,
     )
     parser.add_argument("--export_path", default="exported_model")
+    parser.add_argument(
+        "--vllm_fakequant_export",
+        default=False,
+        action="store_true",
+        help="Export as vLLM fake-quant checkpoint (produces vllm_fq_modelopt_state.pth "
+        "for use with vllm_serve_fakequant.py).",
+    )
     parser.add_argument(
         "--dataset",
         help=(

@@ -60,17 +60,30 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=<model_name>,
 
 Step 1: export the model with bf16 weights and quantizer state. To export the model:
 
-- For **HF** models, use `hf_ptq_export.py`:
+- For **HF** models, use `examples/llm_ptq/hf_ptq.py` with `--vllm_fakequant_export`:
 
 ```bash
-python  hf_ptq_export.py\
+python ../llm_ptq/hf_ptq.py \
   --pyt_ckpt_path <MODEL_PATH> \
-  --quant_cfg NVFP4_DEFAULT_CFG \
+  --qformat nvfp4 \
+  --calib_size 512 \
   --export_path <EXPORT_DIR> \
+  --vllm_fakequant_export \
   --trust_remote_code
 ```
 
   This creates `<EXPORT_DIR>/vllm_fq_modelopt_state.pth` (ModelOpt quantizer state for vLLM fake-quant reload) and saves the HF-exported model under `<EXPORT_DIR>` (config/tokenizer/weights).
+
+  Alternatively, the dedicated `hf_ptq_export.py` script (**deprecated** — use `hf_ptq.py` with `--vllm_fakequant_export` instead) can be used for a simpler interface:
+
+```bash
+python hf_ptq_export.py \
+  --pyt_ckpt_path <MODEL_PATH> \
+  --quant_cfg NVFP4_DEFAULT_CFG \
+  --export_path <EXPORT_DIR> \
+  --trust_remote_code
+```
+
-
-  Alternatively, the dedicated `hf_ptq_export.py` script (**deprecated** — use `hf_ptq.py` with `--vllm_fakequant_export` instead) can be used for a simpler interface:
-
-```bash
-python hf_ptq_export.py \
-  --pyt_ckpt_path <MODEL_PATH> \
-  --quant_cfg NVFP4_DEFAULT_CFG \
-  --export_path <EXPORT_DIR> \
-  --trust_remote_code
-```
-
-  Alternatively, the dedicated `hf_ptq_export.py` script (**deprecated** — use `hf_ptq.py` with `--vllm_fakequant_export` instead) can be used for a simpler interface:
-
-```bash
-python hf_ptq_export.py \
-  --pyt_ckpt_path <MODEL_PATH> \
-  --quant_cfg NVFP4_DEFAULT_CFG \
-  --export_path <EXPORT_DIR> \
-  --trust_remote_code
-```
   Note: `--pyt_ckpt_path` can point to either an HF checkpoint or a ModelOpt-saved checkpoint (e.g., a QAT/QAD checkpoint produced by `examples/llm_qat/main.py`). If the input checkpoint is already quantized, the script will **skip re-quantization** and only export artifacts for vLLM fakequant reload.
 
 - For **MCore** models, use `modelopt.torch.export.export_mcore_gpt_to_hf_vllm_fq`: