Qualcomm AI Engine Direct - Support for LPAI in cli.py (pytorch#18995)

shewu-quic · web-flow · commit cd811565c1e0 · 2026-04-20T09:40:32.000-07:00
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -8746,6 +8746,9 @@ def required_envs(self, conditions=None) -> bool:
         )
 
     def test_cli(self):
+        # TODO: Add gpu support in cli.py
+        if get_backend_type(self.backend) == QnnExecuTorchBackendType.kGpuBackend:
+            self.skipTest("Currently, the GPU does not support CLI.")
         with tempfile.TemporaryDirectory() as tmp_dir:
             sample_input = torch.randn(1, 2, 3, 4)
             ep = torch.export.export(Relu(), (sample_input,))  # noqa: F405
@@ -8768,6 +8771,8 @@ def test_cli(self):
                 f"{tmp_dir}/input_list",
                 "--soc_model",
                 self.soc_model,
+                "--backend",
+                self.backend,
             ]
             subprocess.run(cmds, stdout=subprocess.DEVNULL)
             self.assertTrue(os.path.isfile(f"{tmp_dir}/q_out/relu_quantized.pt2"))
@@ -8783,6 +8788,8 @@ def test_cli(self):
                 f"{tmp_dir}/c_out",
                 "--soc_model",
                 self.soc_model,
+                "--backend",
+                self.backend,
             ]
             subprocess.run(cmds, stdout=subprocess.DEVNULL)
             self.assertTrue(os.path.isfile(f"{tmp_dir}/c_out/relu_quantized.pte"))
@@ -8807,13 +8814,18 @@ def test_cli(self):
                 self.target,
                 "--device",
                 self.device,
+                "--backend",
+                self.backend,
             ]
             if self.host:
                 cmds.extend(["--host", self.host])
             subprocess.run(cmds, stdout=subprocess.DEVNULL)
             self.assertTrue(os.path.isfile(f"{tmp_dir}/e_out/Result_0/output_0.pt"))
 
     def test_cli_with_input_list_assignment(self):
+        # TODO: Add gpu support in cli.py
+        if get_backend_type(self.backend) == QnnExecuTorchBackendType.kGpuBackend:
+            self.skipTest("Currently, the GPU does not support CLI.")
         with tempfile.TemporaryDirectory() as tmp_dir:
             sample_input = torch.randn(1, 2, 3, 4)
             sample_input2 = torch.randn(1, 2, 3, 4)
@@ -8840,6 +8852,8 @@ def test_cli_with_input_list_assignment(self):
                 f"{tmp_dir}/input_list",
                 "--soc_model",
                 self.soc_model,
+                "--backend",
+                self.backend,
             ]
             subprocess.run(cmds, stdout=subprocess.DEVNULL)
             self.assertTrue(os.path.isfile(f"{tmp_dir}/q_out/sub_quantized.pt2"))
@@ -8855,6 +8869,8 @@ def test_cli_with_input_list_assignment(self):
                 f"{tmp_dir}/c_out",
                 "--soc_model",
                 self.soc_model,
+                "--backend",
+                self.backend,
             ]
             subprocess.run(cmds, stdout=subprocess.DEVNULL)
             self.assertTrue(os.path.isfile(f"{tmp_dir}/c_out/sub_quantized.pte"))
@@ -8879,6 +8895,8 @@ def test_cli_with_input_list_assignment(self):
                 self.build_folder,
                 "--input_list",
                 f"{tmp_dir}/input_list",
+                "--backend",
+                self.backend,
             ]
             if self.host:
                 cmds.extend(["--host", self.host])
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -1116,7 +1116,7 @@ def generate_lpai_compiler_spec(
     )
 
 
-def generate_qnn_executorch_compiler_spec(
+def generate_qnn_executorch_compiler_spec(  # noqa: C901
     soc_model: QcomChipset,
     backend_options: QnnExecuTorchBackendOptions,
     debug: bool = False,
@@ -1224,6 +1224,21 @@ def generate_qnn_executorch_compiler_spec(
     ):
         raise ValueError("LPAI does not support online prepare.")
 
+    if backend_options.backend_type == QnnExecuTorchBackendType.kLpaiBackend:
+        if soc_model.name not in get_soc_to_lpai_hw_ver_map():
+            raise ValueError(
+                f"Target soc_model({soc_model.name}) doesn't support LPAI backend. \n"
+                "Please choose the following SOC: "
+                f"{list(get_soc_to_lpai_hw_ver_map().keys())}"
+            )
+        elif get_soc_to_lpai_hw_ver_map()[
+            soc_model.name
+        ] == LpaiHardwareVersion.V6 and is_qnn_sdk_version_less_than("2.39"):
+            raise ValueError(
+                f"Target soc_model({soc_model.name}) with LPAI backend v6 requires QNN SDK version >= 2.39. \n"
+                f"Current QNN SDK version: {get_sdk_build_id()}"
+            )
+
     qnn_executorch_options.shared_buffer = shared_buffer
     qnn_executorch_options.online_prepare = online_prepare
     qnn_executorch_options.is_from_context_binary = is_from_context_binary
diff --git a/examples/qualcomm/util_scripts/README.md b/examples/qualcomm/util_scripts/README.md
@@ -36,14 +36,14 @@ This tool aims for users who want to deploy models with ExecuTorch runtime. It's
           current_input += f"{file_name} "
       input_list += f"{current_input.strip()}\n"
 
-  with open(f"{ws}/input_list", 'w') as f:
+  with open(f"{ws}/input_list.txt", 'w') as f:
       f.write(input_list)
   ```
 
 * Quantize
   ```bash 
   # user could get more information via: PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli quantize -h
-  PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli quantize -a cli_example/simple_model.pt2 -o cli_example/quantize_output -c use_8a8w -i cli_example/input_list.txt --per_channel
+  PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli quantize -a cli_example/simple_model.pt2 -o cli_example/quantize_output -c use_8a8w -i cli_example/input_list.txt --per_channel -m SM8750 --backend htp
   ```
 * Artifacts for quantized .pt2 file
   - `cli_example/quantize_output/simple_model_quantized.pt2`
@@ -61,7 +61,7 @@ This tool aims for users who want to deploy models with ExecuTorch runtime. It's
   ```bash
   # `pip install pydot` if package is missing
   # user could get more information via: PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli compile -h
-  PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli compile -a model.bin -o path/to/model/output -m SM8750
+  PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli compile -a model.bin -o path/to/model/output -m SM8750 --backend htp
   ```
 * Artifacts for .pte file and figure of graph information
   - `cli_example/compile_output/simple_model_quantized.pte`
@@ -72,7 +72,7 @@ This tool aims for users who want to deploy models with ExecuTorch runtime. It's
 * Execute .pte program
   ```bash
   # user could get more information via: PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli execute -h
-  PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli execute -a cli_example/compile_output/simple_model_quantized.pte -o cli_example/execute_output -i cli_example/input_list.txt -H $HOST_NAME -s $DEVICE_SERIAL -b build-android -m SM8750
+  PYTHONPATH=.. python -m examples.qualcomm.util_scripts.cli execute -a cli_example/compile_output/simple_model_quantized.pte -o cli_example/execute_output -i cli_example/input_list.txt -H $HOST_NAME -s $DEVICE_SERIAL -b build-android -m SM8750 --backend htp
   ```
 * Artifacts for .pte file and figure of graph information
   - `cli_example/execute_output/output_{data_index}_{output_index}.pt`.<br/>
diff --git a/examples/qualcomm/util_scripts/cli.py b/examples/qualcomm/util_scripts/cli.py
@@ -30,13 +30,18 @@
     SimpleADB,
 )
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
+from executorch.backends.qualcomm.serialization.qc_schema import (
+    QcomChipset,
+    QnnExecuTorchBackendType,
+    QnnExecuTorchLpaiTargetEnv,
+)
 from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY
 from executorch.backends.qualcomm.utils.utils import (
     draw_graph,
     dump_context_from_pte,
     from_context_binary,
     generate_htp_compiler_spec,
+    generate_lpai_compiler_spec,
     generate_qnn_executorch_compiler_spec,
     generate_qnn_executorch_option,
     QNN_QUANT_TYPE_MAP,
@@ -104,7 +109,7 @@ def fill_tensor_info(info, qnn_tensors, category):
         qnn_mgr = PyQnnManagerAdaptor.QnnManager(
             generate_qnn_executorch_option(compiler_specs), ctx_bin
         )
-        assert qnn_mgr.Init().value == 0, "failed to load context binary"
+        assert qnn_mgr.Init().value == 0, "failed to initialize backend"
         graph_name = qnn_mgr.GetGraphNames()[0]
         qnn_mgr.AllocateTensor(graph_name)
         fill_tensor_info(tensor_info, qnn_mgr.GetGraphInputs(graph_name), in_key)
@@ -206,8 +211,17 @@ def compile(args):
 
     file_name, extension = Path(args.artifact).stem, Path(args.artifact).suffix
     os.makedirs(args.output_folder, exist_ok=True)
-    # setup compiler spec dedicated to QNN HTP backend
-    backend_options = generate_htp_compiler_spec(use_fp16=True)
+    # setup compiler spec
+    backend_type = get_backend_type(args.backend)
+    match backend_type:
+        case QnnExecuTorchBackendType.kHtpBackend:
+            backend_options = generate_htp_compiler_spec(use_fp16=True)
+        case QnnExecuTorchBackendType.kLpaiBackend:
+            backend_options = generate_lpai_compiler_spec(
+                target_env=QnnExecuTorchLpaiTargetEnv.kArm
+            )
+        case _:
+            raise ValueError("Backend is not implemented yet")
     # setup general compiler spec for QNN
     compiler_specs = generate_qnn_executorch_compiler_spec(
         soc_model=getattr(QcomChipset, args.soc_model),
@@ -305,8 +319,17 @@ def execute(args):
             user_inputs.append(inputs)
 
     logger.info("retrieving graph I/O")
-    # setup compiler spec dedicated to QNN HTP backend
-    backend_options = generate_htp_compiler_spec(use_fp16=True)
+    # setup compiler spec
+    backend_type = get_backend_type(args.backend)
+    match backend_type:
+        case QnnExecuTorchBackendType.kHtpBackend:
+            backend_options = generate_htp_compiler_spec(use_fp16=True)
+        case QnnExecuTorchBackendType.kLpaiBackend:
+            backend_options = generate_lpai_compiler_spec(
+                target_env=QnnExecuTorchLpaiTargetEnv.kArm
+            )
+        case _:
+            raise ValueError("Backend is not implemented yet")
     # setup general compiler spec for QNN
     compiler_specs = generate_qnn_executorch_compiler_spec(
         soc_model=getattr(QcomChipset, args.soc_model),
@@ -332,7 +355,7 @@ def execute(args):
 
     logger.info("pushing QNN libraries & other artifacts")
 
-    adb.push(inputs=user_inputs)
+    adb.push(inputs=user_inputs, backends=[backend_type])
 
     logger.info("starting inference")
     adb.execute()
@@ -364,10 +387,16 @@ def post_process():
 
             output_result_folder = f"{args.output_folder}/Result_{data_index}"
             os.makedirs(output_result_folder, exist_ok=True)
+            # For the LPAI backend, a dequantize node will be retained for the output, ensuring that the output remains in float32 format.
+            # TODO: add support for other dtypes for LPAI backend
             output = np.fromfile(
                 filename,
-                dtype=eval(
-                    f"np.{torch_to_numpy_dtype_dict[output_info[output_index]['dtype']]}"
+                dtype=(
+                    eval(
+                        f"np.{torch_to_numpy_dtype_dict[output_info[output_index]['dtype']]}"
+                    )
+                    if backend_type != QnnExecuTorchBackendType.kLpaiBackend
+                    else np.float32
                 ),
             )
             output = torch.from_numpy(
@@ -460,9 +489,9 @@ def main():
     sub_quantize.add_argument(
         "--backend",
         type=str,
-        choices=["htp", "gpu"],
+        choices=["htp", "lpai"],
         default="htp",
-        help="Backend to be deployed ('htp'/'gpu' are currently supported).",
+        help="Backend to be deployed ('htp'/'lpai' are currently supported).",
     )
     sub_quantize.add_argument(
         "--eps",
@@ -514,6 +543,13 @@ def main():
         ),
         action="store_true",
     )
+    sub_compile.add_argument(
+        "--backend",
+        type=str,
+        choices=["htp", "lpai"],
+        default="htp",
+        help="Backend to be deployed ('htp'/'lpai' are currently supported).",
+    )
     sub_compile.set_defaults(callback=compile)
 
     sub_execute = subparsers.add_parser(
@@ -590,6 +626,13 @@ def main():
         ),
         action="store_true",
     )
+    sub_execute.add_argument(
+        "--backend",
+        type=str,
+        choices=["htp", "lpai"],
+        default="htp",
+        help="Backend to be deployed ('htp'/'lpai' are currently supported).",
+    )
     sub_execute.set_defaults(callback=execute)
 
     args = parser.parse_args()