Qualcomm AI Engine Direct - fix attention sink feature (pytorch#19203)

DannyYuyang-quic · web-flow · commit e270e8fac7f0 · 2026-04-29T09:43:27.000-07:00
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
@@ -365,22 +365,22 @@ def _save_logits_quant_attrs(self):
                             break
 
     def _save_output_kv_cache_quant_attrs(self):
-        k_idx = 0
-        v_idx = 0
+        kv_idx = 0
         for node in self.decoder.graph.nodes:
             if not is_graph_output(node):
                 continue
             cache_output_node = node.args[0].args[0]
             if cache_output_node.meta["val"].size()[-2:] in self.kv_cache_shape:
                 # [QCOM_SCALE, QCOM_ZERO_POINT, QCOM_QUANT_MIN, QCOM_QUANT_MAX, QCOM_DTYPE]
                 # This meta is for attention sink feature
-                self.meta[f"get_kv_output_{k_idx+v_idx}_quant_attr"] = [
+                self.meta[f"get_kv_output_{kv_idx}_quant_attr"] = [
                     node.args[1],
                     node.args[2],
                     node.args[3],
                     node.args[4],
                     str(node.args[5]),
                 ]
+                kv_idx += 1
 
     def _tag_ios(self, node, fixed_point_type):
         atten_mask_shape = {