Skip to content

Commit e270e8f

Browse files
Qualcomm AI Engine Direct - fix attention sink feature (pytorch#19203)
1 parent ddd8ac6 commit e270e8f

1 file changed

Lines changed: 3 additions & 3 deletions

File tree

examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -365,22 +365,22 @@ def _save_logits_quant_attrs(self):
365365
break
366366

367367
def _save_output_kv_cache_quant_attrs(self):
368-
k_idx = 0
369-
v_idx = 0
368+
kv_idx = 0
370369
for node in self.decoder.graph.nodes:
371370
if not is_graph_output(node):
372371
continue
373372
cache_output_node = node.args[0].args[0]
374373
if cache_output_node.meta["val"].size()[-2:] in self.kv_cache_shape:
375374
# [QCOM_SCALE, QCOM_ZERO_POINT, QCOM_QUANT_MIN, QCOM_QUANT_MAX, QCOM_DTYPE]
376375
# This meta is for attention sink feature
377-
self.meta[f"get_kv_output_{k_idx+v_idx}_quant_attr"] = [
376+
self.meta[f"get_kv_output_{kv_idx}_quant_attr"] = [
378377
node.args[1],
379378
node.args[2],
380379
node.args[3],
381380
node.args[4],
382381
str(node.args[5]),
383382
]
383+
kv_idx += 1
384384

385385
def _tag_ios(self, node, fixed_point_type):
386386
atten_mask_shape = {

0 commit comments

Comments
 (0)