Skip to content

Commit 3ddb86c

Browse files
Qualcomm AI Engine Direct - Support multimodal(VLM) runner (pytorch#16536)
### Summary: - Runtime support for models - SmolVLM 500M - InternVL3 1B - add hybrid mode runtime requantization in multimodal runner - Background: In LLMs, `annotate_prefill_kv_output` effectively narrows the output gap between `hybrid` mode and `KV` mode. However, applying the same method to multimodal models do not work(bad results). To achieve decent result in hybrid mode, we dequantize the KV cache right after prefilling and re‑quantize it based on the decoder input cache at runtime. - CI - refactor VLM test script - add VLM acc/perf runtime tests - Refactor (VLM) - rename embedding forward input for CPU quantization - Update VLM vision encoder architecture to align with transformers 5.0 changes - Documentation - add readme for multimodal VLM ### Test plan #### SmolVLM Perf: ~63 TPS in SM8750 ``` bash python -m backends.qualcomm.tests.test_qnn_delegate TestExampleMultimodalityScript.test_static_vlm --model_name smolvlm_500m_instruct -b build-android --executorch_root . -a . -m SM8750 -s ${SERIAL_NUM} ``` #### InternVL3 Perf: ~17 TPS in SM8750 ``` bash python -m backends.qualcomm.tests.test_qnn_delegate TestExampleMultimodalityScript.test_static_vlm --model_name internvl3_1b -b build-android --executorch_root . -a . -m SM8750 -s ${SERIAL_NUM} ``` ### Script #### SmolVLM ``` bash python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" ``` #### InternVL3 ```bash python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model internvl3_1b --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" ```
1 parent ac0a201 commit 3ddb86c

35 files changed

Lines changed: 3846 additions & 241 deletions

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 74 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -6529,70 +6529,55 @@ def test_qwen2_5(self):
65296529

65306530

65316531
class TestExampleMultimodalityScript(TestQNN):
6532-
def test_smolvlm_500m_instruct(self):
6533-
if not self.required_envs():
6534-
self.skipTest("missing required envs")
65356532

6536-
prompt = "Can you describe this image?"
6537-
cmds = [
6538-
"python",
6539-
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
6540-
"--artifact",
6541-
self.artifact_dir,
6542-
"--build_folder",
6543-
self.build_folder,
6544-
"--model",
6545-
self.model,
6546-
"--ip",
6547-
self.ip,
6548-
"--port",
6549-
str(self.port),
6550-
"--prompt",
6551-
prompt,
6552-
"--temperature",
6553-
"0",
6554-
"--decoder_model",
6555-
"smolvlm_500m_instruct",
6556-
"--model_mode",
6557-
"kv",
6558-
"--max_seq_len",
6559-
"128",
6560-
]
6561-
if self.compile_only:
6562-
cmds.extend(["--compile_only"])
6563-
elif self.device:
6564-
cmds.extend(["--device", self.device])
6565-
if self.host:
6566-
cmds.extend(["--host", self.host])
6567-
elif self.enable_x86_64:
6568-
cmds.extend(["--enable_x86_64"])
6569-
if self.pre_gen_pte:
6570-
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
6533+
@dataclass(frozen=True)
6534+
class MLLMSpecs:
6535+
max_seq_len: int
6536+
sm8650_token_rate: float
6537+
sm8750_token_rate: float
6538+
encoder_pte_size: float
6539+
text_embedding_pte_size: float
6540+
decoder_pte_size: float
65716541

6572-
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
6573-
with Listener((self.ip, self.port)) as listener:
6574-
conn = listener.accept()
6575-
p.communicate()
6576-
msg = json.loads(conn.recv())
6577-
if "Error" in msg:
6578-
self.fail(msg["Error"])
6579-
else:
6580-
if not self.enable_x86_64:
6581-
encoder_pte_size = msg["encoder_pte_size"]
6582-
text_embedding_pte_size = msg["text_embedding_pte_size"]
6583-
decoder_pte_size = msg["pte_size"]
6584-
self.assertLessEqual(encoder_pte_size, 110_000_000) # 110MB
6585-
self.assertLessEqual(text_embedding_pte_size, 100_000_000) # 100MB
6586-
self.assertLessEqual(decoder_pte_size, 400_000_000) # 400MB
6587-
print(f"Encoder PTE Size: {encoder_pte_size} bytes")
6588-
print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
6589-
print(f"Decoder PTE Size: {decoder_pte_size} bytes")
6542+
@dataclass(frozen=True)
6543+
class VLMSpecs(MLLMSpecs):
6544+
image_path: str
6545+
golden_image_feature: str
65906546

6591-
def test_internvl3_1b(self):
6592-
if not self.required_envs():
6547+
# TODO: refactor to support different backends
6548+
def setUp(self):
6549+
self.vlm_specs = {
6550+
"smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
6551+
max_seq_len=128,
6552+
sm8650_token_rate=50,
6553+
sm8750_token_rate=55,
6554+
encoder_pte_size=110_000_000, # 110MB
6555+
text_embedding_pte_size=100_000_000, # 100MB
6556+
decoder_pte_size=400_000_000, # 400MB
6557+
image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", # New York Bay
6558+
golden_image_feature="city",
6559+
),
6560+
"internvl3_1b": TestExampleMultimodalityScript.VLMSpecs(
6561+
max_seq_len=320,
6562+
sm8650_token_rate=11,
6563+
sm8750_token_rate=13,
6564+
encoder_pte_size=425_000_000, # 425MB
6565+
text_embedding_pte_size=300_000_000, # 300MB
6566+
decoder_pte_size=550_000_000, # 550 MB
6567+
image_path="http://images.cocodataset.org/val2017/000000039769.jpg", # Two cats lying on a blanket
6568+
golden_image_feature="cats",
6569+
),
6570+
}
6571+
6572+
def test_static_vlm(self):
6573+
if not self.required_envs([self.model_name]):
65936574
self.skipTest("missing required envs")
65946575

6576+
vlm_specs: TestExampleMultimodalityScript.VLMSpecs = self.vlm_specs[
6577+
self.model_name
6578+
]
65956579
prompt = "Can you describe this image?"
6580+
image_path = vlm_specs.image_path
65966581
cmds = [
65976582
"python",
65986583
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
@@ -6608,14 +6593,16 @@ def test_internvl3_1b(self):
66086593
str(self.port),
66096594
"--prompt",
66106595
prompt,
6596+
"--image_path",
6597+
image_path,
66116598
"--temperature",
66126599
"0",
66136600
"--decoder_model",
6614-
"internvl3_1b",
6601+
f"{self.model_name}",
66156602
"--model_mode",
66166603
"kv",
66176604
"--max_seq_len",
6618-
"320",
6605+
f"{vlm_specs.max_seq_len}",
66196606
]
66206607
if self.compile_only:
66216608
cmds.extend(["--compile_only"])
@@ -6636,17 +6623,41 @@ def test_internvl3_1b(self):
66366623
if "Error" in msg:
66376624
self.fail(msg["Error"])
66386625
else:
6626+
if not self.compile_only:
6627+
model_out = msg["result"][0]
6628+
self.assertTrue(
6629+
vlm_specs.golden_image_feature in model_out,
6630+
f"Expected Output contains feature: '{vlm_specs.golden_image_feature}' Actual Output: '{model_out}'",
6631+
)
6632+
print(f"Image Path: {image_path}")
6633+
print(f"Query: {prompt}")
6634+
print(f"Answer: {model_out}")
66396635
if not self.enable_x86_64:
66406636
encoder_pte_size = msg["encoder_pte_size"]
66416637
text_embedding_pte_size = msg["text_embedding_pte_size"]
66426638
decoder_pte_size = msg["pte_size"]
6643-
self.assertLessEqual(encoder_pte_size, 425_000_000) # 425MB
6644-
self.assertLessEqual(text_embedding_pte_size, 300_000_000) # 300MB
6645-
self.assertLessEqual(decoder_pte_size, 550_000_000) # 550MB
6639+
self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
6640+
self.assertLessEqual(
6641+
text_embedding_pte_size, vlm_specs.text_embedding_pte_size
6642+
)
6643+
self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
66466644
print(f"Encoder PTE Size: {encoder_pte_size} bytes")
66476645
print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
66486646
print(f"Decoder PTE Size: {decoder_pte_size} bytes")
66496647

6648+
attr_name = f"{self.model.lower()}_token_rate"
6649+
if (
6650+
not self.compile_only
6651+
and not self.enable_x86_64
6652+
and hasattr(vlm_specs, attr_name)
6653+
):
6654+
device_inference_speed = msg["inference_speed"]
6655+
expected_inference_speed = getattr(vlm_specs, attr_name)
6656+
print(f"Prompt Evaluation: {device_inference_speed} tokens/second")
6657+
self.assertGreaterEqual(
6658+
device_inference_speed, expected_inference_speed
6659+
)
6660+
66506661

66516662
class TestExampleOssScript(TestQNN):
66526663
def test_albert(self):

examples/qualcomm/oss_scripts/llama/CMakeLists.txt

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,60 @@ target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
8181
set_target_properties(
8282
qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
8383
)
84+
85+
# build qnn multimodal runner preprocess qnn runner src files for multimodal
86+
set(_multimodal_runner__srcs ${_llama_runner__srcs})
87+
list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*qnn_llama_runner.*")
88+
list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*runner/runner\.(cpp|h)")
89+
list(
90+
PREPEND
91+
_multimodal_runner__srcs
92+
${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
93+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
94+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
95+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
96+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
97+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
98+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
99+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
100+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
101+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
102+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
103+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp
104+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.h
105+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
106+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.h
107+
)
108+
109+
list(APPEND _multimodal_runner__srcs)
110+
111+
# build qnn multimodal runner
112+
add_executable(qnn_multimodal_runner ${_multimodal_runner__srcs})
113+
target_include_directories(
114+
qnn_multimodal_runner PUBLIC ${_common_include_directories}
115+
)
116+
target_include_directories(
117+
qnn_multimodal_runner
118+
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
119+
)
120+
target_compile_options(qnn_multimodal_runner PUBLIC ${_common_compile_options})
121+
122+
target_link_libraries(
123+
qnn_multimodal_runner
124+
qnn_executorch_backend
125+
executorch_core
126+
extension_data_loader
127+
extension_flat_tensor
128+
extension_llm_runner
129+
extension_module
130+
extension_tensor
131+
gflags
132+
custom_ops
133+
quantized_ops_lib
134+
quantized_kernels
135+
tokenizers::tokenizers
136+
)
137+
138+
set_target_properties(
139+
qnn_multimodal_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
140+
)

0 commit comments

Comments
 (0)