Skip to content

Commit 6c9e7bf

Browse files
committed
feat(chat_handler): update multimodal handlers for Qwen2.5-VL, Qwen3-VL, and PaddleOCR
- Update PaddleOCRChatHandler to support version 1.6 - Add token configuration and stop sequences for Qwen2.5-VL and Qwen3-VL - Standardize input_ids initialization in __call__ methods for Qwen2.5-VL, Qwen3-ASR, and Qwen3-VL handlers Signed-off-by: JamePeng <jame_peng@sina.com>
1 parent 103639c commit 6c9e7bf

1 file changed

Lines changed: 20 additions & 1 deletion

File tree

llama_cpp/llama_chat_format.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5324,7 +5324,7 @@ def __call__(self, **kwargs):
53245324

53255325
class PaddleOCRChatHandler(MTMDChatHandler):
53265326
"""
5327-
Handler for PaddleOCR 1.5 multimodal models.
5327+
Handler for PaddleOCR 1.5/1.6 multimodal models.
53285328
"""
53295329

53305330
PADDLEOCR_CLS_TOKEN = "<|begin_of_sentence|>"
@@ -5431,6 +5431,11 @@ def __call__(self, **kwargs):
54315431

54325432

54335433
class Qwen25VLChatHandler(MTMDChatHandler):
5434+
5435+
QWEN25_VL_BOS_TOKEN = "<|endoftext|>"
5436+
QWEN25_VL_PAD_TOKEN = "<|endoftext|>"
5437+
QWEN25_VL_EOS_TOKEN = "<|im_end|>"
5438+
54345439
CHAT_FORMAT = (
54355440
"{% set image_count = namespace(value=0) %}"
54365441
"{% for message in messages %}"
@@ -5462,6 +5467,8 @@ class Qwen25VLChatHandler(MTMDChatHandler):
54625467
)
54635468

54645469
def __call__(self, **kwargs):
5470+
kwargs['stop'] = [self.QWEN25_VL_EOS_TOKEN, self.QWEN25_VL_PAD_TOKEN]
5471+
54655472
llama = kwargs['llama']
54665473

54675474
if hasattr(llama, 'input_ids'):
@@ -5547,12 +5554,22 @@ def __call__(self, **kwargs):
55475554
# Qwen3 models universally use `<|endoftext|>` and `<|im_end|>` as the stop token
55485555
kwargs['stop'] = [self.QWEN3_ASR_AUDIO_PAD_TOKEN, self.QWEN3_ASR_AUDIO_EOS_TOKEN]
55495556

5557+
llama = kwargs['llama']
5558+
5559+
if hasattr(llama, 'input_ids'):
5560+
llama.input_ids.fill(0)
5561+
55505562
if self.verbose:
55515563
print(f"{self.log_prefix} - Start processing Qwen3-ASR (Audio Only)")
55525564

55535565
return super().__call__(**kwargs)
55545566

55555567
class Qwen3VLChatHandler(MTMDChatHandler):
5568+
5569+
QWEN3_VL_BOS_TOKEN = "<|endoftext|>"
5570+
QWEN3_VL_PAD_TOKEN = "<|endoftext|>"
5571+
QWEN3_VL_EOS_TOKEN = "<|im_end|>"
5572+
55565573
CHAT_FORMAT = (
55575574
"{{- '<|im_start|>system\n' -}}"
55585575
"{%- if messages[0].content is string and messages[0].role == 'system' -%}"
@@ -5661,6 +5678,8 @@ def __init__(
56615678
self.extra_template_arguments["add_vision_id"] = add_vision_id
56625679

56635680
def __call__(self, **kwargs):
5681+
kwargs['stop'] = [self.QWEN3_VL_EOS_TOKEN, self.QWEN3_VL_PAD_TOKEN]
5682+
56645683
llama = kwargs['llama']
56655684

56665685
if hasattr(llama, 'input_ids'):

0 commit comments

Comments
 (0)