Skip to content

Commit abe789f

Browse files
committed
Implement Step3VLChatHandler for Step3-VL-10B
Signed-off-by: JamePeng <jame_peng@sina.com>
1 parent 9241b0f commit abe789f

2 files changed

Lines changed: 138 additions & 0 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
744744
| [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
745745
| [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` |
746746
| [qwen3.5](https://huggingface.co/unsloth/Qwen3.5-27B-GGUF) | `Qwen35ChatHandler` | `qwen3.5` |
747+
| [step3-vl](https://huggingface.co/JamePeng2023/Step3-VL-10B-GGUF) | `Step3VLChatHandler` | `step3-vl` |
747748
748749
Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
749750

llama_cpp/llama_chat_format.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5512,6 +5512,143 @@ def __call__(self, **kwargs):
55125512
return super().__call__(**kwargs)
55135513

55145514

5515+
class Step3VLChatHandler(MTMDChatHandler):
5516+
"""
5517+
Handler for Step3-VL models.
5518+
"""
5519+
5520+
STEP3VL_BOS_TOKEN = "<|im_start|>"
5521+
STEP3VL_EOS_TOKEN = "<|im_end|>"
5522+
STEP3VL_PAD_TOKEN = "<|endoftext|>"
5523+
STEP3VL_IMAGE_TOKEN = "<im_patch>"
5524+
5525+
CHAT_FORMAT = (
5526+
"{%- macro render_content(content) -%}\n"
5527+
" {%- if content is none -%}{{- '' -}}\n"
5528+
" {%- elif content is string -%}{{- content -}}\n"
5529+
" {%- elif content is mapping -%}{{- content['value'] if 'value' in content else content['text'] -}}\n"
5530+
" {%- elif content is iterable -%}\n"
5531+
" {%- for item in content -%}\n"
5532+
" {%- if item.type == 'text' -%}\n"
5533+
" {{- item['value'] if 'value' in item else item['text'] -}}\n"
5534+
" {%- elif item.type in ['image', 'image_url'] -%}\n"
5535+
" {%- set url_val = '' -%}\n"
5536+
" {%- if item.image_url -%}\n"
5537+
" {%- set url_val = item.image_url if item.image_url is string else item.image_url.url -%}\n"
5538+
" {%- endif -%}\n"
5539+
" {{- '<im_patch>' + url_val -}}\n"
5540+
" {%- endif -%}\n"
5541+
" {%- endfor -%}\n"
5542+
" {%- endif -%}\n"
5543+
"{%- endmacro -%}\n"
5544+
"\n"
5545+
"{%- if tools -%}\n"
5546+
" {{- '<|im_start|>system\\n' -}}\n"
5547+
" {%- if messages[0].role == 'system' -%}\n"
5548+
" {{- render_content(messages[0].content) + '\\n\\n' -}}\n"
5549+
" {%- endif -%}\n"
5550+
" {{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>' -}}\n"
5551+
" {%- for tool in tools -%}\n"
5552+
" {{- '\\n' -}}\n"
5553+
" {{- tool | tojson -}}\n"
5554+
" {%- endfor -%}\n"
5555+
" {{- '\\n</tools>\\n\\nAlways adhere to this exact format for tool use:\\n<tool_calls>\\n<tool_call>\\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\\n</tool_call>\\n{additional_tool_calls}</tool_calls>\\n\\nNote:\\n- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags.\\n- `<function-name>` must be an exact match to one of the available tools.\\n- `<args-json-object>` must be valid JSON that strictly follows the tool\\'s parameters schema.<|im_end|>\\n' -}}\n"
5556+
"{%- else -%}\n"
5557+
" {%- if messages[0].role == 'system' -%}\n"
5558+
" {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' -}}\n"
5559+
" {%- endif -%}\n"
5560+
"{%- endif -%}\n"
5561+
"\n"
5562+
"{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) -%}\n"
5563+
"{%- for message in messages[::-1] -%}\n"
5564+
" {%- set index = (messages|length - 1) - loop.index0 -%}\n"
5565+
" {%- if ns.multi_step_tool and message.role == 'user' and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) -%}\n"
5566+
" {%- set ns.multi_step_tool = false -%}\n"
5567+
" {%- set ns.last_query_index = index -%}\n"
5568+
" {%- endif -%}\n"
5569+
"{%- endfor -%}\n"
5570+
"\n"
5571+
"{%- for message in messages -%}\n"
5572+
" {%- set content = render_content(message.content) -%}\n"
5573+
" {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) -%}\n"
5574+
" {%- set role_name = 'observation' if (message.role == 'system' and not loop.first and message.name == 'observation') else message.role -%}\n"
5575+
" {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' -}}\n"
5576+
" {%- elif message.role == 'assistant' -%}\n"
5577+
" {%- if message.reasoning_content is string -%}\n"
5578+
" {%- set reasoning_content = render_content(message.reasoning_content) -%}\n"
5579+
" {%- else -%}\n"
5580+
" {%- if '</think>' in content -%}\n"
5581+
" {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') -%}\n"
5582+
" {%- set content = content.split('</think>')[-1].lstrip('\\n') -%}\n"
5583+
" {%- else -%}\n"
5584+
" {%- set reasoning_content = '' -%}\n"
5585+
" {%- endif -%}\n"
5586+
" {%- endif -%}\n"
5587+
" {%- if loop.index0 > ns.last_query_index -%}\n"
5588+
" {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n' + content -}}\n"
5589+
" {%- else -%}\n"
5590+
" {{- '<|im_start|>' + message.role + '\\n' + content -}}\n"
5591+
" {%- endif -%}\n"
5592+
" {%- if message.tool_calls -%}\n"
5593+
" {{- '\\n<tool_calls>' -}}\n"
5594+
" {%- for tool_call in message.tool_calls -%}\n"
5595+
" {{- '\\n' -}}\n"
5596+
" {%- if tool_call.function -%}\n"
5597+
" {%- set tool_call = tool_call.function -%}\n"
5598+
" {%- endif -%}\n"
5599+
" {{- '<tool_call>\\n{\"name\": \"' -}}\n"
5600+
" {{- tool_call.name -}}\n"
5601+
" {{- '\", \"arguments\": ' -}}\n"
5602+
" {%- if tool_call.arguments is string -%}\n"
5603+
" {{- tool_call.arguments -}}\n"
5604+
" {%- else -%}\n"
5605+
" {{- tool_call.arguments | tojson -}}\n"
5606+
" {%- endif -%}\n"
5607+
" {{- '}\\n</tool_call>' -}}\n"
5608+
" {%- endfor -%}\n"
5609+
" {{- '\\n</tool_calls>' -}}\n"
5610+
" {%- endif -%}\n"
5611+
" {{- '<|im_end|>\\n' -}}\n"
5612+
" {%- elif message.role == 'tool' -%}\n"
5613+
" {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}\n"
5614+
" {{- '<|im_start|>tool_response' -}}\n"
5615+
" {%- endif -%}\n"
5616+
" {{- '\\n<tool_response>\\n' -}}\n"
5617+
" {{- content -}}\n"
5618+
" {{- '\\n</tool_response>' -}}\n"
5619+
" {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}\n"
5620+
" {{- '<|im_end|>\\n' -}}\n"
5621+
" {%- endif -%}\n"
5622+
" {%- endif -%}\n"
5623+
"{%- endfor -%}\n"
5624+
"{%- if add_generation_prompt -%}\n"
5625+
" {{- '<|im_start|>assistant\\n<think>\\n\\n</think>\\n' if (enable_thinking is defined and not enable_thinking) else '<|im_start|>assistant\\n<think>' -}}\n"
5626+
"{%- endif -%}\n"
5627+
)
5628+
5629+
def __init__(self, enable_thinking: bool = True, **kwargs):
5630+
"""
5631+
Initializes the Step3-VL Handler.
5632+
5633+
Args:
5634+
enable_thinking (bool): If False, injects an empty <think> block to bypass reasoning.
5635+
"""
5636+
self.enable_thinking = enable_thinking
5637+
super().__init__(**kwargs)
5638+
5639+
def __call__(self, **kwargs):
5640+
# Pass thinking toggle into Jinja
5641+
self.extra_template_arguments["enable_thinking"] = self.enable_thinking
5642+
5643+
# Step3 uses standard <|im_end|> ChatML stop formatting
5644+
kwargs['stop'] = [self.STEP3VL_PAD_TOKEN, self.STEP3VL_EOS_TOKEN]
5645+
5646+
if self.verbose:
5647+
print(f"{self.log_prefix}(enable_thinking={self.enable_thinking}) - Start processing")
5648+
5649+
return super().__call__(**kwargs)
5650+
5651+
55155652
@register_chat_completion_handler("chatml-function-calling")
55165653
def chatml_function_calling(
55175654
llama: llama_core.Llama,

0 commit comments

Comments
 (0)