feat(mtmd): improve fallback chat template for multimodal models

JamePeng · JamePeng · commit e7976f42b23c · 2026-05-30T01:24:56.000+08:00
- Add BOS/EOS token handling to the default MTMD chat format.
- Use a clearer role-based template with explicit USER and ASSISTANT prefixes.
- Append a newline after each message to keep generated prompts readable.
- Treat EOS as the end marker for the serialized conversation history before
the optional generation prompt.
- Improve fallback behavior for multimodal GGUF models that do not provide a
chat template, such as OCR-oriented models like DeepSeek-OCR 1/2.
- Make the default system prompt a single normalized string while preserving
its original meaning.
- Clean up minor formatting around MTMD context parameter initialization.

This improves prompt compatibility for multimodal models that either lack a
GGUF chat template or are not yet covered by a complete custom chat handler.

Signed-off-by: JamePeng &lt;jame_peng@sina.com&gt;
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2811,21 +2811,20 @@ def generate_streaming(tools, functions, function_call, prompt):
 
 class MTMDChatHandler:
     DEFAULT_SYSTEM_MESSAGE: Optional[str] = (
-"""You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content,
-while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful."""
+"You are an exceptionally capable, precise, and helpful multimodal AI assistant that excels at deeply understanding and richly describing images, charts, diagrams, text in images, scenes, and any visual content, "
+"while also answering every question accurately, clearly, and step-by-step when appropriate — always responding in the same language as the user's question, remaining polite, professional, and maximally helpful."
     )
 
     CHAT_FORMAT = (
+        "{{ bos_token if bos_token is defined else '' }}"
         "{% for message in messages %}"
             "{% if message.role == 'system' %}"
                 "{{ message.content }}"
-            "{% endif %}"
-
-            "{% if message.role == 'user' %}"
+            "{% elif message.role == 'user' %}"
+                "USER: "
                 "{% if message.content is string %}"
-                    "\nUSER: {{ message.content }}"
+                    "{{ message.content }}"
                 "{% elif message.content is iterable %}"
-                    "\nUSER: "
                     "{% for content in message.content %}"
                         "{% if content.type == 'image_url' %}"
                             "{{ content.image_url if content.image_url is string else content.image_url.url }}"
@@ -2842,15 +2841,19 @@ class MTMDChatHandler:
                         "{% endif %}"
                     "{% endfor %}"
                 "{% endif %}"
-            "{% endif %}"
 
-            "{% if message.role == 'assistant' and message.content is not none %}"
-                "\nASSISTANT: {{ message.content }}"
+            "{% elif message.role == 'assistant' and message.content is not none %}"
+                "ASSISTANT: {{ message.content }}"
             "{% endif %}"
+            "{{ \"\n\" }}"
         "{% endfor %}"
 
+        "{% if eos_token is defined %}"
+            "{{ eos_token }}"
+        "{% endif %}"
+
         "{% if add_generation_prompt %}"
-            "\nASSISTANT: "
+            "ASSISTANT: "
         "{% endif %}"
     )
 
@@ -2906,7 +2909,7 @@ def _init_mtmd_context(self, llama_model: llama_core.Llama):
         self.mctx_params.use_gpu = self.use_gpu
         self.mctx_params.print_timings = self.verbose
         self.mctx_params.n_threads = llama_model.n_threads
-        self.mctx_params.flash_attn_type  = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO
+        self.mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO
         self.mctx_params.warmup = True
         if self.image_min_tokens > 0:
             self.mctx_params.image_min_tokens = self.image_min_tokens