InternLM · dbsd11 · May 11, 2026
diff --git a/lmdeploy/serve/parsers/response_parser.py b/lmdeploy/serve/parsers/response_parser.py
@@ -333,6 +333,7 @@ def _consume_reasoning(self) -> tuple[str | None, bool]:
 
         Behavior:
         - Drops the explicit open tag if model emits it.
+        - Detects tool open tag within reasoning content and switches to MODE_TOOL.
         - If no close tag is present, emits only the safe reasoning-text prefix and
           preserves possible partial-tag suffix for the next chunk.
         - If a close tag is found, emits text before the close tag as reasoning content,
@@ -350,24 +351,51 @@ def _consume_reasoning(self) -> tuple[str | None, bool]:
             self._pending = self._pending[len(open_tag):]
             return None, True
 
+        # Check for tool open tag within reasoning content.
+        # But only if reasoning close tag is not found first.
         close_tag = self.profile.reasoning_close_tag
         if not close_tag:
             raise RuntimeError('Invariant violated: MODE_REASONING requires a reasoning_close_tag.')
 
-        idx = self._pending.find(close_tag)
-        # No close tag found, treat the whole pending text as reasoning content.
-        if idx < 0:
-            if not self._pending:
-                return None, False
-            out = self._pending
-            self._pending = ''
-            return out, True
+        close_idx = self._pending.find(close_tag)
+        tool_tag = self.profile.tool_open_tag
+        tool_idx = self._pending.find(tool_tag) if tool_tag else -1
+
+        # If close tag is found, process it first.
+        if close_idx >= 0:
+            # Check if tool tag appears before close tag.
+            if tool_idx >= 0 and tool_idx < close_idx:
+                # Tool tag is before reasoning close - emit reasoning before tool, switch to tool mode.
+                reasoning_chunk = self._pending[:tool_idx] if tool_idx > 0 else None
+                self._pending = self._pending[tool_idx + len(tool_tag):]
+                self._mode = self.MODE_TOOL
+                if self.tool_parser is not None:
+                    self.tool_parser.start_tool_call()
+                return (reasoning_chunk if reasoning_chunk else None), True
+
+            # Reasoning close comes first (or no tool tag).
+            reasoning_chunk = self._pending[:close_idx]
+            self._pending = self._pending[close_idx + len(close_tag):]
+            self._mode = self.MODE_PLAIN
+            return (reasoning_chunk if reasoning_chunk else None), True
 
-        reasoning_chunk = self._pending[:idx]
-        self._pending = self._pending[idx + len(close_tag):]
-        # reasoning part is done, switch to plain mode
-        self._mode = self.MODE_PLAIN
-        return (reasoning_chunk if reasoning_chunk else None), True
+        # No close tag found yet. Check for tool open tag.
+        if tool_idx >= 0:
+            # Tool tag found before reasoning close - emit reasoning before tool, switch to tool mode.
+            reasoning_chunk = self._pending[:tool_idx] if tool_idx > 0 else None
+            self._pending = self._pending[tool_idx + len(tool_tag):]
+            self._mode = self.MODE_TOOL
+            if self.tool_parser is not None:
+                self.tool_parser.start_tool_call()
+            return (reasoning_chunk if reasoning_chunk else None), True
+
+        # No close tag and no tool tag found - emit safe reasoning prefix,
+        # keeping possible partial-tag suffix in buffer.
+        if not self._pending:
+            return None, False
+        out = self._pending
+        self._pending = ''
+        return out, True
 
     def _consume_tool(self) -> tuple[list[DeltaToolCall], bool]:
         """Consume buffered text while in tool mode.

diff --git a/tests/test_lmdeploy/serve/parsers/test_qwen3_5_parser.py b/tests/test_lmdeploy/serve/parsers/test_qwen3_5_parser.py
@@ -90,7 +90,7 @@ class TestQwen3_5ResponseParserStreaming:
 
     @staticmethod
     def _encode_ids(tokenizer, text: str) -> list[int]:
-        return tokenizer.encode(text, add_bos=False, add_special_tokens=False)
+        return tokenizer.encode(text, add_special_tokens=False)
 
     def test_stream_chunk_matches_reference(self, tokenizer, response_parser):
         """Feed the real streaming sequence into ResponseParser.stream_chunk

diff --git a/tests/test_lmdeploy/serve/parsers/test_qwen3_parser.py b/tests/test_lmdeploy/serve/parsers/test_qwen3_parser.py
@@ -138,7 +138,7 @@ class TestQwenResponseParserStreaming:
 
     @staticmethod
     def _encode_ids(tokenizer, text: str) -> list[int]:
-        return tokenizer.encode(text, add_bos=False, add_special_tokens=False)
+        return tokenizer.encode(text, add_special_tokens=False)
 
     @pytest.mark.parametrize('reference_chunks', [REFERENCE_CHUNKS_0, REFERENCE_CHUNKS_1, REFERENCE_CHUNKS_2])
     def test_stream_chunk_matches_reference(self, tokenizer, response_parser, reference_chunks):