usestrix · 0xhis · Mar 21, 2026 · Mar 21, 2026 · Copilot · Mar 21, 2026
diff --git a/strix/llm/llm.py b/strix/llm/llm.py
@@ -10,7 +10,7 @@
 
 from strix.config import Config
 from strix.llm.config import LLMConfig
-from strix.llm.memory_compressor import MemoryCompressor
+from strix.llm.memory_compressor import MemoryCompressor, get_message_tokens
 from strix.llm.utils import (
     _truncate_to_first_function,
     fix_incomplete_tool_call,
@@ -210,7 +210,12 @@ def _prepare_messages(self, conversation_history: list[dict[str, Any]]) -> list[
                 }
             )
 
-        compressed = list(self.memory_compressor.compress_history(conversation_history))
+        reserved_tokens = sum(
+            get_message_tokens(msg, self.config.litellm_model) for msg in messages
+        )
+        compressed = list(
+            self.memory_compressor.compress_history(conversation_history, reserved_tokens)
+        )
         conversation_history.clear()
         conversation_history.extend(compressed)
         messages.extend(compressed)

diff --git a/strix/llm/memory_compressor.py b/strix/llm/memory_compressor.py
@@ -52,7 +52,7 @@ def _count_tokens(text: str, model: str) -> int:
         return len(text) // 4  # Rough estimate
 
 
-def _get_message_tokens(msg: dict[str, Any], model: str) -> int:
+def get_message_tokens(msg: dict[str, Any], model: str) -> int:
     content = msg.get("content", "")
     if isinstance(content, str):
         return _count_tokens(content, model)
@@ -166,9 +166,16 @@ def __init__(
     def compress_history(
         self,
         messages: list[dict[str, Any]],
+        reserved_tokens: int = 0,
     ) -> list[dict[str, Any]]:
         """Compress conversation history to stay within token limits.
 
+        Args:
+            messages: Conversation history messages to compress.
+            reserved_tokens: Tokens already reserved for system prompt and
+                other framing messages outside the conversation history.
+                Subtracted from the budget before checking limits.
+
         Strategy:
         1. Handle image limits first
         2. Keep all system messages
@@ -201,8 +208,8 @@ def compress_history(
         # Type assertion since we ensure model_name is not None in __init__
         model_name: str = self.model_name  # type: ignore[assignment]
 
-        total_tokens = sum(
-            _get_message_tokens(msg, model_name) for msg in system_msgs + regular_msgs
+        total_tokens = reserved_tokens + sum(
+            get_message_tokens(msg, model_name) for msg in system_msgs + regular_msgs
         )
 
         if total_tokens <= MAX_TOTAL_TOKENS * 0.9: