AperturePlus · AperturePlus · Feb 25, 2026
diff --git a/src/aci/core/__init__.py b/src/aci/core/__init__.py
@@ -41,11 +41,13 @@
     ScannedFile,
     get_default_registry,
 )
-from aci.core.tokenizer import (
-    TiktokenTokenizer,
-    TokenizerInterface,
-    get_default_tokenizer,
-)
+from aci.core.tokenizer import (
+    CharacterTokenizer,
+    SimpleTokenizer,
+    TiktokenTokenizer,
+    TokenizerInterface,
+    get_default_tokenizer,
+)
 from aci.core.watch_config import WatchConfig
 
 __all__ = [
@@ -70,10 +72,12 @@
     "TreeSitterParser",
     "SUPPORTED_LANGUAGES",
     "check_tree_sitter_setup",
-    # Tokenizer
-    "TokenizerInterface",
-    "TiktokenTokenizer",
-    "get_default_tokenizer",
+    # Tokenizer
+    "TokenizerInterface",
+    "TiktokenTokenizer",
+    "CharacterTokenizer",
+    "SimpleTokenizer",
+    "get_default_tokenizer",
     # Chunker
     "CodeChunk",
     "ChunkerConfig",

diff --git a/src/aci/core/config.py b/src/aci/core/config.py
@@ -133,6 +133,7 @@ class IndexingConfig:
         default_factory=lambda: _get_default("indexing", "chunk_overlap_lines", 2)
     )
     max_workers: int = field(default_factory=lambda: _get_default("indexing", "max_workers", 4))
+    tokenizer: str = field(default_factory=lambda: _get_default("indexing", "tokenizer", "tiktoken"))
 
 
 @dataclass
@@ -226,6 +227,7 @@ def apply_env_overrides(self) -> "ACIConfig":
             "ACI_INDEXING_MAX_CHUNK_TOKENS": ("indexing", "max_chunk_tokens", int),
             "ACI_INDEXING_CHUNK_OVERLAP_LINES": ("indexing", "chunk_overlap_lines", int),
             "ACI_INDEXING_MAX_WORKERS": ("indexing", "max_workers", int),
+            "ACI_TOKENIZER": ("indexing", "tokenizer", str),
             "ACI_INDEXING_FILE_EXTENSIONS": (
                 "indexing",
                 "file_extensions",

diff --git a/src/aci/core/tokenizer.py b/src/aci/core/tokenizer.py
@@ -4,7 +4,8 @@
 Uses tiktoken library for accurate token counting compatible with OpenAI models.
 """
 
-from abc import ABC, abstractmethod
+from abc import ABC, abstractmethod
+from math import ceil
 
 import tiktoken
 
@@ -44,7 +45,7 @@ def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
         pass
 
 
-class TiktokenTokenizer(TokenizerInterface):
+class TiktokenTokenizer(TokenizerInterface):
     """
     Tokenizer implementation using tiktoken library.
 
@@ -134,14 +135,93 @@ def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
             result_lines.append(line)
             current_tokens += line_tokens
 
-        return "\n".join(result_lines)
-
-
-def get_default_tokenizer() -> TokenizerInterface:
+        return "\n".join(result_lines)
+
+
+class CharacterTokenizer(TokenizerInterface):
+    """Conservative tokenizer that estimates tokens using character length."""
+
+    def __init__(self, chars_per_token: int = 4):
+        if chars_per_token <= 0:
+            raise ValueError("chars_per_token must be greater than 0")
+        self._chars_per_token = chars_per_token
+
+    def count_tokens(self, text: str) -> int:
+        if not text:
+            return 0
+        return ceil(len(text) / self._chars_per_token)
+
+    def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
+        if not text or max_tokens <= 0:
+            return ""
+
+        if self.count_tokens(text) <= max_tokens:
+            return text
+
+        lines = text.split("\n")
+        result_lines: list[str] = []
+        current_tokens = 0
+
+        for line in lines:
+            line_with_newline = f"\n{line}" if result_lines else line
+            line_tokens = self.count_tokens(line_with_newline)
+
+            if current_tokens + line_tokens > max_tokens:
+                break
+
+            result_lines.append(line)
+            current_tokens += line_tokens
+
+        return "\n".join(result_lines)
+
+
+class SimpleTokenizer(TokenizerInterface):
+    """Simple whitespace tokenizer primarily for generic non-BPE models."""
+
+    def count_tokens(self, text: str) -> int:
+        if not text:
+            return 0
+        return len(text.split())
+
+    def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
+        if not text or max_tokens <= 0:
+            return ""
+
+        if self.count_tokens(text) <= max_tokens:
+            return text
+
+        lines = text.split("\n")
+        result_lines: list[str] = []
+        current_tokens = 0
+
+        for line in lines:
+            line_with_newline = f"\n{line}" if result_lines else line
+            line_tokens = self.count_tokens(line_with_newline)
+
+            if current_tokens + line_tokens > max_tokens:
+                break
+
+            result_lines.append(line)
+            current_tokens += line_tokens
+
+        return "\n".join(result_lines)
+
+
+def get_default_tokenizer(strategy: str = "tiktoken") -> TokenizerInterface:
     """
     Get the default tokenizer instance.
 
     Returns:
-        A TiktokenTokenizer with cl100k_base encoding.
-    """
-    return TiktokenTokenizer(encoding_name="cl100k_base")
+        A tokenizer implementation matching the configured strategy.
+    """
+    normalized = strategy.strip().lower()
+    if normalized == "tiktoken":
+        return TiktokenTokenizer(encoding_name="cl100k_base")
+    if normalized == "character":
+        return CharacterTokenizer(chars_per_token=4)
+    if normalized == "simple":
+        return SimpleTokenizer()
+    raise ValueError(
+        f"Unsupported tokenizer strategy '{strategy}'. "
+        "Expected one of: tiktoken, character, simple"
+    )
diff --git a/src/aci/infrastructure/embedding/response_parser.py b/src/aci/infrastructure/embedding/response_parser.py
@@ -73,9 +73,9 @@ def is_token_limit_error(status_code: int, response_text: str) -> bool:
     if status_code == 400:
         response_lower = response_text.lower()
         # Check for common token limit error patterns
-        if "token" in response_lower:
+        if any(pattern in response_lower for pattern in ["token", "input length", "context length"]):
             if any(pattern in response_lower for pattern in [
-                "limit", "8192", "exceed", "maximum", "many"
+                "limit", "8192", "exceed", "maximum", "many", "context length"
             ]):
                 return True
         # Check for SiliconFlow specific error code

diff --git a/src/aci/services/container.py b/src/aci/services/container.py
@@ -14,6 +14,7 @@
 from aci.core.file_scanner import FileScanner
 from aci.core.qdrant_launcher import ensure_qdrant_running
 from aci.core.summary_generator import SummaryGenerator
+from aci.core.tokenizer import get_default_tokenizer
 from aci.infrastructure import (
     EmbeddingClientInterface,
     IndexMetadataStore,
@@ -120,11 +121,13 @@ def create_services(
         ignore_patterns=config.indexing.ignore_patterns,
     )
 
-    # Create summary generator for multi-granularity indexing
-    summary_generator = SummaryGenerator()
+    # Create tokenizer and summary generator for multi-granularity indexing
+    tokenizer = get_default_tokenizer(config.indexing.tokenizer)
+    summary_generator = SummaryGenerator(tokenizer=tokenizer)
 
     # Create chunker with config-driven settings
     chunker = create_chunker(
+        tokenizer=tokenizer,
         max_tokens=config.indexing.max_chunk_tokens,
         overlap_lines=config.indexing.chunk_overlap_lines,
         summary_generator=summary_generator,

diff --git a/tests/property/test_config_properties.py b/tests/property/test_config_properties.py
@@ -78,6 +78,7 @@ def indexing_config_strategy(draw):
         max_chunk_tokens=draw(st.integers(min_value=100, max_value=32000)),
         chunk_overlap_lines=draw(st.integers(min_value=0, max_value=50)),
         max_workers=draw(st.integers(min_value=1, max_value=32)),
+        tokenizer=draw(st.sampled_from(["tiktoken", "character", "simple"])),
     )
 
 

diff --git a/tests/property/test_embedding_client_properties.py b/tests/property/test_embedding_client_properties.py
@@ -179,6 +179,7 @@ async def run_test():
     "token limit exceeded",
     "maximum token limit",
     "too many tokens",
+    "the input length exceeds the context length",
     '{"code":20042,"message":"input must have less than 8192 tokens"}',
 ]