Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions src/aci/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@
ScannedFile,
get_default_registry,
)
from aci.core.tokenizer import (
TiktokenTokenizer,
TokenizerInterface,
get_default_tokenizer,
)
from aci.core.tokenizer import (
CharacterTokenizer,
SimpleTokenizer,
TiktokenTokenizer,
TokenizerInterface,
get_default_tokenizer,
)
from aci.core.watch_config import WatchConfig

__all__ = [
Expand All @@ -70,10 +72,12 @@
"TreeSitterParser",
"SUPPORTED_LANGUAGES",
"check_tree_sitter_setup",
# Tokenizer
"TokenizerInterface",
"TiktokenTokenizer",
"get_default_tokenizer",
# Tokenizer
"TokenizerInterface",
"TiktokenTokenizer",
"CharacterTokenizer",
"SimpleTokenizer",
"get_default_tokenizer",
# Chunker
"CodeChunk",
"ChunkerConfig",
Expand Down
2 changes: 2 additions & 0 deletions src/aci/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ class IndexingConfig:
default_factory=lambda: _get_default("indexing", "chunk_overlap_lines", 2)
)
max_workers: int = field(default_factory=lambda: _get_default("indexing", "max_workers", 4))
tokenizer: str = field(default_factory=lambda: _get_default("indexing", "tokenizer", "tiktoken"))


@dataclass
Expand Down Expand Up @@ -226,6 +227,7 @@ def apply_env_overrides(self) -> "ACIConfig":
"ACI_INDEXING_MAX_CHUNK_TOKENS": ("indexing", "max_chunk_tokens", int),
"ACI_INDEXING_CHUNK_OVERLAP_LINES": ("indexing", "chunk_overlap_lines", int),
"ACI_INDEXING_MAX_WORKERS": ("indexing", "max_workers", int),
"ACI_TOKENIZER": ("indexing", "tokenizer", str),
"ACI_INDEXING_FILE_EXTENSIONS": (
"indexing",
"file_extensions",
Expand Down
98 changes: 89 additions & 9 deletions src/aci/core/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
Uses tiktoken library for accurate token counting compatible with OpenAI models.
"""

from abc import ABC, abstractmethod
from abc import ABC, abstractmethod
from math import ceil

import tiktoken

Expand Down Expand Up @@ -44,7 +45,7 @@ def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
pass


class TiktokenTokenizer(TokenizerInterface):
class TiktokenTokenizer(TokenizerInterface):
"""
Tokenizer implementation using tiktoken library.

Expand Down Expand Up @@ -134,14 +135,93 @@ def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
result_lines.append(line)
current_tokens += line_tokens

return "\n".join(result_lines)


def get_default_tokenizer() -> TokenizerInterface:
return "\n".join(result_lines)


class CharacterTokenizer(TokenizerInterface):
"""Conservative tokenizer that estimates tokens using character length."""

def __init__(self, chars_per_token: int = 4):
if chars_per_token <= 0:
raise ValueError("chars_per_token must be greater than 0")
self._chars_per_token = chars_per_token

def count_tokens(self, text: str) -> int:
if not text:
return 0
return ceil(len(text) / self._chars_per_token)

def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
if not text or max_tokens <= 0:
return ""

if self.count_tokens(text) <= max_tokens:
return text

lines = text.split("\n")
result_lines: list[str] = []
current_tokens = 0

for line in lines:
line_with_newline = f"\n{line}" if result_lines else line
line_tokens = self.count_tokens(line_with_newline)

if current_tokens + line_tokens > max_tokens:
break

result_lines.append(line)
current_tokens += line_tokens

return "\n".join(result_lines)


class SimpleTokenizer(TokenizerInterface):
"""Simple whitespace tokenizer primarily for generic non-BPE models."""

def count_tokens(self, text: str) -> int:
if not text:
return 0
return len(text.split())

def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
if not text or max_tokens <= 0:
return ""

if self.count_tokens(text) <= max_tokens:
return text

lines = text.split("\n")
result_lines: list[str] = []
current_tokens = 0

for line in lines:
line_with_newline = f"\n{line}" if result_lines else line
line_tokens = self.count_tokens(line_with_newline)

if current_tokens + line_tokens > max_tokens:
break

result_lines.append(line)
current_tokens += line_tokens

return "\n".join(result_lines)


def get_default_tokenizer(strategy: str = "tiktoken") -> TokenizerInterface:
"""
Get the default tokenizer instance.

Returns:
A TiktokenTokenizer with cl100k_base encoding.
"""
return TiktokenTokenizer(encoding_name="cl100k_base")
A tokenizer implementation matching the configured strategy.
"""
normalized = strategy.strip().lower()
if normalized == "tiktoken":
return TiktokenTokenizer(encoding_name="cl100k_base")
if normalized == "character":
return CharacterTokenizer(chars_per_token=4)
if normalized == "simple":
return SimpleTokenizer()
raise ValueError(
f"Unsupported tokenizer strategy '{strategy}'. "
"Expected one of: tiktoken, character, simple"
)
4 changes: 2 additions & 2 deletions src/aci/infrastructure/embedding/response_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ def is_token_limit_error(status_code: int, response_text: str) -> bool:
if status_code == 400:
response_lower = response_text.lower()
# Check for common token limit error patterns
if "token" in response_lower:
if any(pattern in response_lower for pattern in ["token", "input length", "context length"]):
if any(pattern in response_lower for pattern in [
"limit", "8192", "exceed", "maximum", "many"
"limit", "8192", "exceed", "maximum", "many", "context length"
]):
return True
# Check for SiliconFlow specific error code
Expand Down
7 changes: 5 additions & 2 deletions src/aci/services/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from aci.core.file_scanner import FileScanner
from aci.core.qdrant_launcher import ensure_qdrant_running
from aci.core.summary_generator import SummaryGenerator
from aci.core.tokenizer import get_default_tokenizer
from aci.infrastructure import (
EmbeddingClientInterface,
IndexMetadataStore,
Expand Down Expand Up @@ -120,11 +121,13 @@ def create_services(
ignore_patterns=config.indexing.ignore_patterns,
)

# Create summary generator for multi-granularity indexing
summary_generator = SummaryGenerator()
# Create tokenizer and summary generator for multi-granularity indexing
tokenizer = get_default_tokenizer(config.indexing.tokenizer)
summary_generator = SummaryGenerator(tokenizer=tokenizer)

# Create chunker with config-driven settings
chunker = create_chunker(
tokenizer=tokenizer,
max_tokens=config.indexing.max_chunk_tokens,
overlap_lines=config.indexing.chunk_overlap_lines,
summary_generator=summary_generator,
Expand Down
1 change: 1 addition & 0 deletions tests/property/test_config_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def indexing_config_strategy(draw):
max_chunk_tokens=draw(st.integers(min_value=100, max_value=32000)),
chunk_overlap_lines=draw(st.integers(min_value=0, max_value=50)),
max_workers=draw(st.integers(min_value=1, max_value=32)),
tokenizer=draw(st.sampled_from(["tiktoken", "character", "simple"])),
)


Expand Down
1 change: 1 addition & 0 deletions tests/property/test_embedding_client_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ async def run_test():
"token limit exceeded",
"maximum token limit",
"too many tokens",
"the input length exceeds the context length",
'{"code":20042,"message":"input must have less than 8192 tokens"}',
]

Expand Down
Loading