Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions docs/wiki/Configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,48 @@ Files matching `.gitignore` are also skipped automatically.

---

## Custom File Extensions

By default the indexer recognises common source file extensions (`.py`, `.ts`, `.go`, `.html`, `.css`, …) and routes each to the right tree-sitter parser when one is available. If your project uses an extension CCE doesn't know about — a template language, a rebranded JS extension, a config DSL — register it under `indexer.extensions`:

```yaml
indexer:
extensions:
.tpl: html # alias to an existing parser
.mjs: javascript
.cts: typescript
.liquid: "" # index as plaintext (no AST chunking)
.erb: ""
```

**Rules:**

- **Keys** must start with `.` and are matched case-insensitively against file suffixes (`.HTML` and `.html` resolve the same way).
- **Values** are language strings — anything in the built-in `_LANGUAGE_MAP` works (`html`, `javascript`, `typescript`, `python`, `go`, `rust`, `java`, `php`, etc.). Unknown values are accepted and fall back to plaintext at chunk time.
- **Empty string or `null`** indexes the file as a single plaintext chunk. Useful when you want the file searchable but know there's no parser for it.
- **User entries override built-ins.** For example, force `.h` to be parsed as C++ instead of C:

```yaml
indexer:
extensions:
.h: cpp
```

**Where to put it:**

- Global default: `~/.cce/config.yaml`
- Project-specific: `.context-engine.yaml` in the project root (overrides the global entry per-extension)

**After editing**, re-run indexing so existing files get re-chunked under the new mapping:

```bash
cce index --full
```

**Parsers with full AST chunking** (semantic chunks for functions, classes, blocks): Python, JavaScript, TypeScript/TSX, PHP, Go, Rust, Java, HTML. Other languages (`css`, `markdown`, `json`, `yaml`, …) are mapped for metadata but indexed as a single plaintext chunk per file.

---

## Changing the Embedding Model

```yaml
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies = [
"tree-sitter-go>=0.23",
"tree-sitter-rust>=0.23",
"tree-sitter-java>=0.23",
"tree-sitter-html>=0.20",
"watchdog>=4.0",
"mcp>=1.0",
"httpx>=0.27",
Expand Down
27 changes: 27 additions & 0 deletions src/context_engine/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ class Config:
# index. See indexer/secrets.py for the full pattern list. Default
# True; users on non-sensitive corpora can opt out.
indexer_redact_secrets: bool = True
# Extra extension → language mappings layered over the indexer's built-in
# _LANGUAGE_MAP. Keys must start with "." and are matched case-insensitively
# against file suffixes; values are language strings (e.g. "html",
# "javascript") that pick the tree-sitter parser, or "" for plaintext.
# Unknown language strings are accepted and fall back to plaintext at
# chunk time — same behavior as built-in entries without a parser (md,
# css, json, …).
indexer_extensions: dict[str, str] = field(default_factory=dict)
# When True, memory.db writes (decisions, code_areas, turn_summaries,
# session rollups) get PII scrubbed before storage: emails, IPs,
# credit cards (Luhn-validated), SSNs, phone numbers. Free-form
Expand Down Expand Up @@ -127,6 +135,7 @@ def _deep_merge(base: dict, override: dict) -> dict:
"indexer_debounce_ms": int,
"indexer_ignore": list,
"indexer_redact_secrets": bool,
"indexer_extensions": dict,
"memory_redact_pii": bool,
"audit_log_enabled": bool,
"storage_path": str,
Expand All @@ -148,6 +157,7 @@ def _apply_dict_to_config(config: Config, data: dict) -> None:
("indexer", "debounce_ms"): "indexer_debounce_ms",
("indexer", "ignore"): "indexer_ignore",
("indexer", "redact_secrets"): "indexer_redact_secrets",
("indexer", "extensions"): "indexer_extensions",
("memory", "redact_pii"): "memory_redact_pii",
("audit", "enabled"): "audit_log_enabled",
("storage", "path"): "storage_path",
Expand Down Expand Up @@ -176,6 +186,23 @@ def _apply_dict_to_config(config: Config, data: dict) -> None:
if item not in merged:
merged.append(item)
setattr(config, attr, merged)
elif attr == "indexer_extensions" and isinstance(value, dict):
normalized: dict[str, str] = {}
for ext, lang in value.items():
if not isinstance(ext, str) or not ext.startswith("."):
raise ValueError(
f"Config indexer.extensions: key {ext!r} must be a "
"string starting with '.' (e.g. '.tpl')"
)
if lang is None:
lang = ""
if not isinstance(lang, str):
raise ValueError(
f"Config indexer.extensions[{ext!r}]: language must "
f"be a string or null, got {type(lang).__name__}"
)
normalized[ext.lower()] = lang
setattr(config, attr, normalized)
else:
setattr(config, attr, value)

Expand Down
11 changes: 11 additions & 0 deletions src/context_engine/indexer/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import tree_sitter_go as tsgo
import tree_sitter_rust as tsrust
import tree_sitter_java as tsjava
import tree_sitter_html as tshtml
from tree_sitter import Language, Parser

from context_engine.models import Chunk, ChunkType
Expand All @@ -28,6 +29,13 @@
"import_declaration", # TypeScript, Go, Java
"use_declaration", # PHP, Rust
}
# HTML chunks at <script> and <style> boundaries — the only nodes that
# reliably represent a self-contained unit worth retrieving on its own.
# Pages without either fall through to the whole-file plaintext chunk.
_HTML_BLOCK_TYPES = {
"script_element",
"style_element",
}

_LANGUAGES = {
"python": Language(tspython.language()),
Expand All @@ -38,6 +46,7 @@
"go": Language(tsgo.language()),
"rust": Language(tsrust.language()),
"java": Language(tsjava.language()),
"html": Language(tshtml.language()),
}


Expand Down Expand Up @@ -69,6 +78,8 @@ def _walk(self, node, source, file_path, language, chunks):
chunks.append(self._node_to_chunk(node, source, file_path, language, ChunkType.FUNCTION))
elif node.type in _CLASS_TYPES:
chunks.append(self._node_to_chunk(node, source, file_path, language, ChunkType.CLASS))
elif node.type in _HTML_BLOCK_TYPES:
chunks.append(self._node_to_chunk(node, source, file_path, language, ChunkType.MODULE))
for child in node.children:
self._walk(child, source, file_path, language, chunks)

Expand Down
16 changes: 15 additions & 1 deletion src/context_engine/indexer/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,19 @@ def _pipeline_lock(storage_key: str) -> asyncio.Lock:
}


def _resolve_language(suffix: str, custom: dict[str, str]) -> str:
"""Pick a language string for a file suffix.

User-supplied `custom` mappings (from config.indexer_extensions) override
`_LANGUAGE_MAP`. An empty value means plaintext indexing — useful when the
user wants the file included but knows there is no parser for it.
"""
key = suffix.lower()
if key in custom:
return custom[key] or "plaintext"
return _LANGUAGE_MAP.get(key, "plaintext")


@dataclass
class IndexResult:
indexed_files: list[str] = field(default_factory=list)
Expand Down Expand Up @@ -341,6 +354,7 @@ async def _run_indexing_locked(
chunker = Chunker()
manifest = Manifest(manifest_path=storage_base / "manifest.json")
ignore_set = set(config.indexer_ignore)
custom_extensions: dict[str, str] = dict(getattr(config, "indexer_extensions", {}) or {})
# Load .cceignore once per indexing run. Patterns are evaluated against
# paths relative to project_dir; see indexer/ignorefile.py.
from context_engine.indexer.ignorefile import load_ignore_patterns
Expand Down Expand Up @@ -498,7 +512,7 @@ async def _embed_and_ingest(
log_fn(f" [skip] {rel_path} (unchanged)")
continue

language = _LANGUAGE_MAP.get(file_path.suffix, "plaintext")
language = _resolve_language(file_path.suffix, custom_extensions)
to_chunk.append((file_path, rel_path, content, content_hash, language))

batch_chunks: list = []
Expand Down
38 changes: 38 additions & 0 deletions tests/indexer/test_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,41 @@ def test_chunk_still_works_without_imports():
chunker = Chunker()
chunks = chunker.chunk(source, file_path="hello.py", language="python")
assert len(chunks) == 1


HTML_WITH_BLOCKS = """<!DOCTYPE html>
<html>
<head><title>Page</title></head>
<body>
<h1>Hello</h1>
<script>
console.log("a");
</script>
<style>
body { color: red; }
</style>
</body>
</html>
"""

HTML_PLAIN = """<!DOCTYPE html>
<html><body><h1>Just text</h1><p>no script or style here</p></body></html>
"""


def test_chunk_html_extracts_script_and_style(chunker):
chunks = chunker.chunk(HTML_WITH_BLOCKS, file_path="page.html", language="html")
types = [c.chunk_type for c in chunks]
# script and style become MODULE chunks
assert types.count(ChunkType.MODULE) >= 2
contents = [c.content for c in chunks if c.chunk_type == ChunkType.MODULE]
assert any("console.log" in c for c in contents)
assert any("color: red" in c for c in contents)


def test_chunk_html_without_blocks_falls_back_to_whole_file(chunker):
chunks = chunker.chunk(HTML_PLAIN, file_path="plain.html", language="html")
# No script/style → fallback path returns single MODULE chunk for the file
assert len(chunks) == 1
assert chunks[0].chunk_type == ChunkType.MODULE
assert "Just text" in chunks[0].content
35 changes: 35 additions & 0 deletions tests/indexer/test_language_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Unit tests for `_resolve_language` — the indexer hook that lets users add
custom file-extension → language mappings via `indexer.extensions` in
`.context-engine.yaml`.
"""
from context_engine.indexer.pipeline import _resolve_language


def test_builtin_extension_resolves_to_known_language():
assert _resolve_language(".py", {}) == "python"


def test_unknown_extension_falls_back_to_plaintext():
assert _resolve_language(".xyz", {}) == "plaintext"


def test_custom_alias_overrides_builtin():
# .h normally maps to c; custom mapping flips it to cpp.
assert _resolve_language(".h", {".h": "cpp"}) == "cpp"


def test_custom_alias_for_unknown_extension():
assert _resolve_language(".tpl", {".tpl": "html"}) == "html"


def test_custom_empty_value_means_plaintext():
# User opts into indexing the file but knows there's no parser.
assert _resolve_language(".liquid", {".liquid": ""}) == "plaintext"


def test_lookup_is_case_insensitive():
# Extension comes from Path.suffix which preserves case (.HTML on Windows
# mounts, .R for R files); custom map keys are normalised to lowercase
# at config load time, so the lookup must lowercase the suffix too.
assert _resolve_language(".HTML", {}) == "html"
assert _resolve_language(".TPL", {".tpl": "html"}) == "html"
52 changes: 52 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,55 @@ def test_ollama_url_yaml_type_validation(tmp_path):
}))
with pytest.raises(ValueError, match="ollama_url"):
load_config(global_path=config_file)


def test_indexer_extensions_default_empty():
assert Config().indexer_extensions == {}


def test_indexer_extensions_loads_and_normalizes(tmp_path):
config_file = tmp_path / "config.yaml"
config_file.write_text(yaml.dump({
"indexer": {"extensions": {".tpl": "html", ".MJS": "javascript", ".liquid": "", ".erb": None}},
}))
config = load_config(global_path=config_file)
# Keys lowercased, null coerced to empty string.
assert config.indexer_extensions == {
".tpl": "html",
".mjs": "javascript",
".liquid": "",
".erb": "",
}


def test_indexer_extensions_rejects_key_without_dot(tmp_path):
config_file = tmp_path / "config.yaml"
config_file.write_text(yaml.dump({
"indexer": {"extensions": {"tpl": "html"}},
}))
with pytest.raises(ValueError, match="must be a string starting with"):
load_config(global_path=config_file)


def test_indexer_extensions_rejects_non_string_value(tmp_path):
config_file = tmp_path / "config.yaml"
config_file.write_text(yaml.dump({
"indexer": {"extensions": {".tpl": 123}},
}))
with pytest.raises(ValueError, match="must be a string or null"):
load_config(global_path=config_file)


def test_indexer_extensions_project_overrides_global(tmp_path):
global_file = tmp_path / "config.yaml"
global_file.write_text(yaml.dump({
"indexer": {"extensions": {".tpl": "html"}},
}))
project_file = tmp_path / ".context-engine.yaml"
project_file.write_text(yaml.dump({
"indexer": {"extensions": {".tpl": "javascript", ".vue": "vue"}},
}))
config = load_config(global_path=global_file, project_path=project_file)
# Project entry wins for .tpl; .vue inherited because deep_merge merges dicts.
assert config.indexer_extensions[".tpl"] == "javascript"
assert config.indexer_extensions[".vue"] == "vue"
Loading