Skip to content
Merged
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ result.chunk.content # "def save_pretrained(self, path: PathLike, ..."

## Main Features

- **Fast**: indexes a repo in ~250 ms and answers queries in ~1.5 ms, all on CPU.
- **Fast**: indexes an average repo in ~250 ms and answers queries in ~1.5 ms, all on CPU.
- **Accurate**: NDCG@10 of 0.854 on our [benchmarks](#benchmarks), on par with code-specialized transformer models, at a fraction of the size and cost.
- **Token-efficient**: returns only the relevant chunks, using ~98% fewer tokens than grep+read.
- **Zero setup**: runs on CPU with no API keys, GPU, or external services required.
Expand All @@ -68,7 +68,7 @@ result.chunk.content # "def save_pretrained(self, path: PathLike, ..."

## MCP Server

Semble can run as an MCP server so agents can search any codebase directly. Repos are cloned and indexed on demand, and indexes are cached for the lifetime of the session.
Semble can run as an MCP server so agents can search any codebase directly. Repos are cloned and indexed on demand, and indexes are cached for the lifetime of the session. Local paths are watched for file changes and re-indexed automatically.

### Setup

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ dependencies = [

[project.optional-dependencies]
mcp = [
"mcp>=1.0",
"mcp>=1.0,<2.0",
"watchfiles>=0.21",
]
benchmark = [
"sentence-transformers>=3.0",
Expand Down
6 changes: 6 additions & 0 deletions src/semble/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import asyncio
import sys
from importlib.resources import files
from importlib.util import find_spec
from pathlib import Path

from model2vec.utils import get_package_extras

from semble.index import SembleIndex
from semble.utils import _format_results, _is_git_url, _resolve_chunk

Expand Down Expand Up @@ -32,6 +35,9 @@ def _mcp_main() -> None:
)
parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).")
args = parser.parse_args()
if any(find_spec(dep) is None for dep in get_package_extras("semble", "mcp")):
print("MCP dependencies are not installed. Run: pip install 'semble[mcp]'", file=sys.stderr)
raise SystemExit(1)
from semble.mcp import serve

asyncio.run(serve(args.path, ref=args.ref))
Expand Down
43 changes: 37 additions & 6 deletions src/semble/mcp.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from __future__ import annotations

import asyncio
import logging
from pathlib import Path
from typing import Annotated, Literal

import watchfiles
from mcp.server.fastmcp import FastMCP
from pydantic import Field

Expand All @@ -12,6 +14,8 @@
from semble.types import Encoder
from semble.utils import _format_results, _is_git_url, _resolve_chunk

logger = logging.getLogger(__name__)

_REPO_DESCRIPTION = (
"Git URL (e.g. https://github.com/org/repo) or local path to index and search. "
"Required when no default index was configured at startup. "
Expand Down Expand Up @@ -107,6 +111,8 @@ async def serve(path: str | None = None, ref: str | None = None) -> None:
cache = _IndexCache(model=model)
if path:
await cache.get(path, ref=ref)
if not _is_git_url(path):
await cache.start_watcher(path)

server = create_server(cache, default_source=path)
await server.run_stdio_async()
Expand All @@ -119,14 +125,38 @@ def __init__(self, model: Encoder) -> None:
"""Initialise an empty cache with a shared embedding model."""
self._model = model
self._tasks: dict[str, asyncio.Task[SembleIndex]] = {}
self._watcher_task: asyncio.Task[None] | None = None

def _compute_cache_key(self, source: str, ref: str | None = None) -> str:
"""Compute the canonical cache key for a source."""
is_git = _is_git_url(source)
return (f"{source}@{ref}" if ref else source) if is_git else str(Path(source).resolve())

def evict(self, source: str) -> None:
self._tasks.pop(self._compute_cache_key(source), None)

async def start_watcher(self, path: str) -> None:
"""Start a background task that re-indexes the path whenever files change."""
self._watcher_task = asyncio.create_task(self._watch_loop(path))

async def _watch_loop(self, path: str) -> None:
"""Watch the given path for changes and evict the cache entry on changes."""
try:
async for _ in watchfiles.awatch(path):
self.evict(path)
try:
await self.get(path)
except Exception:
logger.warning("Failed to rebuild index for %r after file change", path, exc_info=True)
except Exception:
pass

async def get(self, source: str, ref: str | None = None) -> SembleIndex:
"""Return an index for the requested source, building and caching it on first access."""
is_git = _is_git_url(source)
cache_key = (f"{source}@{ref}" if ref else source) if is_git else str(Path(source).resolve())
cache_key = self._compute_cache_key(source, ref)

if cache_key not in self._tasks:
if is_git:
if _is_git_url(source):
self._tasks[cache_key] = asyncio.create_task(
asyncio.to_thread(SembleIndex.from_git, source, ref=ref, model=self._model)
)
Expand All @@ -139,9 +169,10 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex:
return await asyncio.shield(task)
except asyncio.CancelledError: # pragma: no cover
if task.done():
self._tasks.pop(cache_key, None)
self.evict(source)
raise
except Exception:
# Build failed: evict so the next caller can retry.
self._tasks.pop(cache_key, None)
# Only evict if this task hasn't already been replaced by evict()+get().
if self._tasks.get(cache_key) is task:
self.evict(source)
raise
2 changes: 1 addition & 1 deletion src/semble/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version_triple__ = (0, 1, 1)
__version_triple__ = (0, 1, 2)
__version__ = ".".join(map(str, __version_triple__))
12 changes: 12 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,18 @@ def test_cli_entrypoint_works_without_mcp_installed(
assert expected_stdout in capsys.readouterr().out


def test_mcp_main_exits_with_message_when_extras_missing(
monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
) -> None:
"""_mcp_main prints an actionable message and exits when mcp extras are not installed."""
monkeypatch.setattr(sys, "argv", ["semble"])
with patch("semble.cli.find_spec", return_value=None):
with pytest.raises(SystemExit) as exc_info:
main()
assert exc_info.value.code == 1
assert "pip install 'semble[mcp]'" in capsys.readouterr().err


def test_agent_file_tools_are_bash_only() -> None:
"""The agent file must list only Bash and Read — no MCP tools that require schema loading."""
frontmatter = _CLAUDE_AGENT_FILE.split("---")[1]
Expand Down
30 changes: 29 additions & 1 deletion tests/test_mcp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import Any
from typing import Any, AsyncGenerator
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
Expand Down Expand Up @@ -248,8 +248,36 @@ async def test_serve_runs_stdio(tmp_path: Path, with_path: bool) -> None:
with (
patch("semble.mcp.load_model", return_value=MagicMock(spec=Encoder)),
patch("semble.mcp.SembleIndex.from_path", return_value=MagicMock()),
patch.object(_IndexCache, "start_watcher", new_callable=AsyncMock),
patch("mcp.server.fastmcp.FastMCP.run_stdio_async", new_callable=AsyncMock) as mock_run,
):
await (serve(str(tmp_path)) if with_path else serve())

mock_run.assert_called_once()


def test_cache_evict(cache: _IndexCache, tmp_path: Path) -> None:
"""evict() removes an existing cache entry by resolved path."""
key = str(tmp_path.resolve())
cache._tasks[key] = MagicMock()
cache.evict(str(tmp_path))
assert key not in cache._tasks


def test_cache_evict_missing(cache: _IndexCache, tmp_path: Path) -> None:
"""evict() on an unknown path is a no-op."""
cache.evict(str(tmp_path)) # should not raise


@pytest.mark.anyio
async def test_watch_loop(cache: _IndexCache, tmp_path: Path) -> None:
"""_watch_loop rebuilds on change (inner errors swallowed) and exits cleanly on watcher error."""

async def fake_awatch(_path: str) -> AsyncGenerator:
yield set()
raise RuntimeError("watcher died")

with patch("semble.mcp.watchfiles.awatch", fake_awatch):
with patch("semble.mcp.SembleIndex.from_path", side_effect=RuntimeError("build failed")):
await cache.start_watcher(str(tmp_path))
await cache._watcher_task
Loading
Loading