Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion docs/src/content/docs/mcp/kit-dev-mcp.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,27 @@ The server provides many tools including:

- **open_repository** - Open local or remote Git repositories
- **search_code** - Pattern-based code search
- **grep_code** - Fast literal string search
- **grep_code** - Fast literal string search (120s default timeout, configurable via `KIT_GREP_TIMEOUT`)
- **get_file_tree** - Repository file structure with pagination support (`limit`/`offset` params)
- **get_file_content** - Read file contents
- **extract_symbols** - Extract functions, classes, and symbols
- **find_symbol_usages** - Find where symbols are used
- **get_code_summary** - AI-powered code summaries
- **warm_cache** - Pre-warm caches for faster operations on large codebases (100K+ files)
- **review_diff** - AI-powered diff reviews
- **deep_research_package** - Comprehensive package documentation
- **semantic_search** - Vector-based code search
- **package_search_grep** - Search package source code with regex patterns
- **package_search_hybrid** - Semantic search in package source code
- **package_search_read_file** - Read specific files from packages

<Aside type="tip" title="Large Codebases">
For repositories with 50K+ files (Linux kernel, Kubernetes, etc.):
- Use `warm_cache` first to pre-load the file tree (~3-5s for 100K files)
- Use `get_file_tree` with `limit`/`offset` for pagination
- `grep_code` now defaults to 120s timeout (set `KIT_GREP_TIMEOUT` env var to adjust)
</Aside>

## Learn More

<LinkCard
Expand Down
91 changes: 88 additions & 3 deletions src/kit/mcp/dev_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,22 @@ class GetFileTreeParams(BaseModel):
default=False,
description="Include directory entries (only relevant when compact=true).",
)
limit: int = Field(
default=10000,
description="Maximum number of files to return. Use with offset for pagination on very large repos.",
)
offset: int = Field(
default=0,
description="Number of files to skip. Use with limit for pagination.",
)


class WarmCacheParams(BaseModel):
"""Pre-warm caches for faster subsequent operations on large codebases."""

repo_id: str
warm_file_tree: bool = Field(default=True, description="Pre-cache file tree (fast, recommended)")
warm_symbols: bool = Field(default=False, description="Pre-cache symbol extraction (slower, scans all files)")


class GetSymbolCodeParams(BaseModel):
Expand Down Expand Up @@ -284,6 +300,44 @@ def get_file_tree(self, repo_id: str) -> List[Dict[str, Any]]:
repo = self.get_repo(repo_id)
return repo.get_file_tree()

def warm_cache(self, repo_id: str, warm_file_tree: bool = True, warm_symbols: bool = False) -> Dict[str, Any]:
"""Pre-warm caches for faster subsequent operations on large codebases.

This is useful for very large repos where the first file_tree or symbol
extraction can take 30+ seconds. Warming caches upfront avoids timeouts.

Args:
repo_id: Repository ID to warm caches for
warm_file_tree: Pre-cache file tree (fast, ~1-5s for 100K files)
warm_symbols: Pre-cache symbols (slower, ~30-60s for 100K files)

Returns:
Dict with timing stats for each warmed cache
"""
import time

repo = self.get_repo(repo_id)
stats: Dict[str, Any] = {"repo_id": repo_id}

if warm_file_tree:
start = time.time()
tree = repo.get_file_tree()
stats["file_tree"] = {
"elapsed_seconds": round(time.time() - start, 2),
"file_count": len(tree),
}

if warm_symbols:
start = time.time()
# Trigger full repo scan by calling extract_symbols with no file
symbols = repo.extract_symbols()
stats["symbols"] = {
"elapsed_seconds": round(time.time() - start, 2),
"symbol_count": len(symbols),
}

return stats

def extract_symbols(self, repo_id: str, file_path: str, symbol_type: Optional[str] = None) -> List[Dict[str, Any]]:
"""Extract symbols from a file."""
repo = self.get_repo(repo_id)
Expand Down Expand Up @@ -545,6 +599,11 @@ def list_tools(self) -> List[Tool]:
description="Get source code of a specific symbol (lazy loading for context efficiency)",
inputSchema=GetSymbolCodeParams.model_json_schema(),
),
Tool(
name="warm_cache",
description="Pre-warm caches for faster operations on large codebases (call before get_file_tree on huge repos)",
inputSchema=WarmCacheParams.model_json_schema(),
),
]


Expand Down Expand Up @@ -1072,15 +1131,33 @@ async def call_tool(name: str, arguments: dict) -> List[TextContent]:
elif name == "get_file_tree":
tree_params = GetFileTreeParams(**arguments)
result = logic.get_file_tree(tree_params.repo_id)

# Apply pagination for large codebases
total_count = len(result)
start = tree_params.offset
end = start + tree_params.limit
paginated = result[start:end]
has_more = end < total_count

# Compact mode: newline-separated paths (saves ~75% context)
if tree_params.compact:
paths = []
for item in result:
for item in paginated:
is_dir = item.get("is_dir", False)
if tree_params.include_dirs or not is_dir:
paths.append(item.get("path", ""))
return [TextContent(type="text", text="\n".join(paths))]
return [TextContent(type="text", text=json.dumps(result, indent=2))]
# Include pagination metadata as header for compact mode
header = f"# total={total_count} offset={start} limit={tree_params.limit} has_more={has_more}\n"
return [TextContent(type="text", text=header + "\n".join(paths))]
# JSON mode: include pagination in response
response = {
"files": paginated,
"total_count": total_count,
"offset": start,
"limit": tree_params.limit,
"has_more": has_more,
}
return [TextContent(type="text", text=json.dumps(response, indent=2))]
elif name == "get_code_summary":
summary_params = GetCodeSummaryParams(**arguments)
result = logic.get_code_summary(
Expand Down Expand Up @@ -1119,6 +1196,14 @@ async def call_tool(name: str, arguments: dict) -> List[TextContent]:
symbol_code_params.symbol_name,
)
return [TextContent(type="text", text=json.dumps(result, indent=2))]
elif name == "warm_cache":
cache_params = WarmCacheParams(**arguments)
result = logic.warm_cache(
cache_params.repo_id,
cache_params.warm_file_tree,
cache_params.warm_symbols,
)
return [TextContent(type="text", text=json.dumps(result, indent=2))]
else:
# Should not happen since we checked the name is in the list
return [TextContent(type="text", text=f"Tool {name} is recognized but not implemented")]
Expand Down
27 changes: 25 additions & 2 deletions src/kit/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ def grep(
max_results: int = 1000,
directory: Optional[str] = None,
include_hidden: bool = False,
timeout: Optional[int] = None,
) -> List[Dict[str, Any]]:
"""
Performs literal grep search on repository files using system grep.
Expand All @@ -389,8 +390,11 @@ def grep(
include_pattern: Glob pattern for files to include (e.g. '*.py').
exclude_pattern: Glob pattern for files to exclude.
max_results: Maximum number of results to return. Defaults to 1000.
Uses grep's -m flag for early termination on large codebases.
directory: Limit search to specific directory within repository (e.g. 'src', 'lib/utils').
include_hidden: Whether to search hidden directories (starting with '.'). Defaults to False.
timeout: Search timeout in seconds. Defaults to 120s (or KIT_GREP_TIMEOUT env var).
For very large codebases (10M+ files), consider increasing this.

Returns:
List[Dict[str, Any]]: List of matches with file, line_number, line_content.
Expand All @@ -403,6 +407,17 @@ def grep(

self._ensure_git_state_valid()

# Resolve timeout: parameter > env var > default (120s)
if timeout is None:
env_timeout = os.environ.get("KIT_GREP_TIMEOUT")
if env_timeout:
try:
timeout = int(env_timeout)
except ValueError:
timeout = 120
else:
timeout = 120

# Build grep command
cmd = ["grep", "-r", "-n", "-H"] # -r for recursive, -n for line numbers, -H for filenames

Expand Down Expand Up @@ -477,6 +492,11 @@ def grep(
raise ValueError(f"Directory not found in repository: {directory}")
search_path = directory

# Early termination: use -m flag to stop after max_results matches per file
# This provides massive speedups on large codebases by avoiding full traversal
if max_results > 0:
cmd.extend(["-m", str(max_results)])

# Search recursively in specified directory
cmd.append(search_path)

Expand All @@ -487,10 +507,13 @@ def grep(
capture_output=True,
text=True,
encoding="utf-8",
timeout=30, # 30 second timeout
timeout=timeout,
)
except subprocess.TimeoutExpired:
raise RuntimeError("Grep search timed out after 30 seconds")
raise RuntimeError(
f"Grep search timed out after {timeout} seconds. "
"Set KIT_GREP_TIMEOUT env var or timeout parameter for longer searches."
)
except FileNotFoundError:
raise RuntimeError("grep command not found. Please ensure grep is installed and in PATH.")

Expand Down
Loading