Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 25 additions & 96 deletions librarian/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
from rich.panel import Panel
from rich.table import Table

from librarian.sources.ignore import GitignoreMatcher, should_skip_file

# Initialize Typer app
app = typer.Typer(
name="libr",
Expand Down Expand Up @@ -216,100 +218,13 @@ def _get_config() -> dict[str, Any]:
}


def _should_skip_file(file_path: Path, supported_extensions: set[str]) -> bool:
"""
Check if a file should be skipped during indexing.

Args:
file_path: Path to the file.
supported_extensions: Set of supported extensions.

Returns:
True if the file should be skipped.
"""
# Skip system/hidden directories
skip_dirs = {
"__pycache__",
".git",
".svn",
".hg",
"node_modules",
".venv",
"venv",
".pytest_cache",
".mypy_cache",
".ruff_cache",
"__MACOSX",
".DS_Store",
}

# Check if file is in a skipped directory
for parent in file_path.parents:
if parent.name in skip_dirs:
return True

# Skip hidden files (starting with .)
if file_path.name.startswith("."):
return True

# Skip binary/system file extensions
skip_extensions = {
# Executables and binaries
".exe",
".bin",
".dll",
".so",
".dylib",
".a",
".o",
# Disk images and archives
".dmg",
".iso",
".img",
".app",
".pkg",
# Compressed archives
".zip",
".tar",
".gz",
".bz2",
".xz",
".7z",
".rar",
# Python compiled
".pyc",
".pyo",
".pyd",
# System files
".lock",
".log",
".tmp",
".temp",
".cache",
# Media files (large binaries)
".mp4",
".mp3",
".wav",
".avi",
".mov",
".flac",
# Font files
".ttf",
".otf",
".woff",
".woff2",
}

if file_path.suffix.lower() in skip_extensions:
return True

# Skip files without extensions unless they're in supported list
# (e.g., README is supported, but random no-extension files aren't)
if not file_path.suffix:
return True

# Skip if extension not in supported list
return file_path.suffix.lower() not in supported_extensions
def _should_skip_file(
file_path: Path,
supported_extensions: set[str],
gitignore_matcher: "GitignoreMatcher | None" = None,
) -> bool:
"""Check if a file should be skipped during indexing."""
return should_skip_file(file_path, supported_extensions, gitignore_matcher)


def _find_source(name_or_path: str) -> dict | None:
Expand Down Expand Up @@ -506,6 +421,13 @@ def add_source(
verbose: Annotated[
bool, typer.Option("--verbose", "-v", help="Show files being indexed")
] = False,
include_ignored: Annotated[
bool,
typer.Option(
"--include-ignored",
help="Index files even when matched by a .gitignore in the source tree",
),
] = False,
) -> None:
"""Add a file or directory as a source and index it recursively."""
cfg = _get_config()
Expand Down Expand Up @@ -551,9 +473,13 @@ def add_source(
else:
files_to_index.extend(source_path.rglob(f"*{ext}"))

# Filter out system/binary files
gitignore_matcher = None if include_ignored else GitignoreMatcher(source_path)

# Filter out system/binary files and .gitignore matches
files_to_index = [
f for f in files_to_index if not _should_skip_file(f, supported_extensions)
f
for f in files_to_index
if not _should_skip_file(f, supported_extensions, gitignore_matcher)
]

# Apply pattern filter
Expand Down Expand Up @@ -598,6 +524,7 @@ def add_source(
"depth": depth,
"pattern": pattern,
"exclude": exclude,
"include_ignored": include_ignored,
"added_at": datetime.now().isoformat(),
}

Expand Down Expand Up @@ -629,6 +556,7 @@ def add_source(
server_ingest(
context=None, # type: ignore[arg-type]
directory=str(source_path),
include_ignored=include_ignored,
)
)

Expand Down Expand Up @@ -909,6 +837,7 @@ def index_build(
server_ingest(
context=None, # type: ignore[arg-type]
directory=str(src_path),
include_ignored=bool(src.get("include_ignored", False)),
)
)
total_indexed += result.get("indexed", 0) + result.get("updated", 0)
Expand Down
111 changes: 17 additions & 94 deletions librarian/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from librarian.processing.embed import get_embedder
from librarian.processing.parsers.base import FileReadError, FileReadTimeoutError
from librarian.retrieval.search import HybridSearcher
from librarian.sources.ignore import GitignoreMatcher, should_skip_file
from librarian.storage.database import get_database
from librarian.tool_outputs import (
AddOutput,
Expand Down Expand Up @@ -96,99 +97,13 @@ def _process_and_index_file(file_path: Path) -> dict[str, Any]:
return get_indexing_service().index_file(file_path)


def _should_skip_file(file_path: Path, supported_extensions: set[str]) -> bool:
"""
Check if a file should be skipped during indexing.

Args:
file_path: Path to the file.
supported_extensions: Set of supported extensions.

Returns:
True if the file should be skipped.
"""
# Skip system/hidden directories
skip_dirs = {
"__pycache__",
".git",
".svn",
".hg",
"node_modules",
".venv",
"venv",
".pytest_cache",
".mypy_cache",
".ruff_cache",
"__MACOSX",
".DS_Store",
}

# Check if file is in a skipped directory
for parent in file_path.parents:
if parent.name in skip_dirs:
return True

# Skip hidden files (starting with .)
if file_path.name.startswith("."):
return True

# Skip binary/system file extensions
skip_extensions = {
# Executables and binaries
".exe",
".bin",
".dll",
".so",
".dylib",
".a",
".o",
# Disk images and archives
".dmg",
".iso",
".img",
".app",
".pkg",
# Compressed archives
".zip",
".tar",
".gz",
".bz2",
".xz",
".7z",
".rar",
# Python compiled
".pyc",
".pyo",
".pyd",
# System files
".lock",
".log",
".tmp",
".temp",
".cache",
# Media files (large binaries)
".mp4",
".mp3",
".wav",
".avi",
".mov",
".flac",
# Font files
".ttf",
".otf",
".woff",
".woff2",
}

if file_path.suffix.lower() in skip_extensions:
return True

# Skip files without extensions
if not file_path.suffix:
return True

# Skip if extension not in supported list
return file_path.suffix.lower() not in supported_extensions
def _should_skip_file(
file_path: Path,
supported_extensions: set[str],
gitignore_matcher: GitignoreMatcher | None = None,
) -> bool:
"""Check if a file should be skipped during indexing."""
return should_skip_file(file_path, supported_extensions, gitignore_matcher)


def _resolve_path(raw_path: str, kind: str = "path") -> Path:
Expand Down Expand Up @@ -243,6 +158,10 @@ def _resolve_path(raw_path: str, kind: str = "path") -> Path:
async def index_directory_to_library(
context: Context,
directory: Annotated[str, "Absolute path to directory containing files to add to the library"],
include_ignored: Annotated[
bool,
"If True, index files even when matched by a .gitignore under the directory.",
] = False,
) -> Annotated[
IndexDirectoryOutput,
"Per-directory index summary with counts and a per-file status list.",
Expand Down Expand Up @@ -285,12 +204,16 @@ async def index_directory_to_library(
registry = get_registry()
supported_extensions = registry.get_supported_extensions()

gitignore_matcher = None if include_ignored else GitignoreMatcher(dir_path)

all_files: list[Path] = []
for ext in supported_extensions:
pattern = f"**/*{ext}"
all_files.extend(dir_path.glob(pattern))

all_files = [f for f in all_files if not _should_skip_file(f, supported_extensions)]
all_files = [
f for f in all_files if not _should_skip_file(f, supported_extensions, gitignore_matcher)
]

if not all_files:
return IndexDirectoryOutput(
Expand Down
1 change: 1 addition & 0 deletions librarian/sources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Source management for the librarian."""
Loading
Loading