Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions librarian/storage/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import threading
from collections.abc import Generator
from contextlib import contextmanager
from datetime import datetime
from datetime import date, datetime
from typing import Any

import sqlite_vec
Expand All @@ -30,6 +30,19 @@
logger = logging.getLogger(__name__)


def _json_default(value: Any) -> str:
"""JSON fallback for types YAML frontmatter emits but stdlib json can't encode.

Obsidian and other markdown frontmatter commonly contain `YYYY-MM-DD` values
which PyYAML parses into `datetime.date`. Stored as ISO strings; round-tripped
values come back as strings (acceptable because metadata is informational,
not queried as dates).
"""
if isinstance(value, date | datetime):
return value.isoformat()
return str(value)


def get_effective_embedding_dimension() -> int:
"""Get the embedding dimension based on configured provider."""
if EMBEDDING_PROVIDER == "openai":
Expand Down Expand Up @@ -286,7 +299,9 @@ def insert_document(self, document: Document) -> int:
document.path,
document.title,
document.content,
json.dumps(document.metadata) if document.metadata else None,
json.dumps(document.metadata, default=_json_default)
if document.metadata
else None,
document.file_mtime,
document.asset_type.value,
),
Expand All @@ -312,7 +327,9 @@ def update_document(self, document: Document) -> None:
document.path,
document.title,
document.content,
json.dumps(document.metadata) if document.metadata else None,
json.dumps(document.metadata, default=_json_default)
if document.metadata
else None,
document.file_mtime,
document.asset_type.value,
document.id,
Expand Down
69 changes: 69 additions & 0 deletions tests/test_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Regression tests for librarian.storage.database."""

from datetime import date, datetime
from pathlib import Path

import pytest

from librarian.storage.database import Database
from librarian.types import AssetType, Document


@pytest.fixture
def db(tmp_path: Path) -> Database:
"""Fresh Database instance per test, isolated to a tmp file."""
return Database(db_path=str(tmp_path / "test.db"))


class TestMetadataSerialization:
"""JSON-serialize document metadata containing types that YAML frontmatter
emits but the stdlib json encoder doesn't know about."""

def test_insert_with_date_metadata(self, db: Database) -> None:
"""A `date` in metadata (from `last_push: 2026-05-19` frontmatter) must
not crash on insert. Regression: previously raised
`TypeError: Object of type date is not JSON serializable`."""
doc = Document(
id=None,
path="/note-with-date.md",
title="Note",
content="body",
metadata={
"last_push": date(2026, 5, 19),
"updated_at": datetime(2026, 5, 19, 12, 30, 45),
"tags": ["repo", "MOC"],
},
file_mtime=0.0,
asset_type=AssetType.TEXT,
)
doc_id = db.insert_document(doc)
assert doc_id is not None

got = db.get_document_by_path("/note-with-date.md")
assert got is not None
# Dates round-trip as ISO strings; metadata is informational, not queried as dates.
assert got.metadata["last_push"] == "2026-05-19"
assert got.metadata["updated_at"].startswith("2026-05-19T12:30:45")
assert got.metadata["tags"] == ["repo", "MOC"]

def test_update_with_date_metadata(self, db: Database) -> None:
"""update_document() must also handle date metadata (parallel call site)."""
doc = Document(
id=None,
path="/n.md",
title="N",
content="",
metadata={"tag": "a"},
file_mtime=0.0,
asset_type=AssetType.TEXT,
)
db.insert_document(doc)

loaded = db.get_document_by_path("/n.md")
assert loaded is not None
loaded.metadata = {"last_push": date(2026, 5, 19)}
db.update_document(loaded)

got = db.get_document_by_path("/n.md")
assert got is not None
assert got.metadata["last_push"] == "2026-05-19"
Loading