Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ async def finalize_entities(
) -> list[dict[str, Any]]:
"""Read entity rows, enrich with degree, and write back.

Streams through the entities table, deduplicates by title,
Streams through the entities table, deduplicates by title and type,
assigns degree from the pre-computed degree map, and writes
each finalized row back to the same table (safe when using
truncate=True, which reads from the original and writes to
Expand All @@ -36,14 +36,16 @@ async def finalize_entities(
Sample of up to 5 entity rows for logging.
"""
sample_rows: list[dict[str, Any]] = []
seen_titles: set[str] = set()
seen_entities: set[tuple[str, str]] = set()
human_readable_id = 0

async for row in entities_table:
title = row.get("title")
if not title or title in seen_titles:
entity_type = row.get("type", "")
entity_key = (title, entity_type)
if not title or entity_key in seen_entities:
continue
seen_titles.add(title)
seen_entities.add(entity_key)
row["degree"] = degree_map.get(title, 0)
row["human_readable_id"] = human_readable_id
row["id"] = str(uuid4())
Expand Down
26 changes: 21 additions & 5 deletions tests/unit/indexing/test_finalize_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,12 @@ async def test_missing_degree_defaults_to_zero(self):
assert len(table.written) == 1
assert table.written[0]["degree"] == 0

async def test_deduplicates_by_title(self):
"""Duplicate titles should be skipped."""
async def test_deduplicates_by_title_and_type(self):
"""Duplicate title and type combinations should be skipped."""
table = FakeTable([
_make_entity_row("A"),
_make_entity_row("A"),
_make_entity_row("B"),
_make_entity_row("A", "PERSON"),
_make_entity_row("A", "PERSON"),
_make_entity_row("B", "ORGANIZATION"),
])
degree_map = {"A": 1, "B": 2}
await finalize_entities(table, degree_map)
Expand All @@ -205,6 +205,22 @@ async def test_deduplicates_by_title(self):
titles = [r["title"] for r in table.written]
assert titles == ["A", "B"]

async def test_preserves_entities_with_same_title_different_type(self):
"""Entities with same title but different types should not be deduplicated."""
table = FakeTable([
_make_entity_row("PARENT AREA", "PERSON"),
_make_entity_row("PARENT AREA", "ORGANIZATION"),
_make_entity_row("B", "ENTITY"),
])
degree_map = {"PARENT AREA": 1, "B": 2}
await finalize_entities(table, degree_map)

assert len(table.written) == 3
titles = [r["title"] for r in table.written]
types = [r["type"] for r in table.written]
assert titles == ["PARENT AREA", "PARENT AREA", "B"]
assert types == ["PERSON", "ORGANIZATION", "ENTITY"]

async def test_skips_empty_title(self):
"""Rows with empty or missing title should be skipped."""
table = FakeTable([
Expand Down