Skip to content

Commit fbdf7b0

Browse files
author
SentienceDev
committed
canonicalize diff status
1 parent 2f11d76 commit fbdf7b0

File tree

7 files changed

+296
-148
lines changed

7 files changed

+296
-148
lines changed

sentience/canonicalization.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
"""
2+
Shared canonicalization utilities for snapshot comparison and indexing.
3+
4+
This module provides consistent normalization functions used by both:
5+
- trace_indexing/indexer.py (for computing stable digests)
6+
- snapshot_diff.py (for computing diff_status labels)
7+
8+
By sharing these helpers, we ensure consistent behavior:
9+
- Same text normalization (whitespace, case, length)
10+
- Same bbox rounding (2px precision)
11+
- Same change detection thresholds
12+
"""
13+
14+
from typing import Any
15+
16+
17+
def normalize_text(text: str | None, max_len: int = 80) -> str:
18+
"""
19+
Normalize text for canonical comparison.
20+
21+
Transforms:
22+
- Trims leading/trailing whitespace
23+
- Collapses internal whitespace to single spaces
24+
- Lowercases
25+
- Caps length
26+
27+
Args:
28+
text: Input text (may be None)
29+
max_len: Maximum length to retain (default: 80)
30+
31+
Returns:
32+
Normalized text string (empty string if input is None)
33+
34+
Examples:
35+
>>> normalize_text(" Hello World ")
36+
'hello world'
37+
>>> normalize_text(None)
38+
''
39+
"""
40+
if not text:
41+
return ""
42+
# Trim and collapse whitespace
43+
normalized = " ".join(text.split())
44+
# Lowercase
45+
normalized = normalized.lower()
46+
# Cap length
47+
if len(normalized) > max_len:
48+
normalized = normalized[:max_len]
49+
return normalized
50+
51+
52+
def round_bbox(bbox: dict[str, float], precision: int = 2) -> dict[str, int]:
53+
"""
54+
Round bbox coordinates to reduce noise.
55+
56+
Snaps coordinates to grid of `precision` pixels to ignore
57+
sub-pixel rendering differences.
58+
59+
Args:
60+
bbox: Bounding box with x, y, width, height
61+
precision: Grid size in pixels (default: 2)
62+
63+
Returns:
64+
Rounded bbox with integer coordinates
65+
66+
Examples:
67+
>>> round_bbox({"x": 101, "y": 203, "width": 50, "height": 25})
68+
{'x': 100, 'y': 202, 'width': 50, 'height': 24}
69+
"""
70+
return {
71+
"x": round(bbox.get("x", 0) / precision) * precision,
72+
"y": round(bbox.get("y", 0) / precision) * precision,
73+
"width": round(bbox.get("width", 0) / precision) * precision,
74+
"height": round(bbox.get("height", 0) / precision) * precision,
75+
}
76+
77+
78+
def bbox_equal(bbox1: dict[str, Any], bbox2: dict[str, Any], precision: int = 2) -> bool:
79+
"""
80+
Check if two bboxes are equal after rounding.
81+
82+
Args:
83+
bbox1: First bounding box
84+
bbox2: Second bounding box
85+
precision: Grid size for rounding (default: 2)
86+
87+
Returns:
88+
True if bboxes are equal after rounding
89+
90+
Examples:
91+
>>> bbox_equal({"x": 100, "y": 200, "width": 50, "height": 25},
92+
... {"x": 101, "y": 200, "width": 50, "height": 25})
93+
True
94+
"""
95+
r1 = round_bbox(bbox1, precision)
96+
r2 = round_bbox(bbox2, precision)
97+
return (
98+
r1["x"] == r2["x"]
99+
and r1["y"] == r2["y"]
100+
and r1["width"] == r2["width"]
101+
and r1["height"] == r2["height"]
102+
)
103+
104+
105+
def bbox_changed(bbox1: dict[str, Any], bbox2: dict[str, Any], precision: int = 2) -> bool:
106+
"""
107+
Check if two bboxes differ after rounding.
108+
109+
This is the inverse of bbox_equal, provided for semantic clarity
110+
in diff detection code.
111+
112+
Args:
113+
bbox1: First bounding box
114+
bbox2: Second bounding box
115+
precision: Grid size for rounding (default: 2)
116+
117+
Returns:
118+
True if bboxes differ after rounding
119+
"""
120+
return not bbox_equal(bbox1, bbox2, precision)
121+
122+
123+
def canonicalize_element(elem: dict[str, Any]) -> dict[str, Any]:
124+
"""
125+
Create canonical representation of an element for comparison/hashing.
126+
127+
Extracts and normalizes the fields that matter for identity:
128+
- id, role, normalized text, rounded bbox
129+
- is_primary, is_clickable from visual_cues
130+
131+
Args:
132+
elem: Raw element dictionary
133+
134+
Returns:
135+
Canonical element dictionary with normalized fields
136+
137+
Examples:
138+
>>> canonicalize_element({
139+
... "id": 1,
140+
... "role": "button",
141+
... "text": " Click Me ",
142+
... "bbox": {"x": 101, "y": 200, "width": 50, "height": 25},
143+
... "visual_cues": {"is_primary": True, "is_clickable": True}
144+
... })
145+
{'id': 1, 'role': 'button', 'text_norm': 'click me', 'bbox': {'x': 100, 'y': 200, 'width': 50, 'height': 24}, 'is_primary': True, 'is_clickable': True}
146+
"""
147+
# Extract is_primary and is_clickable from visual_cues if present
148+
visual_cues = elem.get("visual_cues", {})
149+
is_primary = (
150+
visual_cues.get("is_primary", False)
151+
if isinstance(visual_cues, dict)
152+
else elem.get("is_primary", False)
153+
)
154+
is_clickable = (
155+
visual_cues.get("is_clickable", False)
156+
if isinstance(visual_cues, dict)
157+
else elem.get("is_clickable", False)
158+
)
159+
160+
return {
161+
"id": elem.get("id"),
162+
"role": elem.get("role", ""),
163+
"text_norm": normalize_text(elem.get("text")),
164+
"bbox": round_bbox(elem.get("bbox", {"x": 0, "y": 0, "width": 0, "height": 0})),
165+
"is_primary": is_primary,
166+
"is_clickable": is_clickable,
167+
}
168+
169+
170+
def content_equal(elem1: dict[str, Any], elem2: dict[str, Any]) -> bool:
171+
"""
172+
Check if two elements have equal content (ignoring position).
173+
174+
Compares normalized text, role, and visual cues.
175+
176+
Args:
177+
elem1: First element (raw or canonical)
178+
elem2: Second element (raw or canonical)
179+
180+
Returns:
181+
True if content is equal after normalization
182+
"""
183+
# Normalize both elements
184+
c1 = canonicalize_element(elem1)
185+
c2 = canonicalize_element(elem2)
186+
187+
return (
188+
c1["role"] == c2["role"]
189+
and c1["text_norm"] == c2["text_norm"]
190+
and c1["is_primary"] == c2["is_primary"]
191+
and c1["is_clickable"] == c2["is_clickable"]
192+
)
193+
194+
195+
def content_changed(elem1: dict[str, Any], elem2: dict[str, Any]) -> bool:
196+
"""
197+
Check if two elements have different content (ignoring position).
198+
199+
This is the inverse of content_equal, provided for semantic clarity
200+
in diff detection code.
201+
202+
Args:
203+
elem1: First element
204+
elem2: Second element
205+
206+
Returns:
207+
True if content differs after normalization
208+
"""
209+
return not content_equal(elem1, elem2)

sentience/extension/background.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ async function handleSnapshotProcessing(rawData, options = {}) {
2828
const startTime = performance.now();
2929
try {
3030
if (!Array.isArray(rawData)) throw new Error("rawData must be an array");
31-
if (rawData.length > 1e4 && (rawData = rawData.slice(0, 1e4)), await initWASM(),
31+
if (rawData.length > 1e4 && (rawData = rawData.slice(0, 1e4)), await initWASM(),
3232
!wasmReady) throw new Error("WASM module not initialized");
3333
let analyzedElements, prunedRawData;
3434
try {
3535
const wasmPromise = new Promise((resolve, reject) => {
3636
try {
3737
let result;
38-
result = options.limit || options.filter ? analyze_page_with_options(rawData, options) : analyze_page(rawData),
38+
result = options.limit || options.filter ? analyze_page_with_options(rawData, options) : analyze_page(rawData),
3939
resolve(result);
4040
} catch (e) {
4141
reject(e);
@@ -101,4 +101,4 @@ initWASM().catch(err => {}), chrome.runtime.onMessage.addListener((request, send
101101
event.preventDefault();
102102
}), self.addEventListener("unhandledrejection", event => {
103103
event.preventDefault();
104-
});
104+
});

sentience/extension/content.js

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282
if (!elements || !Array.isArray(elements)) return;
8383
removeOverlay();
8484
const host = document.createElement("div");
85-
host.id = OVERLAY_HOST_ID, host.style.cssText = "\n position: fixed !important;\n top: 0 !important;\n left: 0 !important;\n width: 100vw !important;\n height: 100vh !important;\n pointer-events: none !important;\n z-index: 2147483647 !important;\n margin: 0 !important;\n padding: 0 !important;\n ",
85+
host.id = OVERLAY_HOST_ID, host.style.cssText = "\n position: fixed !important;\n top: 0 !important;\n left: 0 !important;\n width: 100vw !important;\n height: 100vh !important;\n pointer-events: none !important;\n z-index: 2147483647 !important;\n margin: 0 !important;\n padding: 0 !important;\n ",
8686
document.body.appendChild(host);
8787
const shadow = host.attachShadow({
8888
mode: "closed"
@@ -94,15 +94,15 @@
9494
let color;
9595
color = isTarget ? "#FF0000" : isPrimary ? "#0066FF" : "#00FF00";
9696
const importanceRatio = maxImportance > 0 ? importance / maxImportance : .5, borderOpacity = isTarget ? 1 : isPrimary ? .9 : Math.max(.4, .5 + .5 * importanceRatio), fillOpacity = .2 * borderOpacity, borderWidth = isTarget ? 2 : isPrimary ? 1.5 : Math.max(.5, Math.round(2 * importanceRatio)), hexOpacity = Math.round(255 * fillOpacity).toString(16).padStart(2, "0"), box = document.createElement("div");
97-
if (box.style.cssText = `\n position: absolute;\n left: ${bbox.x}px;\n top: ${bbox.y}px;\n width: ${bbox.width}px;\n height: ${bbox.height}px;\n border: ${borderWidth}px solid ${color};\n background-color: ${color}${hexOpacity};\n box-sizing: border-box;\n opacity: ${borderOpacity};\n pointer-events: none;\n `,
97+
if (box.style.cssText = `\n position: absolute;\n left: ${bbox.x}px;\n top: ${bbox.y}px;\n width: ${bbox.width}px;\n height: ${bbox.height}px;\n border: ${borderWidth}px solid ${color};\n background-color: ${color}${hexOpacity};\n box-sizing: border-box;\n opacity: ${borderOpacity};\n pointer-events: none;\n `,
9898
importance > 0 || isPrimary) {
9999
const badge = document.createElement("span");
100-
badge.textContent = isPrimary ? `⭐${importance}` : `${importance}`, badge.style.cssText = `\n position: absolute;\n top: -18px;\n left: 0;\n background: ${color};\n color: white;\n font-size: 11px;\n font-weight: bold;\n padding: 2px 6px;\n font-family: Arial, sans-serif;\n border-radius: 3px;\n opacity: 0.95;\n white-space: nowrap;\n pointer-events: none;\n `,
100+
badge.textContent = isPrimary ? `⭐${importance}` : `${importance}`, badge.style.cssText = `\n position: absolute;\n top: -18px;\n left: 0;\n background: ${color};\n color: white;\n font-size: 11px;\n font-weight: bold;\n padding: 2px 6px;\n font-family: Arial, sans-serif;\n border-radius: 3px;\n opacity: 0.95;\n white-space: nowrap;\n pointer-events: none;\n `,
101101
box.appendChild(badge);
102102
}
103103
if (isTarget) {
104104
const targetIndicator = document.createElement("span");
105-
targetIndicator.textContent = "🎯", targetIndicator.style.cssText = "\n position: absolute;\n top: -18px;\n right: 0;\n font-size: 16px;\n pointer-events: none;\n ",
105+
targetIndicator.textContent = "🎯", targetIndicator.style.cssText = "\n position: absolute;\n top: -18px;\n right: 0;\n font-size: 16px;\n pointer-events: none;\n ",
106106
box.appendChild(targetIndicator);
107107
}
108108
shadow.appendChild(box);
@@ -120,7 +120,7 @@
120120
let overlayTimeout = null;
121121
function removeOverlay() {
122122
const existing = document.getElementById(OVERLAY_HOST_ID);
123-
existing && existing.remove(), overlayTimeout && (clearTimeout(overlayTimeout),
123+
existing && existing.remove(), overlayTimeout && (clearTimeout(overlayTimeout),
124124
overlayTimeout = null);
125125
}
126-
}();
126+
}();

0 commit comments

Comments
 (0)