Skip to content

Commit f2e5525

Browse files
authored
Merge pull request #77 from SentienceAPI/text_coord
get text coordinates
2 parents 70af2f5 + b1a7d33 commit f2e5525

File tree

11 files changed

+467
-114
lines changed

11 files changed

+467
-114
lines changed

README.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,74 @@ data_url = screenshot(browser, format="jpeg", quality=85)
401401

402402
</details>
403403

404+
<details>
405+
<summary><h3>🔎 Text Search - Find Elements by Visible Text</h3></summary>
406+
407+
**`find_text_rect(browser, text, case_sensitive=False, whole_word=False, max_results=10)`** - Find text on page and get exact pixel coordinates
408+
409+
Find buttons, links, or any UI elements by their visible text without needing element IDs or CSS selectors. Returns exact pixel coordinates for each match.
410+
411+
**Example:**
412+
```python
413+
from sentience import SentienceBrowser, find_text_rect, click_rect
414+
415+
with SentienceBrowser() as browser:
416+
browser.page.goto("https://example.com")
417+
418+
# Find "Sign In" button
419+
result = find_text_rect(browser, "Sign In")
420+
if result.status == "success" and result.results:
421+
first_match = result.results[0]
422+
print(f"Found at: ({first_match.rect.x}, {first_match.rect.y})")
423+
print(f"In viewport: {first_match.in_viewport}")
424+
425+
# Click on the found text
426+
if first_match.in_viewport:
427+
click_rect(browser, {
428+
"x": first_match.rect.x,
429+
"y": first_match.rect.y,
430+
"w": first_match.rect.width,
431+
"h": first_match.rect.height
432+
})
433+
```
434+
435+
**Advanced Options:**
436+
```python
437+
# Case-sensitive search
438+
result = find_text_rect(browser, "LOGIN", case_sensitive=True)
439+
440+
# Whole word only (won't match "login" as part of "loginButton")
441+
result = find_text_rect(browser, "log", whole_word=True)
442+
443+
# Find multiple matches
444+
result = find_text_rect(browser, "Buy", max_results=10)
445+
for match in result.results:
446+
if match.in_viewport:
447+
print(f"Found '{match.text}' at ({match.rect.x}, {match.rect.y})")
448+
print(f"Context: ...{match.context.before}[{match.text}]{match.context.after}...")
449+
```
450+
451+
**Returns:** `TextRectSearchResult` with:
452+
- **`status`**: "success" or "error"
453+
- **`results`**: List of `TextMatch` objects with:
454+
- `text` - The matched text
455+
- `rect` - Absolute coordinates (with scroll offset)
456+
- `viewport_rect` - Viewport-relative coordinates
457+
- `context` - Surrounding text (before/after)
458+
- `in_viewport` - Whether visible in current viewport
459+
460+
**Use Cases:**
461+
- Find buttons/links by visible text without CSS selectors
462+
- Get exact pixel coordinates for click automation
463+
- Verify text visibility and position on page
464+
- Search dynamic content that changes frequently
465+
466+
**Note:** Does not consume API credits (runs locally in browser)
467+
468+
**See example:** `examples/find_text_demo.py`
469+
470+
</details>
471+
404472
---
405473

406474
## 📋 Reference

examples/find_text_demo.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""
2+
Text Search Demo - Using find_text_rect() to locate elements by visible text
3+
4+
This example demonstrates how to:
5+
1. Find text on a webpage and get exact pixel coordinates
6+
2. Use case-sensitive and whole-word matching options
7+
3. Click on found text using click_rect()
8+
4. Handle multiple matches and filter by viewport visibility
9+
"""
10+
11+
from sentience import SentienceBrowser, click_rect, find_text_rect
12+
13+
14+
def main():
15+
with SentienceBrowser() as browser:
16+
# Navigate to a search page
17+
browser.page.goto("https://www.google.com")
18+
browser.page.wait_for_load_state("networkidle")
19+
20+
print("\n" + "=" * 60)
21+
print("Text Search Demo")
22+
print("=" * 60 + "\n")
23+
24+
# Example 1: Simple text search
25+
print("Example 1: Finding 'Google Search' button")
26+
print("-" * 60)
27+
result = find_text_rect(browser, "Google Search")
28+
29+
if result.status == "success" and result.results:
30+
print(f"✓ Found {result.matches} match(es) for '{result.query}'")
31+
for i, match in enumerate(result.results[:3]): # Show first 3
32+
print(f"\nMatch {i + 1}:")
33+
print(f" Text: '{match.text}'")
34+
print(f" Position: ({match.rect.x:.1f}, {match.rect.y:.1f})")
35+
print(f" Size: {match.rect.width:.1f}x{match.rect.height:.1f} pixels")
36+
print(f" In viewport: {match.in_viewport}")
37+
print(f" Context: ...{match.context.before}[{match.text}]{match.context.after}...")
38+
else:
39+
print(f"✗ Search failed: {result.error}")
40+
41+
# Example 2: Find and click search box
42+
print("\n\nExample 2: Finding and clicking the search box")
43+
print("-" * 60)
44+
result = find_text_rect(browser, "Search", max_results=5)
45+
46+
if result.status == "success" and result.results:
47+
# Find the first visible match
48+
for match in result.results:
49+
if match.in_viewport:
50+
print(f"✓ Found visible match: '{match.text}'")
51+
print(f" Clicking at ({match.rect.x:.1f}, {match.rect.y:.1f})")
52+
53+
# Click in the center of the text
54+
click_result = click_rect(
55+
browser,
56+
{
57+
"x": match.rect.x,
58+
"y": match.rect.y,
59+
"w": match.rect.width,
60+
"h": match.rect.height,
61+
},
62+
)
63+
64+
if click_result.success:
65+
print(f" ✓ Click successful!")
66+
break
67+
68+
# Example 3: Case-sensitive search
69+
print("\n\nExample 3: Case-sensitive search for 'GOOGLE'")
70+
print("-" * 60)
71+
result_insensitive = find_text_rect(browser, "GOOGLE", case_sensitive=False)
72+
result_sensitive = find_text_rect(browser, "GOOGLE", case_sensitive=True)
73+
74+
print(f"Case-insensitive search: {result_insensitive.matches or 0} matches")
75+
print(f"Case-sensitive search: {result_sensitive.matches or 0} matches")
76+
77+
# Example 4: Whole word search
78+
print("\n\nExample 4: Whole word search")
79+
print("-" * 60)
80+
result_partial = find_text_rect(browser, "Search", whole_word=False)
81+
result_whole = find_text_rect(browser, "Search", whole_word=True)
82+
83+
print(f"Partial word match: {result_partial.matches or 0} matches")
84+
print(f"Whole word only: {result_whole.matches or 0} matches")
85+
86+
# Example 5: Get viewport information
87+
print("\n\nExample 5: Viewport and scroll information")
88+
print("-" * 60)
89+
result = find_text_rect(browser, "Google")
90+
if result.status == "success" and result.viewport:
91+
print(f"Viewport size: {result.viewport.width}x{result.viewport.height}")
92+
# Note: scroll position would be available if viewport had scroll_x/scroll_y fields
93+
94+
print("\n" + "=" * 60)
95+
print("Demo complete!")
96+
print("=" * 60 + "\n")
97+
98+
99+
if __name__ == "__main__":
100+
main()

sentience/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,13 @@
4141
SnapshotFilter,
4242
SnapshotOptions,
4343
StorageState,
44+
TextContext,
45+
TextMatch,
46+
TextRect,
47+
TextRectSearchResult,
4448
TokenStats,
4549
Viewport,
50+
ViewportRect,
4651
WaitResult,
4752
)
4853
from .overlay import clear_overlay, show_overlay
@@ -51,6 +56,7 @@
5156
from .recorder import Recorder, Trace, TraceStep, record
5257
from .screenshot import screenshot
5358
from .snapshot import snapshot
59+
from .text_search import find_text_rect
5460
from .tracer_factory import SENTIENCE_API_URL, create_tracer
5561
from .tracing import JsonlTraceSink, TraceEvent, Tracer, TraceSink
5662

@@ -96,6 +102,13 @@
96102
"screenshot",
97103
"show_overlay",
98104
"clear_overlay",
105+
# Text Search
106+
"find_text_rect",
107+
"TextRectSearchResult",
108+
"TextMatch",
109+
"TextRect",
110+
"ViewportRect",
111+
"TextContext",
99112
# Agent Layer (Phase 1 & 2)
100113
"BaseAgent",
101114
"LLMProvider",

sentience/expect.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
"""
44

55
import time
6-
from typing import Optional, Union
76

87
from .browser import SentienceBrowser
98
from .models import Element

sentience/extension/background.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,13 @@ async function handleScreenshotCapture(_tabId, options = {}) {
144144
async function handleSnapshotProcessing(rawData, options = {}) {
145145
const MAX_ELEMENTS = 10000; // Safety limit to prevent hangs
146146
const startTime = performance.now();
147-
147+
148148
try {
149149
// Safety check: limit element count to prevent hangs
150150
if (!Array.isArray(rawData)) {
151151
throw new Error('rawData must be an array');
152152
}
153-
153+
154154
if (rawData.length > MAX_ELEMENTS) {
155155
console.warn(`[Sentience Background] ⚠️ Large dataset: ${rawData.length} elements. Limiting to ${MAX_ELEMENTS} to prevent hangs.`);
156156
rawData = rawData.slice(0, MAX_ELEMENTS);
@@ -186,7 +186,7 @@ async function handleSnapshotProcessing(rawData, options = {}) {
186186
// Add timeout protection (18 seconds - less than content.js timeout)
187187
analyzedElements = await Promise.race([
188188
wasmPromise,
189-
new Promise((_, reject) =>
189+
new Promise((_, reject) =>
190190
setTimeout(() => reject(new Error('WASM processing timeout (>18s)')), 18000)
191191
)
192192
]);

sentience/extension/content.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ function handleSnapshotRequest(data) {
9292
if (responded) return; // Already responded via timeout
9393
responded = true;
9494
clearTimeout(timeoutId);
95-
95+
9696
const duration = performance.now() - startTime;
9797

9898
// Handle Chrome extension errors (e.g., background script crashed)

0 commit comments

Comments
 (0)