Skip to content

Commit f569ef3

Browse files
committed
updated release pipeline with new functions for read
1 parent 93aeb23 commit f569ef3

File tree

8 files changed

+374
-20
lines changed

8 files changed

+374
-20
lines changed

.github/workflows/sync-extension.yml

Lines changed: 71 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ jobs:
2525
uses: actions/checkout@v4
2626
with:
2727
token: ${{ secrets.GITHUB_TOKEN }}
28+
fetch-depth: 0 # Fetch all history for proper branching
2829

2930
- name: Set up Python
3031
uses: actions/setup-python@v5
@@ -62,34 +63,79 @@ jobs:
6263
mkdir -p extension-temp
6364
cd extension-temp
6465
65-
# Download each file from release
66-
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
66+
# First, try to download the zip archive if available
67+
ZIP_URL=$(curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
6768
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
68-
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \
69-
while read url; do
70-
filename=$(basename "$url")
71-
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename"
72-
done
69+
jq -r '.assets[] | select(.name == "extension-package.zip") | .browser_download_url')
7370
74-
# Alternative: Download from release archive if available
75-
# Or use the extension-package artifact
71+
if [ -n "$ZIP_URL" ] && [ "$ZIP_URL" != "null" ]; then
72+
echo "📦 Downloading extension-package.zip..."
73+
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$ZIP_URL" -o extension-package.zip
74+
unzip -q extension-package.zip -d .
75+
# Files should now be in extension-temp/extension-package/ or extension-temp/
76+
if [ -d "extension-package" ]; then
77+
mv extension-package/* . 2>/dev/null || true
78+
rmdir extension-package 2>/dev/null || true
79+
fi
80+
else
81+
echo "📁 Downloading individual files from release..."
82+
# Download each file from release
83+
curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
84+
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
85+
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \
86+
while read url; do
87+
if [ -n "$url" ] && [ "$url" != "null" ]; then
88+
filename=$(basename "$url")
89+
echo " Downloading $filename..."
90+
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename"
91+
fi
92+
done
93+
fi
94+
95+
# Verify files were downloaded
96+
echo "📋 Downloaded files:"
97+
ls -la
7698
7799
- name: Copy extension files
78100
if: steps.release.outputs.skip != 'true'
79101
run: |
80102
# Create extension directory structure
81103
mkdir -p sentience/extension/pkg
82104
83-
# Copy extension files
84-
cp extension-temp/manifest.json sentience/extension/ 2>/dev/null || echo "manifest.json not found in release"
85-
cp extension-temp/content.js sentience/extension/ 2>/dev/null || echo "content.js not found in release"
86-
cp extension-temp/background.js sentience/extension/ 2>/dev/null || echo "background.js not found in release"
87-
cp extension-temp/injected_api.js sentience/extension/ 2>/dev/null || echo "injected_api.js not found in release"
105+
# Copy extension files (check both root and pkg subdirectory)
106+
cp extension-temp/manifest.json sentience/extension/ 2>/dev/null || echo "⚠️ manifest.json not found in release"
107+
cp extension-temp/content.js sentience/extension/ 2>/dev/null || echo "⚠️ content.js not found in release"
108+
cp extension-temp/background.js sentience/extension/ 2>/dev/null || echo "⚠️ background.js not found in release"
109+
cp extension-temp/injected_api.js sentience/extension/ 2>/dev/null || echo "⚠️ injected_api.js not found in release"
110+
111+
# Copy WASM files (check both root and pkg subdirectory)
112+
if [ -f "extension-temp/pkg/sentience_core.js" ]; then
113+
cp extension-temp/pkg/sentience_core.js sentience/extension/pkg/
114+
elif [ -f "extension-temp/sentience_core.js" ]; then
115+
cp extension-temp/sentience_core.js sentience/extension/pkg/
116+
else
117+
echo "⚠️ sentience_core.js not found"
118+
fi
119+
120+
if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then
121+
cp extension-temp/pkg/sentience_core_bg.wasm sentience/extension/pkg/
122+
elif [ -f "extension-temp/sentience_core_bg.wasm" ]; then
123+
cp extension-temp/sentience_core_bg.wasm sentience/extension/pkg/
124+
else
125+
echo "⚠️ sentience_core_bg.wasm not found"
126+
fi
127+
128+
# Copy TypeScript definitions
129+
if [ -d "extension-temp/pkg" ]; then
130+
cp extension-temp/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
131+
elif [ -d "extension-temp" ]; then
132+
cp extension-temp/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
133+
fi
88134
89-
# Copy WASM files
90-
cp extension-temp/pkg/sentience_core.js sentience/extension/pkg/ 2>/dev/null || echo "sentience_core.js not found"
91-
cp extension-temp/pkg/sentience_core_bg.wasm sentience/extension/pkg/ 2>/dev/null || echo "sentience_core_bg.wasm not found"
92-
cp extension-temp/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "Type definitions not found"
135+
# Verify copied files
136+
echo "📋 Copied files:"
137+
ls -la sentience/extension/
138+
ls -la sentience/extension/pkg/ 2>/dev/null || echo "⚠️ pkg directory not created"
93139
94140
- name: Check for changes
95141
if: steps.release.outputs.skip != 'true'
@@ -110,7 +156,9 @@ jobs:
110156
if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true'
111157
uses: peter-evans/create-pull-request@v5
112158
with:
113-
token: ${{ secrets.GITHUB_TOKEN }}
159+
# Use GITHUB_TOKEN (built-in) if repository allows PR creation, otherwise use PR_TOKEN (PAT)
160+
# To use PAT: create secret named PR_TOKEN with a Personal Access Token that has 'repo' scope
161+
token: ${{ secrets.PR_TOKEN || secrets.GITHUB_TOKEN }}
114162
commit-message: "chore: sync extension files from sentience-chrome ${{ steps.release.outputs.tag }}"
115163
title: "Sync Extension: ${{ steps.release.outputs.tag }}"
116164
body: |
@@ -120,7 +168,10 @@ jobs:
120168
- Extension manifest and scripts
121169
- WASM binary and bindings
122170
123-
**Source:** [sentience-chrome release ${{ steps.release.outputs.tag }}](${{ secrets.SENTIENCE_CHROME_REPO }}/releases/tag/${{ steps.release.outputs.tag }})
171+
**Source:** [sentience-chrome release ${{ steps.release.outputs.tag }}](https://github.com/${{ secrets.SENTIENCE_CHROME_REPO }}/releases/tag/${{ steps.release.outputs.tag }})
124172
branch: sync-extension-${{ steps.release.outputs.tag }}
125173
delete-branch: true
174+
labels: |
175+
automated
176+
extension-sync
126177

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ with SentienceBrowser(headless=False) as browser:
4646
- Pydantic models for type safety
4747
- `snapshot.save(filepath)` - Save to JSON
4848

49+
### Content Reading & Screenshots
50+
- `read(browser, format="text|markdown")` - Read page content as text or markdown
51+
- Enhanced markdown conversion using `markdownify` (better than extension's lightweight conversion)
52+
- Supports `enhance_markdown=True` to use improved conversion
53+
- `screenshot(browser, format="png|jpeg", quality=80)` - Capture standalone screenshot
54+
- Returns base64-encoded data URL
55+
- Supports PNG and JPEG formats with quality control
56+
4957
### Day 4: Query Engine
5058
- `query(snapshot, selector)` - Find elements matching selector
5159
- `find(snapshot, selector)` - Find single best match
@@ -73,6 +81,39 @@ See `examples/` directory:
7381
- `query_demo.py` - Query engine
7482
- `wait_and_click.py` - Wait and actions
7583

84+
### Content Reading Example
85+
86+
```python
87+
from sentience import SentienceBrowser, read
88+
89+
with SentienceBrowser() as browser:
90+
browser.page.goto("https://example.com")
91+
browser.page.wait_for_load_state("networkidle")
92+
93+
# Read as enhanced markdown (better quality)
94+
result = read(browser, format="markdown", enhance_markdown=True)
95+
print(result["content"]) # High-quality markdown
96+
```
97+
98+
### Screenshot Example
99+
100+
```python
101+
from sentience import SentienceBrowser, screenshot
102+
import base64
103+
104+
with SentienceBrowser() as browser:
105+
browser.page.goto("https://example.com")
106+
browser.page.wait_for_load_state("networkidle")
107+
108+
# Capture PNG screenshot
109+
data_url = screenshot(browser, format="png")
110+
111+
# Save to file
112+
image_data = base64.b64decode(data_url.split(",")[1])
113+
with open("screenshot.png", "wb") as f:
114+
f.write(image_data)
115+
```
116+
76117
## Testing
77118

78119
```bash

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ dependencies = [
2626
"jsonschema>=4.0.0",
2727
"requests>=2.31.0", # For server-side API calls
2828
"playwright-stealth>=1.0.6", # Bot evasion and stealth mode
29+
"markdownify>=0.11.6", # Enhanced HTML to Markdown conversion
2930
]
3031

3132
[project.urls]

sentience/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from .inspector import Inspector, inspect
1313
from .recorder import Recorder, Trace, TraceStep, record
1414
from .generator import ScriptGenerator, generate
15+
from .read import read
16+
from .screenshot import screenshot
1517

1618
__version__ = "0.1.0"
1719

@@ -39,5 +41,7 @@
3941
"record",
4042
"ScriptGenerator",
4143
"generate",
44+
"read",
45+
"screenshot",
4246
]
4347

sentience/read.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
Read page content - enhanced markdown conversion
3+
"""
4+
5+
from typing import Optional, Literal
6+
from .browser import SentienceBrowser
7+
8+
9+
def read(
10+
browser: SentienceBrowser,
11+
format: Literal["text", "markdown"] = "text",
12+
enhance_markdown: bool = True,
13+
) -> dict:
14+
"""
15+
Read page content as text or markdown
16+
17+
Args:
18+
browser: SentienceBrowser instance
19+
format: Output format - "text" or "markdown"
20+
enhance_markdown: If True and format="markdown", use markdownify for better conversion
21+
22+
Returns:
23+
dict with:
24+
- status: "success" or "error"
25+
- url: Current page URL
26+
- format: "text" or "markdown"
27+
- content: Page content as string
28+
- length: Content length in characters
29+
- error: Error message if status is "error"
30+
"""
31+
if not browser.page:
32+
raise RuntimeError("Browser not started. Call browser.start() first.")
33+
34+
# Get basic content from extension
35+
result = browser.page.evaluate(
36+
"""
37+
(options) => {
38+
return window.sentience.read(options);
39+
}
40+
""",
41+
{"format": format},
42+
)
43+
44+
# Enhance markdown if requested and format is markdown
45+
if format == "markdown" and enhance_markdown and result.get("status") == "success":
46+
try:
47+
# Get full HTML from page
48+
html_content = browser.page.evaluate("() => document.documentElement.outerHTML")
49+
50+
# Use markdownify for better conversion
51+
from markdownify import markdownify as md
52+
enhanced_markdown = md(
53+
html_content,
54+
heading_style="ATX", # Use # for headings
55+
bullets="-", # Use - for lists
56+
strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'], # Strip unwanted tags
57+
)
58+
result["content"] = enhanced_markdown
59+
result["length"] = len(enhanced_markdown)
60+
except ImportError:
61+
# Fallback to extension's lightweight conversion if markdownify not installed
62+
# This shouldn't happen if dependencies are installed, but handle gracefully
63+
pass
64+
except Exception as e:
65+
# If enhancement fails, use extension's result
66+
# Don't overwrite result["error"] - keep extension's result
67+
pass
68+
69+
return result
70+

sentience/screenshot.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Screenshot functionality - standalone screenshot capture
3+
"""
4+
5+
from typing import Optional, Literal, Dict, Any
6+
from .browser import SentienceBrowser
7+
8+
9+
def screenshot(
10+
browser: SentienceBrowser,
11+
format: Literal["png", "jpeg"] = "png",
12+
quality: Optional[int] = None,
13+
) -> str:
14+
"""
15+
Capture screenshot of current page
16+
17+
Args:
18+
browser: SentienceBrowser instance
19+
format: Image format - "png" or "jpeg"
20+
quality: JPEG quality (1-100), only used for JPEG format
21+
22+
Returns:
23+
Base64-encoded screenshot data URL (e.g., "data:image/png;base64,...")
24+
25+
Raises:
26+
RuntimeError: If browser not started
27+
ValueError: If quality is invalid for JPEG
28+
"""
29+
if not browser.page:
30+
raise RuntimeError("Browser not started. Call browser.start() first.")
31+
32+
if format == "jpeg" and quality is not None:
33+
if not (1 <= quality <= 100):
34+
raise ValueError("Quality must be between 1 and 100 for JPEG format")
35+
36+
# Use Playwright's screenshot with base64 encoding
37+
screenshot_options: Dict[str, Any] = {
38+
"type": format,
39+
}
40+
41+
if format == "jpeg" and quality is not None:
42+
screenshot_options["quality"] = quality
43+
44+
# Capture screenshot as base64
45+
# Playwright returns bytes when encoding is not specified, so we encode manually
46+
import base64
47+
image_bytes = browser.page.screenshot(**screenshot_options)
48+
base64_data = base64.b64encode(image_bytes).decode('utf-8')
49+
50+
# Return as data URL
51+
mime_type = "image/png" if format == "png" else "image/jpeg"
52+
return f"data:{mime_type};base64,{base64_data}"
53+

tests/test_read.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Tests for read functionality
3+
"""
4+
5+
from sentience import SentienceBrowser, read
6+
7+
8+
def test_read_text():
9+
"""Test reading page as text"""
10+
with SentienceBrowser(headless=True) as browser:
11+
browser.page.goto("https://example.com")
12+
browser.page.wait_for_load_state("networkidle")
13+
14+
result = read(browser, format="text")
15+
16+
assert result["status"] == "success"
17+
assert result["format"] == "text"
18+
assert "content" in result
19+
assert "length" in result
20+
assert len(result["content"]) > 0
21+
assert result["url"] == "https://example.com"
22+
23+
24+
def test_read_markdown():
25+
"""Test reading page as markdown"""
26+
with SentienceBrowser(headless=True) as browser:
27+
browser.page.goto("https://example.com")
28+
browser.page.wait_for_load_state("networkidle")
29+
30+
result = read(browser, format="markdown")
31+
32+
assert result["status"] == "success"
33+
assert result["format"] == "markdown"
34+
assert "content" in result
35+
assert "length" in result
36+
assert len(result["content"]) > 0
37+
assert result["url"] == "https://example.com"
38+
39+
40+
def test_read_markdown_enhanced():
41+
"""Test reading page as markdown with enhancement"""
42+
with SentienceBrowser(headless=True) as browser:
43+
browser.page.goto("https://example.com")
44+
browser.page.wait_for_load_state("networkidle")
45+
46+
# Test with enhancement (default)
47+
result_enhanced = read(browser, format="markdown", enhance_markdown=True)
48+
49+
assert result_enhanced["status"] == "success"
50+
assert result_enhanced["format"] == "markdown"
51+
assert len(result_enhanced["content"]) > 0
52+
53+
# Test without enhancement
54+
result_basic = read(browser, format="markdown", enhance_markdown=False)
55+
56+
assert result_basic["status"] == "success"
57+
assert result_basic["format"] == "markdown"
58+
assert len(result_basic["content"]) > 0
59+
60+
# Enhanced markdown should be different (and likely better formatted)
61+
# Note: They might be similar for simple pages, but enhanced should handle more cases
62+
assert isinstance(result_enhanced["content"], str)
63+
assert isinstance(result_basic["content"], str)
64+

0 commit comments

Comments
 (0)