Skip to content

Commit 220e47b

Browse files
authored
Merge pull request #8 from SentienceAPI/markdown
handle markdown read
2 parents fba9842 + 34fe408 commit 220e47b

File tree

4 files changed

+189
-76
lines changed

4 files changed

+189
-76
lines changed

.github/workflows/sync-extension.yml

Lines changed: 58 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -63,36 +63,25 @@ jobs:
6363
mkdir -p extension-temp
6464
cd extension-temp
6565
66-
# First, try to download the zip archive if available
67-
ZIP_URL=$(curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
66+
# Download individual files from release (reliable method - no zip)
67+
echo "📁 Downloading individual files from release..."
68+
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
6869
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
69-
jq -r '.assets[] | select(.name == "extension-package.zip") | .browser_download_url')
70-
71-
if [ -n "$ZIP_URL" ] && [ "$ZIP_URL" != "null" ]; then
72-
echo "📦 Downloading extension-package.zip..."
73-
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$ZIP_URL" -o extension-package.zip
74-
unzip -q extension-package.zip -d .
75-
# Files should now be in extension-temp/extension-package/ or extension-temp/
76-
if [ -d "extension-package" ]; then
77-
mv extension-package/* . 2>/dev/null || true
78-
rmdir extension-package 2>/dev/null || true
79-
fi
80-
else
81-
echo "📁 Downloading individual files from release..."
82-
# Download each file from release
83-
curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
84-
"https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
85-
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \
86-
while read url; do
87-
if [ -n "$url" ] && [ "$url" != "null" ]; then
88-
filename=$(basename "$url")
89-
echo " Downloading $filename..."
90-
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename"
70+
jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \
71+
while IFS='|' read -r url name; do
72+
if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then
73+
# Preserve directory structure from asset name
74+
# If name contains '/', create directories
75+
dir=$(dirname "$name")
76+
if [ "$dir" != "." ]; then
77+
mkdir -p "$dir"
9178
fi
92-
done
93-
fi
79+
echo " Downloading $name..."
80+
curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name"
81+
fi
82+
done
9483
95-
# Verify files were downloaded
84+
# Verify downloaded files
9685
echo "📋 Downloaded files:"
9786
ls -la
9887
@@ -102,34 +91,62 @@ jobs:
10291
# Create extension directory structure
10392
mkdir -p sentience/extension/pkg
10493
105-
# Copy extension files (check both root and pkg subdirectory)
106-
cp extension-temp/manifest.json sentience/extension/ 2>/dev/null || echo "⚠️ manifest.json not found in release"
107-
cp extension-temp/content.js sentience/extension/ 2>/dev/null || echo "⚠️ content.js not found in release"
108-
cp extension-temp/background.js sentience/extension/ 2>/dev/null || echo "⚠️ background.js not found in release"
109-
cp extension-temp/injected_api.js sentience/extension/ 2>/dev/null || echo "⚠️ injected_api.js not found in release"
94+
# Copy extension files (handle both root and extension-package/ subdirectory)
95+
# Check root first, then extension-package/ subdirectory
96+
if [ -f "extension-temp/manifest.json" ]; then
97+
cp extension-temp/manifest.json sentience/extension/
98+
elif [ -f "extension-temp/extension-package/manifest.json" ]; then
99+
cp extension-temp/extension-package/manifest.json sentience/extension/
100+
else
101+
echo "⚠️ manifest.json not found"
102+
fi
103+
104+
if [ -f "extension-temp/content.js" ]; then
105+
cp extension-temp/content.js sentience/extension/
106+
elif [ -f "extension-temp/extension-package/content.js" ]; then
107+
cp extension-temp/extension-package/content.js sentience/extension/
108+
else
109+
echo "⚠️ content.js not found"
110+
fi
111+
112+
if [ -f "extension-temp/background.js" ]; then
113+
cp extension-temp/background.js sentience/extension/
114+
elif [ -f "extension-temp/extension-package/background.js" ]; then
115+
cp extension-temp/extension-package/background.js sentience/extension/
116+
else
117+
echo "⚠️ background.js not found"
118+
fi
119+
120+
if [ -f "extension-temp/injected_api.js" ]; then
121+
cp extension-temp/injected_api.js sentience/extension/
122+
elif [ -f "extension-temp/extension-package/injected_api.js" ]; then
123+
cp extension-temp/extension-package/injected_api.js sentience/extension/
124+
else
125+
echo "⚠️ injected_api.js not found"
126+
fi
110127
111-
# Copy WASM files (check both root and pkg subdirectory)
128+
# Copy WASM files (check both locations)
112129
if [ -f "extension-temp/pkg/sentience_core.js" ]; then
113130
cp extension-temp/pkg/sentience_core.js sentience/extension/pkg/
114-
elif [ -f "extension-temp/sentience_core.js" ]; then
115-
cp extension-temp/sentience_core.js sentience/extension/pkg/
131+
elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then
132+
cp extension-temp/extension-package/pkg/sentience_core.js sentience/extension/pkg/
116133
else
117134
echo "⚠️ sentience_core.js not found"
118135
fi
119136
120137
if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then
121138
cp extension-temp/pkg/sentience_core_bg.wasm sentience/extension/pkg/
122-
elif [ -f "extension-temp/sentience_core_bg.wasm" ]; then
123-
cp extension-temp/sentience_core_bg.wasm sentience/extension/pkg/
139+
elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then
140+
cp extension-temp/extension-package/pkg/sentience_core_bg.wasm sentience/extension/pkg/
124141
else
125142
echo "⚠️ sentience_core_bg.wasm not found"
126143
fi
127144
128145
# Copy TypeScript definitions
129146
if [ -d "extension-temp/pkg" ]; then
130147
cp extension-temp/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
131-
elif [ -d "extension-temp" ]; then
132-
cp extension-temp/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
148+
elif [ -d "extension-temp/extension-package/pkg" ]; then
149+
cp extension-temp/extension-package/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
133150
fi
134151
135152
# Verify copied files
@@ -156,9 +173,9 @@ jobs:
156173
if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true'
157174
uses: peter-evans/create-pull-request@v5
158175
with:
159-
# Use GITHUB_TOKEN (built-in) if repository allows PR creation, otherwise use PR_TOKEN (PAT)
176+
# Use PR_TOKEN if available (for repos with org restrictions), otherwise use GITHUB_TOKEN
160177
# To use PAT: create secret named PR_TOKEN with a Personal Access Token that has 'repo' scope
161-
token: ${{ secrets.PR_TOKEN || secrets.GITHUB_TOKEN }}
178+
token: ${{ secrets.PR_TOKEN }}
162179
commit-message: "chore: sync extension files from sentience-chrome ${{ steps.release.outputs.tag }}"
163180
title: "Sync Extension: ${{ steps.release.outputs.tag }}"
164181
body: |

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,36 @@ with SentienceBrowser(headless=False) as browser:
7373
- `.to_have_text(text)`
7474
- `.to_have_count(n)`
7575

76+
### Content Reading
77+
- `read(browser, format="raw|text|markdown")` - Read page content
78+
- **Default format: `"raw"`** - Returns HTML suitable for Turndown/markdownify
79+
- `format="raw"` - Get cleaned HTML
80+
- `format="markdown"` - Get high-quality markdown (uses markdownify internally)
81+
- `format="text"` - Get plain text
82+
83+
**Examples:**
84+
```python
85+
from sentience import read
86+
87+
# Get raw HTML (default)
88+
result = read(browser)
89+
html = result["content"]
90+
91+
# Get high-quality markdown (uses markdownify automatically)
92+
result = read(browser, format="markdown")
93+
markdown = result["content"]
94+
```
95+
96+
See `examples/read_markdown.py` for complete examples.
97+
7698
## Examples
7799

78100
See `examples/` directory:
79101
- `hello.py` - Extension bridge verification
80102
- `basic_agent.py` - Basic snapshot
81103
- `query_demo.py` - Query engine
82104
- `wait_and_click.py` - Wait and actions
105+
- `read_markdown.py` - Reading page content and converting to markdown
83106

84107
### Content Reading Example
85108

examples/read_markdown.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Example: Reading page content and converting to markdown
3+
4+
This example shows how to use the read() function to get page content
5+
and convert it to high-quality markdown using markdownify.
6+
"""
7+
8+
from sentience import SentienceBrowser, read
9+
from markdownify import markdownify
10+
11+
12+
def main():
13+
# Initialize browser
14+
with SentienceBrowser(headless=True) as browser:
15+
# Navigate to a page
16+
browser.page.goto("https://example.com")
17+
browser.page.wait_for_load_state("networkidle")
18+
19+
# Method 1: Get raw HTML (default) and convert with markdownify
20+
print("=== Method 1: Raw HTML + markdownify (Recommended) ===")
21+
result = read(browser) # format="raw" is default
22+
html_content = result["content"]
23+
24+
# Convert to markdown using markdownify (better quality)
25+
markdown = markdownify(
26+
html_content,
27+
heading_style="ATX", # Use # for headings
28+
bullets="-", # Use - for lists
29+
strip=['script', 'style', 'nav', 'footer', 'header'], # Strip unwanted tags
30+
)
31+
print(f"Markdown length: {len(markdown)} characters")
32+
print(markdown[:500]) # Print first 500 chars
33+
print("\n")
34+
35+
# Method 2: Get high-quality markdown directly (uses markdownify internally)
36+
print("=== Method 2: Direct markdown (High-quality via markdownify) ===")
37+
result = read(browser, format="markdown")
38+
high_quality_markdown = result["content"]
39+
print(f"Markdown length: {len(high_quality_markdown)} characters")
40+
print(high_quality_markdown[:500]) # Print first 500 chars
41+
print("\n")
42+
43+
# Method 3: Get plain text
44+
print("=== Method 3: Plain text ===")
45+
result = read(browser, format="text")
46+
text_content = result["content"]
47+
print(f"Text length: {len(text_content)} characters")
48+
print(text_content[:500]) # Print first 500 chars
49+
50+
51+
if __name__ == "__main__":
52+
main()
53+

sentience/read.py

Lines changed: 55 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,90 @@
11
"""
2-
Read page content - enhanced markdown conversion
2+
Read page content - supports raw HTML, text, and markdown formats
33
"""
44

5-
from typing import Optional, Literal
5+
from typing import Literal
66
from .browser import SentienceBrowser
77

88

99
def read(
1010
browser: SentienceBrowser,
11-
format: Literal["text", "markdown"] = "text",
11+
output_format: Literal["raw", "text", "markdown"] = "raw",
1212
enhance_markdown: bool = True,
1313
) -> dict:
1414
"""
15-
Read page content as text or markdown
15+
Read page content as raw HTML, text, or markdown
1616
1717
Args:
1818
browser: SentienceBrowser instance
19-
format: Output format - "text" or "markdown"
20-
enhance_markdown: If True and format="markdown", use markdownify for better conversion
19+
output_format: Output format - "raw" (default, returns HTML for external processing),
20+
"text" (plain text), or "markdown" (lightweight or enhanced markdown).
21+
enhance_markdown: If True and output_format is "markdown", uses markdownify for better conversion.
22+
If False, uses the extension's lightweight markdown converter.
2123
2224
Returns:
2325
dict with:
2426
- status: "success" or "error"
2527
- url: Current page URL
26-
- format: "text" or "markdown"
28+
- format: "raw", "text", or "markdown"
2729
- content: Page content as string
2830
- length: Content length in characters
2931
- error: Error message if status is "error"
32+
33+
Examples:
34+
# Get raw HTML (default) - can be used with markdownify for better conversion
35+
result = read(browser)
36+
html_content = result["content"]
37+
38+
# Get high-quality markdown (uses markdownify internally)
39+
result = read(browser, output_format="markdown")
40+
markdown = result["content"]
41+
42+
# Get plain text
43+
result = read(browser, output_format="text")
44+
text = result["content"]
3045
"""
3146
if not browser.page:
3247
raise RuntimeError("Browser not started. Call browser.start() first.")
3348

34-
# Get basic content from extension
49+
if output_format == "markdown" and enhance_markdown:
50+
# Get raw HTML from the extension first
51+
raw_html_result = browser.page.evaluate(
52+
"""
53+
(options) => {
54+
return window.sentience.read(options);
55+
}
56+
""",
57+
{"format": "raw"},
58+
)
59+
60+
if raw_html_result.get("status") == "success":
61+
html_content = raw_html_result["content"]
62+
try:
63+
# Use markdownify for enhanced markdown conversion
64+
from markdownify import markdownify, MarkdownifyError
65+
markdown_content = markdownify(html_content, heading_style="ATX", wrap=True)
66+
return {
67+
"status": "success",
68+
"url": raw_html_result["url"],
69+
"format": "markdown",
70+
"content": markdown_content,
71+
"length": len(markdown_content),
72+
}
73+
except ImportError:
74+
print("Warning: 'markdownify' not installed. Install with 'pip install markdownify' for enhanced markdown. Falling back to extension's markdown.")
75+
except MarkdownifyError as e:
76+
print(f"Warning: markdownify failed ({e}), falling back to extension's markdown.")
77+
except Exception as e:
78+
print(f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown.")
79+
80+
# If not enhanced markdown, or fallback, call extension with requested format
3581
result = browser.page.evaluate(
3682
"""
3783
(options) => {
3884
return window.sentience.read(options);
3985
}
4086
""",
41-
{"format": format},
87+
{"format": output_format},
4288
)
4389

44-
# Enhance markdown if requested and format is markdown
45-
if format == "markdown" and enhance_markdown and result.get("status") == "success":
46-
try:
47-
# Get full HTML from page
48-
html_content = browser.page.evaluate("() => document.documentElement.outerHTML")
49-
50-
# Use markdownify for better conversion
51-
from markdownify import markdownify as md
52-
enhanced_markdown = md(
53-
html_content,
54-
heading_style="ATX", # Use # for headings
55-
bullets="-", # Use - for lists
56-
strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'], # Strip unwanted tags
57-
)
58-
result["content"] = enhanced_markdown
59-
result["length"] = len(enhanced_markdown)
60-
except ImportError:
61-
# Fallback to extension's lightweight conversion if markdownify not installed
62-
# This shouldn't happen if dependencies are installed, but handle gracefully
63-
pass
64-
except Exception as e:
65-
# If enhancement fails, use extension's result
66-
# Don't overwrite result["error"] - keep extension's result
67-
pass
68-
6990
return result
70-

0 commit comments

Comments
 (0)