Merge pull request #8 from SentienceAPI/markdown

rcholic · web-flow · commit 220e47b47ffa · 2025-12-21T18:33:33.000-08:00
handle markdown read
diff --git a/.github/workflows/sync-extension.yml b/.github/workflows/sync-extension.yml
@@ -63,36 +63,25 @@ jobs:
         mkdir -p extension-temp
         cd extension-temp
         
-        # First, try to download the zip archive if available
-        ZIP_URL=$(curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
+        # Download individual files from release (reliable method - no zip)
+        echo "📁 Downloading individual files from release..."
+        curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
           "https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
-          jq -r '.assets[] | select(.name == "extension-package.zip") | .browser_download_url')
-        
-        if [ -n "$ZIP_URL" ] && [ "$ZIP_URL" != "null" ]; then
-          echo "📦 Downloading extension-package.zip..."
-          curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$ZIP_URL" -o extension-package.zip
-          unzip -q extension-package.zip -d .
-          # Files should now be in extension-temp/extension-package/ or extension-temp/
-          if [ -d "extension-package" ]; then
-            mv extension-package/* . 2>/dev/null || true
-            rmdir extension-package 2>/dev/null || true
-          fi
-        else
-          echo "📁 Downloading individual files from release..."
-          # Download each file from release
-          curl -s -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" \
-            "https://api.github.com/repos/$REPO/releases/tags/$TAG" | \
-            jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | .browser_download_url' | \
-            while read url; do
-              if [ -n "$url" ] && [ "$url" != "null" ]; then
-                filename=$(basename "$url")
-                echo "  Downloading $filename..."
-                curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$filename"
+          jq -r '.assets[] | select(.name | endswith(".js") or endswith(".wasm") or endswith(".json") or endswith(".d.ts")) | "\(.browser_download_url)|\(.name)"' | \
+          while IFS='|' read -r url name; do
+            if [ -n "$url" ] && [ "$url" != "null" ] && [ -n "$name" ]; then
+              # Preserve directory structure from asset name
+              # If name contains '/', create directories
+              dir=$(dirname "$name")
+              if [ "$dir" != "." ]; then
+                mkdir -p "$dir"
               fi
-            done
-        fi
+              echo "  Downloading $name..."
+              curl -L -H "Authorization: token ${{ secrets.SENTIENCE_CHROME_TOKEN }}" "$url" -o "$name"
+            fi
+          done
         
-        # Verify files were downloaded
+        # Verify downloaded files
         echo "📋 Downloaded files:"
         ls -la
         
@@ -102,34 +91,62 @@ jobs:
         # Create extension directory structure
         mkdir -p sentience/extension/pkg
         
-        # Copy extension files (check both root and pkg subdirectory)
-        cp extension-temp/manifest.json sentience/extension/ 2>/dev/null || echo "⚠️ manifest.json not found in release"
-        cp extension-temp/content.js sentience/extension/ 2>/dev/null || echo "⚠️ content.js not found in release"
-        cp extension-temp/background.js sentience/extension/ 2>/dev/null || echo "⚠️ background.js not found in release"
-        cp extension-temp/injected_api.js sentience/extension/ 2>/dev/null || echo "⚠️ injected_api.js not found in release"
+        # Copy extension files (handle both root and extension-package/ subdirectory)
+        # Check root first, then extension-package/ subdirectory
+        if [ -f "extension-temp/manifest.json" ]; then
+          cp extension-temp/manifest.json sentience/extension/
+        elif [ -f "extension-temp/extension-package/manifest.json" ]; then
+          cp extension-temp/extension-package/manifest.json sentience/extension/
+        else
+          echo "⚠️ manifest.json not found"
+        fi
+        
+        if [ -f "extension-temp/content.js" ]; then
+          cp extension-temp/content.js sentience/extension/
+        elif [ -f "extension-temp/extension-package/content.js" ]; then
+          cp extension-temp/extension-package/content.js sentience/extension/
+        else
+          echo "⚠️ content.js not found"
+        fi
+        
+        if [ -f "extension-temp/background.js" ]; then
+          cp extension-temp/background.js sentience/extension/
+        elif [ -f "extension-temp/extension-package/background.js" ]; then
+          cp extension-temp/extension-package/background.js sentience/extension/
+        else
+          echo "⚠️ background.js not found"
+        fi
+        
+        if [ -f "extension-temp/injected_api.js" ]; then
+          cp extension-temp/injected_api.js sentience/extension/
+        elif [ -f "extension-temp/extension-package/injected_api.js" ]; then
+          cp extension-temp/extension-package/injected_api.js sentience/extension/
+        else
+          echo "⚠️ injected_api.js not found"
+        fi
         
-        # Copy WASM files (check both root and pkg subdirectory)
+        # Copy WASM files (check both locations)
         if [ -f "extension-temp/pkg/sentience_core.js" ]; then
           cp extension-temp/pkg/sentience_core.js sentience/extension/pkg/
-        elif [ -f "extension-temp/sentience_core.js" ]; then
-          cp extension-temp/sentience_core.js sentience/extension/pkg/
+        elif [ -f "extension-temp/extension-package/pkg/sentience_core.js" ]; then
+          cp extension-temp/extension-package/pkg/sentience_core.js sentience/extension/pkg/
         else
           echo "⚠️ sentience_core.js not found"
         fi
         
         if [ -f "extension-temp/pkg/sentience_core_bg.wasm" ]; then
           cp extension-temp/pkg/sentience_core_bg.wasm sentience/extension/pkg/
-        elif [ -f "extension-temp/sentience_core_bg.wasm" ]; then
-          cp extension-temp/sentience_core_bg.wasm sentience/extension/pkg/
+        elif [ -f "extension-temp/extension-package/pkg/sentience_core_bg.wasm" ]; then
+          cp extension-temp/extension-package/pkg/sentience_core_bg.wasm sentience/extension/pkg/
         else
           echo "⚠️ sentience_core_bg.wasm not found"
         fi
         
         # Copy TypeScript definitions
         if [ -d "extension-temp/pkg" ]; then
           cp extension-temp/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
-        elif [ -d "extension-temp" ]; then
-          cp extension-temp/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
+        elif [ -d "extension-temp/extension-package/pkg" ]; then
+          cp extension-temp/extension-package/pkg/*.d.ts sentience/extension/pkg/ 2>/dev/null || echo "⚠️ Type definitions not found"
         fi
         
         # Verify copied files
@@ -156,9 +173,9 @@ jobs:
       if: steps.release.outputs.skip != 'true' && steps.changes.outputs.changed == 'true'
       uses: peter-evans/create-pull-request@v5
       with:
-        # Use GITHUB_TOKEN (built-in) if repository allows PR creation, otherwise use PR_TOKEN (PAT)
+        # Use PR_TOKEN if available (for repos with org restrictions), otherwise use GITHUB_TOKEN
         # To use PAT: create secret named PR_TOKEN with a Personal Access Token that has 'repo' scope
-        token: ${{ secrets.PR_TOKEN || secrets.GITHUB_TOKEN }}
+        token: ${{ secrets.PR_TOKEN }}
         commit-message: "chore: sync extension files from sentience-chrome ${{ steps.release.outputs.tag }}"
         title: "Sync Extension: ${{ steps.release.outputs.tag }}"
         body: |
diff --git a/README.md b/README.md
@@ -73,13 +73,36 @@ with SentienceBrowser(headless=False) as browser:
   - `.to_have_text(text)`
   - `.to_have_count(n)`
 
+### Content Reading
+- `read(browser, format="raw|text|markdown")` - Read page content
+  - **Default format: `"raw"`** - Returns HTML suitable for Turndown/markdownify
+  - `format="raw"` - Get cleaned HTML
+  - `format="markdown"` - Get high-quality markdown (uses markdownify internally)
+  - `format="text"` - Get plain text
+  
+  **Examples:**
+  ```python
+  from sentience import read
+  
+  # Get raw HTML (default)
+  result = read(browser)
+  html = result["content"]
+  
+  # Get high-quality markdown (uses markdownify automatically)
+  result = read(browser, format="markdown")
+  markdown = result["content"]
+  ```
+  
+  See `examples/read_markdown.py` for complete examples.
+
 ## Examples
 
 See `examples/` directory:
 - `hello.py` - Extension bridge verification
 - `basic_agent.py` - Basic snapshot
 - `query_demo.py` - Query engine
 - `wait_and_click.py` - Wait and actions
+- `read_markdown.py` - Reading page content and converting to markdown
 
 ### Content Reading Example
 
diff --git a/examples/read_markdown.py b/examples/read_markdown.py
@@ -0,0 +1,53 @@
+"""
+Example: Reading page content and converting to markdown
+
+This example shows how to use the read() function to get page content
+and convert it to high-quality markdown using markdownify.
+"""
+
+from sentience import SentienceBrowser, read
+from markdownify import markdownify
+
+
+def main():
+    # Initialize browser
+    with SentienceBrowser(headless=True) as browser:
+        # Navigate to a page
+        browser.page.goto("https://example.com")
+        browser.page.wait_for_load_state("networkidle")
+        
+        # Method 1: Get raw HTML (default) and convert with markdownify
+        print("=== Method 1: Raw HTML + markdownify (Recommended) ===")
+        result = read(browser)  # format="raw" is default
+        html_content = result["content"]
+        
+        # Convert to markdown using markdownify (better quality)
+        markdown = markdownify(
+            html_content,
+            heading_style="ATX",  # Use # for headings
+            bullets="-",  # Use - for lists
+            strip=['script', 'style', 'nav', 'footer', 'header'],  # Strip unwanted tags
+        )
+        print(f"Markdown length: {len(markdown)} characters")
+        print(markdown[:500])  # Print first 500 chars
+        print("\n")
+        
+        # Method 2: Get high-quality markdown directly (uses markdownify internally)
+        print("=== Method 2: Direct markdown (High-quality via markdownify) ===")
+        result = read(browser, format="markdown")
+        high_quality_markdown = result["content"]
+        print(f"Markdown length: {len(high_quality_markdown)} characters")
+        print(high_quality_markdown[:500])  # Print first 500 chars
+        print("\n")
+        
+        # Method 3: Get plain text
+        print("=== Method 3: Plain text ===")
+        result = read(browser, format="text")
+        text_content = result["content"]
+        print(f"Text length: {len(text_content)} characters")
+        print(text_content[:500])  # Print first 500 chars
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/sentience/read.py b/sentience/read.py
@@ -1,70 +1,90 @@
 """
-Read page content - enhanced markdown conversion
+Read page content - supports raw HTML, text, and markdown formats
 """
 
-from typing import Optional, Literal
+from typing import Literal
 from .browser import SentienceBrowser
 
 
 def read(
     browser: SentienceBrowser,
-    format: Literal["text", "markdown"] = "text",
+    output_format: Literal["raw", "text", "markdown"] = "raw",
     enhance_markdown: bool = True,
 ) -> dict:
     """
-    Read page content as text or markdown
+    Read page content as raw HTML, text, or markdown
     
     Args:
         browser: SentienceBrowser instance
-        format: Output format - "text" or "markdown"
-        enhance_markdown: If True and format="markdown", use markdownify for better conversion
+        output_format: Output format - "raw" (default, returns HTML for external processing),
+                        "text" (plain text), or "markdown" (lightweight or enhanced markdown).
+        enhance_markdown: If True and output_format is "markdown", uses markdownify for better conversion.
+                          If False, uses the extension's lightweight markdown converter.
     
     Returns:
         dict with:
             - status: "success" or "error"
             - url: Current page URL
-            - format: "text" or "markdown"
+            - format: "raw", "text", or "markdown"
             - content: Page content as string
             - length: Content length in characters
             - error: Error message if status is "error"
+    
+    Examples:
+        # Get raw HTML (default) - can be used with markdownify for better conversion
+        result = read(browser)
+        html_content = result["content"]
+        
+        # Get high-quality markdown (uses markdownify internally)
+        result = read(browser, output_format="markdown")
+        markdown = result["content"]
+        
+        # Get plain text
+        result = read(browser, output_format="text")
+        text = result["content"]
     """
     if not browser.page:
         raise RuntimeError("Browser not started. Call browser.start() first.")
     
-    # Get basic content from extension
+    if output_format == "markdown" and enhance_markdown:
+        # Get raw HTML from the extension first
+        raw_html_result = browser.page.evaluate(
+            """
+            (options) => {
+                return window.sentience.read(options);
+            }
+            """,
+            {"format": "raw"},
+        )
+        
+        if raw_html_result.get("status") == "success":
+            html_content = raw_html_result["content"]
+            try:
+                # Use markdownify for enhanced markdown conversion
+                from markdownify import markdownify, MarkdownifyError
+                markdown_content = markdownify(html_content, heading_style="ATX", wrap=True)
+                return {
+                    "status": "success",
+                    "url": raw_html_result["url"],
+                    "format": "markdown",
+                    "content": markdown_content,
+                    "length": len(markdown_content),
+                }
+            except ImportError:
+                print("Warning: 'markdownify' not installed. Install with 'pip install markdownify' for enhanced markdown. Falling back to extension's markdown.")
+            except MarkdownifyError as e:
+                print(f"Warning: markdownify failed ({e}), falling back to extension's markdown.")
+            except Exception as e:
+                print(f"Warning: An unexpected error occurred with markdownify ({e}), falling back to extension's markdown.")
+
+    # If not enhanced markdown, or fallback, call extension with requested format
     result = browser.page.evaluate(
         """
         (options) => {
             return window.sentience.read(options);
         }
         """,
-        {"format": format},
+        {"format": output_format},
     )
     
-    # Enhance markdown if requested and format is markdown
-    if format == "markdown" and enhance_markdown and result.get("status") == "success":
-        try:
-            # Get full HTML from page
-            html_content = browser.page.evaluate("() => document.documentElement.outerHTML")
-            
-            # Use markdownify for better conversion
-            from markdownify import markdownify as md
-            enhanced_markdown = md(
-                html_content,
-                heading_style="ATX",  # Use # for headings
-                bullets="-",  # Use - for lists
-                strip=['script', 'style', 'nav', 'footer', 'header', 'noscript'],  # Strip unwanted tags
-            )
-            result["content"] = enhanced_markdown
-            result["length"] = len(enhanced_markdown)
-        except ImportError:
-            # Fallback to extension's lightweight conversion if markdownify not installed
-            # This shouldn't happen if dependencies are installed, but handle gracefully
-            pass
-        except Exception as e:
-            # If enhancement fails, use extension's result
-            # Don't overwrite result["error"] - keep extension's result
-            pass
-    
     return result
-