Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 57 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,66 @@ pip install -e .

# Install Playwright browsers (required)
playwright install chromium

# For LLM Agent features (optional)
pip install openai # For OpenAI models
pip install anthropic # For Claude models
pip install transformers torch # For local LLMs
```

## Quick Start: Choose Your Abstraction Level

Sentience SDK offers **three abstraction levels** - use what fits your needs:

### 🎯 **Level 3: Natural Language (Easiest)** - For non-technical users

```python
from sentience import SentienceBrowser, ConversationalAgent
from sentience.llm_provider import OpenAIProvider

browser = SentienceBrowser()
llm = OpenAIProvider(api_key="your-key", model="gpt-4o")
agent = ConversationalAgent(browser, llm)

with browser:
response = agent.execute("Search for magic mouse on google.com")
print(response)
# → "I searched for 'magic mouse' and found several results.
# The top result is from amazon.com selling Magic Mouse 2 for $79."
```

## Quick Start
**Best for:** End users, chatbots, no-code platforms
**Code required:** 3-5 lines
**Technical knowledge:** None

### ⚙️ **Level 2: Technical Commands (Recommended)** - For AI developers

```python
from sentience import SentienceBrowser, SentienceAgent
from sentience.llm_provider import OpenAIProvider

browser = SentienceBrowser()
llm = OpenAIProvider(api_key="your-key", model="gpt-4o")
agent = SentienceAgent(browser, llm)

with browser:
browser.page.goto("https://google.com")
agent.act("Click the search box")
agent.act("Type 'magic mouse' into the search field")
agent.act("Press Enter key")
```

**Best for:** Building AI agents, automation scripts
**Code required:** 10-15 lines
**Technical knowledge:** Medium (Python basics)

### 🔧 **Level 1: Direct SDK (Most Control)** - For production automation

```python
from sentience import SentienceBrowser, snapshot, find, click

# Start browser with extension
with SentienceBrowser(headless=False) as browser:
browser.goto("https://example.com", wait_until="domcontentloaded")
browser.page.goto("https://example.com")

# Take snapshot - captures all interactive elements
snap = snapshot(browser)
Expand All @@ -31,6 +81,10 @@ with SentienceBrowser(headless=False) as browser:
print(f"Click success: {result.success}")
```

**Best for:** Maximum control, performance-critical apps
**Code required:** 20-50 lines
**Technical knowledge:** High (SDK API, selectors)

## Real-World Example: Amazon Shopping Bot

This example demonstrates navigating Amazon, finding products, and adding items to cart:
Expand Down
222 changes: 222 additions & 0 deletions examples/agent_layers_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
"""
Demonstration of all three abstraction layers in Sentience SDK

Layer 1: Direct SDK (Full Control)
Layer 2: SentienceAgent (Technical Commands)
Layer 3: ConversationalAgent (Natural Language)

This script shows how the same task can be accomplished at different abstraction levels.
"""

import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def demo_layer1_direct_sdk():
"""
Layer 1: Direct SDK Usage
- Full control over every action
- Requires knowing exact element selectors
- 50+ lines of code for typical automation
"""
print("\n" + "="*70)
print("LAYER 1: Direct SDK Usage (Full Control)")
print("="*70)

from sentience import SentienceBrowser, snapshot, find, click, type_text, press

with SentienceBrowser(headless=False) as browser:
# Navigate
browser.page.goto("https://google.com")

# Get snapshot
snap = snapshot(browser)

# Find search box manually
search_box = find(snap, "role=searchbox")
if not search_box:
search_box = find(snap, "role=textbox")

# Click search box
click(browser, search_box.id)

# Type query
type_text(browser, search_box.id, "magic mouse")

# Press Enter
press(browser, "Enter")

print("\n✅ Layer 1 Demo Complete")
print(" Code required: ~20 lines")
print(" Technical knowledge: High")
print(" Flexibility: Maximum")


def demo_layer2_sentience_agent():
"""
Layer 2: SentienceAgent (Technical Commands)
- High-level commands with LLM intelligence
- No need to know selectors
- 15 lines of code for typical automation
"""
print("\n" + "="*70)
print("LAYER 2: SentienceAgent (Technical Commands)")
print("="*70)

from sentience import SentienceBrowser, SentienceAgent
from sentience.llm_provider import OpenAIProvider

# Initialize
browser = SentienceBrowser(headless=False)
llm = OpenAIProvider(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o-mini")
agent = SentienceAgent(browser, llm, verbose=True)

with browser:
browser.page.goto("https://google.com")

# Execute technical commands
agent.act("Click the search box")
agent.act("Type 'magic mouse' into the search field")
agent.act("Press Enter key")

print("\n✅ Layer 2 Demo Complete")
print(" Code required: ~10 lines")
print(" Technical knowledge: Medium")
print(" Flexibility: High")
print(f" Tokens used: {agent.get_token_stats()['total_tokens']}")


def demo_layer3_conversational_agent():
"""
Layer 3: ConversationalAgent (Natural Language)
- Pure natural language interface
- Automatic planning and execution
- 3 lines of code for typical automation
"""
print("\n" + "="*70)
print("LAYER 3: ConversationalAgent (Natural Language)")
print("="*70)

from sentience import SentienceBrowser, ConversationalAgent
from sentience.llm_provider import OpenAIProvider

# Initialize
browser = SentienceBrowser(headless=False)
llm = OpenAIProvider(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
agent = ConversationalAgent(browser, llm, verbose=True)

with browser:
# Execute in natural language (agent plans and executes automatically)
response = agent.execute("Search for magic mouse on google.com")

print("\n✅ Layer 3 Demo Complete")
print(" Code required: ~5 lines")
print(" Technical knowledge: None")
print(" Flexibility: Medium")
print(f" Agent Response: {response}")


def demo_layer3_with_local_llm():
"""
Layer 3 with Local LLM (Zero Cost)
- Uses local Qwen 2.5 3B model
- No API costs
- Runs on your hardware
"""
print("\n" + "="*70)
print("LAYER 3: ConversationalAgent with Local LLM (Zero Cost)")
print("="*70)

from sentience import SentienceBrowser, ConversationalAgent
from sentience.llm_provider import LocalLLMProvider

# Initialize with local LLM
browser = SentienceBrowser(headless=False)
llm = LocalLLMProvider(
model_name="Qwen/Qwen2.5-3B-Instruct",
device="auto", # Use CUDA if available
load_in_4bit=True # Save memory with quantization
)
agent = ConversationalAgent(browser, llm, verbose=True)

with browser:
# Execute in natural language
response = agent.execute("Go to google.com and search for python tutorials")

print("\n✅ Layer 3 with Local LLM Demo Complete")
print(" API Cost: $0 (runs locally)")
print(" Privacy: 100% (no data sent to cloud)")
print(f" Agent Response: {response}")


def demo_comparison():
"""
Side-by-side comparison of all layers
"""
print("\n" + "="*70)
print("COMPARISON: All Three Layers")
print("="*70)

comparison_table = """
| Feature | Layer 1 (SDK) | Layer 2 (Agent) | Layer 3 (Conversational) |
|--------------------------|------------------|------------------|--------------------------|
| Lines of code | 50+ | 15 | 3-5 |
| Technical knowledge | High | Medium | None |
| Requires selectors? | Yes | No | No |
| LLM required? | No | Yes | Yes |
| Cost per action | $0 | ~$0.005 | ~$0.010 |
| Speed | Fastest | Fast | Medium |
| Error handling | Manual | Auto-retry | Auto-recovery |
| Multi-step planning | Manual | Manual | Automatic |
| Natural language I/O | No | Commands only | Full conversation |
| Best for | Production | AI developers | End users |
"""

print(comparison_table)


def main():
"""Run all demos"""
print("\n" + "="*70)
print("SENTIENCE SDK: Multi-Layer Abstraction Demo")
print("="*70)
print("\nThis demo shows how to use the SDK at different abstraction levels:")
print(" 1. Layer 1: Direct SDK (maximum control)")
print(" 2. Layer 2: SentienceAgent (technical commands)")
print(" 3. Layer 3: ConversationalAgent (natural language)")
print("\nChoose which demo to run:")
print(" 1 - Layer 1: Direct SDK")
print(" 2 - Layer 2: SentienceAgent")
print(" 3 - Layer 3: ConversationalAgent (OpenAI)")
print(" 4 - Layer 3: ConversationalAgent (Local LLM)")
print(" 5 - Show comparison table")
print(" 0 - Exit")

choice = input("\nEnter your choice (0-5): ").strip()

if choice == "1":
demo_layer1_direct_sdk()
elif choice == "2":
if not os.getenv("OPENAI_API_KEY"):
print("\n❌ Error: OPENAI_API_KEY not set")
return
demo_layer2_sentience_agent()
elif choice == "3":
if not os.getenv("OPENAI_API_KEY"):
print("\n❌ Error: OPENAI_API_KEY not set")
return
demo_layer3_conversational_agent()
elif choice == "4":
demo_layer3_with_local_llm()
elif choice == "5":
demo_comparison()
elif choice == "0":
print("Goodbye!")
else:
print("Invalid choice")


if __name__ == "__main__":
main()
86 changes: 86 additions & 0 deletions examples/test_local_llm_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Test script for LocalLLMProvider with Qwen2.5-3B-Instruct
Demonstrates using a local LLM with SentienceAgent
"""

from sentience.llm_provider import LocalLLMProvider

def test_local_llm_basic():
"""Test basic LLM response generation"""
print("="*70)
print("Testing LocalLLMProvider with Qwen2.5-3B-Instruct")
print("="*70)

# Initialize local LLM
# Using the model from your local cache
llm = LocalLLMProvider(
model_name="Qwen/Qwen2.5-3B-Instruct",
device="auto", # Will use CUDA if available, else CPU
load_in_4bit=False, # Set to True to save memory
torch_dtype="auto"
)

print("\n" + "="*70)
print("Test 1: Simple question")
print("="*70)

response = llm.generate(
system_prompt="You are a helpful web automation assistant.",
user_prompt="What is 2+2?",
max_new_tokens=50,
temperature=0.1
)

print(f"Response: {response.content}")
print(f"Tokens: {response.total_tokens} (prompt: {response.prompt_tokens}, completion: {response.completion_tokens})")

print("\n" + "="*70)
print("Test 2: Action parsing (for agent)")
print("="*70)

system_prompt = """You are an AI web automation agent.

GOAL: Click the search box

VISIBLE ELEMENTS (sorted by importance, max 50):
[1] <button> "Sign In" {PRIMARY,CLICKABLE,color:blue} @ (100,50) (Imp:900)
[2] <textbox> "" {CLICKABLE} @ (200,100) (Imp:850)
[3] <link> "Help" {} @ (50,150) (Imp:700)

VISUAL CUES:
- {PRIMARY}: Main call-to-action element
- {CLICKABLE}: Element is clickable
- {color:X}: Background color name

RESPONSE FORMAT (return ONLY the function call):
- CLICK(id) - Click element by ID
- TYPE(id, "text") - Type text into element
- PRESS("key") - Press keyboard key
- FINISH() - Task complete
"""

user_prompt = "What is the next step to achieve the goal?"

response = llm.generate(
system_prompt=system_prompt,
user_prompt=user_prompt,
max_new_tokens=20,
temperature=0.0
)

print(f"Agent Response: {response.content}")
print(f"Tokens: {response.total_tokens}")

# Check if response is parseable
if "CLICK(2)" in response.content or "click(2)" in response.content.lower():
print("\n✅ SUCCESS: LLM correctly identified textbox (element 2) as search box!")
else:
print(f"\n⚠️ Response may need adjustment: {response.content}")

print("\n" + "="*70)
print("LocalLLMProvider Test Complete!")
print("="*70)


if __name__ == "__main__":
test_local_llm_basic()
Loading