Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .python-version

This file was deleted.

44 changes: 42 additions & 2 deletions browser_use/agent/message_manager/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,17 @@ def create_state_messages(
) -> None:
"""Create single state message with all content"""

# Check if Sentience snapshot was injected BEFORE clearing context messages
# (Sentience message is added to context_messages, so we need to check before clearing)
has_sentience = any(
msg.content and isinstance(msg.content, str) and (
"Elements (ID|role|text|importance)" in msg.content or
"Elements: ID|role|text|imp|docYq|ord|DG|href" in msg.content or
"Rules: ordinal→DG=1 then ord asc" in msg.content
)
for msg in self.state.history.context_messages
)

# Clear contextual messages from previous steps to prevent accumulation
self.state.history.context_messages.clear()

Expand Down Expand Up @@ -343,8 +354,36 @@ def create_state_messages(
if include_screenshot and browser_state_summary.screenshot:
screenshots.append(browser_state_summary.screenshot)

# Use vision in the user message if screenshots are included
effective_use_vision = len(screenshots) > 0
# Use vision in the user message if screenshots are included OR if there are other images
# When use_vision=False, exclude ALL images (screenshots, sample_images, read_state_images)
has_other_images = bool(self.sample_images) or bool(self.state.read_state_images)
# Only use vision if: (1) we have screenshots, OR (2) use_vision is not False AND we have other images
effective_use_vision = len(screenshots) > 0 or (use_vision is not False and has_other_images)

# Debug logging for vision usage
if effective_use_vision:
logger.info(
'⚠️ Vision is ENABLED: use_vision=%s, screenshots=%d, sample_images=%d, read_state_images=%d',
use_vision, len(screenshots), len(self.sample_images) if self.sample_images else 0,
len(self.state.read_state_images) if self.state.read_state_images else 0
)
else:
logger.info(
'✅ Vision is DISABLED: use_vision=%s, screenshots=%d, sample_images=%d, read_state_images=%d',
use_vision, len(screenshots), len(self.sample_images) if self.sample_images else 0,
len(self.state.read_state_images) if self.state.read_state_images else 0
)

# Use the has_sentience flag we detected before clearing context_messages
# Log Sentience detection for debugging
if has_sentience:
logger.info('✅ Sentience detected - reducing DOM size to 5000 chars')
else:
logger.info('❌ Sentience NOT detected - using full DOM size (40000 chars)')

# Reduce DOM tree size when Sentience provides semantic geometry
# Default is 40,000 chars, reduce to 5,000 when Sentience is available
max_clickable_elements_length = 5000 if has_sentience else 40000

# Create single state message with all content
assert browser_state_summary
Expand All @@ -357,6 +396,7 @@ def create_state_messages(
include_attributes=self.include_attributes,
step_info=step_info,
page_filtered_actions=page_filtered_actions,
max_clickable_elements_length=max_clickable_elements_length,
sensitive_data=self.sensitive_data_description,
available_file_paths=available_file_paths,
screenshots=screenshots,
Expand Down
104 changes: 80 additions & 24 deletions browser_use/agent/prompts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib.resources
import logging
from datetime import datetime
from typing import TYPE_CHECKING, Literal, Optional

Expand All @@ -7,6 +8,8 @@
from browser_use.observability import observe_debug
from browser_use.utils import is_new_tab_page, sanitize_surrogates

logger = logging.getLogger(__name__)

if TYPE_CHECKING:
from browser_use.agent.views import AgentStepInfo
from browser_use.browser.views import BrowserStateSummary
Expand Down Expand Up @@ -214,12 +217,21 @@ def _get_browser_state_description(self) -> str:
stats_text += '</page_stats>\n'

elements_text = self.browser_state.dom_state.llm_representation(include_attributes=self.include_attributes)

# Log DOM size before truncation
original_dom_size = len(elements_text)
dom_tokens_estimate = original_dom_size // 4

if len(elements_text) > self.max_clickable_elements_length:
elements_text = elements_text[: self.max_clickable_elements_length]
truncated_text = f' (truncated to {self.max_clickable_elements_length} characters)'
logger.info(
'📊 DOM state: %d chars (~%d tokens) truncated to %d chars (~%d tokens)',
original_dom_size, dom_tokens_estimate, self.max_clickable_elements_length, self.max_clickable_elements_length // 4
)
else:
truncated_text = ''
logger.info('📊 DOM state: %d chars (~%d tokens)', original_dom_size, dom_tokens_estimate)

has_content_above = False
has_content_below = False
Expand Down Expand Up @@ -400,10 +412,54 @@ def get_user_message(self, use_vision: bool = True) -> UserMessage:
# Sanitize surrogates from all text content
state_description = sanitize_surrogates(state_description)

# Check if we have images to include (from read_file action)
has_images = bool(self.read_state_images)
# Log token usage breakdown for debugging
agent_history_len = len(self.agent_history_description) if self.agent_history_description else 0
browser_state_len = len(self._get_browser_state_description())
agent_state_len = len(self._get_agent_state_description())
read_state_len = len(self.read_state_description) if self.read_state_description else 0
total_len = len(state_description)

# Rough token estimate (1 token ≈ 4 characters)
logger.info(
'📊 Token breakdown (chars): agent_history=%d (~%d tokens), browser_state=%d (~%d tokens), '
'agent_state=%d (~%d tokens), read_state=%d (~%d tokens), total=%d (~%d tokens)',
agent_history_len, agent_history_len // 4,
browser_state_len, browser_state_len // 4,
agent_state_len, agent_state_len // 4,
read_state_len, read_state_len // 4,
total_len, total_len // 4
)

# Check if we have images to include
# When use_vision=False, exclude ALL images (screenshots, sample_images, read_state_images)
has_read_state_images = bool(self.read_state_images)
has_sample_images = bool(self.sample_images)
has_screenshots = bool(self.screenshots)

# Include images only if use_vision is not False and we have images
# When use_vision=False, never use vision (even for read_state_images from read_file)
should_use_vision = (
use_vision is not False and
(has_screenshots or has_sample_images or has_read_state_images)
)

# Debug logging
if should_use_vision:
logger.info(
'⚠️ AgentMessagePrompt: Vision ENABLED - use_vision=%s, screenshots=%d, sample_images=%d, read_state_images=%d',
use_vision, len(self.screenshots) if self.screenshots else 0,
len(self.sample_images) if self.sample_images else 0,
len(self.read_state_images) if self.read_state_images else 0
)
else:
logger.info(
'✅ AgentMessagePrompt: Vision DISABLED - use_vision=%s, screenshots=%d, sample_images=%d, read_state_images=%d',
use_vision, len(self.screenshots) if self.screenshots else 0,
len(self.sample_images) if self.sample_images else 0,
len(self.read_state_images) if self.read_state_images else 0
)

if (use_vision is True and self.screenshots) or has_images:
if should_use_vision:
# Start with text description
content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)]

Expand All @@ -412,28 +468,28 @@ def get_user_message(self, use_vision: bool = True) -> UserMessage:

# Add screenshots with labels
for i, screenshot in enumerate(self.screenshots):
if i == len(self.screenshots) - 1:
label = 'Current screenshot:'
else:
# Use simple, accurate labeling since we don't have actual step timing info
label = 'Previous screenshot:'

# Add label as text content
content_parts.append(ContentPartTextParam(text=label))

# Resize screenshot if llm_screenshot_size is configured
processed_screenshot = self._resize_screenshot(screenshot)

# Add the screenshot
content_parts.append(
ContentPartImageParam(
image_url=ImageURL(
url=f'data:image/png;base64,{processed_screenshot}',
media_type='image/png',
detail=self.vision_detail_level,
),
if i == len(self.screenshots) - 1:
label = 'Current screenshot:'
else:
# Use simple, accurate labeling since we don't have actual step timing info
label = 'Previous screenshot:'

# Add label as text content
content_parts.append(ContentPartTextParam(text=label))

# Resize screenshot if llm_screenshot_size is configured
processed_screenshot = self._resize_screenshot(screenshot)

# Add the screenshot
content_parts.append(
ContentPartImageParam(
image_url=ImageURL(
url=f'data:image/png;base64,{processed_screenshot}',
media_type='image/png',
detail=self.vision_detail_level,
),
)
)
)

# Add read_state images (from read_file action) before screenshots
for img_data in self.read_state_images:
Expand Down
43 changes: 40 additions & 3 deletions browser_use/agent/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1011,10 +1011,15 @@ async def _prepare_context(self, step_info: AgentStepInfo | None = None) -> Brow
assert self.browser_session is not None, 'BrowserSession is not set up'

self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...')
# Always take screenshots for all steps
self.logger.debug('📸 Requesting browser state with include_screenshot=True')
# Only capture screenshots if use_vision is not False
# When use_vision=False, skip screenshot capture entirely to save resources
include_screenshot = self.settings.use_vision is not False
if include_screenshot:
self.logger.debug('📸 Requesting browser state with include_screenshot=True')
else:
self.logger.debug('📸 Skipping screenshot capture (use_vision=False)')
browser_state_summary = await self.browser_session.get_browser_state_summary(
include_screenshot=True, # always capture even if use_vision=False so that cloud sync is useful (it's fast now anyway)
include_screenshot=include_screenshot,
include_recent_events=self.include_recent_events,
)
if browser_state_summary.screenshot:
Expand Down Expand Up @@ -1043,6 +1048,38 @@ async def _prepare_context(self, step_info: AgentStepInfo | None = None) -> Brow
if self.skill_service is not None:
unavailable_skills_info = await self._get_unavailable_skills_info()

# Inject Sentience semantic geometry inventory (if available)
# This gives the LLM access to semantic element IDs and bbox coordinates
try:
from browser_use.integrations.sentience.state_injector import build_sentience_state

sent_state = await build_sentience_state(self.browser_session)
if sent_state:
# Add Sentience element inventory to LLM context for this step
self._message_manager._add_context_message(UserMessage(content=sent_state.prompt_block))

# Log injection details
element_count = len(sent_state.snapshot.elements)
prompt_size = len(sent_state.prompt_block)
# Show sample of first few elements
lines = sent_state.prompt_block.split("\n")
sample_lines = lines[3:8] if len(lines) > 8 else lines[3:] # Skip header, show 5 elements
sample = "\n".join(sample_lines) if sample_lines else ""

self.logger.info(
f"🧠 Sentience: Injected {element_count} semantic elements ({prompt_size} chars) into LLM context"
)
if sample:
self.logger.debug(f" Sample elements:\n{sample}")
else:
self.logger.debug("Sentience: No snapshot available (extension may not be loaded)")
except ImportError:
# Sentience SDK not installed, skip silently
self.logger.debug("Sentience: SDK not installed, skipping")
except Exception as e:
# Extension not loaded or snapshot failed, log at debug level
self.logger.debug(f"Sentience: State injection skipped: {e}")

self._message_manager.create_state_messages(
browser_state_summary=browser_state_summary,
model_output=self.state.last_model_output,
Expand Down
9 changes: 9 additions & 0 deletions browser_use/browser/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1315,6 +1315,15 @@ async def get_browser_state_summary(
if include_screenshot and not self._cached_browser_state_summary.screenshot:
self.logger.debug('⚠️ Cached browser state has no screenshot, fetching fresh state with screenshot')
# Fall through to fetch fresh state with screenshot
elif not include_screenshot and self._cached_browser_state_summary.screenshot:
# If we don't want a screenshot but cached state has one, create a copy without screenshot
from dataclasses import replace
cached_copy = replace(
self._cached_browser_state_summary,
screenshot=None, # Remove screenshot when not requested
)
self.logger.debug('🔄 Using pre-cached browser state summary (screenshot removed per request)')
return cached_copy
elif selector_map and len(selector_map) > 0:
self.logger.debug('🔄 Using pre-cached browser state summary for open tab')
return self._cached_browser_state_summary
Expand Down
5 changes: 5 additions & 0 deletions browser_use/integrations/sentience/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Sentience integration for browser-use Agent."""

from .state_injector import build_sentience_state, format_snapshot_for_llm

__all__ = ["build_sentience_state", "format_snapshot_for_llm"]
Loading