SentienceAPI · rcholic · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 10, 2026
diff --git a/.python-version b/.python-version
diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
@@ -302,6 +302,17 @@ def create_state_messages(
 	) -> None:
 		"""Create single state message with all content"""
 
+		# Check if Sentience snapshot was injected BEFORE clearing context messages
+		# (Sentience message is added to context_messages, so we need to check before clearing)
+		has_sentience = any(
+			msg.content and isinstance(msg.content, str) and (
+				"Elements (ID|role|text|importance)" in msg.content or
+				"Elements: ID|role|text|imp|docYq|ord|DG|href" in msg.content or
+				"Rules: ordinal→DG=1 then ord asc" in msg.content
+			)
+			for msg in self.state.history.context_messages
+		)
+
 		# Clear contextual messages from previous steps to prevent accumulation
 		self.state.history.context_messages.clear()
 
@@ -343,8 +354,36 @@ def create_state_messages(
 		if include_screenshot and browser_state_summary.screenshot:
 			screenshots.append(browser_state_summary.screenshot)
 
-		# Use vision in the user message if screenshots are included
-		effective_use_vision = len(screenshots) > 0
+		# Use vision in the user message if screenshots are included OR if there are other images
+		# When use_vision=False, exclude ALL images (screenshots, sample_images, read_state_images)
+		has_other_images = bool(self.sample_images) or bool(self.state.read_state_images)
+		# Only use vision if: (1) we have screenshots, OR (2) use_vision is not False AND we have other images
+		effective_use_vision = len(screenshots) > 0 or (use_vision is not False and has_other_images)
+
+		# Debug logging for vision usage
+		if effective_use_vision:
+			logger.info(
+				'⚠️ Vision is ENABLED: use_vision=%s, screenshots=%d, sample_images=%d, read_state_images=%d',
+				use_vision, len(screenshots), len(self.sample_images) if self.sample_images else 0,
+				len(self.state.read_state_images) if self.state.read_state_images else 0
+			)
+		else:
+			logger.info(
+				'✅ Vision is DISABLED: use_vision=%s, screenshots=%d, sample_images=%d, read_state_images=%d',
+				use_vision, len(screenshots), len(self.sample_images) if self.sample_images else 0,
+				len(self.state.read_state_images) if self.state.read_state_images else 0
+			)
+
+		# Use the has_sentience flag we detected before clearing context_messages
+		# Log Sentience detection for debugging
+		if has_sentience:
+			logger.info('✅ Sentience detected - reducing DOM size to 5000 chars')
+		else:
+			logger.info('❌ Sentience NOT detected - using full DOM size (40000 chars)')
+
+		# Reduce DOM tree size when Sentience provides semantic geometry
+		# Default is 40,000 chars, reduce to 5,000 when Sentience is available
+		max_clickable_elements_length = 5000 if has_sentience else 40000
 
 		# Create single state message with all content
 		assert browser_state_summary
@@ -357,6 +396,7 @@ def create_state_messages(
 			include_attributes=self.include_attributes,
 			step_info=step_info,
 			page_filtered_actions=page_filtered_actions,
+			max_clickable_elements_length=max_clickable_elements_length,
 			sensitive_data=self.sensitive_data_description,
 			available_file_paths=available_file_paths,
 			screenshots=screenshots,

diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py
@@ -1,4 +1,5 @@
 import importlib.resources
+import logging
 from datetime import datetime
 from typing import TYPE_CHECKING, Literal, Optional
 
@@ -7,6 +8,8 @@
 from browser_use.observability import observe_debug
 from browser_use.utils import is_new_tab_page, sanitize_surrogates
 
+logger = logging.getLogger(__name__)
+
 if TYPE_CHECKING:
 	from browser_use.agent.views import AgentStepInfo
 	from browser_use.browser.views import BrowserStateSummary
@@ -214,12 +217,21 @@ def _get_browser_state_description(self) -> str:
 		stats_text += '</page_stats>\n'
 
 		elements_text = self.browser_state.dom_state.llm_representation(include_attributes=self.include_attributes)
+
+		# Log DOM size before truncation
+		original_dom_size = len(elements_text)
+		dom_tokens_estimate = original_dom_size // 4
 
 		if len(elements_text) > self.max_clickable_elements_length:
 			elements_text = elements_text[: self.max_clickable_elements_length]
 			truncated_text = f' (truncated to {self.max_clickable_elements_length} characters)'
+			logger.info(
+				'📊 DOM state: %d chars (~%d tokens) truncated to %d chars (~%d tokens)',
+				original_dom_size, dom_tokens_estimate, self.max_clickable_elements_length, self.max_clickable_elements_length // 4
+			)
 		else:
 			truncated_text = ''
+			logger.info('📊 DOM state: %d chars (~%d tokens)', original_dom_size, dom_tokens_estimate)
 
 		has_content_above = False
 		has_content_below = False
@@ -400,10 +412,54 @@ def get_user_message(self, use_vision: bool = True) -> UserMessage:
 		# Sanitize surrogates from all text content
 		state_description = sanitize_surrogates(state_description)
 
-		# Check if we have images to include (from read_file action)
-		has_images = bool(self.read_state_images)
+		# Log token usage breakdown for debugging
+		agent_history_len = len(self.agent_history_description) if self.agent_history_description else 0
+		browser_state_len = len(self._get_browser_state_description())
+		agent_state_len = len(self._get_agent_state_description())
+		read_state_len = len(self.read_state_description) if self.read_state_description else 0
+		total_len = len(state_description)
+
+		# Rough token estimate (1 token ≈ 4 characters)
+		logger.info(
+			'📊 Token breakdown (chars): agent_history=%d (~%d tokens), browser_state=%d (~%d tokens), '
+			'agent_state=%d (~%d tokens), read_state=%d (~%d tokens), total=%d (~%d tokens)',
+			agent_history_len, agent_history_len // 4,
+			browser_state_len, browser_state_len // 4,
+			agent_state_len, agent_state_len // 4,
+			read_state_len, read_state_len // 4,
+			total_len, total_len // 4
+		)
+
+		# Check if we have images to include
+		# When use_vision=False, exclude ALL images (screenshots, sample_images, read_state_images)
+		has_read_state_images = bool(self.read_state_images)
+		has_sample_images = bool(self.sample_images)
+		has_screenshots = bool(self.screenshots)
+
+		# Include images only if use_vision is not False and we have images
+		# When use_vision=False, never use vision (even for read_state_images from read_file)
+		should_use_vision = (
+			use_vision is not False and
+			(has_screenshots or has_sample_images or has_read_state_images)
+		)
+
+		# Debug logging
+		if should_use_vision:
+			logger.info(
+				'⚠️ AgentMessagePrompt: Vision ENABLED - use_vision=%s, screenshots=%d, sample_images=%d, read_state_images=%d',
+				use_vision, len(self.screenshots) if self.screenshots else 0,
+				len(self.sample_images) if self.sample_images else 0,
+				len(self.read_state_images) if self.read_state_images else 0
+			)
+		else:
+			logger.info(
+				'✅ AgentMessagePrompt: Vision DISABLED - use_vision=%s, screenshots=%d, sample_images=%d, read_state_images=%d',
+				use_vision, len(self.screenshots) if self.screenshots else 0,
+				len(self.sample_images) if self.sample_images else 0,
+				len(self.read_state_images) if self.read_state_images else 0
+			)
 
-		if (use_vision is True and self.screenshots) or has_images:
+		if should_use_vision:
 			# Start with text description
 			content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)]
 
@@ -412,28 +468,28 @@ def get_user_message(self, use_vision: bool = True) -> UserMessage:
 
 			# Add screenshots with labels
 			for i, screenshot in enumerate(self.screenshots):
-				if i == len(self.screenshots) - 1:
-					label = 'Current screenshot:'
-				else:
-					# Use simple, accurate labeling since we don't have actual step timing info
-					label = 'Previous screenshot:'
-
-				# Add label as text content
-				content_parts.append(ContentPartTextParam(text=label))
-
-				# Resize screenshot if llm_screenshot_size is configured
-				processed_screenshot = self._resize_screenshot(screenshot)
-
-				# Add the screenshot
-				content_parts.append(
-					ContentPartImageParam(
-						image_url=ImageURL(
-							url=f'data:image/png;base64,{processed_screenshot}',
-							media_type='image/png',
-							detail=self.vision_detail_level,
-						),
+					if i == len(self.screenshots) - 1:
+						label = 'Current screenshot:'
+					else:
+						# Use simple, accurate labeling since we don't have actual step timing info
+						label = 'Previous screenshot:'
+
+					# Add label as text content
+					content_parts.append(ContentPartTextParam(text=label))
+
+					# Resize screenshot if llm_screenshot_size is configured
+					processed_screenshot = self._resize_screenshot(screenshot)
+
+					# Add the screenshot
+					content_parts.append(
+						ContentPartImageParam(
+							image_url=ImageURL(
+								url=f'data:image/png;base64,{processed_screenshot}',
+								media_type='image/png',
+								detail=self.vision_detail_level,
+							),
+						)
 					)
-				)
 
 			# Add read_state images (from read_file action) before screenshots
 			for img_data in self.read_state_images:

diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
@@ -1011,10 +1011,15 @@ async def _prepare_context(self, step_info: AgentStepInfo | None = None) -> Brow
 		assert self.browser_session is not None, 'BrowserSession is not set up'
 
 		self.logger.debug(f'🌐 Step {self.state.n_steps}: Getting browser state...')
-		# Always take screenshots for all steps
-		self.logger.debug('📸 Requesting browser state with include_screenshot=True')
+		# Only capture screenshots if use_vision is not False
+		# When use_vision=False, skip screenshot capture entirely to save resources
+		include_screenshot = self.settings.use_vision is not False
+		if include_screenshot:
+			self.logger.debug('📸 Requesting browser state with include_screenshot=True')
+		else:
+			self.logger.debug('📸 Skipping screenshot capture (use_vision=False)')
 		browser_state_summary = await self.browser_session.get_browser_state_summary(
-			include_screenshot=True,  # always capture even if use_vision=False so that cloud sync is useful (it's fast now anyway)
+			include_screenshot=include_screenshot,
 			include_recent_events=self.include_recent_events,
 		)
 		if browser_state_summary.screenshot:
@@ -1043,6 +1048,38 @@ async def _prepare_context(self, step_info: AgentStepInfo | None = None) -> Brow
 		if self.skill_service is not None:
 			unavailable_skills_info = await self._get_unavailable_skills_info()
 
+		# Inject Sentience semantic geometry inventory (if available)
+		# This gives the LLM access to semantic element IDs and bbox coordinates
+		try:
+			from browser_use.integrations.sentience.state_injector import build_sentience_state
+
+			sent_state = await build_sentience_state(self.browser_session)
+			if sent_state:
+				# Add Sentience element inventory to LLM context for this step
+				self._message_manager._add_context_message(UserMessage(content=sent_state.prompt_block))
+
+				# Log injection details
+				element_count = len(sent_state.snapshot.elements)
+				prompt_size = len(sent_state.prompt_block)
+				# Show sample of first few elements
+				lines = sent_state.prompt_block.split("\n")
+				sample_lines = lines[3:8] if len(lines) > 8 else lines[3:]  # Skip header, show 5 elements
+				sample = "\n".join(sample_lines) if sample_lines else ""
+
+				self.logger.info(
+					f"🧠 Sentience: Injected {element_count} semantic elements ({prompt_size} chars) into LLM context"
+				)
+				if sample:
+					self.logger.debug(f"   Sample elements:\n{sample}")
+			else:
+				self.logger.debug("Sentience: No snapshot available (extension may not be loaded)")
+		except ImportError:
+			# Sentience SDK not installed, skip silently
+			self.logger.debug("Sentience: SDK not installed, skipping")
+		except Exception as e:
+			# Extension not loaded or snapshot failed, log at debug level
+			self.logger.debug(f"Sentience: State injection skipped: {e}")
+
 		self._message_manager.create_state_messages(
 			browser_state_summary=browser_state_summary,
 			model_output=self.state.last_model_output,

diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py
@@ -1315,6 +1315,15 @@ async def get_browser_state_summary(
 			if include_screenshot and not self._cached_browser_state_summary.screenshot:
 				self.logger.debug('⚠️ Cached browser state has no screenshot, fetching fresh state with screenshot')
 				# Fall through to fetch fresh state with screenshot
+			elif not include_screenshot and self._cached_browser_state_summary.screenshot:
+				# If we don't want a screenshot but cached state has one, create a copy without screenshot
+				from dataclasses import replace
+				cached_copy = replace(
+					self._cached_browser_state_summary,
+					screenshot=None,  # Remove screenshot when not requested
+				)
+				self.logger.debug('🔄 Using pre-cached browser state summary (screenshot removed per request)')
+				return cached_copy
 			elif selector_map and len(selector_map) > 0:
 				self.logger.debug('🔄 Using pre-cached browser state summary for open tab')
 				return self._cached_browser_state_summary

diff --git a/browser_use/integrations/sentience/__init__.py b/browser_use/integrations/sentience/__init__.py
@@ -0,0 +1,5 @@
+"""Sentience integration for browser-use Agent."""
+
+from .state_injector import build_sentience_state, format_snapshot_for_llm
+
+__all__ = ["build_sentience_state", "format_snapshot_for_llm"]