browser-use · Brenden2008 · Mar 28, 2026
diff --git a/.env.example b/.env.example
@@ -10,3 +10,4 @@ HYPERBROWSER_API_KEY=
 ONKERNEL_API_KEY=
 REBROWSER_API_KEY=
 STEEL_API_KEY=
+DRIVER_API_KEY=
diff --git a/browsers/__init__.py b/browsers/__init__.py
@@ -21,6 +21,7 @@ async def disconnect() -> None -- cleans up the session
     "anchor",
     "browserbase",
     "browserless",
+    "driver",
     "hyperbrowser",
     "local_headful",
     "local_headless",

diff --git a/browsers/driver.py b/browsers/driver.py
@@ -0,0 +1,43 @@
+import os
+
+import httpx
+
+from browsers import retry_on_429
+
+_sessions: list[str] = []
+
+CDP_PROXY_URL = os.environ.get(
+    "CDP_PROXY_URL", "https://bu-compat.driver.dev"
+).rstrip("/")
+
+
+async def connect() -> str:
+    async def _create():
+        async with httpx.AsyncClient() as client:
+            resp = await client.post(
+                f"{CDP_PROXY_URL}/v1/proxy/session",
+                headers={"Authorization": f"Bearer {os.environ['DRIVER_API_KEY']}"},
+                json={"captchaSolver": True, "type": "hosted", "country": "CA"},
+                timeout=60,
+            )
+            resp.raise_for_status()
+            return resp.json()
+
+    data = await retry_on_429(_create)
+    _sessions.append(data["data"]["sessionId"])
+    return data["data"]["cdpUrl"]
+
+
+async def disconnect() -> None:
+    if not _sessions:
+        return
+    session_id = _sessions.pop()
+    try:
+        async with httpx.AsyncClient() as client:
+            await client.delete(
+                f"{CDP_PROXY_URL}/v1/proxy/session/{session_id}",
+                headers={"Authorization": f"Bearer {os.environ['DRIVER_API_KEY']}"},
+                timeout=30,
+            )
+    except Exception:
+        pass  # Best effort cleanup
diff --git a/run_eval.py b/run_eval.py
@@ -4,11 +4,13 @@
     uv run python run_eval.py                              # defaults: browser-use-cloud + bu-2-0
     uv run python run_eval.py --browser anchor             # use Anchor Browser provider
     uv run python run_eval.py --browser local_headless     # use local headless Chromium
-    uv run python run_eval.py --tasks 5                    # run only 5 tasks
+    uv run python run_eval.py --tasks 5                    # run first 5 tasks
+    uv run python run_eval.py --task-ids 2 5 12 14         # rerun specific task IDs
+    uv run python run_eval.py --task-ids 29-35             # rerun a range of task IDs
 
 Available browsers: browser-use-cloud (default), anchor, browserbase,
-    browserless, hyperbrowser, local_headful, local_headless, onkernel,
-    rebrowser, steel
+    browserless, driver, hyperbrowser, local_headful, local_headless,
+    onkernel, rebrowser, steel
 """
 
 # Fix for MacOS users using uv without SSL certificate setup
@@ -41,8 +43,11 @@
 
 # Judge LLM - always use gemini-2.5-flash for consistent judging across all evaluations
 JUDGE_LLM = ChatGoogle(model="gemini-2.5-flash", api_key=os.getenv("GOOGLE_API_KEY"))
-TASKS_FILE = Path(__file__).parent / "BU_Bench_V1.enc"
-MAX_CONCURRENT = 3
+BENCH_NAMES = {
+    "bu": "BU_Bench_V1",
+    "stealth": "Stealth_Bench_V1",
+}
+MAX_CONCURRENT = 1
 TASK_TIMEOUT = 1800  # 30 minutes max per task
 
 AGENT_FRAMEWORK_NAME = "BrowserUse"
@@ -60,9 +65,10 @@ def encode_screenshots(paths: list[str]) -> list[str]:
     return result
 
 
-def load_tasks() -> list[dict]:
-    key = base64.urlsafe_b64encode(hashlib.sha256(b"BU_Bench_V1").digest())
-    encrypted = base64.b64decode(TASKS_FILE.read_text())
+def load_tasks(bench_name: str) -> list[dict]:
+    tasks_file = Path(__file__).parent / f"{bench_name}.enc"
+    key = base64.urlsafe_b64encode(hashlib.sha256(bench_name.encode()).digest())
+    encrypted = base64.b64decode(tasks_file.read_text())
     return json.loads(Fernet(key).decrypt(encrypted))
 
 
@@ -211,7 +217,13 @@ async def run_task(
 
 
 async def main():
-    parser = argparse.ArgumentParser(description="Run BU_Bench_V1 evaluation")
+    parser = argparse.ArgumentParser(description="Run benchmark evaluation")
+    parser.add_argument(
+        "--bench",
+        default="bu",
+        choices=list(BENCH_NAMES.keys()),
+        help="Benchmark to run: bu (BU_Bench_V1) or stealth (Stealth_Bench_V1) (default: bu)",
+    )
     parser.add_argument(
         "--browser",
         default="browser-use-cloud",
@@ -222,10 +234,19 @@ async def main():
         "--tasks",
         type=int,
         default=None,
-        help="Number of tasks to run (default: all)",
+        help="Number of tasks to run from the start (default: all)",
+    )
+    parser.add_argument(
+        "--task-ids",
+        nargs="+",
+        default=None,
+        help="Specific task IDs to run (e.g. 2 5 12 or 29-35 for ranges)",
     )
     args = parser.parse_args()
 
+    # Resolve bench name
+    bench_name = BENCH_NAMES[args.bench]
+
     # Resolve browser provider (None = use native browser-use-cloud path)
     browser_name = args.browser
     if browser_name == "browser-use-cloud":
@@ -235,15 +256,36 @@ async def main():
 
     # Build run key and paths
     run_start = datetime.now().strftime("%Y%m%d_%H%M%S")
-    run_key = f"{AGENT_FRAMEWORK_NAME}_{AGENT_FRAMEWORK_VERSION}_browser_{browser_name}_model_{MODEL_NAME}"
+    run_key = f"{bench_name}_{AGENT_FRAMEWORK_NAME}_{AGENT_FRAMEWORK_VERSION}_browser_{browser_name}_model_{MODEL_NAME}"
     run_data_dir = (
         Path(__file__).parent / "run_data" / f"{run_key}_start_at_{run_start}"
     )
     results_file = Path(__file__).parent / "results" / f"{run_key}.json"
 
-    tasks = load_tasks()
-    if args.tasks:
-        tasks = tasks[: args.tasks]
+    all_tasks = load_tasks(bench_name)
+
+    # Filter tasks
+    if args.task_ids:
+        # Parse task IDs: supports individual IDs and ranges (e.g. "29-35")
+        selected_ids: set[str] = set()
+        for spec in args.task_ids:
+            if "-" in spec and not spec.startswith("-"):
+                start, end = spec.split("-", 1)
+                for i in range(int(start), int(end) + 1):
+                    selected_ids.add(str(i))
+            else:
+                selected_ids.add(spec)
+        tasks = [t for t in all_tasks if str(t.get("task_id", "")) in selected_ids]
+        if not tasks:
+            print(f"No tasks matched IDs: {selected_ids}")
+            print(f"Available IDs: {[t.get('task_id') for t in all_tasks[:10]]}...")
+            return
+        print(f"Running {len(tasks)} selected task(s): {sorted(selected_ids, key=lambda x: int(x) if x.isdigit() else x)}")
+    elif args.tasks:
+        tasks = all_tasks[: args.tasks]
+    else:
+        tasks = all_tasks
+
     sem = asyncio.Semaphore(MAX_CONCURRENT)
     results = await asyncio.gather(
         *[

diff --git a/stealth_bench/official_plots/accuracy_by_browser_dark.png b/stealth_bench/official_plots/accuracy_by_browser_dark.png
diff --git a/stealth_bench/official_plots/accuracy_by_browser_dark_old.png b/stealth_bench/official_plots/accuracy_by_browser_dark_old.png
diff --git a/stealth_bench/official_plots/accuracy_by_browser_light.png b/stealth_bench/official_plots/accuracy_by_browser_light.png
diff --git a/stealth_bench/official_plots/accuracy_by_browser_light_old.png b/stealth_bench/official_plots/accuracy_by_browser_light_old.png
diff --git a/stealth_bench/official_plots/category_heatmap_dark.png b/stealth_bench/official_plots/category_heatmap_dark.png
diff --git a/stealth_bench/official_plots/category_heatmap_dark_old.png b/stealth_bench/official_plots/category_heatmap_dark_old.png
diff --git a/stealth_bench/official_plots/category_heatmap_light.png b/stealth_bench/official_plots/category_heatmap_light.png
diff --git a/stealth_bench/official_plots/category_heatmap_light_old.png b/stealth_bench/official_plots/category_heatmap_light_old.png
diff --git a/stealth_bench/official_results/Stealth_Bench_V1_browser_driver_model_bu-2-0.json b/stealth_bench/official_results/Stealth_Bench_V1_browser_driver_model_bu-2-0.json
@@ -0,0 +1,35 @@
+[
+  {
+    "run_start": "20260326_160529",
+    "tasks_completed": 80,
+    "tasks_successful": 69,
+    "total_steps": 1436,
+    "total_duration": 29982.2,
+    "total_cost": 0.0,
+    "tasks_successful_by_category": {
+      "Cloudflare": 17,
+      "reCaptcha": 5,
+      "PerimeterX": 17,
+      "Datadome": 11,
+      "hCaptcha": 3,
+      "GeeTest": 3,
+      "Akamai": 6,
+      "Shape": 1,
+      "Kasada": 1,
+      "Custom Antibot": 5
+    },
+    "tasks_total_by_category": {
+      "Cloudflare": 22,
+      "reCaptcha": 6,
+      "PerimeterX": 18,
+      "Datadome": 13,
+      "hCaptcha": 3,
+      "GeeTest": 4,
+      "Akamai": 6,
+      "Shape": 1,
+      "Kasada": 1,
+      "Temu Slider": 1,
+      "Custom Antibot": 5
+    }
+  }
+]