Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ HYPERBROWSER_API_KEY=
ONKERNEL_API_KEY=
REBROWSER_API_KEY=
STEEL_API_KEY=
DRIVER_API_KEY=
1 change: 1 addition & 0 deletions browsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ async def disconnect() -> None -- cleans up the session
"anchor",
"browserbase",
"browserless",
"driver",
"hyperbrowser",
"local_headful",
"local_headless",
Expand Down
43 changes: 43 additions & 0 deletions browsers/driver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os

import httpx

from browsers import retry_on_429

_sessions: list[str] = []

CDP_PROXY_URL = os.environ.get(
"CDP_PROXY_URL", "https://bu-compat.driver.dev"
).rstrip("/")


async def connect() -> str:
async def _create():
async with httpx.AsyncClient() as client:
resp = await client.post(
f"{CDP_PROXY_URL}/v1/proxy/session",
headers={"Authorization": f"Bearer {os.environ['DRIVER_API_KEY']}"},
json={"captchaSolver": True, "type": "hosted", "country": "CA"},
timeout=60,
)
resp.raise_for_status()
return resp.json()

data = await retry_on_429(_create)
_sessions.append(data["data"]["sessionId"])
return data["data"]["cdpUrl"]


async def disconnect() -> None:
if not _sessions:
return
session_id = _sessions.pop()
try:
async with httpx.AsyncClient() as client:
await client.delete(
f"{CDP_PROXY_URL}/v1/proxy/session/{session_id}",
headers={"Authorization": f"Bearer {os.environ['DRIVER_API_KEY']}"},
timeout=30,
)
except Exception:
pass # Best effort cleanup
70 changes: 56 additions & 14 deletions run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
uv run python run_eval.py # defaults: browser-use-cloud + bu-2-0
uv run python run_eval.py --browser anchor # use Anchor Browser provider
uv run python run_eval.py --browser local_headless # use local headless Chromium
uv run python run_eval.py --tasks 5 # run only 5 tasks
uv run python run_eval.py --tasks 5 # run first 5 tasks
uv run python run_eval.py --task-ids 2 5 12 14 # rerun specific task IDs
uv run python run_eval.py --task-ids 29-35 # rerun a range of task IDs

Available browsers: browser-use-cloud (default), anchor, browserbase,
browserless, hyperbrowser, local_headful, local_headless, onkernel,
rebrowser, steel
browserless, driver, hyperbrowser, local_headful, local_headless,
onkernel, rebrowser, steel
"""

# Fix for MacOS users using uv without SSL certificate setup
Expand Down Expand Up @@ -41,8 +43,11 @@

# Judge LLM - always use gemini-2.5-flash for consistent judging across all evaluations
JUDGE_LLM = ChatGoogle(model="gemini-2.5-flash", api_key=os.getenv("GOOGLE_API_KEY"))
TASKS_FILE = Path(__file__).parent / "BU_Bench_V1.enc"
MAX_CONCURRENT = 3
BENCH_NAMES = {
"bu": "BU_Bench_V1",
"stealth": "Stealth_Bench_V1",
}
MAX_CONCURRENT = 1
TASK_TIMEOUT = 1800 # 30 minutes max per task

AGENT_FRAMEWORK_NAME = "BrowserUse"
Expand All @@ -60,9 +65,10 @@ def encode_screenshots(paths: list[str]) -> list[str]:
return result


def load_tasks() -> list[dict]:
key = base64.urlsafe_b64encode(hashlib.sha256(b"BU_Bench_V1").digest())
encrypted = base64.b64decode(TASKS_FILE.read_text())
def load_tasks(bench_name: str) -> list[dict]:
tasks_file = Path(__file__).parent / f"{bench_name}.enc"
key = base64.urlsafe_b64encode(hashlib.sha256(bench_name.encode()).digest())
encrypted = base64.b64decode(tasks_file.read_text())
return json.loads(Fernet(key).decrypt(encrypted))


Expand Down Expand Up @@ -211,7 +217,13 @@ async def run_task(


async def main():
parser = argparse.ArgumentParser(description="Run BU_Bench_V1 evaluation")
parser = argparse.ArgumentParser(description="Run benchmark evaluation")
parser.add_argument(
"--bench",
default="bu",
choices=list(BENCH_NAMES.keys()),
help="Benchmark to run: bu (BU_Bench_V1) or stealth (Stealth_Bench_V1) (default: bu)",
)
parser.add_argument(
"--browser",
default="browser-use-cloud",
Expand All @@ -222,10 +234,19 @@ async def main():
"--tasks",
type=int,
default=None,
help="Number of tasks to run (default: all)",
help="Number of tasks to run from the start (default: all)",
)
parser.add_argument(
"--task-ids",
nargs="+",
default=None,
help="Specific task IDs to run (e.g. 2 5 12 or 29-35 for ranges)",
)
args = parser.parse_args()

# Resolve bench name
bench_name = BENCH_NAMES[args.bench]

# Resolve browser provider (None = use native browser-use-cloud path)
browser_name = args.browser
if browser_name == "browser-use-cloud":
Expand All @@ -235,15 +256,36 @@ async def main():

# Build run key and paths
run_start = datetime.now().strftime("%Y%m%d_%H%M%S")
run_key = f"{AGENT_FRAMEWORK_NAME}_{AGENT_FRAMEWORK_VERSION}_browser_{browser_name}_model_{MODEL_NAME}"
run_key = f"{bench_name}_{AGENT_FRAMEWORK_NAME}_{AGENT_FRAMEWORK_VERSION}_browser_{browser_name}_model_{MODEL_NAME}"
run_data_dir = (
Path(__file__).parent / "run_data" / f"{run_key}_start_at_{run_start}"
)
results_file = Path(__file__).parent / "results" / f"{run_key}.json"

tasks = load_tasks()
if args.tasks:
tasks = tasks[: args.tasks]
all_tasks = load_tasks(bench_name)

# Filter tasks
if args.task_ids:
# Parse task IDs: supports individual IDs and ranges (e.g. "29-35")
selected_ids: set[str] = set()
for spec in args.task_ids:
if "-" in spec and not spec.startswith("-"):
start, end = spec.split("-", 1)
for i in range(int(start), int(end) + 1):
selected_ids.add(str(i))
else:
selected_ids.add(spec)
tasks = [t for t in all_tasks if str(t.get("task_id", "")) in selected_ids]
if not tasks:
print(f"No tasks matched IDs: {selected_ids}")
print(f"Available IDs: {[t.get('task_id') for t in all_tasks[:10]]}...")
return
print(f"Running {len(tasks)} selected task(s): {sorted(selected_ids, key=lambda x: int(x) if x.isdigit() else x)}")
elif args.tasks:
tasks = all_tasks[: args.tasks]
else:
tasks = all_tasks

sem = asyncio.Semaphore(MAX_CONCURRENT)
results = await asyncio.gather(
*[
Expand Down
Binary file modified stealth_bench/official_plots/accuracy_by_browser_dark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified stealth_bench/official_plots/accuracy_by_browser_light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified stealth_bench/official_plots/category_heatmap_dark.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified stealth_bench/official_plots/category_heatmap_light.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[
{
"run_start": "20260326_160529",
"tasks_completed": 80,
"tasks_successful": 69,
"total_steps": 1436,
"total_duration": 29982.2,
"total_cost": 0.0,
"tasks_successful_by_category": {
"Cloudflare": 17,
"reCaptcha": 5,
"PerimeterX": 17,
"Datadome": 11,
"hCaptcha": 3,
"GeeTest": 3,
"Akamai": 6,
"Shape": 1,
"Kasada": 1,
"Custom Antibot": 5
},
"tasks_total_by_category": {
"Cloudflare": 22,
"reCaptcha": 6,
"PerimeterX": 18,
"Datadome": 13,
"hCaptcha": 3,
"GeeTest": 4,
"Akamai": 6,
"Shape": 1,
"Kasada": 1,
"Temu Slider": 1,
"Custom Antibot": 5
}
}
]