Skip to content
Open
54 changes: 47 additions & 7 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

from __future__ import annotations

import inspect
from asyncio import Lock
from datetime import datetime, timedelta, timezone
from functools import lru_cache
from typing import TYPE_CHECKING, Any, cast

from browserforge.injectors.playwright import AsyncNewContext
from playwright.async_api import Browser, BrowserContext, Page, ProxySettings
from playwright.async_api import BrowserType as PlaywrightBrowserType
from typing_extensions import override

from crawlee._utils.docs import docs_group
Expand All @@ -28,6 +31,18 @@
logger = getLogger(__name__)


# Cache Playwright signatures to avoid overhead in critical path
@lru_cache(maxsize=1)
def _get_context_params_cache() -> dict[str, set[str]]:
launch_persistent_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters)
new_context_params = set(inspect.signature(Browser.new_context).parameters)
return {
'common': launch_persistent_params & new_context_params,
'persistent_unique': launch_persistent_params - new_context_params,
'incognito_unique': new_context_params - launch_persistent_params,
}


@docs_group('Browser management')
class PlaywrightBrowserController(BrowserController):
"""Controller for managing Playwright browser instances and their pages.
Expand Down Expand Up @@ -222,11 +237,38 @@ async def _create_browser_context(
`self._fingerprint_generator` is available.
"""
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}

params_cache = _get_context_params_cache()

filtered_options = {}
for key, value in browser_new_context_options.items():
if self._use_incognito_pages:
# Incognito mode (new_context)
if key in params_cache['common'] or key in params_cache['incognito_unique']:
filtered_options[key] = value
elif key in params_cache['persistent_unique']:
logger.warning(
f'Option "{key}" is only supported in persistent context mode '
'(use_incognito_pages=False) and will be ignored.'
)
else:
raise TypeError(f'"{key}" is not a valid Playwright context option.')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think we need to raise here; it is better for the Playwright code to raise, so that anyone can see the code where the arguments are defined.

It will be sufficient to filter out the arguments valid in the other case and warn for those, while letting the completely wrong arguments go through, and let them fail in Playwright. So it can be simplified to something like

if self._use_incognito_pages and key in params_cache['persistent_unique']:
    logger.warning(
        f'Option "{key}" is only supported in persistent context mode '
        '(use_incognito_pages=False) and will be ignored.'
    )
elif not self._use_incognito_pages and key in params_cache['incognito_unique']:
    logger.warning(
        f'Option "{key}" is only supported in incognito context mode '
        '(use_incognito_pages=True) and will be ignored.'
    )
else:
    filtered_options[key] = value

elif key in params_cache['common'] or key in params_cache['persistent_unique']:
# Persistent mode (launch_persistent_context)
filtered_options[key] = value
elif key in params_cache['incognito_unique']:
logger.warning(
f'Option "{key}" is only supported in incognito context mode '
'(use_incognito_pages=True) and will be ignored.'
)
else:
raise TypeError(f'"{key}" is not a valid Playwright context option.')
Comment on lines +243 to +265
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please extract to a standalone private method with docstring explaining it.
filtered_options = self._filter_new_context_options(options=browser_new_context_options)


if proxy_info:
if browser_new_context_options.get('proxy'):
if filtered_options.get('proxy'):
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")

browser_new_context_options['proxy'] = ProxySettings(
filtered_options['proxy'] = ProxySettings(
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
username=proxy_info.username,
password=proxy_info.password,
Expand All @@ -236,7 +278,7 @@ async def _create_browser_context(
return await AsyncNewContext(
browser=self._browser,
fingerprint=self._fingerprint_generator.generate(),
**browser_new_context_options,
**filtered_options,
)

if self._header_generator:
Expand All @@ -256,7 +298,5 @@ async def _create_browser_context(
else:
extra_http_headers = None

browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
'extra_http_headers', extra_http_headers
)
return await self._browser.new_context(**browser_new_context_options)
filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers)
return await self._browser.new_context(**filtered_options)
62 changes: 62 additions & 0 deletions tests/unit/browsers/test_playwright_controller_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import pytest
from playwright.async_api import Browser, Playwright, async_playwright

from crawlee.browsers import PlaywrightBrowserController

if TYPE_CHECKING:
from collections.abc import AsyncGenerator


@pytest.fixture
async def playwright() -> AsyncGenerator[Playwright, None]:
async with async_playwright() as playwright:
yield playwright


@pytest.fixture
async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
browser = await playwright.chromium.launch()
yield browser
await browser.close()


async def test_controller_validation_typo(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
with pytest.raises(TypeError, match=r'"headles" is not a valid Playwright context option.'):
await controller.new_page(browser_new_context_options={'headles': True})
await controller.close()


async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
# Default is persistent mode (use_incognito_pages=False)
controller = PlaywrightBrowserController(browser, use_incognito_pages=False)
# storage_state is incognito-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}})
assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
controller = PlaywrightBrowserController(browser, use_incognito_pages=True)
# env is persistent-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'env': {}})
assert 'Option "env" is only supported in persistent context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_valid_common(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
# viewport is common
page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}})
assert page.viewport_size == {'width': 800, 'height': 600}
await page.close()
await controller.close()