Skip to content

Commit 880b144

Browse files
committed
refactor(browser fetchers): Make all the type hints dynamic + Faster validation
+ Also renamed `custom_config` to `selector_config` so it matches the session class.
1 parent 8940bde commit 880b144

File tree

2 files changed

+142
-358
lines changed

2 files changed

+142
-358
lines changed

scrapling/fetchers/chrome.py

Lines changed: 70 additions & 176 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
1-
from scrapling.core._types import (
2-
Callable,
3-
List,
4-
Dict,
5-
Optional,
6-
SelectorWaitStates,
7-
)
1+
from scrapling.core._types import Unpack
2+
from scrapling.engines._browsers._types import PlaywrightSession
83
from scrapling.engines.toolbelt.custom import BaseFetcher, Response
94
from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
105

@@ -26,190 +21,89 @@ class DynamicFetcher(BaseFetcher):
2621
"""
2722

2823
@classmethod
29-
def fetch(
30-
cls,
31-
url: str,
32-
headless: bool = True,
33-
google_search: bool = True,
34-
hide_canvas: bool = False,
35-
disable_webgl: bool = False,
36-
real_chrome: bool = False,
37-
stealth: bool = False,
38-
wait: int | float = 0,
39-
page_action: Optional[Callable] = None,
40-
proxy: Optional[str | Dict[str, str]] = None,
41-
locale: str = "en-US",
42-
extra_headers: Optional[Dict[str, str]] = None,
43-
useragent: Optional[str] = None,
44-
cdp_url: Optional[str] = None,
45-
timeout: int | float = 30000,
46-
disable_resources: bool = False,
47-
wait_selector: Optional[str] = None,
48-
init_script: Optional[str] = None,
49-
cookies: Optional[List[Dict]] = None,
50-
network_idle: bool = False,
51-
load_dom: bool = True,
52-
wait_selector_state: SelectorWaitStates = "attached",
53-
extra_flags: Optional[List[str]] = None,
54-
additional_args: Optional[Dict] = None,
55-
custom_config: Optional[Dict] = None,
56-
) -> Response:
24+
def fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
5725
"""Opens up a browser and do your request based on your chosen options below.
5826
5927
:param url: Target url.
60-
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
61-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
62-
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
63-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
64-
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
65-
:param cookies: Set cookies for the next request.
66-
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
67-
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
68-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
69-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
70-
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
71-
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
72-
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
73-
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
74-
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
75-
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
76-
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
77-
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
78-
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
79-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
80-
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
81-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
82-
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
83-
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
84-
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
85-
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
28+
:param kwargs: Browser session configuration options including:
29+
- headless: Run the browser in headless/hidden (default), or headful/visible mode.
30+
- disable_resources: Drop requests of unnecessary resources for a speed boost.
31+
- useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
32+
- cookies: Set cookies for the next request.
33+
- network_idle: Wait for the page until there are no network connections for at least 500 ms.
34+
- load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
35+
- timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
36+
- wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
37+
- page_action: Added for automation. A function that takes the `page` object and does the automation you need.
38+
- wait_selector: Wait for a specific CSS selector to be in a specific state.
39+
- init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
40+
- locale: Set the locale for the browser if wanted. The default value is `en-US`.
41+
- wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
42+
- stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
43+
- real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
44+
- hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
45+
- disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
46+
- cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
47+
- google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
48+
- extra_headers: A dictionary of extra headers to add to the request.
49+
- proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
50+
- extra_flags: A list of additional browser flags to pass to the browser on launch.
51+
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
52+
- additional_args: Additional arguments to be passed to Playwright's context as additional settings.
8653
:return: A `Response` object.
8754
"""
88-
if not custom_config:
89-
custom_config = {}
90-
elif not isinstance(custom_config, dict):
91-
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
55+
# Get selector_config from kwargs if provided, otherwise use empty dict
56+
selector_config = kwargs.get("selector_config", {})
57+
if not isinstance(selector_config, dict):
58+
raise TypeError("Argument `selector_config` must be a dictionary.")
9259

93-
with DynamicSession(
94-
wait=wait,
95-
proxy=proxy,
96-
locale=locale,
97-
timeout=timeout,
98-
stealth=stealth,
99-
cdp_url=cdp_url,
100-
cookies=cookies,
101-
headless=headless,
102-
load_dom=load_dom,
103-
useragent=useragent,
104-
real_chrome=real_chrome,
105-
page_action=page_action,
106-
hide_canvas=hide_canvas,
107-
init_script=init_script,
108-
network_idle=network_idle,
109-
google_search=google_search,
110-
extra_headers=extra_headers,
111-
wait_selector=wait_selector,
112-
disable_webgl=disable_webgl,
113-
extra_flags=extra_flags,
114-
additional_args=additional_args,
115-
disable_resources=disable_resources,
116-
wait_selector_state=wait_selector_state,
117-
selector_config={**cls._generate_parser_arguments(), **custom_config},
118-
) as session:
60+
# Merge selector_config with class defaults
61+
kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
62+
63+
with DynamicSession(**kwargs) as session:
11964
return session.fetch(url)
12065

12166
@classmethod
122-
async def async_fetch(
123-
cls,
124-
url: str,
125-
headless: bool = True,
126-
google_search: bool = True,
127-
hide_canvas: bool = False,
128-
disable_webgl: bool = False,
129-
real_chrome: bool = False,
130-
stealth: bool = False,
131-
wait: int | float = 0,
132-
page_action: Optional[Callable] = None,
133-
proxy: Optional[str | Dict[str, str]] = None,
134-
locale: str = "en-US",
135-
extra_headers: Optional[Dict[str, str]] = None,
136-
useragent: Optional[str] = None,
137-
cdp_url: Optional[str] = None,
138-
timeout: int | float = 30000,
139-
disable_resources: bool = False,
140-
wait_selector: Optional[str] = None,
141-
init_script: Optional[str] = None,
142-
cookies: Optional[List[Dict]] = None,
143-
network_idle: bool = False,
144-
load_dom: bool = True,
145-
wait_selector_state: SelectorWaitStates = "attached",
146-
extra_flags: Optional[List[str]] = None,
147-
additional_args: Optional[Dict] = None,
148-
custom_config: Optional[Dict] = None,
149-
) -> Response:
67+
async def async_fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:
15068
"""Opens up a browser and do your request based on your chosen options below.
15169
15270
:param url: Target url.
153-
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
154-
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
155-
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
156-
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
157-
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
158-
:param cookies: Set cookies for the next request.
159-
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
160-
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
161-
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
162-
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
163-
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
164-
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
165-
:param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
166-
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
167-
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
168-
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
169-
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
170-
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
171-
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
172-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
173-
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
174-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
175-
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
176-
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
177-
:param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
178-
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
71+
:param kwargs: Browser session configuration options including:
72+
- headless: Run the browser in headless/hidden (default), or headful/visible mode.
73+
- disable_resources: Drop requests of unnecessary resources for a speed boost.
74+
- useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
75+
- cookies: Set cookies for the next request.
76+
- network_idle: Wait for the page until there are no network connections for at least 500 ms.
77+
- load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
78+
- timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
79+
- wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.
80+
- page_action: Added for automation. A function that takes the `page` object and does the automation you need.
81+
- wait_selector: Wait for a specific CSS selector to be in a specific state.
82+
- init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
83+
- locale: Set the locale for the browser if wanted. The default value is `en-US`.
84+
- wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
85+
- stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
86+
- real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
87+
- hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
88+
- disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
89+
- cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
90+
- google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
91+
- extra_headers: A dictionary of extra headers to add to the request.
92+
- proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
93+
- extra_flags: A list of additional browser flags to pass to the browser on launch.
94+
- selector_config: The arguments that will be passed in the end while creating the final Selector's class.
95+
- additional_args: Additional arguments to be passed to Playwright's context as additional settings.
17996
:return: A `Response` object.
18097
"""
181-
if not custom_config:
182-
custom_config = {}
183-
elif not isinstance(custom_config, dict):
184-
raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
98+
# Get selector_config from kwargs if provided, otherwise use empty dict
99+
selector_config = kwargs.get("selector_config", {})
100+
if not isinstance(selector_config, dict):
101+
raise TypeError("Argument `selector_config` must be a dictionary.")
102+
103+
# Merge selector_config with class defaults
104+
kwargs["selector_config"] = {**cls._generate_parser_arguments(), **selector_config}
185105

186-
async with AsyncDynamicSession(
187-
wait=wait,
188-
max_pages=1,
189-
proxy=proxy,
190-
locale=locale,
191-
timeout=timeout,
192-
stealth=stealth,
193-
cdp_url=cdp_url,
194-
cookies=cookies,
195-
headless=headless,
196-
load_dom=load_dom,
197-
useragent=useragent,
198-
real_chrome=real_chrome,
199-
page_action=page_action,
200-
hide_canvas=hide_canvas,
201-
init_script=init_script,
202-
network_idle=network_idle,
203-
google_search=google_search,
204-
extra_headers=extra_headers,
205-
wait_selector=wait_selector,
206-
disable_webgl=disable_webgl,
207-
extra_flags=extra_flags,
208-
additional_args=additional_args,
209-
disable_resources=disable_resources,
210-
wait_selector_state=wait_selector_state,
211-
selector_config={**cls._generate_parser_arguments(), **custom_config},
212-
) as session:
106+
async with AsyncDynamicSession(**kwargs) as session:
213107
return await session.fetch(url)
214108

215109

0 commit comments

Comments
 (0)