Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

### Added
- **⏱️ Crawl-delay Directive Support**: New `respect_crawl_delay` configuration parameter
- Honors `Crawl-delay` directives from robots.txt files
- Automatically waits the specified delay between requests to the same domain
- Per-domain crawl-delay caching for efficiency
- Shared HTTP session with connection pooling for robots.txt fetching
- Race-condition safe domain initialization with asyncio locks
- Works with `arun_many()` for batch crawling scenarios
- Fully backward compatible with opt-in flag (default: `False`)

- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
- Maintains HTTPS scheme for internal links even when servers redirect to HTTP
- Prevents security downgrades during deep crawling
Expand Down
8 changes: 8 additions & 0 deletions crawl4ai/async_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1136,6 +1136,10 @@ class CrawlerRunConfig():

check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
Default: False.
respect_crawl_delay (bool): Whether to respect Crawl-delay directives from robots.txt.
When True, the crawler will wait the specified delay between
requests to the same domain. Requires check_robots_txt=True.
Default: False.
user_agent (str): Custom User-Agent string to use.
Default: None.
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
Expand Down Expand Up @@ -1247,6 +1251,7 @@ def __init__(
stream: bool = False,
url: str = None,
check_robots_txt: bool = False,
respect_crawl_delay: bool = False,
user_agent: str = None,
user_agent_mode: str = None,
user_agent_generator_config: dict = {},
Expand Down Expand Up @@ -1375,6 +1380,7 @@ def __init__(

# Robots.txt Handling Parameters
self.check_robots_txt = check_robots_txt
self.respect_crawl_delay = respect_crawl_delay

# User Agent Parameters
self.user_agent = user_agent
Expand Down Expand Up @@ -1644,6 +1650,7 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
method=kwargs.get("method", "GET"),
stream=kwargs.get("stream", False),
check_robots_txt=kwargs.get("check_robots_txt", False),
respect_crawl_delay=kwargs.get("respect_crawl_delay", False),
user_agent=kwargs.get("user_agent"),
user_agent_mode=kwargs.get("user_agent_mode"),
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
Expand Down Expand Up @@ -1748,6 +1755,7 @@ def to_dict(self):
"method": self.method,
"stream": self.stream,
"check_robots_txt": self.check_robots_txt,
"respect_crawl_delay": self.respect_crawl_delay,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,
"user_agent_generator_config": self.user_agent_generator_config,
Expand Down
71 changes: 61 additions & 10 deletions crawl4ai/async_dispatcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Optional, List, Tuple, Union
from typing import Dict, Optional, List, Tuple, Union, TYPE_CHECKING
from .async_configs import CrawlerRunConfig
from .models import (
CrawlResult,
Expand All @@ -11,6 +11,9 @@

from .types import AsyncWebCrawler

if TYPE_CHECKING:
from .utils import RobotsParser

from collections.abc import AsyncGenerator

import time
Expand All @@ -32,39 +35,87 @@ def __init__(
max_delay: float = 60.0,
max_retries: int = 3,
rate_limit_codes: List[int] = None,
robots_parser: Optional["RobotsParser"] = None,
respect_crawl_delay: bool = False,
default_user_agent: str = "*",
):
self.base_delay = base_delay
self.max_delay = max_delay
self.max_retries = max_retries
self.rate_limit_codes = rate_limit_codes or [429, 503]
self.domains: Dict[str, DomainState] = {}
self.robots_parser = robots_parser
self.respect_crawl_delay = respect_crawl_delay
self.default_user_agent = default_user_agent
# Lock to prevent race conditions when initializing new domains
self._domain_init_lock = asyncio.Lock()

def get_domain(self, url: str) -> str:
return urlparse(url).netloc

async def _get_crawl_delay_for_domain(self, url: str) -> Optional[float]:
"""Fetch and cache crawl-delay for a domain from robots.txt."""
if not self.robots_parser or not self.respect_crawl_delay:
return None

domain = self.get_domain(url)
state = self.domains.get(domain)

# If we already have crawl_delay cached for this domain, return it
if state and state.crawl_delay is not None:
return state.crawl_delay

# Fetch crawl-delay from robots.txt
try:
delay = await self.robots_parser.get_crawl_delay(url, self.default_user_agent)
return delay
except Exception:
return None

async def wait_if_needed(self, url: str) -> None:
domain = self.get_domain(url)
state = self.domains.get(domain)

# Initialize new domain with lock to prevent race conditions
if not state:
self.domains[domain] = DomainState()
state = self.domains[domain]
async with self._domain_init_lock:
# Double-check after acquiring lock
state = self.domains.get(domain)
if not state:
state = DomainState()

# Fetch crawl-delay before adding to domains dict
if self.robots_parser and self.respect_crawl_delay:
crawl_delay = await self._get_crawl_delay_for_domain(url)
state.crawl_delay = crawl_delay

self.domains[domain] = state

now = time.time()

# Determine the effective delay - use crawl_delay if specified, otherwise current_delay
effective_delay = state.current_delay
if state.crawl_delay is not None and state.crawl_delay > 0:
# Use the larger of crawl_delay and current_delay (which may be increased due to rate limiting)
effective_delay = max(state.crawl_delay, state.current_delay)

if state.last_request_time:
wait_time = max(0, state.current_delay - (now - state.last_request_time))
wait_time = max(0, effective_delay - (now - state.last_request_time))
if wait_time > 0:
await asyncio.sleep(wait_time)

# Random delay within base range if no current delay
if state.current_delay == 0:
# Random delay within base range if no current delay and no crawl_delay
if state.current_delay == 0 and (state.crawl_delay is None or state.crawl_delay == 0):
state.current_delay = random.uniform(*self.base_delay)

state.last_request_time = time.time()

def update_delay(self, url: str, status_code: int) -> bool:
domain = self.get_domain(url)
state = self.domains[domain]

# Get minimum delay from crawl_delay if set
min_delay = state.crawl_delay if state.crawl_delay is not None else 0

if status_code in self.rate_limit_codes:
state.fail_count += 1
Expand All @@ -76,10 +127,10 @@ def update_delay(self, url: str, status_code: int) -> bool:
state.current_delay * 2 * random.uniform(0.75, 1.25), self.max_delay
)
else:
# Gradually reduce delay on success
state.current_delay = max(
random.uniform(*self.base_delay), state.current_delay * 0.75
)
# Gradually reduce delay on success, but never below crawl_delay
base = random.uniform(*self.base_delay)
reduced = state.current_delay * 0.75
state.current_delay = max(base, reduced, min_delay)
state.fail_count = 0

return True
Expand Down
23 changes: 22 additions & 1 deletion crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,9 +730,30 @@ async def arun_many(
# )

if dispatcher is None:
# Determine if we need to respect crawl-delay
respect_crawl_delay = False
user_agent = self.browser_config.user_agent or "*"

if isinstance(config, list):
respect_crawl_delay = any(c.respect_crawl_delay for c in config)
# Use user_agent from first config that has one
for c in config:
if c.user_agent:
user_agent = c.user_agent
break
else:
respect_crawl_delay = config.respect_crawl_delay
if config.user_agent:
user_agent = config.user_agent

dispatcher = MemoryAdaptiveDispatcher(
rate_limiter=RateLimiter(
base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3
base_delay=(1.0, 3.0),
max_delay=60.0,
max_retries=3,
robots_parser=self.robots_parser if respect_crawl_delay else None,
respect_crawl_delay=respect_crawl_delay,
default_user_agent=user_agent,
),
)

Expand Down
1 change: 1 addition & 0 deletions crawl4ai/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class DomainState:
last_request_time: float = 0
current_delay: float = 0
fail_count: int = 0
crawl_delay: Optional[float] = None # Crawl-delay from robots.txt


@dataclass
Expand Down
108 changes: 94 additions & 14 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@ def __init__(self, cache_dir=None, cache_ttl=None):
os.makedirs(self.cache_dir, exist_ok=True)
self.db_path = os.path.join(self.cache_dir, "robots_cache.db")
self._init_db()

# Shared session for efficient connection pooling
self._session: Optional[aiohttp.ClientSession] = None
# Track in-flight requests to prevent duplicate fetches
self._pending_fetches: Dict[str, asyncio.Future] = {}

def _init_db(self):
# Use WAL mode for better concurrency and performance
Expand All @@ -274,6 +279,54 @@ def _init_db(self):
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)")

async def _get_session(self) -> aiohttp.ClientSession:
"""Get or create a shared aiohttp session for connection pooling."""
if self._session is None or self._session.closed:
timeout = aiohttp.ClientTimeout(total=2)
connector = aiohttp.TCPConnector(limit=100, limit_per_host=2)
self._session = aiohttp.ClientSession(timeout=timeout, connector=connector)
return self._session

async def _fetch_robots_txt(self, domain: str, scheme: str) -> Optional[str]:
"""
Fetch robots.txt with deduplication of in-flight requests.
Multiple concurrent requests for the same domain will share the result.
"""
# If there's already a pending fetch for this domain, wait for it
if domain in self._pending_fetches:
return await self._pending_fetches[domain]

# Create a future for this fetch so others can wait
loop = asyncio.get_event_loop()
future = loop.create_future()
self._pending_fetches[domain] = future

try:
robots_url = f"{scheme}://{domain}/robots.txt"
session = await self._get_session()

async with session.get(robots_url, ssl=False) as response:
if response.status == 200:
rules = await response.text()
self._cache_rules(domain, rules)
future.set_result(rules)
return rules
else:
future.set_result(None)
return None
except Exception as _ex:
future.set_result(None)
return None
finally:
# Clean up the pending fetch
self._pending_fetches.pop(domain, None)

async def close(self):
"""Close the shared session. Call this when done with the parser."""
if self._session and not self._session.closed:
await self._session.close()
self._session = None

def _get_cached_rules(self, domain: str) -> tuple[str, bool]:
"""Get cached rules. Returns (rules, is_fresh)"""
with sqlite3.connect(self.db_path) as conn:
Expand Down Expand Up @@ -335,20 +388,9 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool:

# If rules not found or stale, fetch new ones
if not is_fresh:
try:
# Ensure we use the same scheme as the input URL
scheme = parsed.scheme or 'http'
robots_url = f"{scheme}://{domain}/robots.txt"

async with aiohttp.ClientSession() as session:
async with session.get(robots_url, timeout=2, ssl=False) as response:
if response.status == 200:
rules = await response.text()
self._cache_rules(domain, rules)
else:
return True
except Exception as _ex:
# On any error (timeout, connection failed, etc), allow access
scheme = parsed.scheme or 'http'
rules = await self._fetch_robots_txt(domain, scheme)
if rules is None:
return True

if not rules:
Expand All @@ -364,6 +406,44 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool:

return parser.can_fetch(user_agent, url)

async def get_crawl_delay(self, url: str, user_agent: str = "*") -> Optional[float]:
"""
Get the Crawl-delay directive from robots.txt for a URL/user-agent.

Args:
url: The URL to check (used to determine the domain)
user_agent: User agent string to check against (default: "*")

Returns:
float: Crawl delay in seconds, or None if not specified
"""
# Handle empty/invalid URLs
try:
parsed = urlparse(url)
domain = parsed.netloc
if not domain:
return None
except Exception as _ex:
return None

# Check cache first
rules, is_fresh = self._get_cached_rules(domain)

# If rules not found or stale, fetch new ones
if not is_fresh:
scheme = parsed.scheme or 'http'
rules = await self._fetch_robots_txt(domain, scheme)

if not rules:
return None

# Create parser and extract crawl-delay
parser = RobotFileParser()
parser.parse(rules.splitlines())

delay = parser.crawl_delay(user_agent)
return float(delay) if delay is not None else None

def clear_cache(self):
"""Clear all cached robots.txt entries"""
with sqlite3.connect(self.db_path) as conn:
Expand Down
6 changes: 4 additions & 2 deletions docs/md_v2/api/arun.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ async def main():
verbose=True, # Detailed logging
cache_mode=CacheMode.ENABLED, # Use normal read/write cache
check_robots_txt=True, # Respect robots.txt rules
# ... other parameters
respect_crawl_delay=True, # Honor Crawl-delay directives
# ... other parameters
)

async with AsyncWebCrawler() as crawler:
Expand Down Expand Up @@ -232,7 +233,8 @@ async def main():
# Core
verbose=True,
cache_mode=CacheMode.ENABLED,
check_robots_txt=True, # Respect robots.txt rules
check_robots_txt=True, # Respect robots.txt rules
respect_crawl_delay=True, # Honor Crawl-delay directives

# Content
word_count_threshold=10,
Expand Down
Loading