unclecode · dillonledoux · Jan 14, 2026 · Jan 14, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- **⏱️ Crawl-delay Directive Support**: New `respect_crawl_delay` configuration parameter
+  - Honors `Crawl-delay` directives from robots.txt files
+  - Automatically waits the specified delay between requests to the same domain
+  - Per-domain crawl-delay caching for efficiency
+  - Shared HTTP session with connection pooling for robots.txt fetching
+  - Race-condition safe domain initialization with asyncio locks
+  - Works with `arun_many()` for batch crawling scenarios
+  - Fully backward compatible with opt-in flag (default: `False`)
+
 - **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
   - Maintains HTTPS scheme for internal links even when servers redirect to HTTP
   - Prevents security downgrades during deep crawling

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
@@ -1136,6 +1136,10 @@ class CrawlerRunConfig():
 
         check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
                                  Default: False.
+        respect_crawl_delay (bool): Whether to respect Crawl-delay directives from robots.txt.
+                                    When True, the crawler will wait the specified delay between
+                                    requests to the same domain. Requires check_robots_txt=True.
+                                    Default: False.
         user_agent (str): Custom User-Agent string to use.
                           Default: None.
         user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
@@ -1247,6 +1251,7 @@ def __init__(
         stream: bool = False,
         url: str = None,
         check_robots_txt: bool = False,
+        respect_crawl_delay: bool = False,
         user_agent: str = None,
         user_agent_mode: str = None,
         user_agent_generator_config: dict = {},
@@ -1375,6 +1380,7 @@ def __init__(
 
         # Robots.txt Handling Parameters
         self.check_robots_txt = check_robots_txt
+        self.respect_crawl_delay = respect_crawl_delay
 
         # User Agent Parameters
         self.user_agent = user_agent
@@ -1644,6 +1650,7 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
             method=kwargs.get("method", "GET"),
             stream=kwargs.get("stream", False),
             check_robots_txt=kwargs.get("check_robots_txt", False),
+            respect_crawl_delay=kwargs.get("respect_crawl_delay", False),
             user_agent=kwargs.get("user_agent"),
             user_agent_mode=kwargs.get("user_agent_mode"),
             user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
@@ -1748,6 +1755,7 @@ def to_dict(self):
             "method": self.method,
             "stream": self.stream,
             "check_robots_txt": self.check_robots_txt,
+            "respect_crawl_delay": self.respect_crawl_delay,
             "user_agent": self.user_agent,
             "user_agent_mode": self.user_agent_mode,
             "user_agent_generator_config": self.user_agent_generator_config,

diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional, List, Tuple, Union
+from typing import Dict, Optional, List, Tuple, Union, TYPE_CHECKING
 from .async_configs import CrawlerRunConfig
 from .models import (
     CrawlResult,
@@ -11,6 +11,9 @@
 
 from .types import AsyncWebCrawler
 
+if TYPE_CHECKING:
+    from .utils import RobotsParser
+
 from collections.abc import AsyncGenerator
 
 import time
@@ -32,39 +35,87 @@ def __init__(
         max_delay: float = 60.0,
         max_retries: int = 3,
         rate_limit_codes: List[int] = None,
+        robots_parser: Optional["RobotsParser"] = None,
+        respect_crawl_delay: bool = False,
+        default_user_agent: str = "*",
     ):
         self.base_delay = base_delay
         self.max_delay = max_delay
         self.max_retries = max_retries
         self.rate_limit_codes = rate_limit_codes or [429, 503]
         self.domains: Dict[str, DomainState] = {}
+        self.robots_parser = robots_parser
+        self.respect_crawl_delay = respect_crawl_delay
+        self.default_user_agent = default_user_agent
+        # Lock to prevent race conditions when initializing new domains
+        self._domain_init_lock = asyncio.Lock()
 
     def get_domain(self, url: str) -> str:
         return urlparse(url).netloc
 
+    async def _get_crawl_delay_for_domain(self, url: str) -> Optional[float]:
+        """Fetch and cache crawl-delay for a domain from robots.txt."""
+        if not self.robots_parser or not self.respect_crawl_delay:
+            return None
+
+        domain = self.get_domain(url)
+        state = self.domains.get(domain)
+
+        # If we already have crawl_delay cached for this domain, return it
+        if state and state.crawl_delay is not None:
+            return state.crawl_delay
+
+        # Fetch crawl-delay from robots.txt
+        try:
+            delay = await self.robots_parser.get_crawl_delay(url, self.default_user_agent)
+            return delay
+        except Exception:
+            return None
+
     async def wait_if_needed(self, url: str) -> None:
         domain = self.get_domain(url)
         state = self.domains.get(domain)
 
+        # Initialize new domain with lock to prevent race conditions
         if not state:
-            self.domains[domain] = DomainState()
-            state = self.domains[domain]
+            async with self._domain_init_lock:
+                # Double-check after acquiring lock
+                state = self.domains.get(domain)
+                if not state:
+                    state = DomainState()
+
+                    # Fetch crawl-delay before adding to domains dict
+                    if self.robots_parser and self.respect_crawl_delay:
+                        crawl_delay = await self._get_crawl_delay_for_domain(url)
+                        state.crawl_delay = crawl_delay
+
+                    self.domains[domain] = state
 
         now = time.time()
+
+        # Determine the effective delay - use crawl_delay if specified, otherwise current_delay
+        effective_delay = state.current_delay
+        if state.crawl_delay is not None and state.crawl_delay > 0:
+            # Use the larger of crawl_delay and current_delay (which may be increased due to rate limiting)
+            effective_delay = max(state.crawl_delay, state.current_delay)
+
         if state.last_request_time:
-            wait_time = max(0, state.current_delay - (now - state.last_request_time))
+            wait_time = max(0, effective_delay - (now - state.last_request_time))
             if wait_time > 0:
                 await asyncio.sleep(wait_time)
 
-        # Random delay within base range if no current delay
-        if state.current_delay == 0:
+        # Random delay within base range if no current delay and no crawl_delay
+        if state.current_delay == 0 and (state.crawl_delay is None or state.crawl_delay == 0):
             state.current_delay = random.uniform(*self.base_delay)
 
         state.last_request_time = time.time()
 
     def update_delay(self, url: str, status_code: int) -> bool:
         domain = self.get_domain(url)
         state = self.domains[domain]
+
+        # Get minimum delay from crawl_delay if set
+        min_delay = state.crawl_delay if state.crawl_delay is not None else 0
 
         if status_code in self.rate_limit_codes:
             state.fail_count += 1
@@ -76,10 +127,10 @@ def update_delay(self, url: str, status_code: int) -> bool:
                 state.current_delay * 2 * random.uniform(0.75, 1.25), self.max_delay
             )
         else:
-            # Gradually reduce delay on success
-            state.current_delay = max(
-                random.uniform(*self.base_delay), state.current_delay * 0.75
-            )
+            # Gradually reduce delay on success, but never below crawl_delay
+            base = random.uniform(*self.base_delay)
+            reduced = state.current_delay * 0.75
+            state.current_delay = max(base, reduced, min_delay)
             state.fail_count = 0
 
         return True

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
@@ -730,9 +730,30 @@ async def arun_many(
         #     )
 
         if dispatcher is None:
+            # Determine if we need to respect crawl-delay
+            respect_crawl_delay = False
+            user_agent = self.browser_config.user_agent or "*"
+
+            if isinstance(config, list):
+                respect_crawl_delay = any(c.respect_crawl_delay for c in config)
+                # Use user_agent from first config that has one
+                for c in config:
+                    if c.user_agent:
+                        user_agent = c.user_agent
+                        break
+            else:
+                respect_crawl_delay = config.respect_crawl_delay
+                if config.user_agent:
+                    user_agent = config.user_agent
+
             dispatcher = MemoryAdaptiveDispatcher(
                 rate_limiter=RateLimiter(
-                    base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3
+                    base_delay=(1.0, 3.0), 
+                    max_delay=60.0, 
+                    max_retries=3,
+                    robots_parser=self.robots_parser if respect_crawl_delay else None,
+                    respect_crawl_delay=respect_crawl_delay,
+                    default_user_agent=user_agent,
                 ),
             )
 

diff --git a/crawl4ai/models.py b/crawl4ai/models.py
@@ -17,6 +17,7 @@ class DomainState:
     last_request_time: float = 0
     current_delay: float = 0
     fail_count: int = 0
+    crawl_delay: Optional[float] = None  # Crawl-delay from robots.txt
 
 
 @dataclass

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -259,6 +259,11 @@ def __init__(self, cache_dir=None, cache_ttl=None):
         os.makedirs(self.cache_dir, exist_ok=True)
         self.db_path = os.path.join(self.cache_dir, "robots_cache.db")
         self._init_db()
+
+        # Shared session for efficient connection pooling
+        self._session: Optional[aiohttp.ClientSession] = None
+        # Track in-flight requests to prevent duplicate fetches
+        self._pending_fetches: Dict[str, asyncio.Future] = {}
 
     def _init_db(self):
         # Use WAL mode for better concurrency and performance
@@ -274,6 +279,54 @@ def _init_db(self):
             """)
             conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)")
 
+    async def _get_session(self) -> aiohttp.ClientSession:
+        """Get or create a shared aiohttp session for connection pooling."""
+        if self._session is None or self._session.closed:
+            timeout = aiohttp.ClientTimeout(total=2)
+            connector = aiohttp.TCPConnector(limit=100, limit_per_host=2)
+            self._session = aiohttp.ClientSession(timeout=timeout, connector=connector)
+        return self._session
+
+    async def _fetch_robots_txt(self, domain: str, scheme: str) -> Optional[str]:
+        """
+        Fetch robots.txt with deduplication of in-flight requests.
+        Multiple concurrent requests for the same domain will share the result.
+        """
+        # If there's already a pending fetch for this domain, wait for it
+        if domain in self._pending_fetches:
+            return await self._pending_fetches[domain]
+
+        # Create a future for this fetch so others can wait
+        loop = asyncio.get_event_loop()
+        future = loop.create_future()
+        self._pending_fetches[domain] = future
+
+        try:
+            robots_url = f"{scheme}://{domain}/robots.txt"
+            session = await self._get_session()
+
+            async with session.get(robots_url, ssl=False) as response:
+                if response.status == 200:
+                    rules = await response.text()
+                    self._cache_rules(domain, rules)
+                    future.set_result(rules)
+                    return rules
+                else:
+                    future.set_result(None)
+                    return None
+        except Exception as _ex:
+            future.set_result(None)
+            return None
+        finally:
+            # Clean up the pending fetch
+            self._pending_fetches.pop(domain, None)
+
+    async def close(self):
+        """Close the shared session. Call this when done with the parser."""
+        if self._session and not self._session.closed:
+            await self._session.close()
+            self._session = None
+
     def _get_cached_rules(self, domain: str) -> tuple[str, bool]:
         """Get cached rules. Returns (rules, is_fresh)"""
         with sqlite3.connect(self.db_path) as conn:
@@ -335,20 +388,9 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
 
         # If rules not found or stale, fetch new ones
         if not is_fresh:
-            try:
-                # Ensure we use the same scheme as the input URL
-                scheme = parsed.scheme or 'http'
-                robots_url = f"{scheme}://{domain}/robots.txt"
-
-                async with aiohttp.ClientSession() as session:
-                    async with session.get(robots_url, timeout=2, ssl=False) as response:
-                        if response.status == 200:
-                            rules = await response.text()
-                            self._cache_rules(domain, rules)
-                        else:
-                            return True
-            except Exception as _ex:
-                # On any error (timeout, connection failed, etc), allow access
+            scheme = parsed.scheme or 'http'
+            rules = await self._fetch_robots_txt(domain, scheme)
+            if rules is None:
                 return True
 
         if not rules:
@@ -364,6 +406,44 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool:
 
         return parser.can_fetch(user_agent, url)
 
+    async def get_crawl_delay(self, url: str, user_agent: str = "*") -> Optional[float]:
+        """
+        Get the Crawl-delay directive from robots.txt for a URL/user-agent.
+
+        Args:
+            url: The URL to check (used to determine the domain)
+            user_agent: User agent string to check against (default: "*")
+
+        Returns:
+            float: Crawl delay in seconds, or None if not specified
+        """
+        # Handle empty/invalid URLs
+        try:
+            parsed = urlparse(url)
+            domain = parsed.netloc
+            if not domain:
+                return None
+        except Exception as _ex:
+            return None
+
+        # Check cache first
+        rules, is_fresh = self._get_cached_rules(domain)
+
+        # If rules not found or stale, fetch new ones
+        if not is_fresh:
+            scheme = parsed.scheme or 'http'
+            rules = await self._fetch_robots_txt(domain, scheme)
+
+        if not rules:
+            return None
+
+        # Create parser and extract crawl-delay
+        parser = RobotFileParser()
+        parser.parse(rules.splitlines())
+
+        delay = parser.crawl_delay(user_agent)
+        return float(delay) if delay is not None else None
+
     def clear_cache(self):
         """Clear all cached robots.txt entries"""
         with sqlite3.connect(self.db_path) as conn:

diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md
@@ -23,7 +23,8 @@ async def main():
         verbose=True,            # Detailed logging
         cache_mode=CacheMode.ENABLED,  # Use normal read/write cache
         check_robots_txt=True,   # Respect robots.txt rules
-        # ... other parameters
+        respect_crawl_delay=True, # Honor Crawl-delay directives
+        # ... other parameters
     )
 
     async with AsyncWebCrawler() as crawler:
@@ -232,7 +233,8 @@ async def main():
         # Core
         verbose=True,
         cache_mode=CacheMode.ENABLED,
-        check_robots_txt=True,   # Respect robots.txt rules
+        check_robots_txt=True,    # Respect robots.txt rules
+        respect_crawl_delay=True, # Honor Crawl-delay directives
 
         # Content
         word_count_threshold=10,