Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions crawl4ai/async_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1228,6 +1228,12 @@ def __init__(
table_extraction: TableExtractionStrategy = None,
exclude_external_images: bool = False,
exclude_all_images: bool = False,
# CSS Background Images Parameters
extract_css_images: bool = False,
css_image_min_width: int = 100,
css_image_min_height: int = 100,
css_image_score_threshold: int = 2,
css_exclude_repeating: bool = True,
# Link and Domain Handling Parameters
exclude_social_media_domains: list = None,
exclude_external_links: bool = False,
Expand Down Expand Up @@ -1343,6 +1349,12 @@ def __init__(
self.exclude_external_images = exclude_external_images
self.exclude_all_images = exclude_all_images
self.table_score_threshold = table_score_threshold
# CSS Background Images Parameters
self.extract_css_images = extract_css_images
self.css_image_min_width = css_image_min_width
self.css_image_min_height = css_image_min_height
self.css_image_score_threshold = css_image_score_threshold
self.css_exclude_repeating = css_exclude_repeating

# Table extraction strategy (default to DefaultTableExtraction if not specified)
if table_extraction is None:
Expand Down
27 changes: 24 additions & 3 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,7 @@ async def handle_request_failed_capture(request):
# Handle comma-separated selectors by splitting them
selectors = [s.strip() for s in config.css_selector.split(',')]
html_parts = []

for selector in selectors:
try:
content = await self.adapter.evaluate(page,
Expand All @@ -968,13 +968,33 @@ async def handle_request_failed_capture(request):
html_parts.append(content)
except Error as e:
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")

# Wrap in a div to create a valid HTML structure
html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
except Error as e:
raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
else:
html = await page.content()

# Extract CSS background images if enabled
css_images_data = None
if config.extract_css_images:
try:
js_script = load_js_script("extract_css_backgrounds")
result = await self.adapter.evaluate(page, js_script)
css_images_data = result.get("css_images", []) if result else []
if self.logger and config.verbose:
self.logger.info(
message=f"Extracted {len(css_images_data)} CSS background images",
tag="CSS_IMAGES",
)
except Exception as e:
if self.logger:
self.logger.warning(
message=f"Failed to extract CSS background images: {str(e)}",
tag="CSS_IMAGES",
)
css_images_data = None

# # Get final HTML content
# html = await page.content()
Expand Down Expand Up @@ -1047,6 +1067,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
# Include captured data if enabled
network_requests=captured_requests if config.capture_network_requests else None,
console_messages=captured_console if config.capture_console_messages else None,
css_images_data=css_images_data,
)

except Exception as e:
Expand Down
7 changes: 5 additions & 2 deletions crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
import time
from pathlib import Path
from typing import Optional, List
from typing import Optional, List, Dict
import json
import asyncio

Expand Down Expand Up @@ -342,6 +342,7 @@ async def arun(
screenshot_data = async_response.screenshot
pdf_data = async_response.pdf_data
js_execution_result = async_response.js_execution_result
css_images_data = async_response.css_images_data

t2 = time.perf_counter()
self.logger.url_status(
Expand All @@ -366,6 +367,7 @@ async def arun(
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
original_scheme=urlparse(url).scheme,
css_images_data=css_images_data,
**kwargs,
)

Expand Down Expand Up @@ -441,6 +443,7 @@ async def aprocess_html(
screenshot_data: str,
pdf_data: str,
verbose: bool,
css_images_data: List[Dict] = None,
**kwargs,
) -> CrawlResult:
"""
Expand Down Expand Up @@ -480,7 +483,7 @@ async def aprocess_html(
# Scraping Strategy Execution #
################################
result: ScrapingResult = scraping_strategy.scrap(
url, html, **params)
url, html, css_images_data=css_images_data, **params)

if result is None:
raise ValueError(
Expand Down
110 changes: 109 additions & 1 deletion crawl4ai/content_scraping_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,18 @@ def _process_element(
except Exception as e:
self._log("error", f"Error processing image: {str(e)}", "SCRAPE")

# Process CSS background images (if enabled)
if kwargs.get("extract_css_images", False):
try:
css_images_data = kwargs.get("css_images_data")
processed_css_images = self.process_css_background_images(
css_images_data, url, **kwargs
)
if processed_css_images:
media["css_images"].extend(processed_css_images)
except Exception as e:
self._log("error", f"Error processing CSS images: {str(e)}", "SCRAPE")

# Process videos and audios
for media_type in ["video", "audio"]:
for elem in element.xpath(f".//{media_type}"):
Expand Down Expand Up @@ -514,6 +526,102 @@ def add_variant(src: str, width: Optional[str] = None):

return image_variants if image_variants else None

def process_css_background_images(
self, css_images_data: List[Dict], url: str, **kwargs
) -> Optional[List[Dict]]:
"""
Process CSS background images extracted from browser.

Args:
css_images_data: Raw data from JavaScript extraction
url: Page URL for resolving relative URLs
**kwargs: Configuration options

Returns:
List of MediaItem dictionaries or None if no images
"""
from urllib.parse import urljoin

if not css_images_data:
return None

processed_images = []
min_width = kwargs.get("css_image_min_width", 100)
min_height = kwargs.get("css_image_min_height", 100)
exclude_repeating = kwargs.get("css_exclude_repeating", True)
score_threshold = kwargs.get("css_image_score_threshold", 2)

for img_data in css_images_data:
# Filter by element size
computed_width = img_data.get("computed_width", 0)
computed_height = img_data.get("computed_height", 0)

if computed_width < min_width or computed_height < min_height:
continue

# Filter repeating patterns
if exclude_repeating and img_data.get("is_repeating", False):
continue

# Resolve URL
src = img_data["src"]
if not src.startswith(("http://", "https://", "data:")):
src = urljoin(url, src)

# Skip data URLs
if src.startswith("data:"):
continue

# Calculate score based on element properties
score = 0

# Larger elements get higher scores
if computed_width > 300:
score += 1
if computed_height > 300:
score += 1

# Non-repeating backgrounds are more likely to be content images
if not img_data.get("is_repeating", False):
score += 1

# Detect image format
image_formats = {"jpg", "jpeg", "png", "webp", "avif", "gif", "svg"}
detected_format = None
src_lower = src.lower()
for fmt in image_formats:
if fmt in src_lower:
detected_format = fmt
score += 1
break

# Apply score threshold
if score <= score_threshold:
continue

# Build selector string for description
element_tag = img_data.get("element_tag", "element")
selector = img_data.get("selector", "")
desc = f"CSS background image on {element_tag}"
if selector:
desc += f" ({selector})"

# Create MediaItem
media_item = {
"src": src,
"alt": f"Background of {element_tag}",
"desc": desc,
"score": score,
"type": "css_image",
"group_id": 0,
"format": detected_format,
"width": computed_width,
}

processed_images.append(media_item)

return processed_images if processed_images else None

def remove_empty_elements_fast(self, root, word_count_threshold=5):
"""
Remove elements that fall below the desired word threshold in a single pass from the bottom up.
Expand Down Expand Up @@ -730,7 +838,7 @@ def _scrap(
form.getparent().remove(form)

# Process content
media = {"images": [], "videos": [], "audios": [], "tables": []}
media = {"images": [], "videos": [], "audios": [], "tables": [], "css_images": []}
internal_links_dict = {}
external_links_dict = {}

Expand Down
138 changes: 138 additions & 0 deletions crawl4ai/js_snippet/extract_css_backgrounds.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/**
* Extract CSS background images from all elements on the page.
* This script is executed by crawl4ai to extract CSS background images.
*
* Returns a JSON object with css_images array containing:
* - src: Image URL
* - selector: CSS selector for the element
* - element_tag: Tag name of the element
* - element_class: Class names of the element
* - element_id: ID of the element
* - style_property: Which CSS property had the image
* - computed_width: Element width in pixels
* - computed_height: Element height in pixels
* - is_repeated: Whether background repeats
* - background_position: CSS background-position value
* - background_size: CSS background-size value
*/

(function() {
const results = [];
const allElements = document.querySelectorAll('*');
const processedUrls = new Set();

/**
* Generate a unique CSS selector for an element
*/
function getElementSelector(element) {
if (element.id) {
return '#' + element.id;
}

let selector = element.tagName.toLowerCase();

if (element.className && typeof element.className === 'string') {
const classes = element.className.trim().split(/\s+/).filter(c => c);
if (classes.length > 0) {
selector += '.' + classes.join('.');
}
}

// Add nth-child if needed for uniqueness
const parent = element.parentElement;
if (parent) {
const siblings = Array.from(parent.children).filter(
child => child.tagName === element.tagName
);
if (siblings.length > 1) {
const index = siblings.indexOf(element) + 1;
selector += `:nth-child(${index})`;
}
}

return selector;
}

/**
* Check if element is visible and has meaningful dimensions
*/
function isElementVisible(element) {
const rect = element.getBoundingClientRect();
const style = window.getComputedStyle(element);

// Check if element has display: none or visibility: hidden
if (style.display === 'none' || style.visibility === 'hidden') {
return false;
}

// Check if element has meaningful dimensions
return rect.width > 0 && rect.height > 0;
}

allElements.forEach(element => {
// Skip invisible elements
if (!isElementVisible(element)) {
return;
}

const style = window.getComputedStyle(element);
const backgroundImage = style.backgroundImage;

// Skip if no background image or if it's 'none'
if (!backgroundImage || backgroundImage === 'none' || backgroundImage === 'initial') {
return;
}

// Parse url() from background-image property
// Handles: url(...), url("..."), url('...'), and multiple backgrounds
const urlPattern = /url\(['"]?([^'")\s]+)['"]?\)/g;
let match;
const urls = [];

while ((match = urlPattern.exec(backgroundImage)) !== null) {
urls.push(match[1]);
}

if (urls.length === 0) {
return;
}

// Process each URL
urls.forEach(url => {
// Skip data URLs
if (url.startsWith('data:')) {
return;
}

// Skip already processed URLs (deduplication)
if (processedUrls.has(url)) {
return;
}
processedUrls.add(url);

// Get element dimensions
const rect = element.getBoundingClientRect();

// Create result object
const result = {
src: url,
selector: getElementSelector(element),
element_tag: element.tagName.toLowerCase(),
element_class: element.className || '',
element_id: element.id || '',
style_property: 'background-image',
computed_width: Math.round(rect.width),
computed_height: Math.round(rect.height),
is_repeated: style.backgroundRepeat !== 'no-repeat',
background_position: style.backgroundPosition,
background_size: style.backgroundSize
};

results.push(result);
});
});

return {
css_images: results
};
})();
Loading