Skip to content

enh: Add lxml for _fix_links.py#359

Open
hoxbro wants to merge 1 commit into
mainfrom
enh_lxml
Open

enh: Add lxml for _fix_links.py#359
hoxbro wants to merge 1 commit into
mainfrom
enh_lxml

Conversation

@hoxbro
Copy link
Copy Markdown
Collaborator

@hoxbro hoxbro commented Apr 29, 2026

Something which have been in my stack for the longest time

Benchmark:

HTML size: 3,278 bytes

html.parser (BeautifulSoup)    200 iters  0.428s  (2.14 ms/iter)
lxml (BeautifulSoup)           200 iters  0.338s  (1.69 ms/iter)
lxml (pure)                    200 iters  0.023s  (0.11 ms/iter)
"""
Benchmark: BeautifulSoup html.parser vs lxml (BeautifulSoup) vs pure lxml for cleanup_links.
"""
import os
import re
import timeit
import warnings
from pathlib import Path

from bs4 import BeautifulSoup
from lxml import etree, html as lhtml


def make_html(n_links=50, n_imgs=20):
    links = "\n".join(
        f'<a href="notebook_{i}.ipynb">link {i}</a>' for i in range(n_links)
    )
    imgs = "\n".join(
        f'<img src="assets/image_{i}.png" />' for i in range(n_imgs)
    )
    external = "\n".join(
        f'<a href="https://example.com/page{i}">ext {i}</a>' for i in range(10)
    )
    return f"""<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div class="content">
{links}
{external}
{imgs}
<p>Some <code>Element</code> text and more content here.</p>
</div>
</body>
</html>"""


def process_bs4(text, parser):
    soup = BeautifulSoup(text, features=parser)
    for a in soup.find_all("a"):
        href = a.get("href", "")
        if ".ipynb" in href and "http" not in href:
            a["href"] = href.replace(".ipynb", ".html")
    for img in soup.find_all("img"):
        src = img.get("src", "")
        if "http" not in src and "assets" in src:
            pass  # would check path; skip filesystem ops in benchmark
    return str(soup)


def process_lxml(text):
    tree = lhtml.fromstring(text)
    for a in tree.iter("a"):
        href = a.get("href", "")
        if ".ipynb" in href and "http" not in href:
            a.set("href", href.replace(".ipynb", ".html"))
    for img in tree.iter("img"):
        src = img.get("src", "")
        if "http" not in src and "assets" in src:
            pass
    return lhtml.tostring(tree, doctype="<!DOCTYPE html>", encoding="unicode")


def run(label, fn, html, n=200):
    elapsed = timeit.timeit(lambda: fn(html), number=n)
    print(f"{label:<30} {n} iters  {elapsed:.3f}s  ({elapsed/n*1000:.2f} ms/iter)")


if __name__ == "__main__":
    html = make_html(n_links=50, n_imgs=20)
    print(f"HTML size: {len(html):,} bytes\n")

    run("html.parser (BeautifulSoup)", lambda h: process_bs4(h, "html.parser"), html)
    run("lxml (BeautifulSoup)",        lambda h: process_bs4(h, "lxml"),        html)
    run("lxml (pure)",                 process_lxml,                             html)

@hoxbro
Copy link
Copy Markdown
Collaborator Author

hoxbro commented Apr 29, 2026

We already used it here:

soup = bs4.BeautifulSoup(r.content, features='lxml')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant