Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ MarkItDown currently supports the conversion from:
- Excel
- Images (EXIF metadata and OCR)
- Audio (EXIF metadata and speech transcription)
- HTML
- HTML and web pages, including WeChat public account articles
- Text-based formats (CSV, JSON, XML)
- ZIP files (iterates over contents)
- Youtube URLs
Expand Down Expand Up @@ -88,6 +88,19 @@ You can also pipe content:
cat path-to-file.pdf | markitdown
```

### Web Pages

MarkItDown can convert web pages directly:

```bash
markitdown "https://example.com/article" -o article.md
```

For WeChat public account articles (`mp.weixin.qq.com/s/...`), MarkItDown focuses on
the article title, account metadata, cover image, body content, and in-article images
while omitting common platform chrome such as reader prompts, QR/follow overlays,
related-reading lists, and social action labels.

### Optional Dependencies
MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:

Expand Down
17 changes: 16 additions & 1 deletion packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .converters import (
PlainTextConverter,
HtmlConverter,
WeChatConverter,
RssConverter,
WikipediaConverter,
YouTubeConverter,
Expand Down Expand Up @@ -58,6 +59,16 @@
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
WECHAT_ARTICLE_URL_RE = re.compile(r"^https?://mp\.weixin\.qq\.com/s/")
WECHAT_ARTICLE_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36"
),
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Referer": "https://mp.weixin.qq.com/",
}


_plugins: Union[None, List[Any]] = None # If None, plugins have not been loaded yet.
Expand Down Expand Up @@ -188,6 +199,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
)
self.register_converter(WeChatConverter())
self.register_converter(RssConverter())
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
Expand Down Expand Up @@ -472,7 +484,10 @@ def convert_uri(
)
# HTTP/HTTPS URIs
elif uri.startswith("http:") or uri.startswith("https:"):
response = self._requests_session.get(uri, stream=True)
request_kwargs: Dict[str, Any] = {"stream": True}
if WECHAT_ARTICLE_URL_RE.search(uri):
request_kwargs["headers"] = WECHAT_ARTICLE_HEADERS
response = self._requests_session.get(uri, **request_kwargs)
response.raise_for_status()
return self.convert_response(
response,
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ._plain_text_converter import PlainTextConverter
from ._html_converter import HtmlConverter
from ._wechat_converter import WeChatConverter
from ._rss_converter import RssConverter
from ._wikipedia_converter import WikipediaConverter
from ._youtube_converter import YouTubeConverter
Expand Down Expand Up @@ -31,6 +32,7 @@
__all__ = [
"PlainTextConverter",
"HtmlConverter",
"WeChatConverter",
"RssConverter",
"WikipediaConverter",
"YouTubeConverter",
Expand Down
253 changes: 253 additions & 0 deletions packages/markitdown/src/markitdown/converters/_wechat_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import re
from typing import Any, BinaryIO

from bs4 import BeautifulSoup, Tag

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify

ACCEPTED_MIME_TYPE_PREFIXES = [
"text/html",
"application/xhtml",
]

ACCEPTED_FILE_EXTENSIONS = [
".html",
".htm",
]


class WeChatConverter(DocumentConverter):
"""Handle WeChat public account articles, focusing on the article content."""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
url = stream_info.url or ""
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if re.search(r"^https?://mp\.weixin\.qq\.com/s/", url):
return True

is_html = extension in ACCEPTED_FILE_EXTENSIONS or any(
mimetype.startswith(prefix) for prefix in ACCEPTED_MIME_TYPE_PREFIXES
)
if not is_html:
return False

cur_pos = file_stream.tell()
try:
preview = file_stream.read(65536)
finally:
file_stream.seek(cur_pos)

if not isinstance(preview, bytes):
return False
preview_text = preview.decode("utf-8", errors="ignore")

return bool(
re.search(r"id=[\"']js_content[\"']", preview_text)
and (
"rich_media_content" in preview_text
or re.search(r"id=[\"']activity-name[\"']", preview_text)
or re.search(r"id=[\"']img-content[\"']", preview_text)
)
)

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
content_source = soup.find(id="js_content")
if not isinstance(content_source, Tag):
raise ValueError("Could not find WeChat article content.")

for tag in soup(["script", "style", "noscript", "template", "iframe", "form"]):
tag.extract()

title = self._text(soup.find(id="activity-name")) or self._meta_content(
soup, 'meta[property="og:title"]', 'meta[name="twitter:title"]'
)
author = self._text(soup.find(id="js_author_name"))
account = self._text(soup.find(id="js_name"))
publish_time = self._text(soup.find(id="publish_time"))
cover_url = self._meta_content(
soup, 'meta[property="og:image"]', 'meta[name="twitter:image"]'
)

content_soup = BeautifulSoup(str(content_source), "html.parser")
content = content_soup.find(id="js_content") or content_soup

self._remove_elements_by_selector(
content,
(
".novel-card",
".qr_code_pc",
".wx_follow_context",
".wx_stream_article_slide_tip",
".weui-dialog",
".article-tag__error-tips",
"[id^='js_minipro_dialog']",
"[id^='js_pc_qr_code']",
),
)
self._remove_blocks_containing_text(
content,
(
"因微信平台规则调整",
"建议您将本公众号设为星标",
"预览时标签不可点",
),
)
self._truncate_children_from_markers(
content,
(
"点击下方“阅读原文”",
'点击下方"阅读原文"',
"相关阅读",
),
)
self._normalize_lazy_images(content)

article_soup = BeautifulSoup(
"<!doctype html><html><head></head><body><article></article></body></html>",
"html.parser",
)
article = article_soup.article
assert article is not None

if title:
title_tag = article_soup.new_tag("title")
title_tag.string = title
assert article_soup.head is not None
article_soup.head.append(title_tag)

if cover_url:
cover = article_soup.new_tag("img", src=cover_url)
cover["alt"] = "cover_image"
article.append(cover)

if title:
h1 = article_soup.new_tag("h1")
h1.string = title
article.append(h1)

metadata = self._metadata(
author=author, account=account, publish_time=publish_time
)
if metadata:
metadata_tag = article_soup.new_tag("p")
metadata_tag.string = metadata
article.append(metadata_tag)

body_fragment = BeautifulSoup(str(content), "html.parser")
body = body_fragment.find(id="js_content") or body_fragment
for child in list(body.children):
article.append(child)

webpage_text = _CustomMarkdownify(**kwargs).convert_soup(article).strip()

return DocumentConverterResult(
markdown=webpage_text,
title=title,
)

def _text(self, element: Tag | None) -> str | None:
if not element:
return None
text = element.get_text(" ", strip=True)
return text or None

def _meta_content(self, soup: BeautifulSoup, *selectors: str) -> str | None:
for selector in selectors:
element = soup.select_one(selector)
if element and element.get("content"):
return str(element["content"]).strip()
return None

def _metadata(
self, *, author: str | None, account: str | None, publish_time: str | None
) -> str | None:
parts = []
if author:
parts.append(f"Author: {author}")
if account and account != author:
parts.append(f"Account: {account}")
if publish_time:
parts.append(f"Published: {publish_time}")
if not parts:
return None
return " | ".join(dict.fromkeys(parts))

def _remove_elements_by_selector(
self, root: Tag | BeautifulSoup, selectors: tuple[str, ...]
) -> None:
for selector in selectors:
for element in root.select(selector):
element.extract()

def _remove_blocks_containing_text(
self, root: Tag | BeautifulSoup, markers: tuple[str, ...]
) -> None:
matches = []
for node in root.find_all(string=True):
text = str(node)
if any(marker in text for marker in markers):
block = self._nearest_small_block(root, node.parent)
if block and block not in matches:
matches.append(block)
for block in matches:
block.extract()

def _nearest_small_block(
self, root: Tag | BeautifulSoup, element: Tag | None
) -> Tag | None:
current = element
while current and current is not root:
if current.name in {"p", "li", "blockquote"}:
return current
current = current.parent if isinstance(current.parent, Tag) else None

current = element
while current and current is not root:
if current.name in {"section", "div"}:
return current
current = current.parent if isinstance(current.parent, Tag) else None
return None

def _truncate_children_from_markers(
self, root: Tag | BeautifulSoup, markers: tuple[str, ...]
) -> None:
children = [child for child in root.children if isinstance(child, Tag)]
truncate_at = None
for index, child in enumerate(children):
text = child.get_text(" ", strip=True)
if any(marker in text for marker in markers):
truncate_at = index
break
if truncate_at is None:
return
for child in children[truncate_at:]:
child.extract()

def _normalize_lazy_images(self, root: Tag | BeautifulSoup) -> None:
for image in root.find_all("img"):
data_src = (
image.get("data-src")
or image.get("data-original")
or image.get("data-lazy-src")
)
if data_src and not image.get("src"):
image["src"] = data_src
for attr in ("data-src", "data-original", "data-lazy-src"):
if image.has_attr(attr):
del image[attr]
26 changes: 26 additions & 0 deletions packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,32 @@ class FileTestVector(object):
"move to sidebar",
],
),
FileTestVector(
filename="test_wechat_article.html",
mimetype="text/html",
charset="utf-8",
url="https://mp.weixin.qq.com/s/test-wechat-article",
must_include=[
"# 示例微信文章",
"Author: 作者甲 | Account: 示例公众号 | Published: 2026-05-25",
"这是第一段正文,应当保留。",
"![配图](https://example.com/article-image.jpg)",
"这是第二段正文,也应当保留。",
],
must_not_include=[
"在小说阅读器读本章",
"因微信平台规则调整",
"点击下方",
"相关阅读",
"另一篇微信文章",
"微信扫一扫",
"继续滑动看下一个",
"取消",
"允许",
"留言",
"收藏",
],
),
FileTestVector(
filename="test_serp.html",
mimetype="text/html",
Expand Down
Loading