microsoft · Lego1997 · May 25, 2026
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ MarkItDown currently supports the conversion from:
 - Excel
 - Images (EXIF metadata and OCR)
 - Audio (EXIF metadata and speech transcription)
-- HTML
+- HTML and web pages, including WeChat public account articles
 - Text-based formats (CSV, JSON, XML)
 - ZIP files (iterates over contents)
 - Youtube URLs
@@ -88,6 +88,19 @@ You can also pipe content:
 cat path-to-file.pdf | markitdown
 ```
 
+### Web Pages
+
+MarkItDown can convert web pages directly:
+
+```bash
+markitdown "https://example.com/article" -o article.md
+```
+
+For WeChat public account articles (`mp.weixin.qq.com/s/...`), MarkItDown focuses on
+the article title, account metadata, cover image, body content, and in-article images
+while omitting common platform chrome such as reader prompts, QR/follow overlays,
+related-reading lists, and social action labels.
+
 ### Optional Dependencies
 MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
 

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -22,6 +22,7 @@
 from .converters import (
     PlainTextConverter,
     HtmlConverter,
+    WeChatConverter,
     RssConverter,
     WikipediaConverter,
     YouTubeConverter,
@@ -58,6 +59,16 @@
 PRIORITY_GENERIC_FILE_FORMAT = (
     10.0  # Near catch-all converters for mimetypes like text/*, etc.
 )
+WECHAT_ARTICLE_URL_RE = re.compile(r"^https?://mp\.weixin\.qq\.com/s/")
+WECHAT_ARTICLE_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/123.0.0.0 Safari/537.36"
+    ),
+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+    "Referer": "https://mp.weixin.qq.com/",
+}
 
 
 _plugins: Union[None, List[Any]] = None  # If None, plugins have not been loaded yet.
@@ -188,6 +199,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(
                 HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
             )
+            self.register_converter(WeChatConverter())
             self.register_converter(RssConverter())
             self.register_converter(WikipediaConverter())
             self.register_converter(YouTubeConverter())
@@ -472,7 +484,10 @@ def convert_uri(
             )
         # HTTP/HTTPS URIs
         elif uri.startswith("http:") or uri.startswith("https:"):
-            response = self._requests_session.get(uri, stream=True)
+            request_kwargs: Dict[str, Any] = {"stream": True}
+            if WECHAT_ARTICLE_URL_RE.search(uri):
+                request_kwargs["headers"] = WECHAT_ARTICLE_HEADERS
+            response = self._requests_session.get(uri, **request_kwargs)
             response.raise_for_status()
             return self.convert_response(
                 response,

diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -4,6 +4,7 @@
 
 from ._plain_text_converter import PlainTextConverter
 from ._html_converter import HtmlConverter
+from ._wechat_converter import WeChatConverter
 from ._rss_converter import RssConverter
 from ._wikipedia_converter import WikipediaConverter
 from ._youtube_converter import YouTubeConverter
@@ -31,6 +32,7 @@
 __all__ = [
     "PlainTextConverter",
     "HtmlConverter",
+    "WeChatConverter",
     "RssConverter",
     "WikipediaConverter",
     "YouTubeConverter",

diff --git a/packages/markitdown/src/markitdown/converters/_wechat_converter.py b/packages/markitdown/src/markitdown/converters/_wechat_converter.py
@@ -0,0 +1,253 @@
+import re
+from typing import Any, BinaryIO
+
+from bs4 import BeautifulSoup, Tag
+
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+from ._markdownify import _CustomMarkdownify
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "text/html",
+    "application/xhtml",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [
+    ".html",
+    ".htm",
+]
+
+
+class WeChatConverter(DocumentConverter):
+    """Handle WeChat public account articles, focusing on the article content."""
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        url = stream_info.url or ""
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if re.search(r"^https?://mp\.weixin\.qq\.com/s/", url):
+            return True
+
+        is_html = extension in ACCEPTED_FILE_EXTENSIONS or any(
+            mimetype.startswith(prefix) for prefix in ACCEPTED_MIME_TYPE_PREFIXES
+        )
+        if not is_html:
+            return False
+
+        cur_pos = file_stream.tell()
+        try:
+            preview = file_stream.read(65536)
+        finally:
+            file_stream.seek(cur_pos)
+
+        if not isinstance(preview, bytes):
+            return False
+        preview_text = preview.decode("utf-8", errors="ignore")
+
+        return bool(
+            re.search(r"id=[\"']js_content[\"']", preview_text)
+            and (
+                "rich_media_content" in preview_text
+                or re.search(r"id=[\"']activity-name[\"']", preview_text)
+                or re.search(r"id=[\"']img-content[\"']", preview_text)
+            )
+        )
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
+        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        content_source = soup.find(id="js_content")
+        if not isinstance(content_source, Tag):
+            raise ValueError("Could not find WeChat article content.")
+
+        for tag in soup(["script", "style", "noscript", "template", "iframe", "form"]):
+            tag.extract()
+
+        title = self._text(soup.find(id="activity-name")) or self._meta_content(
+            soup, 'meta[property="og:title"]', 'meta[name="twitter:title"]'
+        )
+        author = self._text(soup.find(id="js_author_name"))
+        account = self._text(soup.find(id="js_name"))
+        publish_time = self._text(soup.find(id="publish_time"))
+        cover_url = self._meta_content(
+            soup, 'meta[property="og:image"]', 'meta[name="twitter:image"]'
+        )
+
+        content_soup = BeautifulSoup(str(content_source), "html.parser")
+        content = content_soup.find(id="js_content") or content_soup
+
+        self._remove_elements_by_selector(
+            content,
+            (
+                ".novel-card",
+                ".qr_code_pc",
+                ".wx_follow_context",
+                ".wx_stream_article_slide_tip",
+                ".weui-dialog",
+                ".article-tag__error-tips",
+                "[id^='js_minipro_dialog']",
+                "[id^='js_pc_qr_code']",
+            ),
+        )
+        self._remove_blocks_containing_text(
+            content,
+            (
+                "因微信平台规则调整",
+                "建议您将本公众号设为星标",
+                "预览时标签不可点",
+            ),
+        )
+        self._truncate_children_from_markers(
+            content,
+            (
+                "点击下方“阅读原文”",
+                '点击下方"阅读原文"',
+                "相关阅读",
+            ),
+        )
+        self._normalize_lazy_images(content)
+
+        article_soup = BeautifulSoup(
+            "<!doctype html><html><head></head><body><article></article></body></html>",
+            "html.parser",
+        )
+        article = article_soup.article
+        assert article is not None
+
+        if title:
+            title_tag = article_soup.new_tag("title")
+            title_tag.string = title
+            assert article_soup.head is not None
+            article_soup.head.append(title_tag)
+
+        if cover_url:
+            cover = article_soup.new_tag("img", src=cover_url)
+            cover["alt"] = "cover_image"
+            article.append(cover)
+
+        if title:
+            h1 = article_soup.new_tag("h1")
+            h1.string = title
+            article.append(h1)
+
+        metadata = self._metadata(
+            author=author, account=account, publish_time=publish_time
+        )
+        if metadata:
+            metadata_tag = article_soup.new_tag("p")
+            metadata_tag.string = metadata
+            article.append(metadata_tag)
+
+        body_fragment = BeautifulSoup(str(content), "html.parser")
+        body = body_fragment.find(id="js_content") or body_fragment
+        for child in list(body.children):
+            article.append(child)
+
+        webpage_text = _CustomMarkdownify(**kwargs).convert_soup(article).strip()
+
+        return DocumentConverterResult(
+            markdown=webpage_text,
+            title=title,
+        )
+
+    def _text(self, element: Tag | None) -> str | None:
+        if not element:
+            return None
+        text = element.get_text(" ", strip=True)
+        return text or None
+
+    def _meta_content(self, soup: BeautifulSoup, *selectors: str) -> str | None:
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element and element.get("content"):
+                return str(element["content"]).strip()
+        return None
+
+    def _metadata(
+        self, *, author: str | None, account: str | None, publish_time: str | None
+    ) -> str | None:
+        parts = []
+        if author:
+            parts.append(f"Author: {author}")
+        if account and account != author:
+            parts.append(f"Account: {account}")
+        if publish_time:
+            parts.append(f"Published: {publish_time}")
+        if not parts:
+            return None
+        return " | ".join(dict.fromkeys(parts))
+
+    def _remove_elements_by_selector(
+        self, root: Tag | BeautifulSoup, selectors: tuple[str, ...]
+    ) -> None:
+        for selector in selectors:
+            for element in root.select(selector):
+                element.extract()
+
+    def _remove_blocks_containing_text(
+        self, root: Tag | BeautifulSoup, markers: tuple[str, ...]
+    ) -> None:
+        matches = []
+        for node in root.find_all(string=True):
+            text = str(node)
+            if any(marker in text for marker in markers):
+                block = self._nearest_small_block(root, node.parent)
+                if block and block not in matches:
+                    matches.append(block)
+        for block in matches:
+            block.extract()
+
+    def _nearest_small_block(
+        self, root: Tag | BeautifulSoup, element: Tag | None
+    ) -> Tag | None:
+        current = element
+        while current and current is not root:
+            if current.name in {"p", "li", "blockquote"}:
+                return current
+            current = current.parent if isinstance(current.parent, Tag) else None
+
+        current = element
+        while current and current is not root:
+            if current.name in {"section", "div"}:
+                return current
+            current = current.parent if isinstance(current.parent, Tag) else None
+        return None
+
+    def _truncate_children_from_markers(
+        self, root: Tag | BeautifulSoup, markers: tuple[str, ...]
+    ) -> None:
+        children = [child for child in root.children if isinstance(child, Tag)]
+        truncate_at = None
+        for index, child in enumerate(children):
+            text = child.get_text(" ", strip=True)
+            if any(marker in text for marker in markers):
+                truncate_at = index
+                break
+        if truncate_at is None:
+            return
+        for child in children[truncate_at:]:
+            child.extract()
+
+    def _normalize_lazy_images(self, root: Tag | BeautifulSoup) -> None:
+        for image in root.find_all("img"):
+            data_src = (
+                image.get("data-src")
+                or image.get("data-original")
+                or image.get("data-lazy-src")
+            )
+            if data_src and not image.get("src"):
+                image["src"] = data_src
+            for attr in ("data-src", "data-original", "data-lazy-src"):
+                if image.has_attr(attr):
+                    del image[attr]
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
@@ -123,6 +123,32 @@ class FileTestVector(object):
             "move to sidebar",
         ],
     ),
+    FileTestVector(
+        filename="test_wechat_article.html",
+        mimetype="text/html",
+        charset="utf-8",
+        url="https://mp.weixin.qq.com/s/test-wechat-article",
+        must_include=[
+            "# 示例微信文章",
+            "Author: 作者甲 | Account: 示例公众号 | Published: 2026-05-25",
+            "这是第一段正文，应当保留。",
+            "![配图](https://example.com/article-image.jpg)",
+            "这是第二段正文，也应当保留。",
+        ],
+        must_not_include=[
+            "在小说阅读器读本章",
+            "因微信平台规则调整",
+            "点击下方",
+            "相关阅读",
+            "另一篇微信文章",
+            "微信扫一扫",
+            "继续滑动看下一个",
+            "取消",
+            "允许",
+            "留言",
+            "收藏",
+        ],
+    ),
     FileTestVector(
         filename="test_serp.html",
         mimetype="text/html",