Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
833 changes: 484 additions & 349 deletions markdownify/__init__.py

Large diffs are not rendered by default.

19 changes: 6 additions & 13 deletions markdownify/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@ RSTRIP: str
STRIP: str
STRIP_ONE: str


def markdownify(
html: str,
autolinks: bool = ...,
bs4_options: str = ...,
bullets: str = ...,
code_language: str = ...,
code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
Expand All @@ -41,15 +39,15 @@ def markdownify(
wrap_width: int = ...,
) -> str: ...


class MarkdownConverter:
def __init__(
self,
autolinks: bool = ...,
bs4_options: str = ...,
bullets: str = ...,
code_language: str = ...,
code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
code_language_callback: Union[
Callable[[Incomplete], Union[str, None]], None
] = ...,
convert: Union[list[str], None] = ...,
default_title: bool = ...,
escape_asterisks: bool = ...,
Expand All @@ -67,11 +65,6 @@ class MarkdownConverter:
table_infer_header: bool = ...,
wrap: bool = ...,
wrap_width: int = ...,
) -> None:
...

def convert(self, html: str) -> str:
...

def convert_soup(self, soup: Incomplete) -> str:
...
) -> None: ...
def convert(self, html: str) -> str: ...
def convert_soup(self, soup: Incomplete) -> str: ...
197 changes: 130 additions & 67 deletions markdownify/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,82 +3,145 @@
import argparse
import sys

from markdownify import markdownify, ATX, ATX_CLOSED, UNDERLINED, \
SPACES, BACKSLASH, ASTERISK, UNDERSCORE
from markdownify import (
markdownify,
ATX,
ATX_CLOSED,
UNDERLINED,
SPACES,
BACKSLASH,
ASTERISK,
UNDERSCORE,
)


def main(argv=sys.argv[1:]):
parser = argparse.ArgumentParser(
prog='markdownify',
description='Converts html to markdown.',
prog="markdownify",
description="Converts html to markdown.",
)

parser.add_argument('html', nargs='?', type=argparse.FileType('r'),
default=sys.stdin,
help="The html file to convert. Defaults to STDIN if not "
"provided.")
parser.add_argument('-s', '--strip', nargs='*',
help="A list of tags to strip. This option can't be used with "
"the --convert option.")
parser.add_argument('-c', '--convert', nargs='*',
help="A list of tags to convert. This option can't be used with "
"the --strip option.")
parser.add_argument('-a', '--autolinks', action='store_true',
help="A boolean indicating whether the 'automatic link' style "
"should be used when a 'a' tag's contents match its href.")
parser.add_argument('--default-title', action='store_false',
help="A boolean to enable setting the title of a link to its "
"href, if no title is given.")
parser.add_argument('--heading-style', default=UNDERLINED,
choices=(ATX, ATX_CLOSED, UNDERLINED),
help="Defines how headings should be converted.")
parser.add_argument('-b', '--bullets', default='*+-',
help="A string of bullet styles to use; the bullet will "
"alternate based on nesting level.")
parser.add_argument('--strong-em-symbol', default=ASTERISK,
choices=(ASTERISK, UNDERSCORE),
help="Use * or _ to convert strong and italics text"),
parser.add_argument('--sub-symbol', default='',
help="Define the chars that surround '<sub>'.")
parser.add_argument('--sup-symbol', default='',
help="Define the chars that surround '<sup>'.")
parser.add_argument('--newline-style', default=SPACES,
choices=(SPACES, BACKSLASH),
help="Defines the style of <br> conversions: two spaces "
"or backslash at the and of the line thet should break.")
parser.add_argument('--code-language', default='',
help="Defines the language that should be assumed for all "
"'<pre>' sections.")
parser.add_argument('--no-escape-asterisks', dest='escape_asterisks',
action='store_false',
help="Do not escape '*' to '\\*' in text.")
parser.add_argument('--no-escape-underscores', dest='escape_underscores',
action='store_false',
help="Do not escape '_' to '\\_' in text.")
parser.add_argument('-i', '--keep-inline-images-in',
default=[],
nargs='*',
help="Images are converted to their alt-text when the images are "
"located inside headlines or table cells. If some inline images "
"should be converted to markdown images instead, this option can "
"be set to a list of parent tags that should be allowed to "
"contain inline images.")
parser.add_argument('--table-infer-header', dest='table_infer_header',
action='store_true',
help="When a table has no header row (as indicated by '<thead>' "
"or '<th>'), use the first body row as the header row.")
parser.add_argument('-w', '--wrap', action='store_true',
help="Wrap all text paragraphs at --wrap-width characters.")
parser.add_argument('--wrap-width', type=int, default=80)
parser.add_argument('--bs4-options',
default='html.parser',
help="Specifies the parser that BeautifulSoup should use to parse "
"the HTML markup. Examples include 'html5.parser', 'lxml', and "
"'html5lib'.")
parser.add_argument(
"html",
nargs="?",
type=argparse.FileType("r"),
default=sys.stdin,
help="The html file to convert. Defaults to STDIN if not provided.",
)
parser.add_argument(
"-s",
"--strip",
nargs="*",
help="A list of tags to strip. This option can't be used with "
"the --convert option.",
)
parser.add_argument(
"-c",
"--convert",
nargs="*",
help="A list of tags to convert. This option can't be used with "
"the --strip option.",
)
parser.add_argument(
"-a",
"--autolinks",
action="store_true",
help="A boolean indicating whether the 'automatic link' style "
"should be used when a 'a' tag's contents match its href.",
)
parser.add_argument(
"--default-title",
action="store_false",
help="A boolean to enable setting the title of a link to its "
"href, if no title is given.",
)
parser.add_argument(
"--heading-style",
default=UNDERLINED,
choices=(ATX, ATX_CLOSED, UNDERLINED),
help="Defines how headings should be converted.",
)
parser.add_argument(
"-b",
"--bullets",
default="*+-",
help="A string of bullet styles to use; the bullet will "
"alternate based on nesting level.",
)
(
parser.add_argument(
"--strong-em-symbol",
default=ASTERISK,
choices=(ASTERISK, UNDERSCORE),
help="Use * or _ to convert strong and italics text",
),
)
parser.add_argument(
"--sub-symbol", default="", help="Define the chars that surround '<sub>'."
)
parser.add_argument(
"--sup-symbol", default="", help="Define the chars that surround '<sup>'."
)
parser.add_argument(
"--newline-style",
default=SPACES,
choices=(SPACES, BACKSLASH),
help="Defines the style of <br> conversions: two spaces "
"or backslash at the and of the line thet should break.",
)
parser.add_argument(
"--code-language",
default="",
help="Defines the language that should be assumed for all '<pre>' sections.",
)
parser.add_argument(
"--no-escape-asterisks",
dest="escape_asterisks",
action="store_false",
help="Do not escape '*' to '\\*' in text.",
)
parser.add_argument(
"--no-escape-underscores",
dest="escape_underscores",
action="store_false",
help="Do not escape '_' to '\\_' in text.",
)
parser.add_argument(
"-i",
"--keep-inline-images-in",
default=[],
nargs="*",
help="Images are converted to their alt-text when the images are "
"located inside headlines or table cells. If some inline images "
"should be converted to markdown images instead, this option can "
"be set to a list of parent tags that should be allowed to "
"contain inline images.",
)
parser.add_argument(
"--table-infer-header",
dest="table_infer_header",
action="store_true",
help="When a table has no header row (as indicated by '<thead>' "
"or '<th>'), use the first body row as the header row.",
)
parser.add_argument(
"-w",
"--wrap",
action="store_true",
help="Wrap all text paragraphs at --wrap-width characters.",
)
parser.add_argument("--wrap-width", type=int, default=80)
parser.add_argument(
"--bs4-options",
default="html.parser",
help="Specifies the parser that BeautifulSoup should use to parse "
"the HTML markup. Examples include 'html5.parser', 'lxml', and "
"'html5lib'.",
)

args = parser.parse_args(argv)
print(markdownify(**vars(args)))


if __name__ == '__main__':
if __name__ == "__main__":
main()
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "markdownify"
version = "1.2.2"
version = "2.0.0"
authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
description = "Convert HTML to markdown."
readme = "README.rst"
Expand All @@ -23,8 +23,7 @@ classifiers = [
"Topic :: Utilities",
]
dependencies = [
"beautifulsoup4>=4.9,<5",
"six>=1.15,<2"
"selectolax>0.4"
]

[project.urls]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_custom_converter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from markdownify import MarkdownConverter
from bs4 import BeautifulSoup
from selectolax.lexbor import LexborHTMLParser


class UnitTestConverter(MarkdownConverter):
Expand Down Expand Up @@ -40,5 +40,5 @@ def md(html, **options):

def test_soup():
html = '<b>test</b>'
soup = BeautifulSoup(html, 'html.parser')
soup = LexborHTMLParser(html)
assert MarkdownConverter().convert_soup(soup) == '**test**'
2 changes: 0 additions & 2 deletions tests/test_escaping.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import warnings
from bs4 import MarkupResemblesLocatorWarning
from .utils import md


Expand Down Expand Up @@ -32,7 +31,6 @@ def test_single_escaping_entities():

def test_misc():
# ignore the bs4 warning that "1.2" or "*" looks like a filename
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

assert md('\\*', escape_misc=True) == r'\\\*'
assert md('&lt;foo>', escape_misc=True) == r'\<foo\>'
Expand Down
7 changes: 3 additions & 4 deletions tests/types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from markdownify import markdownify, ASTERISK, BACKSLASH, LSTRIP, RSTRIP, SPACES, STRIP, UNDERLINED, UNDERSCORE, MarkdownConverter
from bs4 import BeautifulSoup
from selectolax.lexbor import LexborHTMLParser, LexborNode
from typing import Union

markdownify("<p>Hello</p>") == "Hello" # test default of STRIP
Expand All @@ -11,7 +11,6 @@
# default options
MarkdownConverter(
autolinks=True,
bs4_options='html.parser',
bullets='*+-',
code_language='',
code_language_callback=None,
Expand Down Expand Up @@ -55,11 +54,11 @@
).convert("")

html = '<b>test</b>'
soup = BeautifulSoup(html, 'html.parser')
soup = LexborHTMLParser(html)
MarkdownConverter().convert_soup(soup) == '**test**'


def callback(el: BeautifulSoup) -> Union[str, None]:
def callback(el: LexborNode) -> Union[str, None]:
return el['class'][0] if el.has_attr('class') else None


Expand Down
2 changes: 1 addition & 1 deletion tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# for unit testing, disable document-level stripping by default so that
# separation newlines are included in testing
def md(html, **options):
def md(html: str, **options):
options = {"strip_document": None, **options}

return MarkdownConverter(**options).convert(html)