Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ autolinks
A boolean indicating whether the "automatic link" style should be used when
a ``a`` tag's contents match its href. Defaults to ``True``.

base_url
A base URL to use for resolving relative URLs. When specified, relative URLs
in the HTML will be converted to absolute URLs using this base. Defaults to
no base URL.

default_title
A boolean to enable setting the title of a link to its href, if no title is
given. Defaults to ``False``.
Expand Down
20 changes: 20 additions & 0 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from textwrap import fill
import re
import six
import urllib.parse


# General-purpose regex patterns
Expand Down Expand Up @@ -176,6 +177,7 @@ def _next_block_content_sibling(el):
class MarkdownConverter(object):
class DefaultOptions:
autolinks = True
base_url = ''
bs4_options = 'html.parser'
bullets = '*+-' # An iterable of bullet types.
code_language = ''
Expand Down Expand Up @@ -435,6 +437,20 @@ def underline(self, text, pad_char):
text = (text or '').rstrip()
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

def _make_absolute(self, url):
"""Convert a URL to absolute using base_url if it's not already absolute."""
base_url = self.options['base_url']

# Check if URLs to join actually exist
if not url or not base_url:
return url

# Check if URL is already absolute
if urllib.parse.urlparse(url).netloc:
return url

return urllib.parse.urljoin(base_url, url)

def convert_a(self, el, text, parent_tags):
if '_noformat' in parent_tags:
return text
Expand All @@ -453,6 +469,7 @@ def convert_a(self, el, text, parent_tags):
if self.options['default_title'] and not title:
title = href
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
href = self._make_absolute(href)
return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text

convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
Expand Down Expand Up @@ -588,6 +605,7 @@ def convert_img(self, el, text, parent_tags):
and el.parent.name not in self.options['keep_inline_images_in']):
return alt

src = self._make_absolute(src)
return '![%s](%s%s)' % (alt, src, title_part)

def convert_video(self, el, text, parent_tags):
Expand All @@ -600,6 +618,8 @@ def convert_video(self, el, text, parent_tags):
if sources:
src = sources[0].attrs.get('src', None) or ''
poster = el.attrs.get('poster', None) or ''
src = self._make_absolute(src)
poster = self._make_absolute(poster)
if src and poster:
return '[![%s](%s)](%s)' % (text, poster, src)
if src:
Expand Down
2 changes: 2 additions & 0 deletions markdownify/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ STRIP_ONE: str
def markdownify(
html: str,
autolinks: bool = ...,
base_url: str = ...,
bs4_options: str = ...,
bullets: str = ...,
code_language: str = ...,
Expand Down Expand Up @@ -46,6 +47,7 @@ class MarkdownConverter:
def __init__(
self,
autolinks: bool = ...,
base_url: str = ...,
bs4_options: str = ...,
bullets: str = ...,
code_language: str = ...,
Expand Down
7 changes: 7 additions & 0 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,10 @@ def test_spaces():
assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo\n```\n\nbar'


def test_a_with_base_url():
assert md('<a href="/path">Link</a>', base_url='https://example.com') == '[Link](https://example.com/path)'
assert md('<a href="../other">Link</a>', base_url='https://example.com/page/') == '[Link](https://example.com/other)'
assert md('<a href="https://other.com">Link</a>', base_url='https://example.com') == '[Link](https://other.com)'
assert md('<a href="/path">Link</a>') == '[Link](/path)'