Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@ html_string = md.render("some *Markdown*")
.. autofunction:: mdit_py_plugins.front_matter.front_matter_plugin
```

## GFM (GitHub Flavored Markdown)

```{eval-rst}
.. autofunction:: mdit_py_plugins.gfm.gfm_plugin
```

## GFM Autolinks

```{eval-rst}
.. autofunction:: mdit_py_plugins.gfm_autolink.gfm_autolink_plugin
```

## Footnotes

```{eval-rst}
Expand Down
102 changes: 102 additions & 0 deletions mdit_py_plugins/gfm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Composite GFM (GitHub Flavored Markdown) plugin.

Enables a set of plugins that together approximate GitHub's Markdown rendering:

- Tables (built-in)
- Strikethrough with single and double tildes (built-in)
- Autolinks (gfm_autolink plugin)
- Task lists (built-in, markdown-it-py >= 4.1.0)
- Alerts (built-in, markdown-it-py >= 4.1.0)
- Footnotes (``[^label]`` references and definitions)

Optional extras:

- Dollar math (``$...$`` / ``$$...$$``)
- Front matter (YAML)

.. note::
Tag filtering (disallowed raw HTML tags) is not yet implemented.

.. seealso::
- `GitHub Flavored Markdown Spec <https://github.github.com/gfm/>`__
- `GitHub basic formatting syntax
<https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax>`__

.. versionadded:: 0.5.0

Requires markdown-it-py >= 4.1.0.
"""

from __future__ import annotations

from functools import lru_cache

from markdown_it import MarkdownIt
from markdown_it import __version__ as _mdit_version

from mdit_py_plugins.dollarmath import dollarmath_plugin
from mdit_py_plugins.footnote import footnote_plugin
from mdit_py_plugins.front_matter import front_matter_plugin
from mdit_py_plugins.gfm_autolink import gfm_autolink_plugin

__all__ = ("gfm_plugin",)

_MIN_VERSION = (4, 1, 0)


@lru_cache(maxsize=8)
def _parse_version(v: str) -> tuple[int, ...]:
"""Parse a version string like '4.1.0' into a tuple of ints."""
return tuple(int(x) for x in v.split(".")[:3])


def gfm_plugin(
md: MarkdownIt,
*,
dollarmath: bool = False,
front_matter: bool = False,
tasklists_editable: bool = False,
) -> None:
"""Enable GFM-like rendering.

Starts from the current parser configuration and enables the GFM
components on top.

:param dollarmath: Enable dollar-delimited math (``$...$``, ``$$...$$``).
:param front_matter: Enable YAML front matter (``---``).
:param tasklists_editable: If True, rendered task list checkboxes are not
disabled (i.e. they are interactive).
"""
if _parse_version(_mdit_version) < _MIN_VERSION:
raise RuntimeError(
f"gfm_plugin requires markdown-it-py >= {'.'.join(str(x) for x in _MIN_VERSION)} "
f"(installed: {_mdit_version})"
)

# Enable table and strikethrough rules (built into markdown-it-py)
md.enable("table")
md.enable("strikethrough")

# GFM options available in markdown-it-py >= 4.1.0
md.options["tasklists"] = True
md.options["tasklists_editable"] = tasklists_editable
md.options["alerts"] = True
md.options["strikethrough_single_tilde"] = True
# GFM autolinks
md.use(gfm_autolink_plugin)

# Footnotes (inline footnotes ^[...] are not part of GFM)
md.use(footnote_plugin, inline=False)

# Dollar math (inline $...$ and block $$...$$)
if dollarmath:
md.use(dollarmath_plugin, allow_blank_lines=False)

# TODO: Tag filter — replace leading `<` with `&lt;` for disallowed raw
# HTML tags: <title>, <textarea>, <style>, <xmp>, <iframe>, <noembed>,
# <noframes>, <script>, <plaintext>.
# See https://github.github.com/gfm/#disallowed-raw-html-extension-

# Optional plugins
if front_matter:
md.use(front_matter_plugin)
15 changes: 15 additions & 0 deletions mdit_py_plugins/gfm_autolink/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""GFM autolink extension plugin for markdown-it-py.

Implements the `GFM autolink extension
<https://github.github.com/gfm/#autolinks-extension->`_,
which recognises bare URLs (``http://``, ``https://``, ``www.``),
protocol links (``mailto:``, ``xmpp:``),
and bare email addresses without requiring angle brackets.

Ported from the Rust crate
`markdown_it_autolink <https://github.com/markdown-it-rust/markdown-it-plugins.rs>`_.
"""

from .index import gfm_autolink_plugin

__all__ = ("gfm_autolink_plugin",)
250 changes: 250 additions & 0 deletions mdit_py_plugins/gfm_autolink/_match.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
"""URL / email matching helpers for the GFM autolink extension.

Ported from the Rust ``gfm_autolinks`` crate.
"""

from __future__ import annotations

import unicodedata

# ---------------------------------------------------------------------------
# Character classification helpers
# ---------------------------------------------------------------------------

_VALID_PREV_CHARS = frozenset(" \t\r\n*_~(")


def check_prev(ch: str) -> bool:
"""Return ``True`` if *ch* is a valid preceding character for an autolink."""
return ch in _VALID_PREV_CHARS


def _is_valid_hostchar(ch: str) -> bool:
"""Return ``True`` if *ch* is valid inside a domain label (not whitespace/punctuation)."""
if ch.isspace():
return False
cat = unicodedata.category(ch)
# Unicode punctuation categories: Pc, Pd, Pe, Pf, Pi, Po, Ps
return not cat.startswith("P")


# Characters that terminate a URL (before autolink_delim trimming).
_SPACE_CHARS = frozenset(" \t\r\n\x00\x0b\x0c")


def _isspace(ch: str) -> bool:
return ch in _SPACE_CHARS


_LINK_END_ASSORTMENT = frozenset("?!.,:*_~'\"[]")


def _autolink_delim(data: str, link_end: int) -> int:
"""Trim trailing punctuation from a URL according to GFM rules."""
# Truncate at first '<'
for i, ch in enumerate(data[:link_end]):
if ch == "<":
link_end = i
break

while link_end > 0:
cclose = data[link_end - 1]

copen = "(" if cclose == ")" else None

if cclose in _LINK_END_ASSORTMENT:
link_end -= 1
elif cclose == ";":
new_end = link_end - 2
while new_end > 0 and data[new_end].isalpha():
new_end -= 1
if new_end < link_end - 2 and data[new_end] == "&":
link_end = new_end
else:
link_end -= 1
elif copen is not None:
opening = data[:link_end].count(copen)
closing = data[:link_end].count(cclose)
if closing <= opening:
break
link_end -= 1
else:
break

return link_end


# ---------------------------------------------------------------------------
# Domain validation
# ---------------------------------------------------------------------------


def _check_domain(data: str, allow_short: bool) -> int | None:
"""Validate a domain name and return the length consumed, or ``None``."""
if not data:
return None

np = 0
uscore1 = 0
uscore2 = 0

for i, ch in enumerate(data):
if ch == "_":
uscore2 += 1
elif ch == ".":
uscore1 = uscore2
uscore2 = 0
np += 1
elif not _is_valid_hostchar(ch) and ch != "-":
if uscore1 == 0 and uscore2 == 0 and (allow_short or np > 0):
return i
return None
# else: valid hostchar or '-'

if (uscore1 > 0 or uscore2 > 0) and np <= 10:
return None
if allow_short or np > 0:
return len(data)
return None


# ---------------------------------------------------------------------------
# www matching
# ---------------------------------------------------------------------------

_EMAIL_OK = frozenset(".+-_")


def match_www(text: str) -> tuple[str, int] | None:
"""Match a bare ``www.`` URL at the start of *text*.

Returns ``(url_with_scheme, char_count)`` or ``None``.
"""
if not text.startswith("www."):
return None

link_end = _check_domain(text[4:], False)
if link_end is None:
return None
# link_end is offset from position 4
link_end += 4

# extend to the end of non-space characters
while link_end < len(text) and not _isspace(text[link_end]):
link_end += 1

link_end = _autolink_delim(text, link_end)

matched = text[:link_end]
url = "http://" + matched
return url, len(matched)


# ---------------------------------------------------------------------------
# http(s):// matching
# ---------------------------------------------------------------------------


def match_http(text: str) -> tuple[str, int] | None:
"""Match an ``http://`` or ``https://`` URL at the start of *text*.

Returns ``(url, char_count)`` or ``None``.
"""
if text.startswith("http://"):
prefix_len = 7
elif text.startswith("https://"):
prefix_len = 8
else:
return None

link_end = _check_domain(text[prefix_len:], True)
if link_end is None:
return None
link_end += prefix_len

while link_end < len(text) and not _isspace(text[link_end]):
link_end += 1

link_end = _autolink_delim(text, link_end)

url = text[:link_end]
return url, len(url)


# ---------------------------------------------------------------------------
# Email matching
# ---------------------------------------------------------------------------


def match_email(text: str) -> tuple[str, int] | None:
"""Match an email address (optionally prefixed by ``mailto:``/``xmpp:``)."""
pos = 0
protocol: str | None = None
if text.startswith("mailto:"):
protocol = "mailto"
pos = 7
elif text.startswith("xmpp:"):
protocol = "xmpp"
pos = 5

return match_any_email(text, pos, protocol)


def match_any_email(
text: str, pos: int, protocol: str | None
) -> tuple[str, int] | None:
"""Match an email address in *text* starting the local-part scan at *pos*.

*protocol* is ``"mailto"``, ``"xmpp"``, or ``None`` (bare address).
Returns ``(url, char_count)`` or ``None``.
"""
size = len(text)

# scan local part (before @)
start_pos = pos
while pos < size:
ch = text[pos]
if ch.isascii() and (ch.isalnum() or ch in _EMAIL_OK):
pos += 1
continue
if ch == "@":
break
return None

if pos == start_pos:
return None

# scan domain (after @)
link_end = pos + 1
np = 0
num_slash = 0

while link_end < size:
ch = text[link_end]
if ch.isascii() and ch.isalnum():
pass
elif ch == "@":
if protocol != "xmpp":
return None
elif (
ch == "."
and link_end < size - 1
and text[link_end + 1].isascii()
and text[link_end + 1].isalnum()
):
np += 1
elif ch == "/" and protocol == "xmpp" and num_slash == 0:
num_slash += 1
elif ch != "-" and ch != "_":
break
link_end += 1

if link_end < 2 or np == 0:
return None
last_ch = text[link_end - 1]
if not (last_ch.isascii() and last_ch.isalpha()) and last_ch != ".":
return None

url = "mailto:" + text[:link_end] if protocol is None else text[:link_end]

return url, link_end
Loading
Loading