Switch to markdown-it (#595)

Fixes #537, fixes #586
2025-06-03 22:41:20 +01:00
parent 98f27364f2
commit 8ed86b53ca
9 changed files with 301 additions and 223 deletions
--- a/app/init.py
+++ b/app/init.py
@@ -21,13 +21,12 @@ import redis
 from flask import redirect, url_for, render_template, flash, request, Flask, send_from_directory, make_response, render_template_string
 from flask_babel import Babel, gettext
 from flask_flatpages import FlatPages
-from flask_flatpages.utils import pygmented_markdown
 from flask_github import GitHub
 from flask_login import logout_user, current_user, LoginManager
 from flask_mail import Mail
 from flask_wtf.csrf import CSRFProtect

-from app.markdown import init_markdown, MARKDOWN_EXTENSIONS, MARKDOWN_EXTENSION_CONFIG
+from app.markdown import init_markdown, render_markdown

 import sentry_sdk
 from sentry_sdk.integrations.flask import FlaskIntegration
@@ -67,13 +66,11 @@ app = Flask(__name__, static_folder="public/static")
 def my_flatpage_renderer(text):
 	# Render with jinja first
 	prerendered_body = render_template_string(text)
-	return pygmented_markdown(prerendered_body, flatpages=pages)
+	return render_markdown(prerendered_body)


 app.config["FLATPAGES_ROOT"] = "flatpages"
 app.config["FLATPAGES_EXTENSION"] = ".md"
-app.config["FLATPAGES_MARKDOWN_EXTENSIONS"] = MARKDOWN_EXTENSIONS
-app.config["FLATPAGES_EXTENSION_CONFIG"] = MARKDOWN_EXTENSION_CONFIG
 app.config["FLATPAGES_HTML_RENDERER"] = my_flatpage_renderer
 app.config["WTF_CSRF_TIME_LIMIT"] = None

--- a/app/blueprints/threads/init.py
+++ b/app/blueprints/threads/init.py
@@ -29,7 +29,7 @@ from app.models import Package, db, User, Permission, Thread, UserRank, AuditSev
 from app.utils import add_notification, is_yes, add_audit_log, get_system_user, has_blocked_domains, \
 	normalize_line_endings
 from flask_wtf import FlaskForm
-from wtforms import StringField, TextAreaField, SubmitField, BooleanField
+from wtforms import StringField, TextAreaField, SubmitField
 from wtforms.validators import InputRequired, Length
 from app.utils import get_int_or_abort

--- a/app/markdown.py
+++ b/app/markdown.py
@@ -1,214 +0,0 @@
-# ContentDB
-# Copyright (C) rubenwardy
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-from functools import partial
-from urllib.parse import urljoin
-
-import bleach
-from bleach import Cleaner
-from bleach.linkifier import LinkifyFilter
-from bs4 import BeautifulSoup
-from markdown import Markdown
-from flask import url_for
-from jinja2.utils import markupsafe
-from markdown.extensions import Extension
-from markdown.inlinepatterns import SimpleTagInlineProcessor
-from markdown.inlinepatterns import Pattern
-from markdown.extensions.codehilite import CodeHiliteExtension
-from xml.etree import ElementTree
-
-# Based on
-# https://github.com/Wenzil/mdx_bleach/blob/master/mdx_bleach/whitelist.py
-#
-# License: MIT
-
-ALLOWED_TAGS = {
-	"h1", "h2", "h3", "h4", "h5", "h6", "hr",
-	"ul", "ol", "li",
-	"p",
-	"br",
-	"pre",
-	"code",
-	"blockquote",
-	"strong",
-	"em",
-	"a",
-	"img",
-	"table", "thead", "tbody", "tr", "th", "td",
-	"div", "span", "del", "s",
-	"details",
-	"summary",
-}
-
-ALLOWED_CSS = [
-	"highlight", "codehilite",
-	"hll", "c", "err", "g", "k", "l", "n", "o", "x", "p", "ch", "cm", "cp", "cpf", "c1", "cs",
-	"gd", "ge", "gr", "gh", "gi", "go", "gp", "gs", "gu", "gt", "kc", "kd", "kn", "kp", "kr",
-	"kt", "ld", "m", "s", "na", "nb", "nc", "no", "nd", "ni", "ne", "nf", "nl", "nn", "nx",
-	"py", "nt", "nv", "ow", "w", "mb", "mf", "mh", "mi", "mo", "sa", "sb", "sc", "dl", "sd",
-	"s2", "se", "sh", "si", "sx", "sr", "s1", "ss", "bp", "fm", "vc", "vg", "vi", "vm", "il",
-]
-
-
-def allow_class(_tag, name, value):
-	return name == "class" and value in ALLOWED_CSS
-
-
-ALLOWED_ATTRIBUTES = {
-	"h1": ["id"],
-	"h2": ["id"],
-	"h3": ["id"],
-	"h4": ["id"],
-	"a": ["href", "title", "data-username"],
-	"img": ["src", "title", "alt"],
-	"code": allow_class,
-	"div": allow_class,
-	"span": allow_class,
-	"table": ["id"],
-}
-
-ALLOWED_PROTOCOLS = {"http", "https", "mailto"}
-
-md = None
-
-
-def linker_callback(attrs, new=False):
-	if new:
-		text = attrs.get("_text")
-		if not (text.startswith("http://") or text.startswith("https://")):
-			return None
-	return attrs
-
-
-def render_markdown(source):
-	html = md.convert(source)
-
-	cleaner = Cleaner(
-		tags=ALLOWED_TAGS,
-		attributes=ALLOWED_ATTRIBUTES,
-		protocols=ALLOWED_PROTOCOLS,
-		filters=[partial(LinkifyFilter,
-				callbacks=[linker_callback] + bleach.linkifier.DEFAULT_CALLBACKS,
-				skip_tags={"pre", "code"})])
-	return cleaner.clean(html)
-
-
-class DelInsExtension(Extension):
-	def extendMarkdown(self, md):
-		del_proc = SimpleTagInlineProcessor(r"(\~\~)(.+?)(\~\~)", "del")
-		md.inlinePatterns.register(del_proc, "del", 200)
-
-		ins_proc = SimpleTagInlineProcessor(r"(\+\+)(.+?)(\+\+)", "ins")
-		md.inlinePatterns.register(ins_proc, "ins", 200)
-
-
-RE_PARTS = dict(
-	USER=r"[A-Za-z0-9._-]*\b",
-	REPO=r"[A-Za-z0-9_]+\b"
-)
-
-
-class MentionPattern(Pattern):
-	ANCESTOR_EXCLUDES = ("a",)
-
-	def __init__(self, config, md):
-		MENTION_RE = r"(@({USER})(?:\/({REPO}))?)".format(**RE_PARTS)
-		super(MentionPattern, self).__init__(MENTION_RE, md)
-		self.config = config
-
-	def handleMatch(self, m):
-		from app.models import User
-
-		label = m.group(2)
-		user = m.group(3)
-		package_name = m.group(4)
-		if package_name:
-			el = ElementTree.Element("a")
-			el.text = label
-			el.set("href", url_for("packages.view", author=user, name=package_name))
-			return el
-		else:
-			if User.query.filter_by(username=user).count() == 0:
-				return None
-
-			el = ElementTree.Element("a")
-			el.text = label
-			el.set("href", url_for("users.profile", username=user))
-			el.set("data-username", user)
-			return el
-
-
-class MentionExtension(Extension):
-	def __init__(self, *args, **kwargs):
-		super(MentionExtension, self).__init__(*args, **kwargs)
-
-	def extendMarkdown(self, md):
-		md.ESCAPED_CHARS.append("@")
-		md.inlinePatterns.register(MentionPattern(self.getConfigs(), md), "mention", 20)
-
-
-MARKDOWN_EXTENSIONS = ["fenced_code", "tables", CodeHiliteExtension(guess_lang=False), "toc", DelInsExtension(), MentionExtension()]
-MARKDOWN_EXTENSION_CONFIG = {
-	"fenced_code": {},
-	"tables": {}
-}
-
-
-def init_markdown(app):
-	global md
-
-	md = Markdown(extensions=MARKDOWN_EXTENSIONS,
-			extension_configs=MARKDOWN_EXTENSION_CONFIG,
-			output_format="html")
-
-	@app.template_filter()
-	def markdown(source):
-		return markupsafe.Markup(render_markdown(source))
-
-
-def get_headings(html: str):
-	soup = BeautifulSoup(html, "html.parser")
-	headings = soup.find_all(["h1", "h2", "h3"])
-
-	root = []
-	stack = []
-	for heading in headings:
-		this = {"link": heading.get("id") or "", "text": heading.text, "children": []}
-		this_level = int(heading.name[1:]) - 1
-
-		while this_level <= len(stack):
-			stack.pop()
-
-		if len(stack) > 0:
-			stack[-1]["children"].append(this)
-		else:
-			root.append(this)
-
-		stack.append(this)
-
-	return root
-
-
-def get_user_mentions(html: str) -> set:
-	soup = BeautifulSoup(html, "html.parser")
-	links = soup.select("a[data-username]")
-	return set([x.get("data-username") for x in links])
-
-
-def get_links(html: str, url: str) -> set:
-	soup = BeautifulSoup(html, "html.parser")
-	links = soup.select("a[href]")
-	return set([urljoin(url, x.get("href")) for x in links])
--- a/app/markdown/init.py
+++ b/app/markdown/init.py
@@ -0,0 +1,106 @@
+# ContentDB
+# Copyright (C) rubenwardy
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from typing import Sequence
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup
+from jinja2.utils import markupsafe
+from markdown_it import MarkdownIt
+from markdown_it.common.utils import unescapeAll, escapeHtml
+from markdown_it.token import Token
+from markdown_it.presets import gfm_like
+from pygments import highlight
+from pygments.lexers import get_lexer_by_name
+from pygments.formatters.html import HtmlFormatter
+
+from .cleaner import clean_html
+from .mention import init_mention
+
+
+def highlight_code(code, name, attrs):
+	if name == "":
+		return None
+
+	lexer = get_lexer_by_name(name)
+	formatter = HtmlFormatter()
+
+	return highlight(code, lexer, formatter)
+
+
+def render_code(self, tokens: Sequence[Token], idx, options, env):
+	token = tokens[idx]
+	info = unescapeAll(token.info).strip() if token.info else ""
+	langName = info.split(maxsplit=1)[0] if info else ""
+
+	if options.highlight:
+		return options.highlight(
+			token.content, langName, ""
+		) or f"<pre><code>{escapeHtml(token.content)}</code></pre>"
+
+	return f"<pre><code>{escapeHtml(token.content)}</code></pre>"
+
+
+
+gfm_like.make()
+md = MarkdownIt("gfm-like", {"highlight": highlight_code})
+md.add_render_rule("fence", render_code)
+init_mention(md)
+
+
+def render_markdown(source):
+	html = md.render(source)
+	return clean_html(html)
+
+
+def init_markdown(app):
+	@app.template_filter()
+	def markdown(source):
+		return markupsafe.Markup(render_markdown(source))
+
+
+def get_headings(html: str):
+	soup = BeautifulSoup(html, "html.parser")
+	headings = soup.find_all(["h1", "h2", "h3"])
+
+	root = []
+	stack = []
+	for heading in headings:
+		this = {"link": heading.get("id") or "", "text": heading.text, "children": []}
+		this_level = int(heading.name[1:]) - 1
+
+		while this_level <= len(stack):
+			stack.pop()
+
+		if len(stack) > 0:
+			stack[-1]["children"].append(this)
+		else:
+			root.append(this)
+
+		stack.append(this)
+
+	return root
+
+
+def get_user_mentions(html: str) -> set:
+	soup = BeautifulSoup(html, "html.parser")
+	links = soup.select("a[data-username]")
+	return set([x.get("data-username") for x in links])
+
+
+def get_links(html: str, url: str) -> set:
+	soup = BeautifulSoup(html, "html.parser")
+	links = soup.select("a[href]")
+	return set([urljoin(url, x.get("href")) for x in links])
--- a/app/markdown/cleaner.py
+++ b/app/markdown/cleaner.py
@@ -0,0 +1,78 @@
+# ContentDB
+# Copyright (C) rubenwardy
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from bleach import Cleaner
+
+
+# Based on
+# https://github.com/Wenzil/mdx_bleach/blob/master/mdx_bleach/whitelist.py
+#
+# License: MIT
+
+ALLOWED_TAGS = {
+	"h1", "h2", "h3", "h4", "h5", "h6", "hr",
+	"ul", "ol", "li",
+	"p",
+	"br",
+	"pre",
+	"code",
+	"blockquote",
+	"strong",
+	"em",
+	"a",
+	"img",
+	"table", "thead", "tbody", "tr", "th", "td",
+	"div", "span", "del", "s",
+	"details",
+	"summary",
+}
+
+ALLOWED_CSS = [
+	"highlight", "codehilite",
+	"hll", "c", "err", "g", "k", "l", "n", "o", "x", "p", "ch", "cm", "cp", "cpf", "c1", "cs",
+	"gd", "ge", "gr", "gh", "gi", "go", "gp", "gs", "gu", "gt", "kc", "kd", "kn", "kp", "kr",
+	"kt", "ld", "m", "s", "na", "nb", "nc", "no", "nd", "ni", "ne", "nf", "nl", "nn", "nx",
+	"py", "nt", "nv", "ow", "w", "mb", "mf", "mh", "mi", "mo", "sa", "sb", "sc", "dl", "sd",
+	"s2", "se", "sh", "si", "sx", "sr", "s1", "ss", "bp", "fm", "vc", "vg", "vi", "vm", "il",
+]
+
+
+def allow_class(_tag, name, value):
+	return name == "class" and value in ALLOWED_CSS
+
+
+ALLOWED_ATTRIBUTES = {
+	"h1": ["id"],
+	"h2": ["id"],
+	"h3": ["id"],
+	"h4": ["id"],
+	"a": ["href", "title", "data-username"],
+	"img": ["src", "title", "alt"],
+	"code": allow_class,
+	"div": allow_class,
+	"span": allow_class,
+	"table": ["id"],
+}
+
+ALLOWED_PROTOCOLS = {"http", "https", "mailto"}
+
+
+def clean_html(html: str):
+	cleaner = Cleaner(
+		tags=ALLOWED_TAGS,
+		attributes=ALLOWED_ATTRIBUTES,
+		protocols=ALLOWED_PROTOCOLS)
+	return cleaner.clean(html)
--- a/app/markdown/mention.py
+++ b/app/markdown/mention.py
@@ -0,0 +1,109 @@
+# ContentDB
+# Copyright (C) rubenwardy
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import re
+
+from flask import url_for
+from markdown_it import MarkdownIt
+from markdown_it.token import Token
+from markdown_it.rules_core.state_core import StateCore
+from typing import Sequence, List
+
+
+def render_user_mention(self, tokens: Sequence[Token], idx, options, env):
+	token = tokens[idx]
+	username = token.content
+	url = url_for("users.profile", username=username)
+	return f"<a href=\"{url}\" data-username=\"{username}\">@{username}</a>"
+
+
+def render_package_mention(self, tokens: Sequence[Token], idx, options, env):
+	token = tokens[idx]
+	username = token.content
+	name = token.attrs["name"]
+	url = url_for("packages.view", author=username, name=name)
+	return f"<a href=\"{url}\">@{username}/{name}</a>"
+
+
+def parse_mentions(state: StateCore):
+	for block_token in state.tokens:
+		if block_token.type != "inline" or block_token.children is None:
+			continue
+
+		link_depth = 0
+		html_link_depth = 0
+
+		children = []
+		for token in block_token.children:
+			if token.type == "link_open":
+				link_depth += 1
+			elif token.type == "link_close":
+				link_depth -= 1
+			elif token.type == "html_inline":
+				# is link open / close?
+				pass
+
+			if link_depth > 0 or html_link_depth > 0 or token.type != "text":
+				children.append(token)
+			else:
+				children.extend(split_tokens(token, state))
+
+		block_token.children = children
+
+
+RE_PARTS = dict(
+	USER=r"[A-Za-z0-9._-]*\b",
+	NAME=r"[A-Za-z0-9_]+\b"
+)
+MENTION_RE = r"(@({USER})(?:\/({NAME}))?)".format(**RE_PARTS)
+
+
+def split_tokens(token: Token, state: StateCore) -> List[Token]:
+	tokens = []
+	content = token.content
+	pos = 0
+	for match in re.finditer(MENTION_RE, content):
+		username = match.group(2)
+		package_name = match.group(3)
+		(start, end) = match.span(0)
+
+		if start > pos:
+			token_text = Token("text", "", 0)
+			token_text.content = content[pos:start]
+			token_text.level = token.level
+			tokens.append(token_text)
+
+		mention = Token("package_mention" if package_name else "user_mention", "", 0)
+		mention.content = username
+		mention.attrSet("name", package_name)
+		mention.level = token.level
+		tokens.append(mention)
+
+		pos = end
+
+	if pos < len(content):
+		token_text = Token("text", "", 0)
+		token_text.content = content[pos:]
+		token_text.level = token.level
+		tokens.append(token_text)
+
+	return tokens
+
+
+def init_mention(md: MarkdownIt):
+	md.add_render_rule("user_mention", render_user_mention, "html")
+	md.add_render_rule("package_mention", render_package_mention, "html")
+	md.core.ruler.after("inline", "mention", parse_mentions)
--- a/app/template_filters.py
+++ b/app/template_filters.py
@@ -23,7 +23,7 @@ from flask_login import current_user
 from markupsafe import Markup

 from . import app, utils
-from .markdown import get_headings
+from app.markdown import get_headings
 from .models import Permission, Package, PackageState, PackageRelease
 from .utils import abs_url_for, url_set_query, url_set_anchor, url_current
 from .utils.minetest_hypertext import normalize_whitespace as do_normalize_whitespace
--- a/requirements.lock.txt
+++ b/requirements.lock.txt
@@ -40,7 +40,8 @@ kombu==5.3.7
 libsass==0.23.0
 lxml==5.2.2
 Mako==1.3.5
-Markdown==3.6
+markdown-it-py==3.0.0
+linkify-it-py==2.0.3
 MarkupSafe==2.1.5
 packaging==24.0
 passlib==1.7.4
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,8 @@ GitHub-Flask
 SQLAlchemy-Searchable

 bcrypt
-markdown
+markdown-it-py
+linkify-it-py
 bleach
 passlib