Switch to markdown-it (#595)

Fixes #537, fixes #586
This commit is contained in:
rubenwardy
2025-06-03 22:41:20 +01:00
committed by GitHub
parent 98f27364f2
commit 8ed86b53ca
9 changed files with 301 additions and 223 deletions

View File

@@ -21,13 +21,12 @@ import redis
from flask import redirect, url_for, render_template, flash, request, Flask, send_from_directory, make_response, render_template_string
from flask_babel import Babel, gettext
from flask_flatpages import FlatPages
from flask_flatpages.utils import pygmented_markdown
from flask_github import GitHub
from flask_login import logout_user, current_user, LoginManager
from flask_mail import Mail
from flask_wtf.csrf import CSRFProtect
from app.markdown import init_markdown, MARKDOWN_EXTENSIONS, MARKDOWN_EXTENSION_CONFIG
from app.markdown import init_markdown, render_markdown
import sentry_sdk
from sentry_sdk.integrations.flask import FlaskIntegration
@@ -67,13 +66,11 @@ app = Flask(__name__, static_folder="public/static")
def my_flatpage_renderer(text):
# Render with jinja first
prerendered_body = render_template_string(text)
return pygmented_markdown(prerendered_body, flatpages=pages)
return render_markdown(prerendered_body)
app.config["FLATPAGES_ROOT"] = "flatpages"
app.config["FLATPAGES_EXTENSION"] = ".md"
app.config["FLATPAGES_MARKDOWN_EXTENSIONS"] = MARKDOWN_EXTENSIONS
app.config["FLATPAGES_EXTENSION_CONFIG"] = MARKDOWN_EXTENSION_CONFIG
app.config["FLATPAGES_HTML_RENDERER"] = my_flatpage_renderer
app.config["WTF_CSRF_TIME_LIMIT"] = None

View File

@@ -29,7 +29,7 @@ from app.models import Package, db, User, Permission, Thread, UserRank, AuditSev
from app.utils import add_notification, is_yes, add_audit_log, get_system_user, has_blocked_domains, \
normalize_line_endings
from flask_wtf import FlaskForm
from wtforms import StringField, TextAreaField, SubmitField, BooleanField
from wtforms import StringField, TextAreaField, SubmitField
from wtforms.validators import InputRequired, Length
from app.utils import get_int_or_abort

View File

@@ -1,214 +0,0 @@
# ContentDB
# Copyright (C) rubenwardy
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from functools import partial
from urllib.parse import urljoin
import bleach
from bleach import Cleaner
from bleach.linkifier import LinkifyFilter
from bs4 import BeautifulSoup
from markdown import Markdown
from flask import url_for
from jinja2.utils import markupsafe
from markdown.extensions import Extension
from markdown.inlinepatterns import SimpleTagInlineProcessor
from markdown.inlinepatterns import Pattern
from markdown.extensions.codehilite import CodeHiliteExtension
from xml.etree import ElementTree
# Based on
# https://github.com/Wenzil/mdx_bleach/blob/master/mdx_bleach/whitelist.py
#
# License: MIT
ALLOWED_TAGS = {
"h1", "h2", "h3", "h4", "h5", "h6", "hr",
"ul", "ol", "li",
"p",
"br",
"pre",
"code",
"blockquote",
"strong",
"em",
"a",
"img",
"table", "thead", "tbody", "tr", "th", "td",
"div", "span", "del", "s",
"details",
"summary",
}
ALLOWED_CSS = [
"highlight", "codehilite",
"hll", "c", "err", "g", "k", "l", "n", "o", "x", "p", "ch", "cm", "cp", "cpf", "c1", "cs",
"gd", "ge", "gr", "gh", "gi", "go", "gp", "gs", "gu", "gt", "kc", "kd", "kn", "kp", "kr",
"kt", "ld", "m", "s", "na", "nb", "nc", "no", "nd", "ni", "ne", "nf", "nl", "nn", "nx",
"py", "nt", "nv", "ow", "w", "mb", "mf", "mh", "mi", "mo", "sa", "sb", "sc", "dl", "sd",
"s2", "se", "sh", "si", "sx", "sr", "s1", "ss", "bp", "fm", "vc", "vg", "vi", "vm", "il",
]
def allow_class(_tag, name, value):
return name == "class" and value in ALLOWED_CSS
ALLOWED_ATTRIBUTES = {
"h1": ["id"],
"h2": ["id"],
"h3": ["id"],
"h4": ["id"],
"a": ["href", "title", "data-username"],
"img": ["src", "title", "alt"],
"code": allow_class,
"div": allow_class,
"span": allow_class,
"table": ["id"],
}
ALLOWED_PROTOCOLS = {"http", "https", "mailto"}
md = None
def linker_callback(attrs, new=False):
if new:
text = attrs.get("_text")
if not (text.startswith("http://") or text.startswith("https://")):
return None
return attrs
def render_markdown(source):
html = md.convert(source)
cleaner = Cleaner(
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
protocols=ALLOWED_PROTOCOLS,
filters=[partial(LinkifyFilter,
callbacks=[linker_callback] + bleach.linkifier.DEFAULT_CALLBACKS,
skip_tags={"pre", "code"})])
return cleaner.clean(html)
class DelInsExtension(Extension):
def extendMarkdown(self, md):
del_proc = SimpleTagInlineProcessor(r"(\~\~)(.+?)(\~\~)", "del")
md.inlinePatterns.register(del_proc, "del", 200)
ins_proc = SimpleTagInlineProcessor(r"(\+\+)(.+?)(\+\+)", "ins")
md.inlinePatterns.register(ins_proc, "ins", 200)
RE_PARTS = dict(
USER=r"[A-Za-z0-9._-]*\b",
REPO=r"[A-Za-z0-9_]+\b"
)
class MentionPattern(Pattern):
ANCESTOR_EXCLUDES = ("a",)
def __init__(self, config, md):
MENTION_RE = r"(@({USER})(?:\/({REPO}))?)".format(**RE_PARTS)
super(MentionPattern, self).__init__(MENTION_RE, md)
self.config = config
def handleMatch(self, m):
from app.models import User
label = m.group(2)
user = m.group(3)
package_name = m.group(4)
if package_name:
el = ElementTree.Element("a")
el.text = label
el.set("href", url_for("packages.view", author=user, name=package_name))
return el
else:
if User.query.filter_by(username=user).count() == 0:
return None
el = ElementTree.Element("a")
el.text = label
el.set("href", url_for("users.profile", username=user))
el.set("data-username", user)
return el
class MentionExtension(Extension):
def __init__(self, *args, **kwargs):
super(MentionExtension, self).__init__(*args, **kwargs)
def extendMarkdown(self, md):
md.ESCAPED_CHARS.append("@")
md.inlinePatterns.register(MentionPattern(self.getConfigs(), md), "mention", 20)
MARKDOWN_EXTENSIONS = ["fenced_code", "tables", CodeHiliteExtension(guess_lang=False), "toc", DelInsExtension(), MentionExtension()]
MARKDOWN_EXTENSION_CONFIG = {
"fenced_code": {},
"tables": {}
}
def init_markdown(app):
global md
md = Markdown(extensions=MARKDOWN_EXTENSIONS,
extension_configs=MARKDOWN_EXTENSION_CONFIG,
output_format="html")
@app.template_filter()
def markdown(source):
return markupsafe.Markup(render_markdown(source))
def get_headings(html: str):
soup = BeautifulSoup(html, "html.parser")
headings = soup.find_all(["h1", "h2", "h3"])
root = []
stack = []
for heading in headings:
this = {"link": heading.get("id") or "", "text": heading.text, "children": []}
this_level = int(heading.name[1:]) - 1
while this_level <= len(stack):
stack.pop()
if len(stack) > 0:
stack[-1]["children"].append(this)
else:
root.append(this)
stack.append(this)
return root
def get_user_mentions(html: str) -> set:
soup = BeautifulSoup(html, "html.parser")
links = soup.select("a[data-username]")
return set([x.get("data-username") for x in links])
def get_links(html: str, url: str) -> set:
soup = BeautifulSoup(html, "html.parser")
links = soup.select("a[href]")
return set([urljoin(url, x.get("href")) for x in links])

106
app/markdown/__init__.py Normal file
View File

@@ -0,0 +1,106 @@
# ContentDB
# Copyright (C) rubenwardy
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from typing import Sequence
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from jinja2.utils import markupsafe
from markdown_it import MarkdownIt
from markdown_it.common.utils import unescapeAll, escapeHtml
from markdown_it.token import Token
from markdown_it.presets import gfm_like
from pygments import highlight
from pygments.lexers import get_lexer_by_name
from pygments.formatters.html import HtmlFormatter
from .cleaner import clean_html
from .mention import init_mention
def highlight_code(code, name, attrs):
if name == "":
return None
lexer = get_lexer_by_name(name)
formatter = HtmlFormatter()
return highlight(code, lexer, formatter)
def render_code(self, tokens: Sequence[Token], idx, options, env):
token = tokens[idx]
info = unescapeAll(token.info).strip() if token.info else ""
langName = info.split(maxsplit=1)[0] if info else ""
if options.highlight:
return options.highlight(
token.content, langName, ""
) or f"<pre><code>{escapeHtml(token.content)}</code></pre>"
return f"<pre><code>{escapeHtml(token.content)}</code></pre>"
gfm_like.make()
md = MarkdownIt("gfm-like", {"highlight": highlight_code})
md.add_render_rule("fence", render_code)
init_mention(md)
def render_markdown(source):
html = md.render(source)
return clean_html(html)
def init_markdown(app):
@app.template_filter()
def markdown(source):
return markupsafe.Markup(render_markdown(source))
def get_headings(html: str):
soup = BeautifulSoup(html, "html.parser")
headings = soup.find_all(["h1", "h2", "h3"])
root = []
stack = []
for heading in headings:
this = {"link": heading.get("id") or "", "text": heading.text, "children": []}
this_level = int(heading.name[1:]) - 1
while this_level <= len(stack):
stack.pop()
if len(stack) > 0:
stack[-1]["children"].append(this)
else:
root.append(this)
stack.append(this)
return root
def get_user_mentions(html: str) -> set:
soup = BeautifulSoup(html, "html.parser")
links = soup.select("a[data-username]")
return set([x.get("data-username") for x in links])
def get_links(html: str, url: str) -> set:
soup = BeautifulSoup(html, "html.parser")
links = soup.select("a[href]")
return set([urljoin(url, x.get("href")) for x in links])

78
app/markdown/cleaner.py Normal file
View File

@@ -0,0 +1,78 @@
# ContentDB
# Copyright (C) rubenwardy
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from bleach import Cleaner
# Based on
# https://github.com/Wenzil/mdx_bleach/blob/master/mdx_bleach/whitelist.py
#
# License: MIT
ALLOWED_TAGS = {
"h1", "h2", "h3", "h4", "h5", "h6", "hr",
"ul", "ol", "li",
"p",
"br",
"pre",
"code",
"blockquote",
"strong",
"em",
"a",
"img",
"table", "thead", "tbody", "tr", "th", "td",
"div", "span", "del", "s",
"details",
"summary",
}
ALLOWED_CSS = [
"highlight", "codehilite",
"hll", "c", "err", "g", "k", "l", "n", "o", "x", "p", "ch", "cm", "cp", "cpf", "c1", "cs",
"gd", "ge", "gr", "gh", "gi", "go", "gp", "gs", "gu", "gt", "kc", "kd", "kn", "kp", "kr",
"kt", "ld", "m", "s", "na", "nb", "nc", "no", "nd", "ni", "ne", "nf", "nl", "nn", "nx",
"py", "nt", "nv", "ow", "w", "mb", "mf", "mh", "mi", "mo", "sa", "sb", "sc", "dl", "sd",
"s2", "se", "sh", "si", "sx", "sr", "s1", "ss", "bp", "fm", "vc", "vg", "vi", "vm", "il",
]
def allow_class(_tag, name, value):
return name == "class" and value in ALLOWED_CSS
ALLOWED_ATTRIBUTES = {
"h1": ["id"],
"h2": ["id"],
"h3": ["id"],
"h4": ["id"],
"a": ["href", "title", "data-username"],
"img": ["src", "title", "alt"],
"code": allow_class,
"div": allow_class,
"span": allow_class,
"table": ["id"],
}
ALLOWED_PROTOCOLS = {"http", "https", "mailto"}
def clean_html(html: str):
cleaner = Cleaner(
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
protocols=ALLOWED_PROTOCOLS)
return cleaner.clean(html)

109
app/markdown/mention.py Normal file
View File

@@ -0,0 +1,109 @@
# ContentDB
# Copyright (C) rubenwardy
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import re
from flask import url_for
from markdown_it import MarkdownIt
from markdown_it.token import Token
from markdown_it.rules_core.state_core import StateCore
from typing import Sequence, List
def render_user_mention(self, tokens: Sequence[Token], idx, options, env):
token = tokens[idx]
username = token.content
url = url_for("users.profile", username=username)
return f"<a href=\"{url}\" data-username=\"{username}\">@{username}</a>"
def render_package_mention(self, tokens: Sequence[Token], idx, options, env):
token = tokens[idx]
username = token.content
name = token.attrs["name"]
url = url_for("packages.view", author=username, name=name)
return f"<a href=\"{url}\">@{username}/{name}</a>"
def parse_mentions(state: StateCore):
for block_token in state.tokens:
if block_token.type != "inline" or block_token.children is None:
continue
link_depth = 0
html_link_depth = 0
children = []
for token in block_token.children:
if token.type == "link_open":
link_depth += 1
elif token.type == "link_close":
link_depth -= 1
elif token.type == "html_inline":
# is link open / close?
pass
if link_depth > 0 or html_link_depth > 0 or token.type != "text":
children.append(token)
else:
children.extend(split_tokens(token, state))
block_token.children = children
RE_PARTS = dict(
USER=r"[A-Za-z0-9._-]*\b",
NAME=r"[A-Za-z0-9_]+\b"
)
MENTION_RE = r"(@({USER})(?:\/({NAME}))?)".format(**RE_PARTS)
def split_tokens(token: Token, state: StateCore) -> List[Token]:
tokens = []
content = token.content
pos = 0
for match in re.finditer(MENTION_RE, content):
username = match.group(2)
package_name = match.group(3)
(start, end) = match.span(0)
if start > pos:
token_text = Token("text", "", 0)
token_text.content = content[pos:start]
token_text.level = token.level
tokens.append(token_text)
mention = Token("package_mention" if package_name else "user_mention", "", 0)
mention.content = username
mention.attrSet("name", package_name)
mention.level = token.level
tokens.append(mention)
pos = end
if pos < len(content):
token_text = Token("text", "", 0)
token_text.content = content[pos:]
token_text.level = token.level
tokens.append(token_text)
return tokens
def init_mention(md: MarkdownIt):
md.add_render_rule("user_mention", render_user_mention, "html")
md.add_render_rule("package_mention", render_package_mention, "html")
md.core.ruler.after("inline", "mention", parse_mentions)

View File

@@ -23,7 +23,7 @@ from flask_login import current_user
from markupsafe import Markup
from . import app, utils
from .markdown import get_headings
from app.markdown import get_headings
from .models import Permission, Package, PackageState, PackageRelease
from .utils import abs_url_for, url_set_query, url_set_anchor, url_current
from .utils.minetest_hypertext import normalize_whitespace as do_normalize_whitespace

View File

@@ -40,7 +40,8 @@ kombu==5.3.7
libsass==0.23.0
lxml==5.2.2
Mako==1.3.5
Markdown==3.6
markdown-it-py==3.0.0
linkify-it-py==2.0.3
MarkupSafe==2.1.5
packaging==24.0
passlib==1.7.4

View File

@@ -10,7 +10,8 @@ GitHub-Flask
SQLAlchemy-Searchable
bcrypt
markdown
markdown-it-py
linkify-it-py
bleach
passlib