From 2a0545210b081bf4d8d40a39a1c40a76e4caea57 Mon Sep 17 00:00:00 2001 From: rubenwardy Date: Wed, 19 Apr 2023 19:47:11 +0100 Subject: [PATCH] hypertext: Add support for nested lists --- app/tests/unit/test_minetest_hypertext.py | 9 +- app/utils/minetest_hypertext.py | 109 ++++++++++++---------- 2 files changed, 66 insertions(+), 52 deletions(-) diff --git a/app/tests/unit/test_minetest_hypertext.py b/app/tests/unit/test_minetest_hypertext.py index ce2b832a..963ff589 100644 --- a/app/tests/unit/test_minetest_hypertext.py +++ b/app/tests/unit/test_minetest_hypertext.py @@ -75,12 +75,17 @@ def test_bullets(): html = """ """ - expected = "• One\n• two three\n• four\n" + expected = "• One\n" \ + "• two three\n" \ + "• sub one\n" \ + "• sub two\n\n" \ + "• four\n" + result = html_to_minetest(html) assert result["body"].strip() == expected.strip() diff --git a/app/utils/minetest_hypertext.py b/app/utils/minetest_hypertext.py index 25068cdf..ca2db02e 100644 --- a/app/utils/minetest_hypertext.py +++ b/app/utils/minetest_hypertext.py @@ -24,52 +24,59 @@ def get_attributes(attrs): return retval +def make_indent(w): + return f"" + + class MinetestHTMLParser(HTMLParser): def __init__(self, include_images): super().__init__() self.include_images = include_images - self.text_buffer = "" - self.has_line_started = False + self.completed_text = "" + self.current_line = "" self.links = {} self.images = {} self.image_tooltips = {} self.is_preserving = False self.remove_until = None + self.indent_level = 0 + + def finish_line(self): + self.completed_text += self.current_line.rstrip() + "\n" + self.current_line = "" def handle_starttag(self, tag, attrs): if self.is_preserving or self.remove_until: return - print("OPEN", tag, file=sys.stderr) - - self.has_line_started = True if tag == "p": - self.has_line_started = False + pass elif tag == "pre": - self.text_buffer += "" + self.current_line += "" self.is_preserving = True - self.has_line_started = False elif tag == "table": # Tables are currently unsupported and removed self.remove_until = "table" - self.text_buffer += "(table removed)\n" + self.current_line += "(table removed)" + self.finish_line() elif tag == "br": - self.text_buffer += "\n" - self.has_line_started = False + self.finish_line() elif tag == "h1" or tag == "h2": - self.text_buffer += "\n" + self.finish_line() + self.current_line += "" elif tag == "h3" or tag == "h4" or tag == "h5": - self.text_buffer += "\n" + self.finish_line() + self.current_line += "" elif tag == "a": for attr in attrs: if attr[0] == "href": name = f"link_{len(self.links)}" self.links[name] = attr[1] - self.text_buffer += f"" + self.current_line += f"" break else: - self.text_buffer += "" + self.current_line += "" elif tag == "img": attr_by_value = get_attributes(attrs) if "src" in attr_by_value and self.include_images: @@ -77,23 +84,29 @@ class MinetestHTMLParser(HTMLParser): self.images[name] = attr_by_value["src"] width = attr_by_value.get("width", 128) height = attr_by_value.get("height", 128) - self.text_buffer += f"" + self.current_line += f"" if "alt" in attr_by_value: self.image_tooltips[name] = attr_by_value["alt"] elif tag == "b" or tag == "strong": - self.text_buffer += "" + self.current_line += "" elif tag == "i" or tag == "em": - self.text_buffer += "" + self.current_line += "" elif tag == "u": - self.text_buffer += "" + self.current_line += "" elif tag == "li": - self.has_line_started = False - self.text_buffer += "• " + if self.current_line.strip() != "": + self.finish_line() + else: + self.current_line = "" + + self.current_line += make_indent(self.indent_level) + "• " elif tag == "code": - self.text_buffer += "" - elif tag == "span" or tag == "ul": + self.current_line += "" + elif tag == "span": pass + elif tag == "ul": + self.indent_level += 1 else: print("UNKNOWN TAG ", tag, attrs, file=sys.stderr) @@ -103,52 +116,46 @@ class MinetestHTMLParser(HTMLParser): self.remove_until = None return - print("CLOSE", tag, file=sys.stderr) - if tag == "pre": - self.text_buffer = self.text_buffer.rstrip() - self.text_buffer += "\n" + self.current_line = self.current_line.rstrip() + "" + self.finish_line() self.is_preserving = False - self.has_line_started = False elif self.is_preserving: return elif tag == "p": - self.text_buffer = self.text_buffer.rstrip() - self.text_buffer += "\n" - self.has_line_started = False + self.current_line = self.current_line.rstrip() + self.finish_line() elif tag == "h1" or tag == "h2": - self.text_buffer += "\n" - self.has_line_started = False + self.current_line += "" + self.finish_line() elif tag == "h3" or tag == "h4" or tag == "h5": - self.text_buffer += "\n" - self.has_line_started = False + self.current_line += "" + self.finish_line() elif tag == "a": - self.text_buffer += "" + self.current_line += "" elif tag == "code": - self.text_buffer += "" + self.current_line += "" elif tag == "b" or tag == "strong": - self.text_buffer += "" + self.current_line += "" elif tag == "i" or tag == "em": - self.text_buffer += "" + self.current_line += "" elif tag == "u": - self.text_buffer += "" + self.current_line += "" elif tag == "li": - self.text_buffer += "\n" - # else: - # print("END", tag, file=sys.stderr) + self.finish_line() + elif tag == "ul": + self.indent_level = max(self.indent_level - 1, 0) def handle_data(self, data): - print(f"DATA \"{data}\"", file=sys.stderr) if self.remove_until: return if not self.is_preserving: data = normalize_whitespace(data) - if not self.has_line_started: + if self.current_line.strip() == "": data = data.lstrip() - self.text_buffer += data - self.has_line_started = True + self.current_line += data def handle_entityref(self, name): to_value = { @@ -160,17 +167,19 @@ class MinetestHTMLParser(HTMLParser): } if name in to_value: - self.text_buffer += to_value[name] + self.current_line += to_value[name] else: - self.text_buffer += f"&{name};" + self.current_line += f"&{name};" def html_to_minetest(html, formspec_version=6, include_images=True): parser = MinetestHTMLParser(include_images) parser.feed(html) + parser.finish_line() + return { "head": HEAD, - "body": parser.text_buffer.strip() + "\n\n", + "body": parser.completed_text.strip() + "\n", "links": parser.links, "images": parser.images, "image_tooltips": parser.image_tooltips,