Implement forum parser to increase accuracy

2018-07-04 00:14:37 +01:00
parent eb6b1d6375
commit 19e1ed8b32
11 changed files with 226 additions and 86 deletions
--- a/app/tasks/phpbbparser.py
+++ b/app/tasks/phpbbparser.py
@@ -5,6 +5,7 @@
 import urllib, socket
 from bs4 import *
 from urllib.parse import urljoin
+from datetime import datetime
 import urllib.request
 import os.path
 import time, re
@@ -77,3 +78,72 @@ def getProfile(url, username):
 		__extract_properties(profile, soup)

 		return profile
+
+
+regex_id = re.compile(r"^.*t=([0-9]+).*$")
+
+def parseForumListPage(id, page, out, extra=None):
+	num_per_page = 30
+	start = page*num_per_page+1
+	print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))
+
+	url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
+	r = urllib.request.urlopen(url).read().decode("utf-8")
+	soup = BeautifulSoup(r, "html.parser")
+
+	for row in soup.find_all("li", class_="row"):
+		classes = row.get("class")
+		if "sticky" in classes or "announce" in classes or "global-announce" in classes:
+			continue
+
+		topic = row.find("dl")
+
+		# Link info
+		link   = topic.find(class_="topictitle")
+		id	   = regex_id.match(link.get("href")).group(1)
+		title  = link.find(text=True)
+
+		# Date
+		left   = topic.find("dt")
+		date   = left.get_text().split("»")[1].strip()
+		date   = datetime.strptime(date, "%a %b %d, %Y %H:%M")
+		author = left.find_all("a")[-1].get_text().strip()
+
+		# Get counts
+		posts  = topic.find(class_="posts").find(text=True)
+		views  = topic.find(class_="views").find(text=True)
+
+		if id in out:
+			print("   - got {} again, title: {}".format(id, title))
+			assert(title == out[id]['title'])
+			return False
+
+		row = {
+			"id"    : id,
+			"title" : title,
+			"author": author,
+			"posts" : posts,
+			"views" : views,
+			"date"  : date
+		}
+
+		if extra is not None:
+			for key, value in extra.items():
+				row[key] = value
+
+		out[id] = row
+
+	return True
+
+def getTopicsFromForum(id, out={}, extra=None):
+	print("Fetching all topics from forum {}".format(id))
+	page = 0
+	while parseForumListPage(id, page, out, extra):
+		page = page + 1
+
+	return out
+
+def dumpTitlesToFile(topics, path):
+	with open(path, "w") as out_file:
+		for topic in topics.values():
+			out_file.write(topic["title"] + "\n")