1 # Copyright (c) 2016 Andrew "rubenwardy" Ward
3 # Source: https://github.com/rubenwardy/python_phpbb_parser
7 from urllib.parse import urljoin
8 from datetime import datetime
13 def urlEncodeNonAscii(b):
14 return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
17 def __init__(self, username):
18 self.username = username
22 def set(self, key, value):
23 self.properties[key] = value
26 return self.properties[key] if key in self.properties else None
29 return self.username + "\n" + str(self.signature) + "\n" + str(self.properties)
31 def __extract_properties(profile, soup):
32 el = soup.find(id="viewprofile")
36 res = el.find_all("dl", class_ = "left-box details")
43 for element in res[0].children:
44 if element.name == "dt":
45 if catch_next_key is None:
46 catch_next_key = element.text.lower()[:-1].strip()
48 print("Unexpected dt!")
50 elif element.name == "dd":
51 if catch_next_key is None:
52 print("Unexpected dd!")
54 if catch_next_key != "groups":
55 profile.set(catch_next_key, element.text)
58 elif element and element.name is not None:
59 print("Unexpected other")
61 def __extract_signature(soup):
62 res = soup.find_all("div", class_="signature")
68 def getProfile(url, username):
69 url = url + "/memberlist.php?mode=viewprofile&un=" + urlEncodeNonAscii(username)
71 contents = urllib.request.urlopen(url).read().decode("utf-8")
72 soup = BeautifulSoup(contents, "lxml")
76 profile = Profile(username)
77 profile.signature = __extract_signature(soup)
78 __extract_properties(profile, soup)
83 regex_id = re.compile(r"^.*t=([0-9]+).*$")
85 def parseForumListPage(id, page, out, extra=None):
87 start = page*num_per_page+1
88 print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))
90 url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
91 r = urllib.request.urlopen(url).read().decode("utf-8")
92 soup = BeautifulSoup(r, "html.parser")
94 for row in soup.find_all("li", class_="row"):
95 classes = row.get("class")
96 if "sticky" in classes or "announce" in classes or "global-announce" in classes:
99 topic = row.find("dl")
102 link = topic.find(class_="topictitle")
103 id = regex_id.match(link.get("href")).group(1)
104 title = link.find(text=True)
107 left = topic.find("dt")
108 date = left.get_text().split("ยป")[1].strip()
109 date = datetime.strptime(date, "%a %b %d, %Y %H:%M")
110 author = left.find_all("a")[-1].get_text().strip()
113 posts = topic.find(class_="posts").find(text=True)
114 views = topic.find(class_="views").find(text=True)
117 print(" - got {} again, title: {}".format(id, title))
118 assert(title == out[id]['title'])
130 if extra is not None:
131 for key, value in extra.items():
138 def getTopicsFromForum(id, out={}, extra=None):
139 print("Fetching all topics from forum {}".format(id))
141 while parseForumListPage(id, page, out, extra):
146 def dumpTitlesToFile(topics, path):
147 with open(path, "w") as out_file:
148 for topic in topics.values():
149 out_file.write(topic["title"] + "\n")