]> git.lizzy.rs Git - cheatdb.git/blob - app/tasks/phpbbparser.py
Fix accidental regression in phpbbparser
[cheatdb.git] / app / tasks / phpbbparser.py
1 # Copyright (c) 2016  Andrew "rubenwardy" Ward
2 # License: MIT
3 # Source: https://github.com/rubenwardy/python_phpbb_parser
4
5 import urllib, socket
6 from bs4 import *
7 from urllib.parse import urljoin
8 from datetime import datetime
9 import urllib.request
10 import os.path
11 import time, re
12
13 def urlEncodeNonAscii(b):
14         return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
15
16 class Profile:
17         def __init__(self, username):
18                 self.username = username
19                 self.signature = ""
20                 self.properties = {}
21
22         def set(self, key, value):
23                 self.properties[key] = value
24
25         def get(self, key):
26                 return self.properties[key] if key in self.properties else None
27
28         def __str__(self):
29                 return self.username + "\n" + str(self.signature) + "\n" + str(self.properties)
30
31 def __extract_properties(profile, soup):
32         el = soup.find(id="viewprofile")
33         if el is None:
34                 return None
35
36         res = el.find_all("dl", class_ = "left-box details")
37         if len(res) != 1:
38                 return None
39
40         catch_next_key = None
41
42         # Look through
43         for element in res[0].children:
44                 if element.name == "dt":
45                         if catch_next_key is None:
46                                 catch_next_key = element.text.lower()[:-1].strip()
47                         else:
48                                 print("Unexpected dt!")
49
50                 elif element.name == "dd":
51                         if catch_next_key is None:
52                                 print("Unexpected dd!")
53                         else:
54                                 if catch_next_key != "groups":
55                                         profile.set(catch_next_key, element.text)
56                                 catch_next_key = None
57
58                 elif element and element.name is not None:
59                         print("Unexpected other")
60
61 def __extract_signature(soup):
62         res = soup.find_all("div", class_="signature")
63         if (len(res) != 1):
64                 return None
65         else:
66                 return res[0]
67
68 def getProfile(url, username):
69         url = url + "/memberlist.php?mode=viewprofile&un=" + urlEncodeNonAscii(username)
70
71         contents = urllib.request.urlopen(url).read().decode("utf-8")
72         soup = BeautifulSoup(contents, "lxml")
73         if soup is None:
74                 return None
75         else:
76                 profile = Profile(username)
77                 profile.signature = __extract_signature(soup)
78                 __extract_properties(profile, soup)
79
80                 return profile
81
82
83 regex_id = re.compile(r"^.*t=([0-9]+).*$")
84
85 def parseForumListPage(id, page, out, extra=None):
86         num_per_page = 30
87         start = page*num_per_page+1
88         print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))
89
90         url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
91         r = urllib.request.urlopen(url).read().decode("utf-8")
92         soup = BeautifulSoup(r, "html.parser")
93
94         for row in soup.find_all("li", class_="row"):
95                 classes = row.get("class")
96                 if "sticky" in classes or "announce" in classes or "global-announce" in classes:
97                         continue
98
99                 topic = row.find("dl")
100
101                 # Link info
102                 link   = topic.find(class_="topictitle")
103                 id         = regex_id.match(link.get("href")).group(1)
104                 title  = link.find(text=True)
105
106                 # Date
107                 left   = topic.find("dt")
108                 date   = left.get_text().split("ยป")[1].strip()
109                 date   = datetime.strptime(date, "%a %b %d, %Y %H:%M")
110                 author = left.find_all("a")[-1].get_text().strip()
111
112                 # Get counts
113                 posts  = topic.find(class_="posts").find(text=True)
114                 views  = topic.find(class_="views").find(text=True)
115
116                 if id in out:
117                         print("   - got {} again, title: {}".format(id, title))
118                         assert(title == out[id]['title'])
119                         return False
120
121                 row = {
122                         "id"    : id,
123                         "title" : title,
124                         "author": author,
125                         "posts" : posts,
126                         "views" : views,
127                         "date"  : date
128                 }
129
130                 if extra is not None:
131                         for key, value in extra.items():
132                                 row[key] = value
133
134                 out[id] = row
135
136         return True
137
138 def getTopicsFromForum(id, out={}, extra=None):
139         print("Fetching all topics from forum {}".format(id))
140         page = 0
141         while parseForumListPage(id, page, out, extra):
142                 page = page + 1
143
144         return out
145
146 def dumpTitlesToFile(topics, path):
147         with open(path, "w") as out_file:
148                 for topic in topics.values():
149                         out_file.write(topic["title"] + "\n")