app/tasks/phpbbparser.py

   1 # Copyright (c) 2016  Andrew "rubenwardy" Ward
   2 # License: MIT
   3 # Source: https://github.com/rubenwardy/python_phpbb_parser
   4
   5 import urllib, socket
   6 from bs4 import *
   7 from urllib.parse import urljoin
   8 from datetime import datetime
   9 import urllib.request
  10 import os.path
  11 import time, re
  12
  13 def urlEncodeNonAscii(b):
  14         return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
  15
  16 class Profile:
  17         def __init__(self, username):
  18                 self.username = username
  19                 self.signature = ""
  20                 self.properties = {}
  21
  22         def set(self, key, value):
  23                 self.properties[key] = value
  24
  25         def get(self, key):
  26                 return self.properties[key] if key in self.properties else None
  27
  28         def __str__(self):
  29                 return self.username + "\n" + str(self.signature) + "\n" + str(self.properties)
  30
  31 def __extract_properties(profile, soup):
  32         el = soup.find(id="viewprofile")
  33         if el is None:
  34                 return None
  35
  36         res = el.find_all("dl", class_ = "left-box details")
  37         if len(res) != 1:
  38                 return None
  39
  40         catch_next_key = None
  41
  42         # Look through
  43         for element in res[0].children:
  44                 if element.name == "dt":
  45                         if catch_next_key is None:
  46                                 catch_next_key = element.text.lower()[:-1].strip()
  47                         else:
  48                                 print("Unexpected dt!")
  49
  50                 elif element.name == "dd":
  51                         if catch_next_key is None:
  52                                 print("Unexpected dd!")
  53                         else:
  54                                 if catch_next_key != "groups":
  55                                         profile.set(catch_next_key, element.text)
  56                                 catch_next_key = None
  57
  58                 elif element and element.name is not None:
  59                         print("Unexpected other")
  60
  61 def __extract_signature(soup):
  62         res = soup.find_all("div", class_="signature")
  63         if (len(res) != 1):
  64                 return None
  65         else:
  66                 return res[0]
  67
  68 def getProfile(url, username):
  69         url = url + "/memberlist.php?mode=viewprofile&un=" + urlEncodeNonAscii(username)
  70
  71         contents = urllib.request.urlopen(url).read().decode("utf-8")
  72         soup = BeautifulSoup(contents, "lxml")
  73         if soup is None:
  74                 return None
  75         else:
  76                 profile = Profile(username)
  77                 profile.signature = __extract_signature(soup)
  78                 __extract_properties(profile, soup)
  79
  80                 return profile
  81
  82
  83 regex_id = re.compile(r"^.*t=([0-9]+).*$")
  84
  85 def parseForumListPage(id, page, out, extra=None):
  86         num_per_page = 30
  87         start = page*num_per_page+1
  88         print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))
  89
  90         url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
  91         r = urllib.request.urlopen(url).read().decode("utf-8")
  92         soup = BeautifulSoup(r, "html.parser")
  93
  94         for row in soup.find_all("li", class_="row"):
  95                 classes = row.get("class")
  96                 if "sticky" in classes or "announce" in classes or "global-announce" in classes:
  97                         continue
  98
  99                 topic = row.find("dl")
 100
 101                 # Link info
 102                 link   = topic.find(class_="topictitle")
 103                 id         = regex_id.match(link.get("href")).group(1)
 104                 title  = link.find(text=True)
 105
 106                 # Date
 107                 left   = topic.find("dt")
 108                 date   = left.get_text().split("»")[1].strip()
 109                 date   = datetime.strptime(date, "%a %b %d, %Y %H:%M")
 110                 author = left.find_all("a")[-1].get_text().strip()
 111
 112                 # Get counts
 113                 posts  = topic.find(class_="posts").find(text=True)
 114                 views  = topic.find(class_="views").find(text=True)
 115
 116                 if id in out:
 117                         print("   - got {} again, title: {}".format(id, title))
 118                         assert(title == out[id]['title'])
 119                         return False
 120
 121                 row = {
 122                         "id"    : id,
 123                         "title" : title,
 124                         "author": author,
 125                         "posts" : posts,
 126                         "views" : views,
 127                         "date"  : date
 128                 }
 129
 130                 if extra is not None:
 131                         for key, value in extra.items():
 132                                 row[key] = value
 133
 134                 out[id] = row
 135
 136         return True
 137
 138 def getTopicsFromForum(id, out={}, extra=None):
 139         print("Fetching all topics from forum {}".format(id))
 140         page = 0
 141         while parseForumListPage(id, page, out, extra):
 142                 page = page + 1
 143
 144         return out
 145
 146 def dumpTitlesToFile(topics, path):
 147         with open(path, "w") as out_file:
 148                 for topic in topics.values():
 149                         out_file.write(topic["title"] + "\n")