]> git.lizzy.rs Git - cheatdb.git/blob - app/tasks/importtasks.py
Implement forum parser to increase accuracy
[cheatdb.git] / app / tasks / importtasks.py
1 # Content DB
2 # Copyright (C) 2018  rubenwardy
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17
18 import flask, json, os, git, tempfile, shutil
19 from git import GitCommandError
20 from flask.ext.sqlalchemy import SQLAlchemy
21 from urllib.error import HTTPError
22 import urllib.request
23 from urllib.parse import urlparse, quote_plus, urlsplit
24 from app import app
25 from app.models import *
26 from app.tasks import celery, TaskError
27 from app.utils import randomString
28
29
30 class GithubURLMaker:
31         def __init__(self, url):
32                 # Rewrite path
33                 import re
34                 m = re.search("^\/([^\/]+)\/([^\/]+)\/?$", url.path)
35                 if m is None:
36                         return
37
38                 user = m.group(1)
39                 repo = m.group(2).replace(".git", "")
40                 self.baseUrl = "https://raw.githubusercontent.com/{}/{}/master" \
41                                 .format(user, repo)
42                 self.user = user
43                 self.repo = repo
44
45         def isValid(self):
46                 return self.baseUrl is not None
47
48         def getRepoURL(self):
49                 return "https://github.com/{}/{}".format(self.user, self.repo)
50
51         def getScreenshotURL(self):
52                 return self.baseUrl + "/screenshot.png"
53
54         def getCommitsURL(self, branch):
55                 return "https://api.github.com/repos/{}/{}/commits?sha={}" \
56                                 .format(self.user, self.repo, urllib.parse.quote_plus(branch))
57
58         def getCommitDownload(self, commit):
59                 return "https://github.com/{}/{}/archive/{}.zip" \
60                                 .format(self.user, self.repo, commit)
61
62 krock_list_cache = None
63 krock_list_cache_by_name = None
64 def getKrockList():
65         global krock_list_cache
66         global krock_list_cache_by_name
67
68         if krock_list_cache is None:
69                 contents = urllib.request.urlopen("http://krock-works.16mb.com/MTstuff/modList.php").read().decode("utf-8")
70                 list = json.loads(contents)
71
72                 def h(x):
73                         if not ("title"   in x and "author" in x and \
74                                         "topicId" in x and "link"   in x and x["link"] != ""):
75                                 return False
76
77                         import re
78                         m = re.search("\[([A-Za-z0-9_]+)\]", x["title"])
79                         if m is None:
80                                 return False
81
82                         x["name"] = m.group(1)
83                         return True
84
85                 def g(x):
86                         return {
87                                 "title":   x["title"],
88                                 "author":  x["author"],
89                                 "name": x["name"],
90                                 "topicId": x["topicId"],
91                                 "link": x["link"],
92                         }
93
94                 krock_list_cache = [g(x) for x in list if h(x)]
95                 krock_list_cache_by_name = {}
96                 for x in krock_list_cache:
97                         if not x["name"] in krock_list_cache_by_name:
98                                 krock_list_cache_by_name[x["name"]] = []
99
100                         krock_list_cache_by_name[x["name"]].append(x)
101
102         return krock_list_cache, krock_list_cache_by_name
103
104 def findModInfo(author, name, link):
105         list, lookup = getKrockList()
106
107         if name is not None and name in lookup:
108                 if len(lookup[name]) == 1:
109                         return lookup[name][0]
110
111                 for x in lookup[name]:
112                         if x["author"] == author:
113                                 return x
114
115         if link is not None and len(link) > 15:
116                 for x in list:
117                         if link in x["link"]:
118                                 return x
119
120         return None
121
122
123 def parseConf(string):
124         retval = {}
125         for line in string.split("\n"):
126                 idx = line.find("=")
127                 if idx > 0:
128                         key   = line[:idx].strip()
129                         value = line[idx+1:].strip()
130                         retval[key] = value
131
132         return retval
133
134
135 class PackageTreeNode:
136         def __init__(self, baseDir, author=None, repo=None, name=None):
137                 print("Scanning " + baseDir)
138                 self.baseDir  = baseDir
139                 self.author   = author
140                 self.name        = name
141                 self.repo        = repo
142                 self.meta        = None
143                 self.children = []
144
145                 # Detect type
146                 type = None
147                 is_modpack = False
148                 if os.path.isfile(baseDir + "/game.conf"):
149                         type = PackageType.GAME
150                 elif os.path.isfile(baseDir + "/init.lua"):
151                         type = PackageType.MOD
152                 elif os.path.isfile(baseDir + "/modpack.txt"):
153                         type = PackageType.MOD
154                         is_modpack = True
155                 elif os.path.isdir(baseDir + "/mods"):
156                         type = PackageType.GAME
157                 elif os.listdir(baseDir) == []:
158                         # probably a submodule
159                         return
160                 else:
161                         raise TaskError("Unable to detect package type!")
162
163                 self.type = type
164                 self.readMetaFiles()
165
166                 if self.type == PackageType.GAME:
167                         self.addChildrenFromModDir(baseDir + "/mods")
168                 elif is_modpack:
169                         self.addChildrenFromModDir(baseDir)
170
171
172         def readMetaFiles(self):
173                 result = {}
174
175                 # .conf file
176                 try:
177                         with open(self.baseDir + "/mod.conf", "r") as myfile:
178                                 conf = parseConf(myfile.read())
179                                 for key in ["name", "description", "title", "depends", "optional_depends"]:
180                                         try:
181                                                 result[key] = conf[key]
182                                         except KeyError:
183                                                 pass
184                 except IOError:
185                         print("description.txt does not exist!")
186
187                 # description.txt
188                 if not "description" in result:
189                         try:
190                                 with open(self.baseDir + "/description.txt", "r") as myfile:
191                                         result["description"] = myfile.read()
192                         except IOError:
193                                 print("description.txt does not exist!")
194
195                 # depends.txt
196                 import re
197                 pattern = re.compile("^([a-z0-9_]+)\??$")
198                 if not "depends" in result and not "optional_depends" in result:
199                         try:
200                                 with open(self.baseDir + "/depends.txt", "r") as myfile:
201                                         contents = myfile.read()
202                                         soft = []
203                                         hard = []
204                                         for line in contents.split("\n"):
205                                                 line = line.strip()
206                                                 if pattern.match(line):
207                                                         if line[len(line) - 1] == "?":
208                                                                 soft.append( line[:-1])
209                                                         else:
210                                                                 hard.append(line)
211
212                                         result["depends"] = hard
213                                         result["optional_depends"] = soft
214
215                         except IOError:
216                                 print("depends.txt does not exist!")
217
218                 else:
219                         if "depends" in result:
220                                 result["depends"] = [x.strip() for x in result["depends"].split(",")]
221                         if "optional_depends" in result:
222                                 result["optional_depends"] = [x.strip() for x in result["optional_depends"].split(",")]
223
224
225                 # Calculate Title
226                 if "name" in result and not "title" in result:
227                         result["title"] = result["name"].replace("_", " ").title()
228
229                 # Calculate short description
230                 if "description" in result:
231                         desc = result["description"]
232                         idx = desc.find(".") + 1
233                         cutIdx = min(len(desc), 200 if idx < 5 else idx)
234                         result["short_description"] = desc[:cutIdx]
235
236                 # Get forum ID
237                 info = findModInfo(self.author, result.get("name"), self.repo)
238                 if info is not None:
239                         result["forumId"] = info.get("topicId")
240
241                 if "name" in result:
242                         self.name = result["name"]
243                         del result["name"]
244
245                 self.meta = result
246
247         def addChildrenFromModDir(self, dir):
248                 for entry in next(os.walk(dir))[1]:
249                         path = dir + "/" + entry
250                         if not entry.startswith('.') and os.path.isdir(path):
251                                 self.children.append(PackageTreeNode(path, name=entry))
252
253
254         def fold(self, attr, key=None, acc=None):
255                 if acc is None:
256                         acc = set()
257
258                 if self.meta is None:
259                         return acc
260
261                 at = getattr(self, attr)
262                 value = at if key is None else at.get(key)
263
264                 if isinstance(value, list):
265                         acc |= set(value)
266                 elif value is not None:
267                         acc.add(value)
268
269                 for child in self.children:
270                         child.fold(attr, key, acc)
271
272                 return acc
273
274         def get(self, key):
275                 return self.meta.get(key)
276
277 def generateGitURL(urlstr):
278         scheme, netloc, path, query, frag = urlsplit(urlstr)
279
280         return "http://:@" + netloc + path + query
281
282 # Clones a repo from an unvalidated URL.
283 # Returns a tuple of path and repo on sucess.
284 # Throws `TaskError` on failure.
285 # Caller is responsible for deleting returned directory.
286 def cloneRepo(urlstr, ref=None, recursive=False):
287         gitDir = tempfile.gettempdir() + "/" + randomString(10)
288
289         err = None
290         try:
291                 gitUrl = generateGitURL(urlstr)
292                 print("Cloning from " + gitUrl)
293                 repo = git.Repo.clone_from(gitUrl, gitDir, \
294                                 progress=None, env=None, depth=1, recursive=recursive, kill_after_timeout=15)
295
296                 if ref is not None:
297                         repo.create_head("myhead", ref).checkout()
298                 return gitDir, repo
299         except GitCommandError as e:
300                 # This is needed to stop the backtrace being weird
301                 err = e.stderr
302
303         raise TaskError(err.replace("stderr: ", "") \
304                         .replace("Cloning into '" + gitDir + "'...", "") \
305                         .strip())
306
307 @celery.task()
308 def getMeta(urlstr, author):
309         gitDir, _ = cloneRepo(urlstr, recursive=True)
310         tree = PackageTreeNode(gitDir, author=author, repo=urlstr)
311         shutil.rmtree(gitDir)
312
313         result = {}
314         result["name"] = tree.name
315         result["provides"] = tree.fold("name")
316         result["type"] = tree.type.name
317
318         for key in ["depends", "optional_depends"]:
319                 result[key] = tree.fold("meta", key)
320
321         for key in ["title", "repo", "issueTracker", "forumId", "description", "short_description"]:
322                 result[key] = tree.get(key)
323
324         for mod in result["provides"]:
325                 result["depends"].discard(mod)
326                 result["optional_depends"].discard(mod)
327
328         for key, value in result.items():
329                 if isinstance(value, set):
330                         result[key] = list(value)
331
332         return result
333
334
335 def makeVCSReleaseFromGithub(id, branch, release, url):
336         urlmaker = GithubURLMaker(url)
337         if not urlmaker.isValid():
338                 raise TaskError("Invalid github repo URL")
339
340         commitsURL = urlmaker.getCommitsURL(branch)
341         contents = urllib.request.urlopen(commitsURL).read().decode("utf-8")
342         commits = json.loads(contents)
343
344         if len(commits) == 0 or not "sha" in commits[0]:
345                 raise TaskError("No commits found")
346
347         release.url = urlmaker.getCommitDownload(commits[0]["sha"])
348         print(release.url)
349         release.task_id = None
350         db.session.commit()
351
352         return release.url
353
354
355
356 @celery.task()
357 def makeVCSRelease(id, branch):
358         release = PackageRelease.query.get(id)
359         if release is None:
360                 raise TaskError("No such release!")
361         elif release.package is None:
362                 raise TaskError("No package attached to release")
363
364         urlmaker = None
365         url = urlparse(release.package.repo)
366         if url.netloc == "github.com":
367                 return makeVCSReleaseFromGithub(id, branch, release, url)
368         else:
369                 gitDir, repo = cloneRepo(release.package.repo, ref=branch, recursive=True)
370
371                 try:
372                         filename = randomString(10) + ".zip"
373                         destPath = os.path.join("app/public/uploads", filename)
374                         with open(destPath, "wb") as fp:
375                                 repo.archive(fp)
376
377                         release.url = "/uploads/" + filename
378                         print(release.url)
379                         release.task_id = None
380                         db.session.commit()
381
382                         return release.url
383                 finally:
384                         shutil.rmtree(gitDir)
385
386 @celery.task()
387 def importRepoScreenshot(id):
388         package = Package.query.get(id)
389         if package is None or package.soft_deleted:
390                 raise Exception("Unexpected none package")
391
392         # Get URL Maker
393         try:
394                 gitDir, _ = cloneRepo(package.repo)
395         except TaskError as e:
396                 # ignore download errors
397                 print(e)
398                 return None
399
400         # Find and import screenshot
401         try:
402                 for ext in ["png", "jpg", "jpeg"]:
403                         sourcePath = gitDir + "/screenshot." + ext
404                         if os.path.isfile(sourcePath):
405                                 filename = randomString(10) + "." + ext
406                                 destPath = os.path.join("app/public/uploads", filename)
407                                 shutil.copyfile(sourcePath, destPath)
408
409                                 ss = PackageScreenshot()
410                                 ss.approved = True
411                                 ss.package = package
412                                 ss.title   = "screenshot.png"
413                                 ss.url   = "/uploads/" + filename
414                                 db.session.add(ss)
415                                 db.session.commit()
416
417                                 return "/uploads/" + filename
418         finally:
419                 shutil.rmtree(gitDir)
420
421         print("screenshot.png does not exist")
422         return None
423
424
425
426 def getDepends(package):
427         url = urlparse(package.repo)
428         urlmaker = None
429         if url.netloc == "github.com":
430                 urlmaker = GithubURLMaker(url)
431         else:
432                 return {}
433
434         result = {}
435         if not urlmaker.isValid():
436                 return {}
437
438         #
439         # Try getting depends on mod.conf
440         #
441         try:
442                 contents = urllib.request.urlopen(urlmaker.getModConfURL()).read().decode("utf-8")
443                 conf = parseConf(contents)
444                 for key in ["depends", "optional_depends"]:
445                         try:
446                                 result[key] = conf[key]
447                         except KeyError:
448                                 pass
449
450         except HTTPError:
451                 print("mod.conf does not exist")
452
453         if "depends" in result or "optional_depends" in result:
454                 return result
455
456
457         #
458         # Try depends.txt
459         #
460         import re
461         pattern = re.compile("^([a-z0-9_]+)\??$")
462         try:
463                 contents = urllib.request.urlopen(urlmaker.getDependsURL()).read().decode("utf-8")
464                 soft = []
465                 hard = []
466                 for line in contents.split("\n"):
467                         line = line.strip()
468                         if pattern.match(line):
469                                 if line[len(line) - 1] == "?":
470                                         soft.append( line[:-1])
471                                 else:
472                                         hard.append(line)
473
474                 result["depends"] = ",".join(hard)
475                 result["optional_depends"] = ",".join(soft)
476         except HTTPError:
477                 print("depends.txt does not exist")
478
479         return result
480
481
482 def importDependencies(package, mpackage_cache):
483         if Dependency.query.filter_by(depender=package).count() != 0:
484                 return
485
486         result = getDepends(package)
487
488         if "depends" in result:
489                 deps = Dependency.SpecToList(package, result["depends"], mpackage_cache)
490                 print("{} hard: {}".format(len(deps), result["depends"]))
491                 for dep in deps:
492                         dep.optional = False
493                         db.session.add(dep)
494
495         if "optional_depends" in result:
496                 deps = Dependency.SpecToList(package, result["optional_depends"], mpackage_cache)
497                 print("{} soft: {}".format(len(deps), result["optional_depends"]))
498                 for dep in deps:
499                         dep.optional = True
500                         db.session.add(dep)
501
502 @celery.task()
503 def importAllDependencies():
504         Dependency.query.delete()
505         mpackage_cache = {}
506         packages = Package.query.filter_by(type=PackageType.MOD).all()
507         for i, p in enumerate(packages):
508                 print("============= {} ({}/{}) =============".format(p.name, i, len(packages)))
509                 importDependencies(p, mpackage_cache)
510
511         db.session.commit()