]> git.lizzy.rs Git - cheatdb.git/commitdiff
Implement forum parser to increase accuracy
authorrubenwardy <rw@rubenwardy.com>
Tue, 3 Jul 2018 23:14:37 +0000 (00:14 +0100)
committerrubenwardy <rw@rubenwardy.com>
Tue, 3 Jul 2018 23:38:51 +0000 (00:38 +0100)
app/models.py
app/tasks/forumtasks.py
app/tasks/phpbbparser.py
app/templates/admin/list.html
app/templates/macros/topictable.html
app/templates/packages/view.html
app/views/admin.py
app/views/packages/__init__.py
app/views/packages/todo.py
app/views/users.py
migrations/versions/9fc23495713b_.py [new file with mode: 0644]

index 5332dbdede0d0b9ce5c54af494c646161dfeaf95..b5b48366e54af08caa35830c6b06ac0312820691 100644 (file)
@@ -743,23 +743,25 @@ REPO_BLACKLIST = [".zip", "mediafire.com", "dropbox.com", "weebly.com", \
                "digitalaudioconcepts.com", "hg.intevation.org", "www.wtfpl.net", \
                "imageshack.com", "imgur.com"]
 
-class KrockForumTopic(db.Model):
+class ForumTopic(db.Model):
        topic_id  = db.Column(db.Integer, primary_key=True, autoincrement=False)
        author_id = db.Column(db.Integer, db.ForeignKey("user.id"), nullable=False)
        author    = db.relationship("User")
 
-       ttype     = db.Column(db.Integer, nullable=False)
+       type      = db.Column(db.Enum(PackageType), nullable=False)
        title     = db.Column(db.String(200), nullable=False)
        name      = db.Column(db.String(30), nullable=True)
        link      = db.Column(db.String(200), nullable=True)
 
-       def getType(self):
-               if self.ttype == 1 or self.ttype == 2:
-                       return PackageType.MOD
-               elif self.ttype == 6:
-                       return PackageType.GAME
+       posts     = db.Column(db.Integer, nullable=False)
+       views     = db.Column(db.Integer, nullable=False)
+
+       created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow)
 
        def getRepoURL(self):
+               if self.link is None:
+                       return None
+
                for item in REPO_BLACKLIST:
                        if item in self.link:
                                return None
index b2e0ca8838b2c66a32a4da7b826b202bccab0144..5513fb2b4cddf79cff5b02588c588687bb6c9c31 100644 (file)
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 
-import flask, json
+import flask, json, re
 from flask.ext.sqlalchemy import SQLAlchemy
 from app import app
 from app.models import *
 from app.tasks import celery
-from .phpbbparser import getProfile
+from .phpbbparser import getProfile, getTopicsFromForum
 import urllib.request
 from urllib.parse import urlparse, quote_plus
 
@@ -51,71 +51,88 @@ def checkForumAccount(username, token=None):
        if needsSaving:
                db.session.commit()
 
-@celery.task()
-def importUsersFromModList():
-       contents = urllib.request.urlopen("http://krock-works.16mb.com/MTstuff/modList.php").read().decode("utf-8")
-       list = json.loads(contents)
-       found = {}
-       imported = []
-
-       for user in User.query.all():
-               found[user.username] = True
-               if user.forums_username is not None:
-                       found[user.forums_username] = True
-
-       for x in list:
-               author = x.get("author")
-               if author is not None and not author in found:
-                       user = User(author)
-                       user.forums_username = author
-                       imported.append(author)
-                       found[author] = True
-                       db.session.add(user)
 
-       db.session.commit()
-       for author in found:
-               checkForumAccount.delay(author, None)
+regex_tag    = re.compile(r"\[([a-z0-9_]+)\]")
+BANNED_NAMES = ["mod", "game", "old", "outdated", "wip", "api", "beta", "alpha", "git"]
+def getNameFromTaglist(taglist):
+       for tag in reversed(regex_tag.findall(taglist)):
+               if len(tag) < 30 and not tag in BANNED_NAMES and \
+                               not re.match(r"^[a-z]?[0-9]+$", tag):
+                       return tag
 
+       return None
 
-BANNED_NAMES = ["mod", "game", "old", "outdated", "wip", "api"]
-ALLOWED_TYPES = [1, 2, 6]
+regex_title = re.compile(r"^((?:\[[^\]]+\] *)*)([^\[]+) *((?:\[[^\]]+\] *)*)[^\[]*$")
+def parseTitle(title):
+       m = regex_title.match(title)
+       if m is None:
+               print("Invalid title format: " + title)
+               return title, getNameFromTaglist(title)
+       else:
+               return m.group(2).strip(), getNameFromTaglist(m.group(3))
+
+def getLinksFromModSearch():
+       links = {}
 
-@celery.task()
-def importKrocksModList():
        contents = urllib.request.urlopen("http://krock-works.16mb.com/MTstuff/modList.php").read().decode("utf-8")
-       list = json.loads(contents)
-       username_to_user = {}
+       for x in json.loads(contents):
+               link = x.get("link")
+               if link is not None:
+                       links[int(x["topicId"])] = link
 
-       KrockForumTopic.query.delete()
+       return links
+
+@celery.task()
+def importTopicList():
+       links_by_id = getLinksFromModSearch()
+
+       info_by_id = {}
+       getTopicsFromForum(11, out=info_by_id, extra={ 'type': PackageType.MOD })
+       getTopicsFromForum(15, out=info_by_id, extra={ 'type': PackageType.GAME })
+
+       # Caches
+       username_to_user = {}
+       topics_by_id     = {}
+       for topic in ForumTopic.query.all():
+               topics_by_id[topic.topic_id] = topic
 
-       for x in list:
-               type = int(x["type"])
-               if not type in ALLOWED_TYPES:
-                       continue
+       # Create or update
+       for info in info_by_id.values():
+               id = int(info["id"])
 
-               username = x["author"]
+               # Get author
+               username = info["author"]
                user = username_to_user.get(username)
                if user is None:
                        user = User.query.filter_by(forums_username=username).first()
-                       assert(user is not None)
+                       if user is None:
+                               print(username + " not found!")
+                               user = User(username)
+                               user.forums_username = username
+                               db.session.add(user)
                        username_to_user[username] = user
 
-               import re
-               tags = re.findall("\[([a-z0-9_]+)\]", x["title"])
-               name = None
-               for tag in reversed(tags):
-                       if len(tag) < 30 and not tag in BANNED_NAMES and \
-                                       not re.match("^([a-z][0-9]+)$", tag):
-                               name = tag
-                               break
-
-               topic = KrockForumTopic()
-               topic.topic_id  = x["topicId"]
-               topic.author_id = user.id
-               topic.ttype     = type
-               topic.title     = x["title"]
-               topic.name      = name
-               topic.link      = x.get("link")
-               db.session.add(topic)
+               # Get / add row
+               topic = topics_by_id.get(id)
+               if topic is None:
+                       topic = ForumTopic()
+                       db.session.add(topic)
+
+               # Parse title
+               title, name = parseTitle(info["title"])
+
+               # Get link
+               link = links_by_id.get(id)
+
+               # Fill row
+               topic.topic_id   = id
+               topic.author     = user
+               topic.type       = info["type"]
+               topic.title      = title
+               topic.name       = name
+               topic.link       = link
+               topic.posts      = info["posts"]
+               topic.views      = info["views"]
+               topic.created_at = info["date"]
 
        db.session.commit()
index d27ccecb8fc2de9efbe473b1f11e6a805ebe12e6..9984ad054dc71059ee577492663f2e1a9b1ec685 100644 (file)
@@ -5,6 +5,7 @@
 import urllib, socket
 from bs4 import *
 from urllib.parse import urljoin
+from datetime import datetime
 import urllib.request
 import os.path
 import time, re
@@ -77,3 +78,72 @@ def getProfile(url, username):
                __extract_properties(profile, soup)
 
                return profile
+
+
+regex_id = re.compile(r"^.*t=([0-9]+).*$")
+
+def parseForumListPage(id, page, out, extra=None):
+       num_per_page = 30
+       start = page*num_per_page+1
+       print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))
+
+       url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
+       r = urllib.request.urlopen(url).read().decode("utf-8")
+       soup = BeautifulSoup(r, "html.parser")
+
+       for row in soup.find_all("li", class_="row"):
+               classes = row.get("class")
+               if "sticky" in classes or "announce" in classes or "global-announce" in classes:
+                       continue
+
+               topic = row.find("dl")
+
+               # Link info
+               link   = topic.find(class_="topictitle")
+               id         = regex_id.match(link.get("href")).group(1)
+               title  = link.find(text=True)
+
+               # Date
+               left   = topic.find("dt")
+               date   = left.get_text().split("ยป")[1].strip()
+               date   = datetime.strptime(date, "%a %b %d, %Y %H:%M")
+               author = left.find_all("a")[-1].get_text().strip()
+
+               # Get counts
+               posts  = topic.find(class_="posts").find(text=True)
+               views  = topic.find(class_="views").find(text=True)
+
+               if id in out:
+                       print("   - got {} again, title: {}".format(id, title))
+                       assert(title == out[id]['title'])
+                       return False
+
+               row = {
+                       "id"    : id,
+                       "title" : title,
+                       "author": author,
+                       "posts" : posts,
+                       "views" : views,
+                       "date"  : date
+               }
+
+               if extra is not None:
+                       for key, value in extra.items():
+                               row[key] = value
+
+               out[id] = row
+
+       return True
+
+def getTopicsFromForum(id, out={}, extra=None):
+       print("Fetching all topics from forum {}".format(id))
+       page = 0
+       while parseForumListPage(id, page, out, extra):
+               page = page + 1
+
+       return out
+
+def dumpTitlesToFile(topics, path):
+       with open(path, "w") as out_file:
+               for topic in topics.values():
+                       out_file.write(topic["title"] + "\n")
index e5049f904c15577d1c1002b28cf02041358fcb61..c565fe025b09f567b4136f5a17f79f57e8e1a94a 100644 (file)
@@ -17,8 +17,7 @@
                <form method="post" action="" class="box-body">
                        <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
                        <select name="action">
-                               <option value="importusers">Create users from mod list</option>
-                               <option value="importmodlist">Import Krock's mod list</option>
+                               <option value="importmodlist">Import forum topics</option>
                                <option value="importscreenshots" selected>Import screenshots from VCS</option>
                                <option value="importdepends">Import dependencies from downloads</option>
                                <option value="modprovides">Set provides to mod name</option>
index a0c5b1ec651be6f32148f0d7c8ebb7add9766b52..7ae8a3596ebcc8d66c56d227203a394bcce4f558 100644 (file)
        {% for topic in topics %}
                <tr>
                        <td>{{ topic.topic_id }}</td>
-                       <td>[{{ topic.getType().value }}] <a href="https://forum.minetest.net/viewtopic.php?t={{ topic.topic_id}}">{{ topic.title }}</a></td>
+                       <td>[{{ topic.type.value }}] <a href="https://forum.minetest.net/viewtopic.php?t={{ topic.topic_id}}">{{ topic.title }}</a></td>
                        {% if show_author %}
                                <td><a href="{{ url_for('user_profile_page', username=topic.author.username) }}">{{ topic.author.display_name}}</a></td>
                        {% endif %}
                        <td>{{ topic.name or ""}}</td>
-                       <td><a href="{{ topic.link }}">{{ topic.link | domain }}</a></td>
+                       <td>{% if topic.link %}<a href="{{ topic.link }}">{{ topic.link | domain }}</a>{% endif %}</td>
                        <td>
                                <a href="{{ url_for('create_edit_package_page', author=topic.author.username, repo=topic.getRepoURL(), forums=topic.topic_id, title=topic.title, bname=topic.name) }}">Create</a>
                        </td>
index ab48c6ef95cbeaa309ef8468a833399950710b79..f69b5cfc78eb0bdfe46e78bf9c849f7568025d32 100644 (file)
                <ul>
                        {% for t in similar_topics %}
                                <li>
-                                       [{{ t.getType().value }}]
+                                       [{{ t.type.value }}]
                                        <a href="https://forum.minetest.net/viewtopic.php?t={{ t.topic_id }}">
                                                {{ t.title }} by {{ t.author.display_name }}
                                        </a>
index 65d526497f500686f72292269514987c32437cc0..92ee43742a8cbb96d4a21ebf6ffffcca49da6802 100644 (file)
@@ -21,7 +21,7 @@ from flask.ext import menu
 from app import app
 from app.models import *
 from app.tasks.importtasks import importRepoScreenshot, importAllDependencies
-from app.tasks.forumtasks  import importUsersFromModList, importKrocksModList
+from app.tasks.forumtasks  import importTopicList
 from flask_wtf import FlaskForm
 from wtforms import *
 from app.utils import loginUser, rank_required
@@ -31,11 +31,8 @@ from app.utils import loginUser, rank_required
 def admin_page():
        if request.method == "POST":
                action = request.form["action"]
-               if action == "importusers":
-                       task = importUsersFromModList.delay()
-                       return redirect(url_for("check_task", id=task.id, r=url_for("user_list_page")))
-               elif action == "importmodlist":
-                       task = importKrocksModList.delay()
+               if action == "importmodlist":
+                       task = importTopicList.delay()
                        return redirect(url_for("check_task", id=task.id, r=url_for("todo_topics_page")))
                elif action == "importscreenshots":
                        packages = Package.query \
index 6ef76ec9b557e56efe4fd49a720730f5aef6e0f4..4d357a64fba8c52a2c4803a1e20592d3f802df5a 100644 (file)
@@ -100,11 +100,11 @@ def package_page(package):
                        package.checkPerm(current_user, Permission.APPROVE_NEW)
 
        similar_topics = None if not show_similar_topics else \
-                       KrockForumTopic.query \
+                       ForumTopic.query \
                                .filter_by(name=package.name) \
-                               .filter(KrockForumTopic.topic_id != package.forums) \
-                               .filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
-                               .order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
+                               .filter(ForumTopic.topic_id != package.forums) \
+                               .filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
+                               .order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
                                .all()
 
        releases = getReleases(package)
index 81735ebe5b710dbc7eda1fd7579481cabd3c3805..84cfef4f517ebf11d0c80134f521bfef780052ff 100644 (file)
@@ -41,8 +41,8 @@ def todo_page():
                screenshots = PackageScreenshot.query.filter_by(approved=False).all()
 
 
-       topics_to_add = KrockForumTopic.query \
-                       .filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
+       topics_to_add = ForumTopic.query \
+                       .filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
                        .count()
 
        return render_template("todo/list.html", title="Reports and Work Queue",
@@ -54,11 +54,11 @@ def todo_page():
 @app.route("/todo/topics/")
 @login_required
 def todo_topics_page():
-       total = KrockForumTopic.query.count()
+       total = ForumTopic.query.count()
 
-       topics = KrockForumTopic.query \
-                       .filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
-                       .order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
+       topics = ForumTopic.query \
+                       .filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
+                       .order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
                        .all()
 
        return render_template("todo/topics.html", topics=topics, total=total)
index 256f7d1023635f9cfd5010b7230149f7cd8a9808..a96fce2f63db71acee6982e408c475bdbbc45cb3 100644 (file)
@@ -98,10 +98,10 @@ def user_profile_page(username):
 
        topics_to_add = None
        if current_user == user or user.checkPerm(current_user, Permission.CHANGE_AUTHOR):
-               topics_to_add = KrockForumTopic.query \
+               topics_to_add = ForumTopic.query \
                                        .filter_by(author_id=user.id) \
-                                       .filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
-                                       .order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
+                                       .filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
+                                       .order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
                                        .all()
 
        # Process GET or invalid POST
diff --git a/migrations/versions/9fc23495713b_.py b/migrations/versions/9fc23495713b_.py
new file mode 100644 (file)
index 0000000..f457ae5
--- /dev/null
@@ -0,0 +1,55 @@
+"""empty message
+
+Revision ID: 9fc23495713b
+Revises: de004661c5e1
+Create Date: 2018-07-04 00:03:20.123285
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '9fc23495713b'
+down_revision = 'de004661c5e1'
+branch_labels = None
+depends_on = None
+from sqlalchemy.dialects.postgresql import ENUM
+
+type_enum = ENUM('MOD', 'GAME', 'TXP', name='packagetype', create_type=False)
+
+def upgrade():
+    type_enum.create(op.get_bind(), checkfirst=True)
+
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('krock_forum_topic')
+    op.create_table('forum_topic',
+    sa.Column('topic_id', sa.Integer(), autoincrement=False, nullable=False),
+    sa.Column('author_id', sa.Integer(), nullable=False),
+    sa.Column('type', type_enum, nullable=True),
+    sa.Column('title', sa.String(length=200), nullable=False),
+    sa.Column('name', sa.String(length=30), nullable=True),
+    sa.Column('link', sa.String(length=200), nullable=True),
+    sa.Column('posts', sa.Integer(), nullable=False),
+    sa.Column('views', sa.Integer(), nullable=False),
+    sa.Column('created_at', sa.DateTime(), nullable=False),
+    sa.ForeignKeyConstraint(['author_id'], ['user.id'], ),
+    sa.PrimaryKeyConstraint('topic_id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('forum_topic')
+    op.create_table('krock_forum_topic',
+    sa.Column('topic_id', sa.Integer(), autoincrement=False, nullable=False),
+    sa.Column('author_id', sa.Integer(), nullable=False),
+    sa.Column('ttype', sa.Integer(), nullable=False),
+    sa.Column('title', sa.String(length=200), nullable=False),
+    sa.Column('name', sa.String(length=30), nullable=True),
+    sa.Column('link', sa.String(length=50), nullable=True),
+    sa.ForeignKeyConstraint(['author_id'], ['user.id'], ),
+    sa.PrimaryKeyConstraint('topic_id')
+    )
+    # ### end Alembic commands ###