From 2fd631554d6494a38ffe3e2067679a3ade168e00 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Wed, 16 Sep 2020 12:52:55 -0400
Subject: [PATCH 01/19] Add unmodified bwds code to utilities

---
 .../utilities/bad_words_detection_system.py   | 276 ++++++++++++++++++
 editquality/utilities/dump_based_detection.py | 224 ++++++++++++++
 2 files changed, 500 insertions(+)
 create mode 100755 editquality/utilities/bad_words_detection_system.py
 create mode 100644 editquality/utilities/dump_based_detection.py

diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py
new file mode 100755
index 0000000..240b52c
--- /dev/null
+++ b/editquality/utilities/bad_words_detection_system.py
@@ -0,0 +1,276 @@
+"""
+WIP
+The script to find bad words automatically.
+
+It gets a set of added words and determines tf-idf of words
+the it uses K-means algorithm to determin them.
+
+Some parts are copied from
+https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py
+
+>>> from bad_words_detection_system import *
+>>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True),
+...          Edit(3, {'one':5, 'four': 1}, False)]
+>>> bot = Bot()
+>>> bot.parse_edits(edits)
+>>> bot.parse_bad_edits()
+
+python3 bad_words_detection_system.py --rev-pages:f.txt
+    --api:https://en.wikipedia.org/w/api.php
+    --language:revscoring.languages.english
+
+Use cache:
+python3 bad_words_detection_system.py --cache:
+"""
+import math
+import sys
+import traceback
+import json
+import time
+from importlib import import_module
+from collections import OrderedDict
+# TODO: User argparse
+
+from revscoring.extractors import APIExtractor
+from revscoring.datasources import diff
+
+from mw import api
+from mw.lib import reverts
+
+base_file_path = '/data/project/dexbot/pywikibot-core/something_'
+
+
+class Edit(object):
+    def __init__(self, rev_id, added_words, reverted):
+        self.id = rev_id
+        self.added_words = added_words
+        if not isinstance(self.added_words, dict):
+            self.fix_added_words()
+        self.reverted = reverted
+
+    def fix_added_words(self):
+        temp = {}
+        for word in self.added_words:
+            temp[word] = temp.get(word, 0) + 1
+        self.added_words = temp
+
+
+class Bot(object):
+
+    def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None):
+        self.bad_edits = Edit(-1, {}, True)
+        self.counter = 0
+        self.words_db = {}
+        self.bad_words_db = {}
+        self.bad_counter = 0
+        if bool(bad_words_cache) != bool(words_cache):
+            raise "You should define both"
+        if words_cache:
+            self.cache = True
+            self.initiate_cache(words_cache, bad_words_cache, no_docs)
+        else:
+            self.cache = False
+
+    def initiate_cache(self, words_cache, bad_words_cache, no_docs):
+        with open(words_cache, 'r') as f:
+            self.words_db = json.loads(f.read())
+        with open(bad_words_cache, 'r') as f:
+            self.bad_edits.added_words = json.loads(f.read())
+        with open(no_docs, 'r') as f:
+            self.counter = int(f.read())
+
+    def parse_edits(self, edits):
+        for edit in edits:
+            # Since edits can be gen and len doesn't mean there
+            self.counter += 1
+            if edit.reverted:
+                for word in edit.added_words:
+                    self.bad_edits.added_words[word] = \
+                        self.bad_edits.added_words.get(word, 0) + \
+                        edit.added_words[word]
+                    self.bad_words_db[word] = (
+                        self.bad_words_db.get(word, 0) + 1)
+                self.bad_counter += 1
+                continue
+            for word in edit.added_words:
+                self.words_db[word] = self.words_db.get(word, 0) + 1
+
+    def parse_bad_edits(self, numbers_to_show=10):
+        self.possible_bad_words = {}
+        self.stop_words = {}
+        if not self.cache:
+            self.counter += 1
+        for word in self.bad_edits.added_words:
+            if not self.cache:
+                self.words_db[word] = self.words_db.get(word, 0) + 1
+            if 'sh' in word or 'ch' in word:
+                continue
+            self.possible_bad_words[word] = self.tf_idf(word)
+            self.stop_words[word] = self.idf(word)
+        if numbers_to_show:
+            self.show_results(numbers_to_show)
+            self.show_results2(numbers_to_show)
+
+    def tf_idf(self, word):
+        tf = math.log(self.bad_edits.added_words[word]) + 1
+        idf = math.log(float(self.counter)/self.words_db[word])
+        return tf*idf
+
+    def idf(self, word):
+        return math.log(float(self.counter)/self.words_db[word])
+
+    def show_results(self, numbers_to_show):
+        print("Showing %d results" % numbers_to_show)
+        values = sorted(self.possible_bad_words.values())
+        lim = values[numbers_to_show*-1]
+        res = {}
+        for word in self.possible_bad_words:
+            if self.possible_bad_words[word] >= lim:
+                res[word] = self.possible_bad_words[word]
+        res = OrderedDict(
+            sorted(res.items(), key=lambda t: t[1], reverse=True))
+        res_text = []
+        for word in res:
+            res_text.append(word)
+        res_text.sort()
+        res_text = "#" + '\n#'.join(res_text)
+        self.bad_words_res_text = res_text
+        with open('%s_%s.txt' % (base_file_path, time.time()), 'w') as f:
+            f.write(res_text)
+
+    def show_results2(self, numbers_to_show):
+        print("Showing another %d results" % numbers_to_show)
+        values = sorted(self.stop_words.values(), reverse=True)
+        lim = values[numbers_to_show*-1]
+        res = {}
+        for word in self.stop_words:
+            if self.stop_words[word] <= lim:
+                res[word] = self.stop_words[word]
+        res = OrderedDict(sorted(res.items(), key=lambda t: t[1]))
+        res_text = []
+        for word in res:
+            res_text.append(word)
+        res_text.sort()
+        res_text = "#" + '\n#'.join(res_text)
+        self.stop_words_res_text = res_text
+        with open('%s2_%s.txt' % (base_file_path, time.time()), 'w') as f:
+            f.write(res_text)
+
+    def dump(self):
+        new_db = {}
+        for word in self.bad_edits.added_words:
+            new_db[word] = self.words_db[word]
+        with open('words_db.txt', 'w') as f:
+            f.write(json.dumps(new_db))
+        with open('bad_edits_words.txt', 'w') as f:
+            f.write(json.dumps(self.bad_edits.added_words))
+        with open('no_docs.txt', 'w') as f:
+            f.write(json.dumps(self.counter))
+
+
+def read_rev_pages(f):
+
+    for line in f:
+        parts = line.strip().split('\t')
+
+        if len(parts) == 1:
+            rev_id = parts
+            yield int(rev_id[0]), None
+        elif len(parts) == 2:
+            rev_id, page_id = parts
+            yield int(rev_id), int(page_id)
+
+
+def import_from_path(path):
+    parts = path.split(".")
+    module_path = ".".join(parts[:-1])
+    attribute_name = parts[-1]
+
+    module = import_module(module_path)
+
+    attribute = getattr(module, attribute_name)
+
+    return attribute
+
+
+def handle_args():
+    args = {}
+    for arg in sys.argv[1:]:
+        if arg.startswith('--rev-pages:'):
+            args['--rev-pages'] = arg[len('--rev-pages:'):]
+        elif arg.startswith('--language:'):
+            args['--language'] = arg[len('--language:'):]
+        elif arg.startswith('--api:'):
+            args['--api'] = arg[len('--api:'):]
+        elif arg.startswith('--cache:'):
+            args['--cache'] = arg[len('--cache:'):]
+        elif arg.startswith('--num_res:'):
+            args['--num_res'] = arg[len('--num_res:'):]
+        else:
+            print('Unknown argument')
+    return args
+
+
+def bot_gen(rev_pages, language, api_url):
+
+    session = api.Session(api_url)
+    extractor = Extractor(session, language=language)
+
+    for rev_id, page_id in rev_pages:
+        sys.stderr.write(".")
+        sys.stderr.flush()
+        try:
+
+            # Detect reverted status
+            revert = reverts.api.check(session, rev_id, page_id, radius=3)
+            reverted = revert is not None
+            added_words = list(
+                extractor.extract(rev_id, [diff.added_words]))[0]
+            yield Edit(rev_id, added_words, reverted)
+
+        except KeyboardInterrupt:
+            sys.stderr.write("\n^C Caught.  Exiting...")
+            break
+
+        except:
+            sys.stderr.write(traceback.format_exc())
+            sys.stderr.write("\n")
+
+    sys.stderr.write("\n")
+
+
+def cache_parse(pathes, num_res):
+    if not pathes.strip():
+        pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt'
+    pathes = pathes.split(',')
+    bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1],
+              no_docs=pathes[2])
+    bot.parse_bad_edits(num_res)
+
+
+def main():
+    args = handle_args()
+    if '--num_res' in args:
+        num_res = int(args['--num_res'])
+    else:
+        num_res = 10
+    if '--cache' in args:
+        cache_parse(args['--cache'], num_res)
+        return
+    rev_pages = read_rev_pages(open(args['--rev-pages']))
+
+    if args['--language'] is not None:
+        language = import_from_path(args['--language'])
+    else:
+        language = None
+
+    api_url = args['--api']
+    gen = bot_gen(rev_pages, language, api_url)
+    bot = Bot()
+    bot.parse_edits(gen)
+    bot.parse_bad_edits(num_res)
+    bot.dump()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/editquality/utilities/dump_based_detection.py b/editquality/utilities/dump_based_detection.py
new file mode 100644
index 0000000..d1ccfd9
--- /dev/null
+++ b/editquality/utilities/dump_based_detection.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+# Copyright Â© 2014 He7d3r
+# License: http://he7d3r.mit-license.org/
+"""
+Extermely under construction.
+Some parts are copied from
+https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c
+"""
+from nltk.tokenize import RegexpTokenizer
+import sys
+from mw.lib import reverts
+from pywikibot import xmlreader
+import pywikibot
+import re
+import time
+import regex
+
+from bad_words_detection_system import Edit, Bot
+
+cache = {}
+
+languages_by_size = [
+            'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi',
+            'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar',
+            'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk',
+            'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz',
+            'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka',
+            'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy',
+            'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb',
+            'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv',
+            'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne',
+            'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa',
+            'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa',
+            'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo',
+            'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb',
+            'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue',
+            'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf',
+            'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea',
+            'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu',
+            'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt',
+            'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo',
+            'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv',
+            'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw',
+            'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr',
+            'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty',
+            'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab',
+            'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr',
+            'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt',
+            'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff',
+            'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc',
+            'azb', 'or'
+        ]
+cjk = (
+    r'\u4E00-\u62FF' +  # Unified Ideographs
+    r'\u6300-\u77FF' +
+    r'\u7800-\u8CFF' +
+    r'\u8D00-\u9FCC' +
+    r'\u3400-\u4DFF' +  # Unified Ideographs Ext A
+    r'\U00020000-\U000215FF' +  # Unified Ideographs Ext. B
+    r'\U00021600-\U000230FF' +
+    r'\U00023100-\U000245FF' +
+    r'\U00024600-\U000260FF' +
+    r'\U00026100-\U000275FF' +
+    r'\U00027600-\U000290FF' +
+    r'\U00029100-\U0002A6DF' +
+    r'\uF900-\uFAFF' +  # Compatibility Ideographs
+    r'\U0002F800-\U0002FA1F'  # Compatibility Ideographs Suppl.
+)
+
+chars = {
+    'az': u'A-Za-zÇçƏəĞğıİÖöŞşÜü',
+    'ar': u'غظضذخثتشرقصفعسنملكيطحزوهدجبا',
+    'et': u'A-Za-zŠšŽžÕõÄäÖöÜü',
+    'af': u'A-Za-züûöôïîëêè',
+    'en': u'A-Za-z',
+    'id': u'A-Za-z',
+    'ko': cjk,
+    'zh': cjk,
+    'ja': cjk,
+    'pt': u'A-Za-záàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ',
+    'tr': u'A-Za-zÇĞİÖŞÜçğıöşüâîûÂÎÛ',
+    'fa': u'ابپتثجچحخدذرزژسشصآضطظعغفقکگلمنوهی‌يك',
+    'fr': u'A-Za-zÀàÂâÆæÄäÇçÉéÈèÊêËëÎîÏïÔôŒœÖöÙùÛûÜüŸÿ',
+    'de': u'A-Za-zÄäÖöÜüß',
+    'es': u'A-Za-zÑñéÉüÜóÓ',
+    'uk': u'АаБбВвГгҐґДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЬ'
+          u'ьЮюЯя',
+    'pl': u'AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż',
+    'he': u'למנסעפצקרשתםןףץאבגדהוזחטיכך',
+    'hy': u'ԱաԲբԳգԴդԵեԶզԷէԸըԹթԺժԻիԼլԽխԾծԿկՀհՁձՂղՃճՄմՅյՆնՇշՈոՉչՊպՋջՌռՍսՎվՏտՐր'
+          u'ՑցՈՒՈւուՒւՓփՔքևևՕօՖֆ',
+    'vi': u'AaĂăÂâBbCcDdĐđEeÊêGgHhIiKkLlMmNnOoÔôƠơPpQqRrSsTtUuƯưVvXxYy',
+    'ur': u'ابپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوهھءیےٹڈڑ‌آّْیٰوَُِٗ',
+    'uz': 'A-Za-zʻ',
+    'sv': u'A-Za-zÅÄÖåäö',
+    'hu': u'A-Za-zËëÉéÓóÖöŐőÚúÜüŰűÁá',
+    'cs': u'A-Za-zÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž',
+    'hi': u'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहळक्षज्ञ:अपआपाइपिईपीउपुऊपूऋपृॠप'
+          u'ॄऌपॢॡपॣएपेऐपैओपोऔपौअंपंअःपः',
+    'no': u'A-Za-zÆØÅæøåéèêóòâôüáàé',
+    'ta': u'௰௱௲௳௴௵௶௷௸௹௺ௗௐொோௌ்ெேைீுூாிரறலளழவஶஷஸஹணதநனபம'
+          u'யஐஒஓஔகஙசஜஞடஂஃஅஆஇஈஉஊஎஏ',
+}
+
+
+def lower(a, lang):
+    if lang == 'tr':
+        return a.replace('I', u'ı').replace(u'İ', 'i').lower()
+    return a.lower()
+
+
+def page_info(dump, lang):
+    global tokenizer
+    c = 1
+    di_old = []
+    di = []
+    nombre = '3,' if lang not in ['ja', 'zh'] else '1'
+    for entry in dump.parse():
+        if entry.ns != '0':
+            continue
+        if c != entry.id:
+            if c != 1:
+                di_old = di[:]
+            di = []
+            if entry.id and int(entry.id[-1]) == 0:
+                print('new page', entry.id)
+            di.append(entry)
+        else:
+            di.append(entry)
+            continue
+        c = entry.id
+        firstRev = True
+        history = {}
+        detector = reverts.Detector(radius=3)
+        for revision in di_old:
+            revision.text = re.sub(
+                r'\[\[(%s)\:' % '|'.join(languages_by_size),
+                '',
+                revision.text)
+            words = set()
+            if lang in chars:
+                token_pattern = r'[%s]{%s}' % (chars[lang], nombre)
+                tokenizer = RegexpTokenizer(token_pattern)
+                tokens = tokenizer.tokenize(revision.text)
+            else:
+                token_pattern = r'\p{alpha}+'
+                tokens = regex.findall(token_pattern, revision.text)
+            for w in tokens:
+                words.add(lower(w, lang))
+            if firstRev:
+                prevIntersection = words
+                firstRev = False
+            added = words - prevIntersection
+            prevIntersection = words
+            history[revision.revisionid] = Edit(
+                revision.revisionid, added, False)
+            rev = detector.process(revision.text,
+                                   {'rev_id': revision.revisionid})
+            if rev:
+                for reverted in rev.reverteds:
+                    history[reverted['rev_id']].reverted = True
+
+        yield history
+
+
+def run(dumps):
+    number = 500000
+    counter = 0
+    start_time = time.time()
+    for casee in dumps:
+        lang = casee.split('/')[-1].split('wiki')[0]
+        dump = xmlreader.XmlDump(casee, True)
+        bot = Bot()
+        for case in page_info(dump, lang):
+            counter += 1
+            if number and counter > number:
+                break
+            bot.parse_edits(case.values())
+    bot.parse_bad_edits(250)
+    bot.dump()
+    print(time.time() - start_time)
+    site = pywikibot.Site('meta', fam='meta')
+    page = pywikibot.Page(
+        site, 'Research:Revision scoring as a service/Word lists/' + lang)
+    try:
+        text = page.get()
+    except pywikibot.NoPage:
+        text = ("{{Research:Revision scoring as a service/template/word list "
+                "data\n  |lang=%s\n  |gen=250\n  |badwords=-\n  |informal=-"
+                "\n  |stopwords=-\n  |dictionary=-\n  |stemmer=-\n  |contact="
+                "\n  |features=no\n  |labels=requested\n  |campaign=no\n  "
+                "|needs=-\n  |list-generated=\n  |list-stop=\n}}\n" % lang)
+    except:
+        return False
+    new_text = text
+    if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text):
+        if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text):
+            new_text = re.sub(
+                r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})',
+                r'\1%s\2' % bot.bad_words_res_text,
+                new_text)
+    else:
+        new_text = re.sub(
+            r'\}\}',
+            r'|list-generated=%s\n}}' % bot.bad_words_res_text,
+            new_text)
+    if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text):
+        if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text):
+            new_text = re.sub(
+                r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})',
+                r'\1%s\2' % bot.stop_words_res_text,
+                new_text)
+    else:
+        new_text = re.sub(
+            r'\}\}',
+            r'|list-stop=%s\n}}' % bot.stop_words_res_text,
+            new_text)
+    if new_text != text:
+        page.text = new_text
+        page.save('Bot: update results')
+if __name__ == "__main__":
+    dumps = sys.argv[1:]
+    run(dumps)

From 5c75035a70f5ec88336a2487f1ec4ab8776b9de8 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Wed, 16 Sep 2020 15:27:15 -0400
Subject: [PATCH 02/19] Copy library code into bwds/__init__.py

---
 editquality/bwds/__init__.py | 158 +++++++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 editquality/bwds/__init__.py

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
new file mode 100644
index 0000000..0c80707
--- /dev/null
+++ b/editquality/bwds/__init__.py
@@ -0,0 +1,158 @@
+"""
+Code to find bad words automatically.
+
+It gets a set of added words and determines tf-idf of words
+the it uses K-means algorithm to determine them.
+
+Some parts are copied from
+https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py
+
+>>> from bad_words_detection_system import *
+>>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True),
+...          Edit(3, {'one':5, 'four': 1}, False)]
+>>> bot = Bot()
+>>> bot.parse_edits(edits)
+>>> bot.parse_bad_edits()
+"""
+import math
+import sys
+import traceback
+import json
+import time
+from importlib import import_module
+from collections import OrderedDict
+# TODO: User argparse
+
+from revscoring.extractors.api import Extractor
+from revscoring.datasources import revision_oriented
+
+from mwapi import Session
+import mwreverts
+
+
+class Edit(object):
+    def __init__(self, rev_id, added_words, reverted):
+        self.id = rev_id
+        self.added_words = added_words
+        if not isinstance(self.added_words, dict):
+            self.fix_added_words()
+        self.reverted = reverted
+
+    def fix_added_words(self):
+        temp = {}
+        for word in self.added_words:
+            temp[word] = temp.get(word, 0) + 1
+        self.added_words = temp
+
+
+class Bot(object):
+
+    def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None):
+        self.bad_edits = Edit(-1, {}, True)
+        self.counter = 0
+        self.words_db = {}
+        self.bad_words_db = {}
+        self.bad_counter = 0
+        if bool(bad_words_cache) != bool(words_cache):
+            raise "You should define both"
+        if words_cache:
+            self.cache = True
+            self.initiate_cache(words_cache, bad_words_cache, no_docs)
+        else:
+            self.cache = False
+
+    def initiate_cache(self, words_cache, bad_words_cache, no_docs):
+        with open(words_cache, 'r') as f:
+            self.words_db = json.loads(f.read())
+        with open(bad_words_cache, 'r') as f:
+            self.bad_edits.added_words = json.loads(f.read())
+        with open(no_docs, 'r') as f:
+            self.counter = int(f.read())
+
+    def parse_edits(self, edits):
+        for edit in edits:
+            # Since edits can be gen and len doesn't mean there
+            self.counter += 1
+            if edit.reverted:
+                for word in edit.added_words:
+                    self.bad_edits.added_words[word] = \
+                        self.bad_edits.added_words.get(word, 0) + \
+                        edit.added_words[word]
+                    self.bad_words_db[word] = (
+                        self.bad_words_db.get(word, 0) + 1)
+                self.bad_counter += 1
+                continue
+            for word in edit.added_words:
+                self.words_db[word] = self.words_db.get(word, 0) + 1
+
+    def parse_bad_edits(self, numbers_to_show=10):
+        self.possible_bad_words = {}
+        self.stop_words = {}
+        if not self.cache:
+            self.counter += 1
+        for word in self.bad_edits.added_words:
+            if not self.cache:
+                self.words_db[word] = self.words_db.get(word, 0) + 1
+            if 'sh' in word or 'ch' in word:
+                continue
+            self.possible_bad_words[word] = self.tf_idf(word)
+            self.stop_words[word] = self.idf(word)
+        if numbers_to_show:
+            self.show_results(numbers_to_show)
+            self.show_results2(numbers_to_show)
+
+    def tf_idf(self, word):
+        tf = math.log(self.bad_edits.added_words[word]) + 1
+        idf = math.log(float(self.counter)/self.words_db[word])
+        return tf*idf
+
+    def idf(self, word):
+        return math.log(float(self.counter)/self.words_db[word])
+
+    def show_results(self, numbers_to_show):
+        print("Showing %d results" % numbers_to_show)
+        values = sorted(self.possible_bad_words.values())
+        lim = values[numbers_to_show*-1]
+        res = {}
+        for word in self.possible_bad_words:
+            if self.possible_bad_words[word] >= lim:
+                res[word] = self.possible_bad_words[word]
+        res = OrderedDict(
+            sorted(res.items(), key=lambda t: t[1], reverse=True))
+        res_text = []
+        for word in res:
+            res_text.append(word)
+        res_text.sort()
+        res_text = "#" + '\n#'.join(res_text)
+        self.bad_words_res_text = res_text
+        with open('%s_%s.txt' % (base_file_path, time.time()), 'w') as f:
+            f.write(res_text)
+
+    def show_results2(self, numbers_to_show):
+        print("Showing another %d results" % numbers_to_show)
+        values = sorted(self.stop_words.values(), reverse=True)
+        lim = values[numbers_to_show*-1]
+        res = {}
+        for word in self.stop_words:
+            if self.stop_words[word] <= lim:
+                res[word] = self.stop_words[word]
+        res = OrderedDict(sorted(res.items(), key=lambda t: t[1]))
+        res_text = []
+        for word in res:
+            res_text.append(word)
+        res_text.sort()
+        res_text = "#" + '\n#'.join(res_text)
+        self.stop_words_res_text = res_text
+        with open('%s2_%s.txt' % (base_file_path, time.time()), 'w') as f:
+            f.write(res_text)
+
+    def dump(self):
+        new_db = {}
+        for word in self.bad_edits.added_words:
+            new_db[word] = self.words_db[word]
+        with open('words_db.txt', 'w') as f:
+            f.write(json.dumps(new_db))
+        with open('bad_edits_words.txt', 'w') as f:
+            f.write(json.dumps(self.bad_edits.added_words))
+        with open('no_docs.txt', 'w') as f:
+            f.write(json.dumps(self.counter))

From 69f35f2961e4a9086e16072b6e1255f5961cedd8 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Wed, 16 Sep 2020 15:53:27 -0400
Subject: [PATCH 03/19] Bot and Edit doctest works

---
 editquality/bwds/__init__.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index 0c80707..5777692 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -2,12 +2,12 @@
 Code to find bad words automatically.
 
 It gets a set of added words and determines tf-idf of words
-the it uses K-means algorithm to determine them.
+then it uses K-means algorithm to determine them.
 
 Some parts are copied from
 https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py
 
->>> from bad_words_detection_system import *
+>>> from editquality.bwds import Bot, Edit
 >>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True),
 ...          Edit(3, {'one':5, 'four': 1}, False)]
 >>> bot = Bot()
@@ -15,19 +15,11 @@
 >>> bot.parse_bad_edits()
 """
 import math
-import sys
-import traceback
 import json
 import time
-from importlib import import_module
 from collections import OrderedDict
-# TODO: User argparse
 
-from revscoring.extractors.api import Extractor
-from revscoring.datasources import revision_oriented
-
-from mwapi import Session
-import mwreverts
+base_file_path = '/data/project/dexbot/pywikibot-core/something_'
 
 
 class Edit(object):
@@ -54,7 +46,7 @@ def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None):
         self.bad_words_db = {}
         self.bad_counter = 0
         if bool(bad_words_cache) != bool(words_cache):
-            raise "You should define both"
+            raise ValueError("bad_words_cache should be defined if and only words_cache is defined")
         if words_cache:
             self.cache = True
             self.initiate_cache(words_cache, bad_words_cache, no_docs)
@@ -103,16 +95,16 @@ def parse_bad_edits(self, numbers_to_show=10):
 
     def tf_idf(self, word):
         tf = math.log(self.bad_edits.added_words[word]) + 1
-        idf = math.log(float(self.counter)/self.words_db[word])
+        idf = math.log(self.counter/self.words_db[word])
         return tf*idf
 
     def idf(self, word):
-        return math.log(float(self.counter)/self.words_db[word])
+        return math.log(self.counter/self.words_db[word])
 
     def show_results(self, numbers_to_show):
         print("Showing %d results" % numbers_to_show)
         values = sorted(self.possible_bad_words.values())
-        lim = values[numbers_to_show*-1]
+        lim = values[max(0, len(values) - numbers_to_show)]
         res = {}
         for word in self.possible_bad_words:
             if self.possible_bad_words[word] >= lim:
@@ -131,7 +123,7 @@ def show_results(self, numbers_to_show):
     def show_results2(self, numbers_to_show):
         print("Showing another %d results" % numbers_to_show)
         values = sorted(self.stop_words.values(), reverse=True)
-        lim = values[numbers_to_show*-1]
+        lim = values[max(0, len(values) - numbers_to_show)]
         res = {}
         for word in self.stop_words:
             if self.stop_words[word] <= lim:

From a62d5c42c2095bef8ad2184e4ccf0281fa7dadb7 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Wed, 16 Sep 2020 16:05:33 -0400
Subject: [PATCH 04/19] Add BWDS scripts to sphinx

---
 editquality/utilities/__init__.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/editquality/utilities/__init__.py b/editquality/utilities/__init__.py
index b26cdd7..c6678f5 100644
--- a/editquality/utilities/__init__.py
+++ b/editquality/utilities/__init__.py
@@ -43,4 +43,12 @@
 merge_labels
 ++++++++++++
 .. automodule:: editquality.utilities.merge_labels
+
+bad_words_detection_system
+++++++++++++++++++++++++++
+.. automodule:: editquality.utilities.bad_words_detection_system
+
+dump_based_detection
+++++++++++++++++++++
+.. automodule:: editquality.utilities.dump_based_detection
 """

From d2cfe3348df279bab4e460ac6610c18d0040fdff Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 11:59:00 -0400
Subject: [PATCH 05/19] Add test_parse_bad_edits

---
 editquality/bwds/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index 5777692..65b731d 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -148,3 +148,10 @@ def dump(self):
             f.write(json.dumps(self.bad_edits.added_words))
         with open('no_docs.txt', 'w') as f:
             f.write(json.dumps(self.counter))
+
+
+def test_parse_bad_edits():
+    edits = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
+    bot = Bot()
+    bot.parse_edits(edits)
+    bot.parse_bad_edits(numbers_to_show=0)

From 1962bc716e5bfc74ba8df65fa62039fcea1885ec Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 12:03:21 -0400
Subject: [PATCH 06/19] Remove dump_based_detection.py

I'm hoping to add this in a future PR
---
 editquality/utilities/dump_based_detection.py | 224 ------------------
 1 file changed, 224 deletions(-)
 delete mode 100644 editquality/utilities/dump_based_detection.py

diff --git a/editquality/utilities/dump_based_detection.py b/editquality/utilities/dump_based_detection.py
deleted file mode 100644
index d1ccfd9..0000000
--- a/editquality/utilities/dump_based_detection.py
+++ /dev/null
@@ -1,224 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
-# Copyright Â© 2014 He7d3r
-# License: http://he7d3r.mit-license.org/
-"""
-Extermely under construction.
-Some parts are copied from
-https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c
-"""
-from nltk.tokenize import RegexpTokenizer
-import sys
-from mw.lib import reverts
-from pywikibot import xmlreader
-import pywikibot
-import re
-import time
-import regex
-
-from bad_words_detection_system import Edit, Bot
-
-cache = {}
-
-languages_by_size = [
-            'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi',
-            'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar',
-            'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk',
-            'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz',
-            'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka',
-            'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy',
-            'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb',
-            'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv',
-            'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne',
-            'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa',
-            'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa',
-            'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo',
-            'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb',
-            'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue',
-            'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf',
-            'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea',
-            'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu',
-            'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt',
-            'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo',
-            'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv',
-            'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw',
-            'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr',
-            'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty',
-            'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab',
-            'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr',
-            'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt',
-            'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff',
-            'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc',
-            'azb', 'or'
-        ]
-cjk = (
-    r'\u4E00-\u62FF' +  # Unified Ideographs
-    r'\u6300-\u77FF' +
-    r'\u7800-\u8CFF' +
-    r'\u8D00-\u9FCC' +
-    r'\u3400-\u4DFF' +  # Unified Ideographs Ext A
-    r'\U00020000-\U000215FF' +  # Unified Ideographs Ext. B
-    r'\U00021600-\U000230FF' +
-    r'\U00023100-\U000245FF' +
-    r'\U00024600-\U000260FF' +
-    r'\U00026100-\U000275FF' +
-    r'\U00027600-\U000290FF' +
-    r'\U00029100-\U0002A6DF' +
-    r'\uF900-\uFAFF' +  # Compatibility Ideographs
-    r'\U0002F800-\U0002FA1F'  # Compatibility Ideographs Suppl.
-)
-
-chars = {
-    'az': u'A-Za-zÇçƏəĞğıİÖöŞşÜü',
-    'ar': u'غظضذخثتشرقصفعسنملكيطحزوهدجبا',
-    'et': u'A-Za-zŠšŽžÕõÄäÖöÜü',
-    'af': u'A-Za-züûöôïîëêè',
-    'en': u'A-Za-z',
-    'id': u'A-Za-z',
-    'ko': cjk,
-    'zh': cjk,
-    'ja': cjk,
-    'pt': u'A-Za-záàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ',
-    'tr': u'A-Za-zÇĞİÖŞÜçğıöşüâîûÂÎÛ',
-    'fa': u'ابپتثجچحخدذرزژسشصآضطظعغفقکگلمنوهی‌يك',
-    'fr': u'A-Za-zÀàÂâÆæÄäÇçÉéÈèÊêËëÎîÏïÔôŒœÖöÙùÛûÜüŸÿ',
-    'de': u'A-Za-zÄäÖöÜüß',
-    'es': u'A-Za-zÑñéÉüÜóÓ',
-    'uk': u'АаБбВвГгҐґДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЬ'
-          u'ьЮюЯя',
-    'pl': u'AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż',
-    'he': u'למנסעפצקרשתםןףץאבגדהוזחטיכך',
-    'hy': u'ԱաԲբԳգԴդԵեԶզԷէԸըԹթԺժԻիԼլԽխԾծԿկՀհՁձՂղՃճՄմՅյՆնՇշՈոՉչՊպՋջՌռՍսՎվՏտՐր'
-          u'ՑցՈՒՈւուՒւՓփՔքևևՕօՖֆ',
-    'vi': u'AaĂăÂâBbCcDdĐđEeÊêGgHhIiKkLlMmNnOoÔôƠơPpQqRrSsTtUuƯưVvXxYy',
-    'ur': u'ابپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوهھءیےٹڈڑ‌آّْیٰوَُِٗ',
-    'uz': 'A-Za-zʻ',
-    'sv': u'A-Za-zÅÄÖåäö',
-    'hu': u'A-Za-zËëÉéÓóÖöŐőÚúÜüŰűÁá',
-    'cs': u'A-Za-zÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž',
-    'hi': u'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहळक्षज्ञ:अपआपाइपिईपीउपुऊपूऋपृॠप'
-          u'ॄऌपॢॡपॣएपेऐपैओपोऔपौअंपंअःपः',
-    'no': u'A-Za-zÆØÅæøåéèêóòâôüáàé',
-    'ta': u'௰௱௲௳௴௵௶௷௸௹௺ௗௐொோௌ்ெேைீுூாிரறலளழவஶஷஸஹணதநனபம'
-          u'யஐஒஓஔகஙசஜஞடஂஃஅஆஇஈஉஊஎஏ',
-}
-
-
-def lower(a, lang):
-    if lang == 'tr':
-        return a.replace('I', u'ı').replace(u'İ', 'i').lower()
-    return a.lower()
-
-
-def page_info(dump, lang):
-    global tokenizer
-    c = 1
-    di_old = []
-    di = []
-    nombre = '3,' if lang not in ['ja', 'zh'] else '1'
-    for entry in dump.parse():
-        if entry.ns != '0':
-            continue
-        if c != entry.id:
-            if c != 1:
-                di_old = di[:]
-            di = []
-            if entry.id and int(entry.id[-1]) == 0:
-                print('new page', entry.id)
-            di.append(entry)
-        else:
-            di.append(entry)
-            continue
-        c = entry.id
-        firstRev = True
-        history = {}
-        detector = reverts.Detector(radius=3)
-        for revision in di_old:
-            revision.text = re.sub(
-                r'\[\[(%s)\:' % '|'.join(languages_by_size),
-                '',
-                revision.text)
-            words = set()
-            if lang in chars:
-                token_pattern = r'[%s]{%s}' % (chars[lang], nombre)
-                tokenizer = RegexpTokenizer(token_pattern)
-                tokens = tokenizer.tokenize(revision.text)
-            else:
-                token_pattern = r'\p{alpha}+'
-                tokens = regex.findall(token_pattern, revision.text)
-            for w in tokens:
-                words.add(lower(w, lang))
-            if firstRev:
-                prevIntersection = words
-                firstRev = False
-            added = words - prevIntersection
-            prevIntersection = words
-            history[revision.revisionid] = Edit(
-                revision.revisionid, added, False)
-            rev = detector.process(revision.text,
-                                   {'rev_id': revision.revisionid})
-            if rev:
-                for reverted in rev.reverteds:
-                    history[reverted['rev_id']].reverted = True
-
-        yield history
-
-
-def run(dumps):
-    number = 500000
-    counter = 0
-    start_time = time.time()
-    for casee in dumps:
-        lang = casee.split('/')[-1].split('wiki')[0]
-        dump = xmlreader.XmlDump(casee, True)
-        bot = Bot()
-        for case in page_info(dump, lang):
-            counter += 1
-            if number and counter > number:
-                break
-            bot.parse_edits(case.values())
-    bot.parse_bad_edits(250)
-    bot.dump()
-    print(time.time() - start_time)
-    site = pywikibot.Site('meta', fam='meta')
-    page = pywikibot.Page(
-        site, 'Research:Revision scoring as a service/Word lists/' + lang)
-    try:
-        text = page.get()
-    except pywikibot.NoPage:
-        text = ("{{Research:Revision scoring as a service/template/word list "
-                "data\n  |lang=%s\n  |gen=250\n  |badwords=-\n  |informal=-"
-                "\n  |stopwords=-\n  |dictionary=-\n  |stemmer=-\n  |contact="
-                "\n  |features=no\n  |labels=requested\n  |campaign=no\n  "
-                "|needs=-\n  |list-generated=\n  |list-stop=\n}}\n" % lang)
-    except:
-        return False
-    new_text = text
-    if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text):
-        if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text):
-            new_text = re.sub(
-                r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})',
-                r'\1%s\2' % bot.bad_words_res_text,
-                new_text)
-    else:
-        new_text = re.sub(
-            r'\}\}',
-            r'|list-generated=%s\n}}' % bot.bad_words_res_text,
-            new_text)
-    if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text):
-        if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text):
-            new_text = re.sub(
-                r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})',
-                r'\1%s\2' % bot.stop_words_res_text,
-                new_text)
-    else:
-        new_text = re.sub(
-            r'\}\}',
-            r'|list-stop=%s\n}}' % bot.stop_words_res_text,
-            new_text)
-    if new_text != text:
-        page.text = new_text
-        page.save('Bot: update results')
-if __name__ == "__main__":
-    dumps = sys.argv[1:]
-    run(dumps)

From a5e06c23f08ed82ec3d55d50bb92dd79805d3528 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 12:24:00 -0400
Subject: [PATCH 07/19] Add a few unit tests

I still haven't figured out how the script is supposed to work
---
 editquality/bwds/__init__.py                  |  38 +++-
 editquality/bwds/tests/__init__.py            |   0
 editquality/bwds/tests/bad_edits_words.txt    |   1 +
 editquality/bwds/tests/no_docs.txt            |   1 +
 editquality/bwds/tests/test_bwds.py           |  29 +++
 editquality/bwds/tests/words_db.txt           |   1 +
 .../utilities/bad_words_detection_system.py   | 208 ++----------------
 7 files changed, 81 insertions(+), 197 deletions(-)
 create mode 100644 editquality/bwds/tests/__init__.py
 create mode 100644 editquality/bwds/tests/bad_edits_words.txt
 create mode 100644 editquality/bwds/tests/no_docs.txt
 create mode 100644 editquality/bwds/tests/test_bwds.py
 create mode 100644 editquality/bwds/tests/words_db.txt

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index 65b731d..eee3552 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -18,6 +18,7 @@
 import json
 import time
 from collections import OrderedDict
+from importlib import import_module
 
 base_file_path = '/data/project/dexbot/pywikibot-core/something_'
 
@@ -150,8 +151,35 @@ def dump(self):
             f.write(json.dumps(self.counter))
 
 
-def test_parse_bad_edits():
-    edits = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
-    bot = Bot()
-    bot.parse_edits(edits)
-    bot.parse_bad_edits(numbers_to_show=0)
+def read_rev_pages(f):
+
+    for line in f:
+        parts = line.strip().split('\t')
+
+        if len(parts) == 1:
+            rev_id = parts
+            yield int(rev_id[0]), None
+        elif len(parts) == 2:
+            rev_id, page_id = parts
+            yield int(rev_id), int(page_id)
+
+
+def import_from_path(path):
+    parts = path.split(".")
+    module_path = ".".join(parts[:-1])
+    attribute_name = parts[-1]
+
+    module = import_module(module_path)
+
+    attribute = getattr(module, attribute_name)
+
+    return attribute
+
+
+def cache_parse(pathes, num_res):
+    if not pathes.strip():
+        pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt'
+    pathes = pathes.split(',')
+    bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1],
+              no_docs=pathes[2])
+    bot.parse_bad_edits(num_res)
diff --git a/editquality/bwds/tests/__init__.py b/editquality/bwds/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/editquality/bwds/tests/bad_edits_words.txt b/editquality/bwds/tests/bad_edits_words.txt
new file mode 100644
index 0000000..073311b
--- /dev/null
+++ b/editquality/bwds/tests/bad_edits_words.txt
@@ -0,0 +1 @@
+bad_edits_words
\ No newline at end of file
diff --git a/editquality/bwds/tests/no_docs.txt b/editquality/bwds/tests/no_docs.txt
new file mode 100644
index 0000000..285d09b
--- /dev/null
+++ b/editquality/bwds/tests/no_docs.txt
@@ -0,0 +1 @@
+no_docs.txt
\ No newline at end of file
diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
new file mode 100644
index 0000000..e9ea48d
--- /dev/null
+++ b/editquality/bwds/tests/test_bwds.py
@@ -0,0 +1,29 @@
+from editquality.bwds import import_from_path, cache_parse, Edit, Bot
+from editquality.utilities.bad_words_detection_system import bot_gen
+
+
+def test_import_from_path():
+    import_from_path('revscoring.languages.english')
+
+
+def test_cache_parse():
+    cache_parse(
+        'editquality/bwds/tests/words_db.txt,'
+        'editquality/bwds/tests/bad_edits_words.txt,'
+        'editquality/bwds/tests/no_docs.txt',
+        num_res=1
+    )
+
+
+def test_bot_gen():
+    en_main_page_id = 232335
+    a_revision_id = 7101436
+    en_api_url = 'https://en.wikipedia.org/w/api.php'
+    bot_gen([(en_main_page_id, a_revision_id)], 'TODO', en_api_url)
+
+
+def test_parse_bad_edits():
+    edits = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
+    bot = Bot()
+    bot.parse_edits(edits)
+    bot.parse_bad_edits(numbers_to_show=0)
diff --git a/editquality/bwds/tests/words_db.txt b/editquality/bwds/tests/words_db.txt
new file mode 100644
index 0000000..9316897
--- /dev/null
+++ b/editquality/bwds/tests/words_db.txt
@@ -0,0 +1 @@
+words_db
\ No newline at end of file
diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py
index 240b52c..7113f93 100755
--- a/editquality/utilities/bad_words_detection_system.py
+++ b/editquality/utilities/bad_words_detection_system.py
@@ -2,19 +2,6 @@
 WIP
 The script to find bad words automatically.
 
-It gets a set of added words and determines tf-idf of words
-the it uses K-means algorithm to determin them.
-
-Some parts are copied from
-https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py
-
->>> from bad_words_detection_system import *
->>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True),
-...          Edit(3, {'one':5, 'four': 1}, False)]
->>> bot = Bot()
->>> bot.parse_edits(edits)
->>> bot.parse_bad_edits()
-
 python3 bad_words_detection_system.py --rev-pages:f.txt
     --api:https://en.wikipedia.org/w/api.php
     --language:revscoring.languages.english
@@ -22,175 +9,17 @@
 Use cache:
 python3 bad_words_detection_system.py --cache:
 """
-import math
 import sys
 import traceback
-import json
-import time
-from importlib import import_module
-from collections import OrderedDict
 # TODO: User argparse
 
-from revscoring.extractors import APIExtractor
-from revscoring.datasources import diff
-
-from mw import api
-from mw.lib import reverts
-
-base_file_path = '/data/project/dexbot/pywikibot-core/something_'
-
-
-class Edit(object):
-    def __init__(self, rev_id, added_words, reverted):
-        self.id = rev_id
-        self.added_words = added_words
-        if not isinstance(self.added_words, dict):
-            self.fix_added_words()
-        self.reverted = reverted
-
-    def fix_added_words(self):
-        temp = {}
-        for word in self.added_words:
-            temp[word] = temp.get(word, 0) + 1
-        self.added_words = temp
-
-
-class Bot(object):
-
-    def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None):
-        self.bad_edits = Edit(-1, {}, True)
-        self.counter = 0
-        self.words_db = {}
-        self.bad_words_db = {}
-        self.bad_counter = 0
-        if bool(bad_words_cache) != bool(words_cache):
-            raise "You should define both"
-        if words_cache:
-            self.cache = True
-            self.initiate_cache(words_cache, bad_words_cache, no_docs)
-        else:
-            self.cache = False
-
-    def initiate_cache(self, words_cache, bad_words_cache, no_docs):
-        with open(words_cache, 'r') as f:
-            self.words_db = json.loads(f.read())
-        with open(bad_words_cache, 'r') as f:
-            self.bad_edits.added_words = json.loads(f.read())
-        with open(no_docs, 'r') as f:
-            self.counter = int(f.read())
-
-    def parse_edits(self, edits):
-        for edit in edits:
-            # Since edits can be gen and len doesn't mean there
-            self.counter += 1
-            if edit.reverted:
-                for word in edit.added_words:
-                    self.bad_edits.added_words[word] = \
-                        self.bad_edits.added_words.get(word, 0) + \
-                        edit.added_words[word]
-                    self.bad_words_db[word] = (
-                        self.bad_words_db.get(word, 0) + 1)
-                self.bad_counter += 1
-                continue
-            for word in edit.added_words:
-                self.words_db[word] = self.words_db.get(word, 0) + 1
-
-    def parse_bad_edits(self, numbers_to_show=10):
-        self.possible_bad_words = {}
-        self.stop_words = {}
-        if not self.cache:
-            self.counter += 1
-        for word in self.bad_edits.added_words:
-            if not self.cache:
-                self.words_db[word] = self.words_db.get(word, 0) + 1
-            if 'sh' in word or 'ch' in word:
-                continue
-            self.possible_bad_words[word] = self.tf_idf(word)
-            self.stop_words[word] = self.idf(word)
-        if numbers_to_show:
-            self.show_results(numbers_to_show)
-            self.show_results2(numbers_to_show)
-
-    def tf_idf(self, word):
-        tf = math.log(self.bad_edits.added_words[word]) + 1
-        idf = math.log(float(self.counter)/self.words_db[word])
-        return tf*idf
+from revscoring.extractors.api import Extractor
+from revscoring.features.wikitext import Diff
 
-    def idf(self, word):
-        return math.log(float(self.counter)/self.words_db[word])
+from mwapi import Session
+import mwreverts
 
-    def show_results(self, numbers_to_show):
-        print("Showing %d results" % numbers_to_show)
-        values = sorted(self.possible_bad_words.values())
-        lim = values[numbers_to_show*-1]
-        res = {}
-        for word in self.possible_bad_words:
-            if self.possible_bad_words[word] >= lim:
-                res[word] = self.possible_bad_words[word]
-        res = OrderedDict(
-            sorted(res.items(), key=lambda t: t[1], reverse=True))
-        res_text = []
-        for word in res:
-            res_text.append(word)
-        res_text.sort()
-        res_text = "#" + '\n#'.join(res_text)
-        self.bad_words_res_text = res_text
-        with open('%s_%s.txt' % (base_file_path, time.time()), 'w') as f:
-            f.write(res_text)
-
-    def show_results2(self, numbers_to_show):
-        print("Showing another %d results" % numbers_to_show)
-        values = sorted(self.stop_words.values(), reverse=True)
-        lim = values[numbers_to_show*-1]
-        res = {}
-        for word in self.stop_words:
-            if self.stop_words[word] <= lim:
-                res[word] = self.stop_words[word]
-        res = OrderedDict(sorted(res.items(), key=lambda t: t[1]))
-        res_text = []
-        for word in res:
-            res_text.append(word)
-        res_text.sort()
-        res_text = "#" + '\n#'.join(res_text)
-        self.stop_words_res_text = res_text
-        with open('%s2_%s.txt' % (base_file_path, time.time()), 'w') as f:
-            f.write(res_text)
-
-    def dump(self):
-        new_db = {}
-        for word in self.bad_edits.added_words:
-            new_db[word] = self.words_db[word]
-        with open('words_db.txt', 'w') as f:
-            f.write(json.dumps(new_db))
-        with open('bad_edits_words.txt', 'w') as f:
-            f.write(json.dumps(self.bad_edits.added_words))
-        with open('no_docs.txt', 'w') as f:
-            f.write(json.dumps(self.counter))
-
-
-def read_rev_pages(f):
-
-    for line in f:
-        parts = line.strip().split('\t')
-
-        if len(parts) == 1:
-            rev_id = parts
-            yield int(rev_id[0]), None
-        elif len(parts) == 2:
-            rev_id, page_id = parts
-            yield int(rev_id), int(page_id)
-
-
-def import_from_path(path):
-    parts = path.split(".")
-    module_path = ".".join(parts[:-1])
-    attribute_name = parts[-1]
-
-    module = import_module(module_path)
-
-    attribute = getattr(module, attribute_name)
-
-    return attribute
+from editquality.bwds import Bot, Edit, cache_parse, import_from_path, read_rev_pages
 
 
 def handle_args():
@@ -213,20 +42,24 @@ def handle_args():
 
 def bot_gen(rev_pages, language, api_url):
 
-    session = api.Session(api_url)
-    extractor = Extractor(session, language=language)
+    session = Session(api_url)
+    extractor = Extractor(session)
 
     for rev_id, page_id in rev_pages:
+        api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids')
+        revisions = next(iter(api_result['query']['pages'].values()))['revisions']
+
         sys.stderr.write(".")
         sys.stderr.flush()
         try:
+            revisions = [revision for revision in revisions if 'sha1hidden' not in revision]
 
             # Detect reverted status
-            revert = reverts.api.check(session, rev_id, page_id, radius=3)
-            reverted = revert is not None
-            added_words = list(
-                extractor.extract(rev_id, [diff.added_words]))[0]
-            yield Edit(rev_id, added_words, reverted)
+            for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions):
+                for reverted in revert.reverteds:
+                    # added_words = list(extractor.extract(rev_id, [diff.added_words]))[0]
+                    added_words = list() # TODO how to upgrade this?
+                    yield Edit(rev_id, added_words, reverted)
 
         except KeyboardInterrupt:
             sys.stderr.write("\n^C Caught.  Exiting...")
@@ -239,15 +72,6 @@ def bot_gen(rev_pages, language, api_url):
     sys.stderr.write("\n")
 
 
-def cache_parse(pathes, num_res):
-    if not pathes.strip():
-        pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt'
-    pathes = pathes.split(',')
-    bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1],
-              no_docs=pathes[2])
-    bot.parse_bad_edits(num_res)
-
-
 def main():
     args = handle_args()
     if '--num_res' in args:

From c14eeddb9b8eaccad74fc397c98eaa6fe02ba5ed Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 13:06:50 -0400
Subject: [PATCH 08/19] Unit tests pass

---
 editquality/bwds/__init__.py               | 10 +---------
 editquality/bwds/tests/bad_edits_words.txt |  2 +-
 editquality/bwds/tests/no_docs.txt         |  2 +-
 editquality/bwds/tests/test_bwds.py        |  2 +-
 editquality/bwds/tests/words_db.txt        |  2 +-
 5 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index eee3552..9b299c3 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -165,15 +165,7 @@ def read_rev_pages(f):
 
 
 def import_from_path(path):
-    parts = path.split(".")
-    module_path = ".".join(parts[:-1])
-    attribute_name = parts[-1]
-
-    module = import_module(module_path)
-
-    attribute = getattr(module, attribute_name)
-
-    return attribute
+    return import_module(path)
 
 
 def cache_parse(pathes, num_res):
diff --git a/editquality/bwds/tests/bad_edits_words.txt b/editquality/bwds/tests/bad_edits_words.txt
index 073311b..3fafbf7 100644
--- a/editquality/bwds/tests/bad_edits_words.txt
+++ b/editquality/bwds/tests/bad_edits_words.txt
@@ -1 +1 @@
-bad_edits_words
\ No newline at end of file
+{"x": 1}
\ No newline at end of file
diff --git a/editquality/bwds/tests/no_docs.txt b/editquality/bwds/tests/no_docs.txt
index 285d09b..56a6051 100644
--- a/editquality/bwds/tests/no_docs.txt
+++ b/editquality/bwds/tests/no_docs.txt
@@ -1 +1 @@
-no_docs.txt
\ No newline at end of file
+1
\ No newline at end of file
diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
index e9ea48d..d83d3d0 100644
--- a/editquality/bwds/tests/test_bwds.py
+++ b/editquality/bwds/tests/test_bwds.py
@@ -11,7 +11,7 @@ def test_cache_parse():
         'editquality/bwds/tests/words_db.txt,'
         'editquality/bwds/tests/bad_edits_words.txt,'
         'editquality/bwds/tests/no_docs.txt',
-        num_res=1
+        0
     )
 
 
diff --git a/editquality/bwds/tests/words_db.txt b/editquality/bwds/tests/words_db.txt
index 9316897..8429d4d 100644
--- a/editquality/bwds/tests/words_db.txt
+++ b/editquality/bwds/tests/words_db.txt
@@ -1 +1 @@
-words_db
\ No newline at end of file
+{"x": 2}
\ No newline at end of file

From 61ccb8d31abe54a1258f8a4559565eac95c3cbfb Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 13:42:22 -0400
Subject: [PATCH 09/19] Add test_dump

---
 editquality/bwds/tests/test_bwds.py           | 36 ++++++++++++++-
 editquality/feature_lists/tests/__init__.py   |  0
 .../feature_lists/tests/test_huwiki.py        | 29 ------------
 .../feature_lists/tests/test_wikidatawiki.py  | 45 -------------------
 4 files changed, 34 insertions(+), 76 deletions(-)
 delete mode 100644 editquality/feature_lists/tests/__init__.py
 delete mode 100644 editquality/feature_lists/tests/test_huwiki.py
 delete mode 100644 editquality/feature_lists/tests/test_wikidatawiki.py

diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
index d83d3d0..e26e870 100644
--- a/editquality/bwds/tests/test_bwds.py
+++ b/editquality/bwds/tests/test_bwds.py
@@ -2,6 +2,9 @@
 from editquality.utilities.bad_words_detection_system import bot_gen
 
 
+EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
+
+
 def test_import_from_path():
     import_from_path('revscoring.languages.english')
 
@@ -23,7 +26,36 @@ def test_bot_gen():
 
 
 def test_parse_bad_edits():
-    edits = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
     bot = Bot()
-    bot.parse_edits(edits)
+    bot.parse_edits(EDITS)
     bot.parse_bad_edits(numbers_to_show=0)
+
+
+def dump_empty():
+    bot = Bot()
+    bot.dump()
+    with open('words_db.txt') as words_db:
+        assert words_db.read() == '{}'
+    with open('bad_edits_words.txt') as bad_edits_words:
+        assert bad_edits_words.read() == '{}'
+    with open('no_docs.txt') as no_docs:
+        assert no_docs.read() == '0'
+
+
+def dump_toy_data():
+    bot = Bot()
+    bot.parse_edits(EDITS)
+    bot.parse_bad_edits(0)
+    bot.dump()
+    with open('words_db.txt') as words_db:
+        assert words_db.read() == '{"three": 1}'
+    with open('bad_edits_words.txt') as bad_edits_words:
+        assert bad_edits_words.read() == '{"three": 3}'
+    with open('no_docs.txt') as no_docs:
+        assert no_docs.read() == '4'
+
+
+def test_dump():
+    # Calling both tests from here because we want to ensure they're not run concurrently
+    dump_empty()
+    dump_toy_data()
diff --git a/editquality/feature_lists/tests/__init__.py b/editquality/feature_lists/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/editquality/feature_lists/tests/test_huwiki.py b/editquality/feature_lists/tests/test_huwiki.py
deleted file mode 100644
index e2d27db..0000000
--- a/editquality/feature_lists/tests/test_huwiki.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from revscoring.dependencies import solve
-
-from .. import huwiki
-
-OK_WORDS = [
-    "fartő",  # part of a cattle
-    "fartőke",  # part of a boat
-    "ok",
-    "dada",
-    "ha"
-]
-
-STILL_MATCH = [
-    "fart",
-    "farts",
-    "farting",
-    "farter"
-]
-
-
-def test_huwiki():
-    ok_cache = {'datasource.revision.text': " ".join(OK_WORDS)}
-    bad_cache = {'datasource.revision.text': " ".join(STILL_MATCH)}
-    assert (solve(huwiki.english_badwords_safe.revision.datasources.matches,
-                  cache=ok_cache) == [])
-    assert (solve(huwiki.english_informals_safe.revision.datasources.matches,
-                  cache=ok_cache) == [])
-    assert (solve(huwiki.english_badwords_safe.revision.datasources.matches,
-                  cache=bad_cache) == STILL_MATCH)
diff --git a/editquality/feature_lists/tests/test_wikidatawiki.py b/editquality/feature_lists/tests/test_wikidatawiki.py
deleted file mode 100644
index cbb5abe..0000000
--- a/editquality/feature_lists/tests/test_wikidatawiki.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from revscoring.datasources import revision_oriented
-from revscoring.dependencies import solve
-
-from .. import wikidatawiki
-
-
-def test_comment_features():
-    comment_ds = revision_oriented.revision.comment
-    cache = {comment_ds: "/* wbmergeitems-to:0||Q928543 */ "}
-    assert solve(wikidatawiki.is_merge_into, cache=cache)
-    assert solve(wikidatawiki.is_merge_from, cache=cache) is False
-    assert solve(wikidatawiki.is_item_creation, cache=cache) is False
-
-    cache = {comment_ds: "/* wbmergeitems-from:0||Q928543 */ "}
-    assert solve(wikidatawiki.is_merge_from, cache=cache)
-    assert solve(wikidatawiki.is_merge_into, cache=cache) is False
-
-    cache = {comment_ds: "/* clientsitelink-remove:1||enwiki */ Boris Kok"}
-    assert solve(wikidatawiki.is_client_delete, cache=cache)
-    assert solve(wikidatawiki.is_client_move, cache=cache) is False
-
-    cache = {comment_ds: "/* clientsitelink-update:0|uk|uk:A|uk:B *"}
-    assert solve(wikidatawiki.is_client_move, cache=cache)
-    assert solve(wikidatawiki.is_client_delete, cache=cache) is False
-    assert solve(wikidatawiki.is_revert, cache=cache) is False
-
-    cache = {comment_ds: "Undid revision 1448592 by [[Special:Contributions/"}
-    assert solve(wikidatawiki.is_revert, cache=cache)
-    cache = {comment_ds: "Reverted edits by [[Special:Contributions/"}
-    assert solve(wikidatawiki.is_revert, cache=cache)
-    cache = {comment_ds: "rvv racial slurs"}
-    assert solve(wikidatawiki.is_revert, cache=cache)
-
-    cache = {comment_ds: "Restored revision 123456"}
-    assert solve(wikidatawiki.is_restore, cache=cache)
-    assert solve(wikidatawiki.is_item_creation, cache=cache) is False
-    assert solve(wikidatawiki.is_revert, cache=cache) is False
-
-    cache = {comment_ds: "/* wbeditentity-create:0| */"}
-    assert solve(wikidatawiki.is_item_creation, cache=cache)
-
-
-def test_property_features():
-    # assert False, "TODO"
-    pass

From e78ac9253d6bd326076fa9d2455c3e998057c458 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 13:46:48 -0400
Subject: [PATCH 10/19] Add test_read_rev_pages

---
 editquality/bwds/tests/test_bwds.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
index e26e870..179a630 100644
--- a/editquality/bwds/tests/test_bwds.py
+++ b/editquality/bwds/tests/test_bwds.py
@@ -1,4 +1,4 @@
-from editquality.bwds import import_from_path, cache_parse, Edit, Bot
+from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages
 from editquality.utilities.bad_words_detection_system import bot_gen
 
 
@@ -25,6 +25,10 @@ def test_bot_gen():
     bot_gen([(en_main_page_id, a_revision_id)], 'TODO', en_api_url)
 
 
+def test_read_rev_pages():
+    assert list(read_rev_pages(["0", "1\t2"])) == [(0, None), (1, 2)]
+
+
 def test_parse_bad_edits():
     bot = Bot()
     bot.parse_edits(EDITS)

From f437ef671c44fea8c048a5cb5be424d381ee5278 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 14:51:16 -0400
Subject: [PATCH 11/19] bot_gen test works except extractor.extract

---
 editquality/bwds/__init__.py                  | 55 ++++++++++++++++++-
 editquality/bwds/tests/test_bwds.py           |  9 ++-
 .../utilities/bad_words_detection_system.py   | 43 +--------------
 3 files changed, 59 insertions(+), 48 deletions(-)

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index 9b299c3..af3e8b0 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -16,13 +16,28 @@
 """
 import math
 import json
+import sys
 import time
-from collections import OrderedDict
+from collections import OrderedDict, namedtuple
 from importlib import import_module
 
+import traceback
+# TODO: User argparse
+
+from revscoring.extractors.api import Extractor
+from revscoring.features.wikitext import Diff
+
+from mwapi import Session
+import mwreverts
+
+
 base_file_path = '/data/project/dexbot/pywikibot-core/something_'
 
 
+# This is nice for debugging, e.g. printing this includes it values
+EditNamedTuple = namedtuple('EditNamedTuple', ['id', 'added_words', 'reverted'])
+
+
 class Edit(object):
     def __init__(self, rev_id, added_words, reverted):
         self.id = rev_id
@@ -37,6 +52,9 @@ def fix_added_words(self):
             temp[word] = temp.get(word, 0) + 1
         self.added_words = temp
 
+    def as_named_tuple(self):
+        return EditNamedTuple(self.id, self.added_words, self.reverted)
+
 
 class Bot(object):
 
@@ -175,3 +193,38 @@ def cache_parse(pathes, num_res):
     bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1],
               no_docs=pathes[2])
     bot.parse_bad_edits(num_res)
+
+
+def bot_gen(rev_pages, language, api_url):
+    session = Session(api_url)
+    extractor = Extractor(session)
+
+    for revision_id, page_id in rev_pages:
+        api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids')
+        revisions = next(iter(api_result['query']['pages'].values()))['revisions']
+
+        sys.stderr.write(".")
+        sys.stderr.flush()
+        try:
+            revisions = [revision for revision in revisions if 'sha1hidden' not in revision]
+
+            reverted_revision_ids = set()
+            # Detect reverted status
+            for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions):
+                for reverted in revert.reverteds:
+                    reverted_revision_ids.add(reverted['revid'])
+
+            # added_words = list(extractor.extract(rev_id, [diff.added_words]))[0]
+            added_words = list()  # TODO how to upgrade this?
+            yield Edit(revision_id, added_words, revision_id in reverted_revision_ids)
+
+        except KeyboardInterrupt:
+            sys.stderr.write("\n^C Caught.  Exiting...")
+            break
+
+        except:
+            sys.stderr.write(traceback.format_exc())
+            sys.stderr.write("\n")
+
+    sys.stderr.write("\n")
+
diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
index 179a630..bfcbf99 100644
--- a/editquality/bwds/tests/test_bwds.py
+++ b/editquality/bwds/tests/test_bwds.py
@@ -1,6 +1,4 @@
-from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages
-from editquality.utilities.bad_words_detection_system import bot_gen
-
+from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages, bot_gen, EditNamedTuple
 
 EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
 
@@ -19,10 +17,11 @@ def test_cache_parse():
 
 
 def test_bot_gen():
-    en_main_page_id = 232335
     a_revision_id = 7101436
+    en_main_page_id = 232335
     en_api_url = 'https://en.wikipedia.org/w/api.php'
-    bot_gen([(en_main_page_id, a_revision_id)], 'TODO', en_api_url)
+    assert list(bot_gen([(a_revision_id, en_main_page_id)], 'TODO', en_api_url))[0].as_named_tuple() == \
+        EditNamedTuple(7101436, {"TODO"}, False)
 
 
 def test_read_rev_pages():
diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py
index 7113f93..e0b4417 100755
--- a/editquality/utilities/bad_words_detection_system.py
+++ b/editquality/utilities/bad_words_detection_system.py
@@ -10,16 +10,7 @@
 python3 bad_words_detection_system.py --cache:
 """
 import sys
-import traceback
-# TODO: User argparse
-
-from revscoring.extractors.api import Extractor
-from revscoring.features.wikitext import Diff
-
-from mwapi import Session
-import mwreverts
-
-from editquality.bwds import Bot, Edit, cache_parse, import_from_path, read_rev_pages
+from editquality.bwds import Bot, Edit, cache_parse, import_from_path, read_rev_pages, bot_gen
 
 
 def handle_args():
@@ -40,38 +31,6 @@ def handle_args():
     return args
 
 
-def bot_gen(rev_pages, language, api_url):
-
-    session = Session(api_url)
-    extractor = Extractor(session)
-
-    for rev_id, page_id in rev_pages:
-        api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids')
-        revisions = next(iter(api_result['query']['pages'].values()))['revisions']
-
-        sys.stderr.write(".")
-        sys.stderr.flush()
-        try:
-            revisions = [revision for revision in revisions if 'sha1hidden' not in revision]
-
-            # Detect reverted status
-            for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions):
-                for reverted in revert.reverteds:
-                    # added_words = list(extractor.extract(rev_id, [diff.added_words]))[0]
-                    added_words = list() # TODO how to upgrade this?
-                    yield Edit(rev_id, added_words, reverted)
-
-        except KeyboardInterrupt:
-            sys.stderr.write("\n^C Caught.  Exiting...")
-            break
-
-        except:
-            sys.stderr.write(traceback.format_exc())
-            sys.stderr.write("\n")
-
-    sys.stderr.write("\n")
-
-
 def main():
     args = handle_args()
     if '--num_res' in args:

From 52e961c512b0fed0d203db286c796a6858819213 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 15:41:53 -0400
Subject: [PATCH 12/19] Try out a dummy Datasource

---
 editquality/bwds/__init__.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index af3e8b0..1931817 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -23,13 +23,13 @@
 
 import traceback
 # TODO: User argparse
-
+from revscoring.datasources import Datasource, revision_oriented
 from revscoring.extractors.api import Extractor
-from revscoring.features.wikitext import Diff
+from revscoring.features.wikitext import revision
 
 from mwapi import Session
 import mwreverts
-
+from tests.extractors.test_extractor import get_last_two
 
 base_file_path = '/data/project/dexbot/pywikibot-core/something_'
 
@@ -214,8 +214,9 @@ def bot_gen(rev_pages, language, api_url):
                 for reverted in revert.reverteds:
                     reverted_revision_ids.add(reverted['revid'])
 
-            # added_words = list(extractor.extract(rev_id, [diff.added_words]))[0]
-            added_words = list()  # TODO how to upgrade this?
+            # added_words = list(extractor.extract(revision_id, [revision.diff.words_added]))[0]
+            datasource = Datasource("last_two_in_id", get_last_two, depends_on=[revision_oriented.revision.id])
+            added_words = {extractor.extract(revision_id, datasource)}
             yield Edit(revision_id, added_words, revision_id in reverted_revision_ids)
 
         except KeyboardInterrupt:

From 43a222c9701766e4d1522db58352c35ba4af777b Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 15:54:47 -0400
Subject: [PATCH 13/19] Remove dump_based_detection autodoc

---
 editquality/utilities/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/editquality/utilities/__init__.py b/editquality/utilities/__init__.py
index c6678f5..ca4198f 100644
--- a/editquality/utilities/__init__.py
+++ b/editquality/utilities/__init__.py
@@ -47,8 +47,4 @@
 bad_words_detection_system
 ++++++++++++++++++++++++++
 .. automodule:: editquality.utilities.bad_words_detection_system
-
-dump_based_detection
-++++++++++++++++++++
-.. automodule:: editquality.utilities.dump_based_detection
 """

From 65bbec185655026d53c842f6dc1f6c5cd06f606c Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Fri, 18 Sep 2020 15:56:39 -0400
Subject: [PATCH 14/19] Restore tests that I accidentally deleted

---
 editquality/feature_lists/tests/__init__.py   |  0
 .../feature_lists/tests/test_huwiki.py        | 29 ++++++++++++
 .../feature_lists/tests/test_wikidatawiki.py  | 45 +++++++++++++++++++
 3 files changed, 74 insertions(+)
 create mode 100644 editquality/feature_lists/tests/__init__.py
 create mode 100644 editquality/feature_lists/tests/test_huwiki.py
 create mode 100644 editquality/feature_lists/tests/test_wikidatawiki.py

diff --git a/editquality/feature_lists/tests/__init__.py b/editquality/feature_lists/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/editquality/feature_lists/tests/test_huwiki.py b/editquality/feature_lists/tests/test_huwiki.py
new file mode 100644
index 0000000..e2d27db
--- /dev/null
+++ b/editquality/feature_lists/tests/test_huwiki.py
@@ -0,0 +1,29 @@
+from revscoring.dependencies import solve
+
+from .. import huwiki
+
+OK_WORDS = [
+    "fartő",  # part of a cattle
+    "fartőke",  # part of a boat
+    "ok",
+    "dada",
+    "ha"
+]
+
+STILL_MATCH = [
+    "fart",
+    "farts",
+    "farting",
+    "farter"
+]
+
+
+def test_huwiki():
+    ok_cache = {'datasource.revision.text': " ".join(OK_WORDS)}
+    bad_cache = {'datasource.revision.text': " ".join(STILL_MATCH)}
+    assert (solve(huwiki.english_badwords_safe.revision.datasources.matches,
+                  cache=ok_cache) == [])
+    assert (solve(huwiki.english_informals_safe.revision.datasources.matches,
+                  cache=ok_cache) == [])
+    assert (solve(huwiki.english_badwords_safe.revision.datasources.matches,
+                  cache=bad_cache) == STILL_MATCH)
diff --git a/editquality/feature_lists/tests/test_wikidatawiki.py b/editquality/feature_lists/tests/test_wikidatawiki.py
new file mode 100644
index 0000000..cbb5abe
--- /dev/null
+++ b/editquality/feature_lists/tests/test_wikidatawiki.py
@@ -0,0 +1,45 @@
+from revscoring.datasources import revision_oriented
+from revscoring.dependencies import solve
+
+from .. import wikidatawiki
+
+
+def test_comment_features():
+    comment_ds = revision_oriented.revision.comment
+    cache = {comment_ds: "/* wbmergeitems-to:0||Q928543 */ "}
+    assert solve(wikidatawiki.is_merge_into, cache=cache)
+    assert solve(wikidatawiki.is_merge_from, cache=cache) is False
+    assert solve(wikidatawiki.is_item_creation, cache=cache) is False
+
+    cache = {comment_ds: "/* wbmergeitems-from:0||Q928543 */ "}
+    assert solve(wikidatawiki.is_merge_from, cache=cache)
+    assert solve(wikidatawiki.is_merge_into, cache=cache) is False
+
+    cache = {comment_ds: "/* clientsitelink-remove:1||enwiki */ Boris Kok"}
+    assert solve(wikidatawiki.is_client_delete, cache=cache)
+    assert solve(wikidatawiki.is_client_move, cache=cache) is False
+
+    cache = {comment_ds: "/* clientsitelink-update:0|uk|uk:A|uk:B *"}
+    assert solve(wikidatawiki.is_client_move, cache=cache)
+    assert solve(wikidatawiki.is_client_delete, cache=cache) is False
+    assert solve(wikidatawiki.is_revert, cache=cache) is False
+
+    cache = {comment_ds: "Undid revision 1448592 by [[Special:Contributions/"}
+    assert solve(wikidatawiki.is_revert, cache=cache)
+    cache = {comment_ds: "Reverted edits by [[Special:Contributions/"}
+    assert solve(wikidatawiki.is_revert, cache=cache)
+    cache = {comment_ds: "rvv racial slurs"}
+    assert solve(wikidatawiki.is_revert, cache=cache)
+
+    cache = {comment_ds: "Restored revision 123456"}
+    assert solve(wikidatawiki.is_restore, cache=cache)
+    assert solve(wikidatawiki.is_item_creation, cache=cache) is False
+    assert solve(wikidatawiki.is_revert, cache=cache) is False
+
+    cache = {comment_ds: "/* wbeditentity-create:0| */"}
+    assert solve(wikidatawiki.is_item_creation, cache=cache)
+
+
+def test_property_features():
+    # assert False, "TODO"
+    pass

From e1ed560e79554d46de504d900f0ccf8039635f96 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Tue, 22 Sep 2020 15:03:22 -0400
Subject: [PATCH 15/19] Remove unused import

---
 editquality/utilities/bad_words_detection_system.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py
index e0b4417..7a64fc7 100755
--- a/editquality/utilities/bad_words_detection_system.py
+++ b/editquality/utilities/bad_words_detection_system.py
@@ -10,7 +10,8 @@
 python3 bad_words_detection_system.py --cache:
 """
 import sys
-from editquality.bwds import Bot, Edit, cache_parse, import_from_path, read_rev_pages, bot_gen
+
+from editquality.bwds import Bot, cache_parse, import_from_path, read_rev_pages, bot_gen
 
 
 def handle_args():

From df1ad80a87e1279cb624be7654d93eeda5714e7b Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Tue, 22 Sep 2020 15:06:45 -0400
Subject: [PATCH 16/19] Add test_bot_gen_empty

---
 editquality/bwds/tests/test_bwds.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
index bfcbf99..b2e4473 100644
--- a/editquality/bwds/tests/test_bwds.py
+++ b/editquality/bwds/tests/test_bwds.py
@@ -16,6 +16,11 @@ def test_cache_parse():
     )
 
 
+def test_bot_gen_empty():
+    en_api_url = 'https://en.wikipedia.org/w/api.php'
+    assert list(bot_gen([], 'TODO', en_api_url)) == []
+
+
 def test_bot_gen():
     a_revision_id = 7101436
     en_main_page_id = 232335

From d75f06ac820def4e95486afb4e369e8fb5eba31a Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Wed, 23 Sep 2020 09:59:36 -0400
Subject: [PATCH 17/19] It works!!!

---
 editquality/bwds/__init__.py        | 10 +++-------
 editquality/bwds/tests/test_bwds.py | 12 ++++++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index 1931817..ba3883e 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -25,11 +25,10 @@
 # TODO: User argparse
 from revscoring.datasources import Datasource, revision_oriented
 from revscoring.extractors.api import Extractor
-from revscoring.features.wikitext import revision
+from revscoring.features import wikitext
 
 from mwapi import Session
 import mwreverts
-from tests.extractors.test_extractor import get_last_two
 
 base_file_path = '/data/project/dexbot/pywikibot-core/something_'
 
@@ -202,21 +201,18 @@ def bot_gen(rev_pages, language, api_url):
     for revision_id, page_id in rev_pages:
         api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids')
         revisions = next(iter(api_result['query']['pages'].values()))['revisions']
+        revisions = [revision for revision in revisions if 'sha1hidden' not in revision]
 
         sys.stderr.write(".")
         sys.stderr.flush()
         try:
-            revisions = [revision for revision in revisions if 'sha1hidden' not in revision]
-
             reverted_revision_ids = set()
             # Detect reverted status
             for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions):
                 for reverted in revert.reverteds:
                     reverted_revision_ids.add(reverted['revid'])
 
-            # added_words = list(extractor.extract(revision_id, [revision.diff.words_added]))[0]
-            datasource = Datasource("last_two_in_id", get_last_two, depends_on=[revision_oriented.revision.id])
-            added_words = {extractor.extract(revision_id, datasource)}
+            added_words = set(extractor.extract(revision_id, wikitext.revision.diff.datasources.words_added))
             yield Edit(revision_id, added_words, revision_id in reverted_revision_ids)
 
         except KeyboardInterrupt:
diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
index b2e4473..86da579 100644
--- a/editquality/bwds/tests/test_bwds.py
+++ b/editquality/bwds/tests/test_bwds.py
@@ -1,3 +1,5 @@
+from deltas import Token
+
 from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages, bot_gen, EditNamedTuple
 
 EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
@@ -22,11 +24,13 @@ def test_bot_gen_empty():
 
 
 def test_bot_gen():
-    a_revision_id = 7101436
-    en_main_page_id = 232335
+    a_revision_id = 979192243
+    pasta_page_id = 23871
     en_api_url = 'https://en.wikipedia.org/w/api.php'
-    assert list(bot_gen([(a_revision_id, en_main_page_id)], 'TODO', en_api_url))[0].as_named_tuple() == \
-        EditNamedTuple(7101436, {"TODO"}, False)
+    generated, = bot_gen([(a_revision_id, pasta_page_id)], '', en_api_url)
+    assert generated.id == a_revision_id
+    assert Token('unleavened', type='word') in generated.added_words
+    assert not generated.reverted
 
 
 def test_read_rev_pages():

From ed9acb579626774c035e00677112d8ebb8906a03 Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Wed, 23 Sep 2020 10:19:53 -0400
Subject: [PATCH 18/19] Remove unused code branch

---
 editquality/bwds/__init__.py                        |  9 +--------
 editquality/bwds/tests/test_bwds.py                 | 10 +++-------
 editquality/utilities/bad_words_detection_system.py | 13 +++----------
 3 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index ba3883e..526415b 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -19,11 +19,8 @@
 import sys
 import time
 from collections import OrderedDict, namedtuple
-from importlib import import_module
 
 import traceback
-# TODO: User argparse
-from revscoring.datasources import Datasource, revision_oriented
 from revscoring.extractors.api import Extractor
 from revscoring.features import wikitext
 
@@ -181,10 +178,6 @@ def read_rev_pages(f):
             yield int(rev_id), int(page_id)
 
 
-def import_from_path(path):
-    return import_module(path)
-
-
 def cache_parse(pathes, num_res):
     if not pathes.strip():
         pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt'
@@ -194,7 +187,7 @@ def cache_parse(pathes, num_res):
     bot.parse_bad_edits(num_res)
 
 
-def bot_gen(rev_pages, language, api_url):
+def bot_gen(rev_pages, api_url):
     session = Session(api_url)
     extractor = Extractor(session)
 
diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
index 86da579..6c6cfd5 100644
--- a/editquality/bwds/tests/test_bwds.py
+++ b/editquality/bwds/tests/test_bwds.py
@@ -1,14 +1,10 @@
 from deltas import Token
 
-from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages, bot_gen, EditNamedTuple
+from editquality.bwds import cache_parse, Edit, Bot, read_rev_pages, bot_gen
 
 EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
 
 
-def test_import_from_path():
-    import_from_path('revscoring.languages.english')
-
-
 def test_cache_parse():
     cache_parse(
         'editquality/bwds/tests/words_db.txt,'
@@ -20,14 +16,14 @@ def test_cache_parse():
 
 def test_bot_gen_empty():
     en_api_url = 'https://en.wikipedia.org/w/api.php'
-    assert list(bot_gen([], 'TODO', en_api_url)) == []
+    assert list(bot_gen([], en_api_url)) == []
 
 
 def test_bot_gen():
     a_revision_id = 979192243
     pasta_page_id = 23871
     en_api_url = 'https://en.wikipedia.org/w/api.php'
-    generated, = bot_gen([(a_revision_id, pasta_page_id)], '', en_api_url)
+    generated, = bot_gen([(a_revision_id, pasta_page_id)], en_api_url)
     assert generated.id == a_revision_id
     assert Token('unleavened', type='word') in generated.added_words
     assert not generated.reverted
diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py
index 7a64fc7..7ab7450 100755
--- a/editquality/utilities/bad_words_detection_system.py
+++ b/editquality/utilities/bad_words_detection_system.py
@@ -4,23 +4,21 @@
 
 python3 bad_words_detection_system.py --rev-pages:f.txt
     --api:https://en.wikipedia.org/w/api.php
-    --language:revscoring.languages.english
 
 Use cache:
 python3 bad_words_detection_system.py --cache:
 """
 import sys
 
-from editquality.bwds import Bot, cache_parse, import_from_path, read_rev_pages, bot_gen
+from editquality.bwds import Bot, cache_parse, read_rev_pages, bot_gen
 
 
+# TODO: Use argparse
 def handle_args():
     args = {}
     for arg in sys.argv[1:]:
         if arg.startswith('--rev-pages:'):
             args['--rev-pages'] = arg[len('--rev-pages:'):]
-        elif arg.startswith('--language:'):
-            args['--language'] = arg[len('--language:'):]
         elif arg.startswith('--api:'):
             args['--api'] = arg[len('--api:'):]
         elif arg.startswith('--cache:'):
@@ -43,13 +41,8 @@ def main():
         return
     rev_pages = read_rev_pages(open(args['--rev-pages']))
 
-    if args['--language'] is not None:
-        language = import_from_path(args['--language'])
-    else:
-        language = None
-
     api_url = args['--api']
-    gen = bot_gen(rev_pages, language, api_url)
+    gen = bot_gen(rev_pages, api_url)
     bot = Bot()
     bot.parse_edits(gen)
     bot.parse_bad_edits(num_res)

From f56c1f7d46729aa8192abd09f10a65da64edf14b Mon Sep 17 00:00:00 2001
From: paulkernfeld <me@paulkernfeld.com>
Date: Wed, 23 Sep 2020 10:33:47 -0400
Subject: [PATCH 19/19] Fix flake8

---
 editquality/bwds/__init__.py        | 34 ++++++++++++++++++++---------
 editquality/bwds/tests/test_bwds.py |  9 ++++++--
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py
index 526415b..b0af7a6 100644
--- a/editquality/bwds/__init__.py
+++ b/editquality/bwds/__init__.py
@@ -61,7 +61,9 @@ def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None):
         self.bad_words_db = {}
         self.bad_counter = 0
         if bool(bad_words_cache) != bool(words_cache):
-            raise ValueError("bad_words_cache should be defined if and only words_cache is defined")
+            raise ValueError(
+                "bad_words_cache should be defined if and only words_cache is "
+                "defined")
         if words_cache:
             self.cache = True
             self.initiate_cache(words_cache, bad_words_cache, no_docs)
@@ -110,11 +112,11 @@ def parse_bad_edits(self, numbers_to_show=10):
 
     def tf_idf(self, word):
         tf = math.log(self.bad_edits.added_words[word]) + 1
-        idf = math.log(self.counter/self.words_db[word])
-        return tf*idf
+        idf = math.log(self.counter / self.words_db[word])
+        return tf * idf
 
     def idf(self, word):
-        return math.log(self.counter/self.words_db[word])
+        return math.log(self.counter / self.words_db[word])
 
     def show_results(self, numbers_to_show):
         print("Showing %d results" % numbers_to_show)
@@ -192,21 +194,34 @@ def bot_gen(rev_pages, api_url):
     extractor = Extractor(session)
 
     for revision_id, page_id in rev_pages:
-        api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids')
+        api_result = session.get(
+            action='query',
+            titles='Main Page',
+            prop='revisions',
+            rvlimit=500,
+            rvprop='sha1|ids'
+        )
         revisions = next(iter(api_result['query']['pages'].values()))['revisions']
-        revisions = [revision for revision in revisions if 'sha1hidden' not in revision]
+        revisions = [
+            revision for revision in revisions if 'sha1hidden' not in revision]
 
         sys.stderr.write(".")
         sys.stderr.flush()
         try:
             reverted_revision_ids = set()
             # Detect reverted status
-            for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions):
+            for revert in mwreverts.detect(
+                    (revision['sha1'], revision) for revision in revisions
+            ):
                 for reverted in revert.reverteds:
                     reverted_revision_ids.add(reverted['revid'])
 
-            added_words = set(extractor.extract(revision_id, wikitext.revision.diff.datasources.words_added))
-            yield Edit(revision_id, added_words, revision_id in reverted_revision_ids)
+            added_words = set(extractor.extract(
+                revision_id, wikitext.revision.diff.datasources.words_added
+            ))
+            yield Edit(
+                revision_id, added_words, revision_id in reverted_revision_ids
+            )
 
         except KeyboardInterrupt:
             sys.stderr.write("\n^C Caught.  Exiting...")
@@ -217,4 +232,3 @@ def bot_gen(rev_pages, api_url):
             sys.stderr.write("\n")
 
     sys.stderr.write("\n")
-
diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py
index 6c6cfd5..3c12b7f 100644
--- a/editquality/bwds/tests/test_bwds.py
+++ b/editquality/bwds/tests/test_bwds.py
@@ -2,7 +2,11 @@
 
 from editquality.bwds import cache_parse, Edit, Bot, read_rev_pages, bot_gen
 
-EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)]
+EDITS = [
+    Edit(1, {'one': 1, 'two': 2}, False),
+    Edit(2, {'three': 3}, True),
+    Edit(3, {'one': 5, 'four': 1}, False)
+]
 
 
 def test_cache_parse():
@@ -64,6 +68,7 @@ def dump_toy_data():
 
 
 def test_dump():
-    # Calling both tests from here because we want to ensure they're not run concurrently
+    # Calling both tests from here because we want to ensure they're not run
+    # concurrently
     dump_empty()
     dump_toy_data()