From 2fd631554d6494a38ffe3e2067679a3ade168e00 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Wed, 16 Sep 2020 12:52:55 -0400 Subject: [PATCH 01/19] Add unmodified bwds code to utilities --- .../utilities/bad_words_detection_system.py | 276 ++++++++++++++++++ editquality/utilities/dump_based_detection.py | 224 ++++++++++++++ 2 files changed, 500 insertions(+) create mode 100755 editquality/utilities/bad_words_detection_system.py create mode 100644 editquality/utilities/dump_based_detection.py diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py new file mode 100755 index 0000000..240b52c --- /dev/null +++ b/editquality/utilities/bad_words_detection_system.py @@ -0,0 +1,276 @@ +""" +WIP +The script to find bad words automatically. + +It gets a set of added words and determines tf-idf of words +the it uses K-means algorithm to determin them. + +Some parts are copied from +https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py + +>>> from bad_words_detection_system import * +>>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True), +... Edit(3, {'one':5, 'four': 1}, False)] +>>> bot = Bot() +>>> bot.parse_edits(edits) +>>> bot.parse_bad_edits() + +python3 bad_words_detection_system.py --rev-pages:f.txt + --api:https://en.wikipedia.org/w/api.php + --language:revscoring.languages.english + +Use cache: +python3 bad_words_detection_system.py --cache: +""" +import math +import sys +import traceback +import json +import time +from importlib import import_module +from collections import OrderedDict +# TODO: User argparse + +from revscoring.extractors import APIExtractor +from revscoring.datasources import diff + +from mw import api +from mw.lib import reverts + +base_file_path = '/data/project/dexbot/pywikibot-core/something_' + + +class Edit(object): + def __init__(self, rev_id, added_words, reverted): + self.id = rev_id + self.added_words = added_words + if not isinstance(self.added_words, dict): + self.fix_added_words() + self.reverted = reverted + + def fix_added_words(self): + temp = {} + for word in self.added_words: + temp[word] = temp.get(word, 0) + 1 + self.added_words = temp + + +class Bot(object): + + def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None): + self.bad_edits = Edit(-1, {}, True) + self.counter = 0 + self.words_db = {} + self.bad_words_db = {} + self.bad_counter = 0 + if bool(bad_words_cache) != bool(words_cache): + raise "You should define both" + if words_cache: + self.cache = True + self.initiate_cache(words_cache, bad_words_cache, no_docs) + else: + self.cache = False + + def initiate_cache(self, words_cache, bad_words_cache, no_docs): + with open(words_cache, 'r') as f: + self.words_db = json.loads(f.read()) + with open(bad_words_cache, 'r') as f: + self.bad_edits.added_words = json.loads(f.read()) + with open(no_docs, 'r') as f: + self.counter = int(f.read()) + + def parse_edits(self, edits): + for edit in edits: + # Since edits can be gen and len doesn't mean there + self.counter += 1 + if edit.reverted: + for word in edit.added_words: + self.bad_edits.added_words[word] = \ + self.bad_edits.added_words.get(word, 0) + \ + edit.added_words[word] + self.bad_words_db[word] = ( + self.bad_words_db.get(word, 0) + 1) + self.bad_counter += 1 + continue + for word in edit.added_words: + self.words_db[word] = self.words_db.get(word, 0) + 1 + + def parse_bad_edits(self, numbers_to_show=10): + self.possible_bad_words = {} + self.stop_words = {} + if not self.cache: + self.counter += 1 + for word in self.bad_edits.added_words: + if not self.cache: + self.words_db[word] = self.words_db.get(word, 0) + 1 + if 'sh' in word or 'ch' in word: + continue + self.possible_bad_words[word] = self.tf_idf(word) + self.stop_words[word] = self.idf(word) + if numbers_to_show: + self.show_results(numbers_to_show) + self.show_results2(numbers_to_show) + + def tf_idf(self, word): + tf = math.log(self.bad_edits.added_words[word]) + 1 + idf = math.log(float(self.counter)/self.words_db[word]) + return tf*idf + + def idf(self, word): + return math.log(float(self.counter)/self.words_db[word]) + + def show_results(self, numbers_to_show): + print("Showing %d results" % numbers_to_show) + values = sorted(self.possible_bad_words.values()) + lim = values[numbers_to_show*-1] + res = {} + for word in self.possible_bad_words: + if self.possible_bad_words[word] >= lim: + res[word] = self.possible_bad_words[word] + res = OrderedDict( + sorted(res.items(), key=lambda t: t[1], reverse=True)) + res_text = [] + for word in res: + res_text.append(word) + res_text.sort() + res_text = "#" + '\n#'.join(res_text) + self.bad_words_res_text = res_text + with open('%s_%s.txt' % (base_file_path, time.time()), 'w') as f: + f.write(res_text) + + def show_results2(self, numbers_to_show): + print("Showing another %d results" % numbers_to_show) + values = sorted(self.stop_words.values(), reverse=True) + lim = values[numbers_to_show*-1] + res = {} + for word in self.stop_words: + if self.stop_words[word] <= lim: + res[word] = self.stop_words[word] + res = OrderedDict(sorted(res.items(), key=lambda t: t[1])) + res_text = [] + for word in res: + res_text.append(word) + res_text.sort() + res_text = "#" + '\n#'.join(res_text) + self.stop_words_res_text = res_text + with open('%s2_%s.txt' % (base_file_path, time.time()), 'w') as f: + f.write(res_text) + + def dump(self): + new_db = {} + for word in self.bad_edits.added_words: + new_db[word] = self.words_db[word] + with open('words_db.txt', 'w') as f: + f.write(json.dumps(new_db)) + with open('bad_edits_words.txt', 'w') as f: + f.write(json.dumps(self.bad_edits.added_words)) + with open('no_docs.txt', 'w') as f: + f.write(json.dumps(self.counter)) + + +def read_rev_pages(f): + + for line in f: + parts = line.strip().split('\t') + + if len(parts) == 1: + rev_id = parts + yield int(rev_id[0]), None + elif len(parts) == 2: + rev_id, page_id = parts + yield int(rev_id), int(page_id) + + +def import_from_path(path): + parts = path.split(".") + module_path = ".".join(parts[:-1]) + attribute_name = parts[-1] + + module = import_module(module_path) + + attribute = getattr(module, attribute_name) + + return attribute + + +def handle_args(): + args = {} + for arg in sys.argv[1:]: + if arg.startswith('--rev-pages:'): + args['--rev-pages'] = arg[len('--rev-pages:'):] + elif arg.startswith('--language:'): + args['--language'] = arg[len('--language:'):] + elif arg.startswith('--api:'): + args['--api'] = arg[len('--api:'):] + elif arg.startswith('--cache:'): + args['--cache'] = arg[len('--cache:'):] + elif arg.startswith('--num_res:'): + args['--num_res'] = arg[len('--num_res:'):] + else: + print('Unknown argument') + return args + + +def bot_gen(rev_pages, language, api_url): + + session = api.Session(api_url) + extractor = Extractor(session, language=language) + + for rev_id, page_id in rev_pages: + sys.stderr.write(".") + sys.stderr.flush() + try: + + # Detect reverted status + revert = reverts.api.check(session, rev_id, page_id, radius=3) + reverted = revert is not None + added_words = list( + extractor.extract(rev_id, [diff.added_words]))[0] + yield Edit(rev_id, added_words, reverted) + + except KeyboardInterrupt: + sys.stderr.write("\n^C Caught. Exiting...") + break + + except: + sys.stderr.write(traceback.format_exc()) + sys.stderr.write("\n") + + sys.stderr.write("\n") + + +def cache_parse(pathes, num_res): + if not pathes.strip(): + pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt' + pathes = pathes.split(',') + bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1], + no_docs=pathes[2]) + bot.parse_bad_edits(num_res) + + +def main(): + args = handle_args() + if '--num_res' in args: + num_res = int(args['--num_res']) + else: + num_res = 10 + if '--cache' in args: + cache_parse(args['--cache'], num_res) + return + rev_pages = read_rev_pages(open(args['--rev-pages'])) + + if args['--language'] is not None: + language = import_from_path(args['--language']) + else: + language = None + + api_url = args['--api'] + gen = bot_gen(rev_pages, language, api_url) + bot = Bot() + bot.parse_edits(gen) + bot.parse_bad_edits(num_res) + bot.dump() + + +if __name__ == "__main__": + main() diff --git a/editquality/utilities/dump_based_detection.py b/editquality/utilities/dump_based_detection.py new file mode 100644 index 0000000..d1ccfd9 --- /dev/null +++ b/editquality/utilities/dump_based_detection.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +# Copyright © 2014 He7d3r +# License: http://he7d3r.mit-license.org/ +""" +Extermely under construction. +Some parts are copied from +https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c +""" +from nltk.tokenize import RegexpTokenizer +import sys +from mw.lib import reverts +from pywikibot import xmlreader +import pywikibot +import re +import time +import regex + +from bad_words_detection_system import Edit, Bot + +cache = {} + +languages_by_size = [ + 'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi', + 'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar', + 'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk', + 'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz', + 'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka', + 'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy', + 'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb', + 'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv', + 'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne', + 'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa', + 'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa', + 'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo', + 'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb', + 'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue', + 'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf', + 'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea', + 'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu', + 'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt', + 'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo', + 'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv', + 'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw', + 'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr', + 'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty', + 'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab', + 'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr', + 'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt', + 'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff', + 'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc', + 'azb', 'or' + ] +cjk = ( + r'\u4E00-\u62FF' + # Unified Ideographs + r'\u6300-\u77FF' + + r'\u7800-\u8CFF' + + r'\u8D00-\u9FCC' + + r'\u3400-\u4DFF' + # Unified Ideographs Ext A + r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B + r'\U00021600-\U000230FF' + + r'\U00023100-\U000245FF' + + r'\U00024600-\U000260FF' + + r'\U00026100-\U000275FF' + + r'\U00027600-\U000290FF' + + r'\U00029100-\U0002A6DF' + + r'\uF900-\uFAFF' + # Compatibility Ideographs + r'\U0002F800-\U0002FA1F' # Compatibility Ideographs Suppl. +) + +chars = { + 'az': u'A-Za-zÇçƏəĞğıİÖöŞşÜü', + 'ar': u'غظضذخثتشرقصفعسنملكيطحزوهدجبا', + 'et': u'A-Za-zŠšŽžÕõÄäÖöÜü', + 'af': u'A-Za-züûöôïîëêè', + 'en': u'A-Za-z', + 'id': u'A-Za-z', + 'ko': cjk, + 'zh': cjk, + 'ja': cjk, + 'pt': u'A-Za-záàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ', + 'tr': u'A-Za-zÇĞİÖŞÜçğıöşüâîûÂÎÛ', + 'fa': u'ابپتثجچحخدذرزژسشصآضطظعغفقکگلمنوهی‌يك', + 'fr': u'A-Za-zÀàÂâÆæÄäÇçÉéÈèÊêËëÎîÏïÔôŒœÖöÙùÛûÜüŸÿ', + 'de': u'A-Za-zÄäÖöÜüß', + 'es': u'A-Za-zÑñéÉüÜóÓ', + 'uk': u'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЬ' + u'ьЮюЯя', + 'pl': u'AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż', + 'he': u'למנסעפצקרשתםןףץאבגדהוזחטיכך', + 'hy': u'ԱաԲբԳգԴդԵեԶզԷէԸըԹթԺժԻիԼլԽխԾծԿկՀհՁձՂղՃճՄմՅյՆնՇշՈոՉչՊպՋջՌռՍսՎվՏտՐր' + u'ՑցՈՒՈւուՒւՓփՔքևևՕօՖֆ', + 'vi': u'AaĂăÂâBbCcDdĐđEeÊêGgHhIiKkLlMmNnOoÔôƠơPpQqRrSsTtUuƯưVvXxYy', + 'ur': u'ابپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوهھءیےٹڈڑ‌آّْیٰوَُِٗ', + 'uz': 'A-Za-zʻ', + 'sv': u'A-Za-zÅÄÖåäö', + 'hu': u'A-Za-zËëÉéÓóÖöŐőÚúÜüŰűÁá', + 'cs': u'A-Za-zÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž', + 'hi': u'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहळक्षज्ञ:अपआपाइपिईपीउपुऊपूऋपृॠप' + u'ॄऌपॢॡपॣएपेऐपैओपोऔपौअंपंअःपः', + 'no': u'A-Za-zÆØÅæøåéèêóòâôüáàé', + 'ta': u'௰௱௲௳௴௵௶௷௸௹௺ௗௐொோௌ்ெேைீுூாிரறலளழவஶஷஸஹணதநனபம' + u'யஐஒஓஔகஙசஜஞடஂஃஅஆஇஈஉஊஎஏ', +} + + +def lower(a, lang): + if lang == 'tr': + return a.replace('I', u'ı').replace(u'İ', 'i').lower() + return a.lower() + + +def page_info(dump, lang): + global tokenizer + c = 1 + di_old = [] + di = [] + nombre = '3,' if lang not in ['ja', 'zh'] else '1' + for entry in dump.parse(): + if entry.ns != '0': + continue + if c != entry.id: + if c != 1: + di_old = di[:] + di = [] + if entry.id and int(entry.id[-1]) == 0: + print('new page', entry.id) + di.append(entry) + else: + di.append(entry) + continue + c = entry.id + firstRev = True + history = {} + detector = reverts.Detector(radius=3) + for revision in di_old: + revision.text = re.sub( + r'\[\[(%s)\:' % '|'.join(languages_by_size), + '', + revision.text) + words = set() + if lang in chars: + token_pattern = r'[%s]{%s}' % (chars[lang], nombre) + tokenizer = RegexpTokenizer(token_pattern) + tokens = tokenizer.tokenize(revision.text) + else: + token_pattern = r'\p{alpha}+' + tokens = regex.findall(token_pattern, revision.text) + for w in tokens: + words.add(lower(w, lang)) + if firstRev: + prevIntersection = words + firstRev = False + added = words - prevIntersection + prevIntersection = words + history[revision.revisionid] = Edit( + revision.revisionid, added, False) + rev = detector.process(revision.text, + {'rev_id': revision.revisionid}) + if rev: + for reverted in rev.reverteds: + history[reverted['rev_id']].reverted = True + + yield history + + +def run(dumps): + number = 500000 + counter = 0 + start_time = time.time() + for casee in dumps: + lang = casee.split('/')[-1].split('wiki')[0] + dump = xmlreader.XmlDump(casee, True) + bot = Bot() + for case in page_info(dump, lang): + counter += 1 + if number and counter > number: + break + bot.parse_edits(case.values()) + bot.parse_bad_edits(250) + bot.dump() + print(time.time() - start_time) + site = pywikibot.Site('meta', fam='meta') + page = pywikibot.Page( + site, 'Research:Revision scoring as a service/Word lists/' + lang) + try: + text = page.get() + except pywikibot.NoPage: + text = ("{{Research:Revision scoring as a service/template/word list " + "data\n |lang=%s\n |gen=250\n |badwords=-\n |informal=-" + "\n |stopwords=-\n |dictionary=-\n |stemmer=-\n |contact=" + "\n |features=no\n |labels=requested\n |campaign=no\n " + "|needs=-\n |list-generated=\n |list-stop=\n}}\n" % lang) + except: + return False + new_text = text + if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text): + if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text): + new_text = re.sub( + r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})', + r'\1%s\2' % bot.bad_words_res_text, + new_text) + else: + new_text = re.sub( + r'\}\}', + r'|list-generated=%s\n}}' % bot.bad_words_res_text, + new_text) + if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text): + if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text): + new_text = re.sub( + r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})', + r'\1%s\2' % bot.stop_words_res_text, + new_text) + else: + new_text = re.sub( + r'\}\}', + r'|list-stop=%s\n}}' % bot.stop_words_res_text, + new_text) + if new_text != text: + page.text = new_text + page.save('Bot: update results') +if __name__ == "__main__": + dumps = sys.argv[1:] + run(dumps) From 5c75035a70f5ec88336a2487f1ec4ab8776b9de8 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Wed, 16 Sep 2020 15:27:15 -0400 Subject: [PATCH 02/19] Copy library code into bwds/__init__.py --- editquality/bwds/__init__.py | 158 +++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 editquality/bwds/__init__.py diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py new file mode 100644 index 0000000..0c80707 --- /dev/null +++ b/editquality/bwds/__init__.py @@ -0,0 +1,158 @@ +""" +Code to find bad words automatically. + +It gets a set of added words and determines tf-idf of words +the it uses K-means algorithm to determine them. + +Some parts are copied from +https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py + +>>> from bad_words_detection_system import * +>>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True), +... Edit(3, {'one':5, 'four': 1}, False)] +>>> bot = Bot() +>>> bot.parse_edits(edits) +>>> bot.parse_bad_edits() +""" +import math +import sys +import traceback +import json +import time +from importlib import import_module +from collections import OrderedDict +# TODO: User argparse + +from revscoring.extractors.api import Extractor +from revscoring.datasources import revision_oriented + +from mwapi import Session +import mwreverts + + +class Edit(object): + def __init__(self, rev_id, added_words, reverted): + self.id = rev_id + self.added_words = added_words + if not isinstance(self.added_words, dict): + self.fix_added_words() + self.reverted = reverted + + def fix_added_words(self): + temp = {} + for word in self.added_words: + temp[word] = temp.get(word, 0) + 1 + self.added_words = temp + + +class Bot(object): + + def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None): + self.bad_edits = Edit(-1, {}, True) + self.counter = 0 + self.words_db = {} + self.bad_words_db = {} + self.bad_counter = 0 + if bool(bad_words_cache) != bool(words_cache): + raise "You should define both" + if words_cache: + self.cache = True + self.initiate_cache(words_cache, bad_words_cache, no_docs) + else: + self.cache = False + + def initiate_cache(self, words_cache, bad_words_cache, no_docs): + with open(words_cache, 'r') as f: + self.words_db = json.loads(f.read()) + with open(bad_words_cache, 'r') as f: + self.bad_edits.added_words = json.loads(f.read()) + with open(no_docs, 'r') as f: + self.counter = int(f.read()) + + def parse_edits(self, edits): + for edit in edits: + # Since edits can be gen and len doesn't mean there + self.counter += 1 + if edit.reverted: + for word in edit.added_words: + self.bad_edits.added_words[word] = \ + self.bad_edits.added_words.get(word, 0) + \ + edit.added_words[word] + self.bad_words_db[word] = ( + self.bad_words_db.get(word, 0) + 1) + self.bad_counter += 1 + continue + for word in edit.added_words: + self.words_db[word] = self.words_db.get(word, 0) + 1 + + def parse_bad_edits(self, numbers_to_show=10): + self.possible_bad_words = {} + self.stop_words = {} + if not self.cache: + self.counter += 1 + for word in self.bad_edits.added_words: + if not self.cache: + self.words_db[word] = self.words_db.get(word, 0) + 1 + if 'sh' in word or 'ch' in word: + continue + self.possible_bad_words[word] = self.tf_idf(word) + self.stop_words[word] = self.idf(word) + if numbers_to_show: + self.show_results(numbers_to_show) + self.show_results2(numbers_to_show) + + def tf_idf(self, word): + tf = math.log(self.bad_edits.added_words[word]) + 1 + idf = math.log(float(self.counter)/self.words_db[word]) + return tf*idf + + def idf(self, word): + return math.log(float(self.counter)/self.words_db[word]) + + def show_results(self, numbers_to_show): + print("Showing %d results" % numbers_to_show) + values = sorted(self.possible_bad_words.values()) + lim = values[numbers_to_show*-1] + res = {} + for word in self.possible_bad_words: + if self.possible_bad_words[word] >= lim: + res[word] = self.possible_bad_words[word] + res = OrderedDict( + sorted(res.items(), key=lambda t: t[1], reverse=True)) + res_text = [] + for word in res: + res_text.append(word) + res_text.sort() + res_text = "#" + '\n#'.join(res_text) + self.bad_words_res_text = res_text + with open('%s_%s.txt' % (base_file_path, time.time()), 'w') as f: + f.write(res_text) + + def show_results2(self, numbers_to_show): + print("Showing another %d results" % numbers_to_show) + values = sorted(self.stop_words.values(), reverse=True) + lim = values[numbers_to_show*-1] + res = {} + for word in self.stop_words: + if self.stop_words[word] <= lim: + res[word] = self.stop_words[word] + res = OrderedDict(sorted(res.items(), key=lambda t: t[1])) + res_text = [] + for word in res: + res_text.append(word) + res_text.sort() + res_text = "#" + '\n#'.join(res_text) + self.stop_words_res_text = res_text + with open('%s2_%s.txt' % (base_file_path, time.time()), 'w') as f: + f.write(res_text) + + def dump(self): + new_db = {} + for word in self.bad_edits.added_words: + new_db[word] = self.words_db[word] + with open('words_db.txt', 'w') as f: + f.write(json.dumps(new_db)) + with open('bad_edits_words.txt', 'w') as f: + f.write(json.dumps(self.bad_edits.added_words)) + with open('no_docs.txt', 'w') as f: + f.write(json.dumps(self.counter)) From 69f35f2961e4a9086e16072b6e1255f5961cedd8 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Wed, 16 Sep 2020 15:53:27 -0400 Subject: [PATCH 03/19] Bot and Edit doctest works --- editquality/bwds/__init__.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index 0c80707..5777692 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -2,12 +2,12 @@ Code to find bad words automatically. It gets a set of added words and determines tf-idf of words -the it uses K-means algorithm to determine them. +then it uses K-means algorithm to determine them. Some parts are copied from https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py ->>> from bad_words_detection_system import * +>>> from editquality.bwds import Bot, Edit >>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True), ... Edit(3, {'one':5, 'four': 1}, False)] >>> bot = Bot() @@ -15,19 +15,11 @@ >>> bot.parse_bad_edits() """ import math -import sys -import traceback import json import time -from importlib import import_module from collections import OrderedDict -# TODO: User argparse -from revscoring.extractors.api import Extractor -from revscoring.datasources import revision_oriented - -from mwapi import Session -import mwreverts +base_file_path = '/data/project/dexbot/pywikibot-core/something_' class Edit(object): @@ -54,7 +46,7 @@ def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None): self.bad_words_db = {} self.bad_counter = 0 if bool(bad_words_cache) != bool(words_cache): - raise "You should define both" + raise ValueError("bad_words_cache should be defined if and only words_cache is defined") if words_cache: self.cache = True self.initiate_cache(words_cache, bad_words_cache, no_docs) @@ -103,16 +95,16 @@ def parse_bad_edits(self, numbers_to_show=10): def tf_idf(self, word): tf = math.log(self.bad_edits.added_words[word]) + 1 - idf = math.log(float(self.counter)/self.words_db[word]) + idf = math.log(self.counter/self.words_db[word]) return tf*idf def idf(self, word): - return math.log(float(self.counter)/self.words_db[word]) + return math.log(self.counter/self.words_db[word]) def show_results(self, numbers_to_show): print("Showing %d results" % numbers_to_show) values = sorted(self.possible_bad_words.values()) - lim = values[numbers_to_show*-1] + lim = values[max(0, len(values) - numbers_to_show)] res = {} for word in self.possible_bad_words: if self.possible_bad_words[word] >= lim: @@ -131,7 +123,7 @@ def show_results(self, numbers_to_show): def show_results2(self, numbers_to_show): print("Showing another %d results" % numbers_to_show) values = sorted(self.stop_words.values(), reverse=True) - lim = values[numbers_to_show*-1] + lim = values[max(0, len(values) - numbers_to_show)] res = {} for word in self.stop_words: if self.stop_words[word] <= lim: From a62d5c42c2095bef8ad2184e4ccf0281fa7dadb7 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Wed, 16 Sep 2020 16:05:33 -0400 Subject: [PATCH 04/19] Add BWDS scripts to sphinx --- editquality/utilities/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/editquality/utilities/__init__.py b/editquality/utilities/__init__.py index b26cdd7..c6678f5 100644 --- a/editquality/utilities/__init__.py +++ b/editquality/utilities/__init__.py @@ -43,4 +43,12 @@ merge_labels ++++++++++++ .. automodule:: editquality.utilities.merge_labels + +bad_words_detection_system +++++++++++++++++++++++++++ +.. automodule:: editquality.utilities.bad_words_detection_system + +dump_based_detection +++++++++++++++++++++ +.. automodule:: editquality.utilities.dump_based_detection """ From d2cfe3348df279bab4e460ac6610c18d0040fdff Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 11:59:00 -0400 Subject: [PATCH 05/19] Add test_parse_bad_edits --- editquality/bwds/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index 5777692..65b731d 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -148,3 +148,10 @@ def dump(self): f.write(json.dumps(self.bad_edits.added_words)) with open('no_docs.txt', 'w') as f: f.write(json.dumps(self.counter)) + + +def test_parse_bad_edits(): + edits = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] + bot = Bot() + bot.parse_edits(edits) + bot.parse_bad_edits(numbers_to_show=0) From 1962bc716e5bfc74ba8df65fa62039fcea1885ec Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 12:03:21 -0400 Subject: [PATCH 06/19] Remove dump_based_detection.py I'm hoping to add this in a future PR --- editquality/utilities/dump_based_detection.py | 224 ------------------ 1 file changed, 224 deletions(-) delete mode 100644 editquality/utilities/dump_based_detection.py diff --git a/editquality/utilities/dump_based_detection.py b/editquality/utilities/dump_based_detection.py deleted file mode 100644 index d1ccfd9..0000000 --- a/editquality/utilities/dump_based_detection.py +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -# Copyright © 2014 He7d3r -# License: http://he7d3r.mit-license.org/ -""" -Extermely under construction. -Some parts are copied from -https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c -""" -from nltk.tokenize import RegexpTokenizer -import sys -from mw.lib import reverts -from pywikibot import xmlreader -import pywikibot -import re -import time -import regex - -from bad_words_detection_system import Edit, Bot - -cache = {} - -languages_by_size = [ - 'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi', - 'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar', - 'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk', - 'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz', - 'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka', - 'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy', - 'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb', - 'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv', - 'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne', - 'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa', - 'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa', - 'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo', - 'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb', - 'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue', - 'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf', - 'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea', - 'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu', - 'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt', - 'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo', - 'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv', - 'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw', - 'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr', - 'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty', - 'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab', - 'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr', - 'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt', - 'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff', - 'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc', - 'azb', 'or' - ] -cjk = ( - r'\u4E00-\u62FF' + # Unified Ideographs - r'\u6300-\u77FF' + - r'\u7800-\u8CFF' + - r'\u8D00-\u9FCC' + - r'\u3400-\u4DFF' + # Unified Ideographs Ext A - r'\U00020000-\U000215FF' + # Unified Ideographs Ext. B - r'\U00021600-\U000230FF' + - r'\U00023100-\U000245FF' + - r'\U00024600-\U000260FF' + - r'\U00026100-\U000275FF' + - r'\U00027600-\U000290FF' + - r'\U00029100-\U0002A6DF' + - r'\uF900-\uFAFF' + # Compatibility Ideographs - r'\U0002F800-\U0002FA1F' # Compatibility Ideographs Suppl. -) - -chars = { - 'az': u'A-Za-zÇçƏəĞğıİÖöŞşÜü', - 'ar': u'غظضذخثتشرقصفعسنملكيطحزوهدجبا', - 'et': u'A-Za-zŠšŽžÕõÄäÖöÜü', - 'af': u'A-Za-züûöôïîëêè', - 'en': u'A-Za-z', - 'id': u'A-Za-z', - 'ko': cjk, - 'zh': cjk, - 'ja': cjk, - 'pt': u'A-Za-záàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ', - 'tr': u'A-Za-zÇĞİÖŞÜçğıöşüâîûÂÎÛ', - 'fa': u'ابپتثجچحخدذرزژسشصآضطظعغفقکگلمنوهی‌يك', - 'fr': u'A-Za-zÀàÂâÆæÄäÇçÉéÈèÊêËëÎîÏïÔôŒœÖöÙùÛûÜüŸÿ', - 'de': u'A-Za-zÄäÖöÜüß', - 'es': u'A-Za-zÑñéÉüÜóÓ', - 'uk': u'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЬ' - u'ьЮюЯя', - 'pl': u'AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż', - 'he': u'למנסעפצקרשתםןףץאבגדהוזחטיכך', - 'hy': u'ԱաԲբԳգԴդԵեԶզԷէԸըԹթԺժԻիԼլԽխԾծԿկՀհՁձՂղՃճՄմՅյՆնՇշՈոՉչՊպՋջՌռՍսՎվՏտՐր' - u'ՑցՈՒՈւուՒւՓփՔքևևՕօՖֆ', - 'vi': u'AaĂăÂâBbCcDdĐđEeÊêGgHhIiKkLlMmNnOoÔôƠơPpQqRrSsTtUuƯưVvXxYy', - 'ur': u'ابپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوهھءیےٹڈڑ‌آّْیٰوَُِٗ', - 'uz': 'A-Za-zʻ', - 'sv': u'A-Za-zÅÄÖåäö', - 'hu': u'A-Za-zËëÉéÓóÖöŐőÚúÜüŰűÁá', - 'cs': u'A-Za-zÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž', - 'hi': u'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहळक्षज्ञ:अपआपाइपिईपीउपुऊपूऋपृॠप' - u'ॄऌपॢॡपॣएपेऐपैओपोऔपौअंपंअःपः', - 'no': u'A-Za-zÆØÅæøåéèêóòâôüáàé', - 'ta': u'௰௱௲௳௴௵௶௷௸௹௺ௗௐொோௌ்ெேைீுூாிரறலளழவஶஷஸஹணதநனபம' - u'யஐஒஓஔகஙசஜஞடஂஃஅஆஇஈஉஊஎஏ', -} - - -def lower(a, lang): - if lang == 'tr': - return a.replace('I', u'ı').replace(u'İ', 'i').lower() - return a.lower() - - -def page_info(dump, lang): - global tokenizer - c = 1 - di_old = [] - di = [] - nombre = '3,' if lang not in ['ja', 'zh'] else '1' - for entry in dump.parse(): - if entry.ns != '0': - continue - if c != entry.id: - if c != 1: - di_old = di[:] - di = [] - if entry.id and int(entry.id[-1]) == 0: - print('new page', entry.id) - di.append(entry) - else: - di.append(entry) - continue - c = entry.id - firstRev = True - history = {} - detector = reverts.Detector(radius=3) - for revision in di_old: - revision.text = re.sub( - r'\[\[(%s)\:' % '|'.join(languages_by_size), - '', - revision.text) - words = set() - if lang in chars: - token_pattern = r'[%s]{%s}' % (chars[lang], nombre) - tokenizer = RegexpTokenizer(token_pattern) - tokens = tokenizer.tokenize(revision.text) - else: - token_pattern = r'\p{alpha}+' - tokens = regex.findall(token_pattern, revision.text) - for w in tokens: - words.add(lower(w, lang)) - if firstRev: - prevIntersection = words - firstRev = False - added = words - prevIntersection - prevIntersection = words - history[revision.revisionid] = Edit( - revision.revisionid, added, False) - rev = detector.process(revision.text, - {'rev_id': revision.revisionid}) - if rev: - for reverted in rev.reverteds: - history[reverted['rev_id']].reverted = True - - yield history - - -def run(dumps): - number = 500000 - counter = 0 - start_time = time.time() - for casee in dumps: - lang = casee.split('/')[-1].split('wiki')[0] - dump = xmlreader.XmlDump(casee, True) - bot = Bot() - for case in page_info(dump, lang): - counter += 1 - if number and counter > number: - break - bot.parse_edits(case.values()) - bot.parse_bad_edits(250) - bot.dump() - print(time.time() - start_time) - site = pywikibot.Site('meta', fam='meta') - page = pywikibot.Page( - site, 'Research:Revision scoring as a service/Word lists/' + lang) - try: - text = page.get() - except pywikibot.NoPage: - text = ("{{Research:Revision scoring as a service/template/word list " - "data\n |lang=%s\n |gen=250\n |badwords=-\n |informal=-" - "\n |stopwords=-\n |dictionary=-\n |stemmer=-\n |contact=" - "\n |features=no\n |labels=requested\n |campaign=no\n " - "|needs=-\n |list-generated=\n |list-stop=\n}}\n" % lang) - except: - return False - new_text = text - if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text): - if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text): - new_text = re.sub( - r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})', - r'\1%s\2' % bot.bad_words_res_text, - new_text) - else: - new_text = re.sub( - r'\}\}', - r'|list-generated=%s\n}}' % bot.bad_words_res_text, - new_text) - if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text): - if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text): - new_text = re.sub( - r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})', - r'\1%s\2' % bot.stop_words_res_text, - new_text) - else: - new_text = re.sub( - r'\}\}', - r'|list-stop=%s\n}}' % bot.stop_words_res_text, - new_text) - if new_text != text: - page.text = new_text - page.save('Bot: update results') -if __name__ == "__main__": - dumps = sys.argv[1:] - run(dumps) From a5e06c23f08ed82ec3d55d50bb92dd79805d3528 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 12:24:00 -0400 Subject: [PATCH 07/19] Add a few unit tests I still haven't figured out how the script is supposed to work --- editquality/bwds/__init__.py | 38 +++- editquality/bwds/tests/__init__.py | 0 editquality/bwds/tests/bad_edits_words.txt | 1 + editquality/bwds/tests/no_docs.txt | 1 + editquality/bwds/tests/test_bwds.py | 29 +++ editquality/bwds/tests/words_db.txt | 1 + .../utilities/bad_words_detection_system.py | 208 ++---------------- 7 files changed, 81 insertions(+), 197 deletions(-) create mode 100644 editquality/bwds/tests/__init__.py create mode 100644 editquality/bwds/tests/bad_edits_words.txt create mode 100644 editquality/bwds/tests/no_docs.txt create mode 100644 editquality/bwds/tests/test_bwds.py create mode 100644 editquality/bwds/tests/words_db.txt diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index 65b731d..eee3552 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -18,6 +18,7 @@ import json import time from collections import OrderedDict +from importlib import import_module base_file_path = '/data/project/dexbot/pywikibot-core/something_' @@ -150,8 +151,35 @@ def dump(self): f.write(json.dumps(self.counter)) -def test_parse_bad_edits(): - edits = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] - bot = Bot() - bot.parse_edits(edits) - bot.parse_bad_edits(numbers_to_show=0) +def read_rev_pages(f): + + for line in f: + parts = line.strip().split('\t') + + if len(parts) == 1: + rev_id = parts + yield int(rev_id[0]), None + elif len(parts) == 2: + rev_id, page_id = parts + yield int(rev_id), int(page_id) + + +def import_from_path(path): + parts = path.split(".") + module_path = ".".join(parts[:-1]) + attribute_name = parts[-1] + + module = import_module(module_path) + + attribute = getattr(module, attribute_name) + + return attribute + + +def cache_parse(pathes, num_res): + if not pathes.strip(): + pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt' + pathes = pathes.split(',') + bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1], + no_docs=pathes[2]) + bot.parse_bad_edits(num_res) diff --git a/editquality/bwds/tests/__init__.py b/editquality/bwds/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/editquality/bwds/tests/bad_edits_words.txt b/editquality/bwds/tests/bad_edits_words.txt new file mode 100644 index 0000000..073311b --- /dev/null +++ b/editquality/bwds/tests/bad_edits_words.txt @@ -0,0 +1 @@ +bad_edits_words \ No newline at end of file diff --git a/editquality/bwds/tests/no_docs.txt b/editquality/bwds/tests/no_docs.txt new file mode 100644 index 0000000..285d09b --- /dev/null +++ b/editquality/bwds/tests/no_docs.txt @@ -0,0 +1 @@ +no_docs.txt \ No newline at end of file diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py new file mode 100644 index 0000000..e9ea48d --- /dev/null +++ b/editquality/bwds/tests/test_bwds.py @@ -0,0 +1,29 @@ +from editquality.bwds import import_from_path, cache_parse, Edit, Bot +from editquality.utilities.bad_words_detection_system import bot_gen + + +def test_import_from_path(): + import_from_path('revscoring.languages.english') + + +def test_cache_parse(): + cache_parse( + 'editquality/bwds/tests/words_db.txt,' + 'editquality/bwds/tests/bad_edits_words.txt,' + 'editquality/bwds/tests/no_docs.txt', + num_res=1 + ) + + +def test_bot_gen(): + en_main_page_id = 232335 + a_revision_id = 7101436 + en_api_url = 'https://en.wikipedia.org/w/api.php' + bot_gen([(en_main_page_id, a_revision_id)], 'TODO', en_api_url) + + +def test_parse_bad_edits(): + edits = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] + bot = Bot() + bot.parse_edits(edits) + bot.parse_bad_edits(numbers_to_show=0) diff --git a/editquality/bwds/tests/words_db.txt b/editquality/bwds/tests/words_db.txt new file mode 100644 index 0000000..9316897 --- /dev/null +++ b/editquality/bwds/tests/words_db.txt @@ -0,0 +1 @@ +words_db \ No newline at end of file diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py index 240b52c..7113f93 100755 --- a/editquality/utilities/bad_words_detection_system.py +++ b/editquality/utilities/bad_words_detection_system.py @@ -2,19 +2,6 @@ WIP The script to find bad words automatically. -It gets a set of added words and determines tf-idf of words -the it uses K-means algorithm to determin them. - -Some parts are copied from -https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py - ->>> from bad_words_detection_system import * ->>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True), -... Edit(3, {'one':5, 'four': 1}, False)] ->>> bot = Bot() ->>> bot.parse_edits(edits) ->>> bot.parse_bad_edits() - python3 bad_words_detection_system.py --rev-pages:f.txt --api:https://en.wikipedia.org/w/api.php --language:revscoring.languages.english @@ -22,175 +9,17 @@ Use cache: python3 bad_words_detection_system.py --cache: """ -import math import sys import traceback -import json -import time -from importlib import import_module -from collections import OrderedDict # TODO: User argparse -from revscoring.extractors import APIExtractor -from revscoring.datasources import diff - -from mw import api -from mw.lib import reverts - -base_file_path = '/data/project/dexbot/pywikibot-core/something_' - - -class Edit(object): - def __init__(self, rev_id, added_words, reverted): - self.id = rev_id - self.added_words = added_words - if not isinstance(self.added_words, dict): - self.fix_added_words() - self.reverted = reverted - - def fix_added_words(self): - temp = {} - for word in self.added_words: - temp[word] = temp.get(word, 0) + 1 - self.added_words = temp - - -class Bot(object): - - def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None): - self.bad_edits = Edit(-1, {}, True) - self.counter = 0 - self.words_db = {} - self.bad_words_db = {} - self.bad_counter = 0 - if bool(bad_words_cache) != bool(words_cache): - raise "You should define both" - if words_cache: - self.cache = True - self.initiate_cache(words_cache, bad_words_cache, no_docs) - else: - self.cache = False - - def initiate_cache(self, words_cache, bad_words_cache, no_docs): - with open(words_cache, 'r') as f: - self.words_db = json.loads(f.read()) - with open(bad_words_cache, 'r') as f: - self.bad_edits.added_words = json.loads(f.read()) - with open(no_docs, 'r') as f: - self.counter = int(f.read()) - - def parse_edits(self, edits): - for edit in edits: - # Since edits can be gen and len doesn't mean there - self.counter += 1 - if edit.reverted: - for word in edit.added_words: - self.bad_edits.added_words[word] = \ - self.bad_edits.added_words.get(word, 0) + \ - edit.added_words[word] - self.bad_words_db[word] = ( - self.bad_words_db.get(word, 0) + 1) - self.bad_counter += 1 - continue - for word in edit.added_words: - self.words_db[word] = self.words_db.get(word, 0) + 1 - - def parse_bad_edits(self, numbers_to_show=10): - self.possible_bad_words = {} - self.stop_words = {} - if not self.cache: - self.counter += 1 - for word in self.bad_edits.added_words: - if not self.cache: - self.words_db[word] = self.words_db.get(word, 0) + 1 - if 'sh' in word or 'ch' in word: - continue - self.possible_bad_words[word] = self.tf_idf(word) - self.stop_words[word] = self.idf(word) - if numbers_to_show: - self.show_results(numbers_to_show) - self.show_results2(numbers_to_show) - - def tf_idf(self, word): - tf = math.log(self.bad_edits.added_words[word]) + 1 - idf = math.log(float(self.counter)/self.words_db[word]) - return tf*idf +from revscoring.extractors.api import Extractor +from revscoring.features.wikitext import Diff - def idf(self, word): - return math.log(float(self.counter)/self.words_db[word]) +from mwapi import Session +import mwreverts - def show_results(self, numbers_to_show): - print("Showing %d results" % numbers_to_show) - values = sorted(self.possible_bad_words.values()) - lim = values[numbers_to_show*-1] - res = {} - for word in self.possible_bad_words: - if self.possible_bad_words[word] >= lim: - res[word] = self.possible_bad_words[word] - res = OrderedDict( - sorted(res.items(), key=lambda t: t[1], reverse=True)) - res_text = [] - for word in res: - res_text.append(word) - res_text.sort() - res_text = "#" + '\n#'.join(res_text) - self.bad_words_res_text = res_text - with open('%s_%s.txt' % (base_file_path, time.time()), 'w') as f: - f.write(res_text) - - def show_results2(self, numbers_to_show): - print("Showing another %d results" % numbers_to_show) - values = sorted(self.stop_words.values(), reverse=True) - lim = values[numbers_to_show*-1] - res = {} - for word in self.stop_words: - if self.stop_words[word] <= lim: - res[word] = self.stop_words[word] - res = OrderedDict(sorted(res.items(), key=lambda t: t[1])) - res_text = [] - for word in res: - res_text.append(word) - res_text.sort() - res_text = "#" + '\n#'.join(res_text) - self.stop_words_res_text = res_text - with open('%s2_%s.txt' % (base_file_path, time.time()), 'w') as f: - f.write(res_text) - - def dump(self): - new_db = {} - for word in self.bad_edits.added_words: - new_db[word] = self.words_db[word] - with open('words_db.txt', 'w') as f: - f.write(json.dumps(new_db)) - with open('bad_edits_words.txt', 'w') as f: - f.write(json.dumps(self.bad_edits.added_words)) - with open('no_docs.txt', 'w') as f: - f.write(json.dumps(self.counter)) - - -def read_rev_pages(f): - - for line in f: - parts = line.strip().split('\t') - - if len(parts) == 1: - rev_id = parts - yield int(rev_id[0]), None - elif len(parts) == 2: - rev_id, page_id = parts - yield int(rev_id), int(page_id) - - -def import_from_path(path): - parts = path.split(".") - module_path = ".".join(parts[:-1]) - attribute_name = parts[-1] - - module = import_module(module_path) - - attribute = getattr(module, attribute_name) - - return attribute +from editquality.bwds import Bot, Edit, cache_parse, import_from_path, read_rev_pages def handle_args(): @@ -213,20 +42,24 @@ def handle_args(): def bot_gen(rev_pages, language, api_url): - session = api.Session(api_url) - extractor = Extractor(session, language=language) + session = Session(api_url) + extractor = Extractor(session) for rev_id, page_id in rev_pages: + api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids') + revisions = next(iter(api_result['query']['pages'].values()))['revisions'] + sys.stderr.write(".") sys.stderr.flush() try: + revisions = [revision for revision in revisions if 'sha1hidden' not in revision] # Detect reverted status - revert = reverts.api.check(session, rev_id, page_id, radius=3) - reverted = revert is not None - added_words = list( - extractor.extract(rev_id, [diff.added_words]))[0] - yield Edit(rev_id, added_words, reverted) + for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions): + for reverted in revert.reverteds: + # added_words = list(extractor.extract(rev_id, [diff.added_words]))[0] + added_words = list() # TODO how to upgrade this? + yield Edit(rev_id, added_words, reverted) except KeyboardInterrupt: sys.stderr.write("\n^C Caught. Exiting...") @@ -239,15 +72,6 @@ def bot_gen(rev_pages, language, api_url): sys.stderr.write("\n") -def cache_parse(pathes, num_res): - if not pathes.strip(): - pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt' - pathes = pathes.split(',') - bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1], - no_docs=pathes[2]) - bot.parse_bad_edits(num_res) - - def main(): args = handle_args() if '--num_res' in args: From c14eeddb9b8eaccad74fc397c98eaa6fe02ba5ed Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 13:06:50 -0400 Subject: [PATCH 08/19] Unit tests pass --- editquality/bwds/__init__.py | 10 +--------- editquality/bwds/tests/bad_edits_words.txt | 2 +- editquality/bwds/tests/no_docs.txt | 2 +- editquality/bwds/tests/test_bwds.py | 2 +- editquality/bwds/tests/words_db.txt | 2 +- 5 files changed, 5 insertions(+), 13 deletions(-) diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index eee3552..9b299c3 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -165,15 +165,7 @@ def read_rev_pages(f): def import_from_path(path): - parts = path.split(".") - module_path = ".".join(parts[:-1]) - attribute_name = parts[-1] - - module = import_module(module_path) - - attribute = getattr(module, attribute_name) - - return attribute + return import_module(path) def cache_parse(pathes, num_res): diff --git a/editquality/bwds/tests/bad_edits_words.txt b/editquality/bwds/tests/bad_edits_words.txt index 073311b..3fafbf7 100644 --- a/editquality/bwds/tests/bad_edits_words.txt +++ b/editquality/bwds/tests/bad_edits_words.txt @@ -1 +1 @@ -bad_edits_words \ No newline at end of file +{"x": 1} \ No newline at end of file diff --git a/editquality/bwds/tests/no_docs.txt b/editquality/bwds/tests/no_docs.txt index 285d09b..56a6051 100644 --- a/editquality/bwds/tests/no_docs.txt +++ b/editquality/bwds/tests/no_docs.txt @@ -1 +1 @@ -no_docs.txt \ No newline at end of file +1 \ No newline at end of file diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py index e9ea48d..d83d3d0 100644 --- a/editquality/bwds/tests/test_bwds.py +++ b/editquality/bwds/tests/test_bwds.py @@ -11,7 +11,7 @@ def test_cache_parse(): 'editquality/bwds/tests/words_db.txt,' 'editquality/bwds/tests/bad_edits_words.txt,' 'editquality/bwds/tests/no_docs.txt', - num_res=1 + 0 ) diff --git a/editquality/bwds/tests/words_db.txt b/editquality/bwds/tests/words_db.txt index 9316897..8429d4d 100644 --- a/editquality/bwds/tests/words_db.txt +++ b/editquality/bwds/tests/words_db.txt @@ -1 +1 @@ -words_db \ No newline at end of file +{"x": 2} \ No newline at end of file From 61ccb8d31abe54a1258f8a4559565eac95c3cbfb Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 13:42:22 -0400 Subject: [PATCH 09/19] Add test_dump --- editquality/bwds/tests/test_bwds.py | 36 ++++++++++++++- editquality/feature_lists/tests/__init__.py | 0 .../feature_lists/tests/test_huwiki.py | 29 ------------ .../feature_lists/tests/test_wikidatawiki.py | 45 ------------------- 4 files changed, 34 insertions(+), 76 deletions(-) delete mode 100644 editquality/feature_lists/tests/__init__.py delete mode 100644 editquality/feature_lists/tests/test_huwiki.py delete mode 100644 editquality/feature_lists/tests/test_wikidatawiki.py diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py index d83d3d0..e26e870 100644 --- a/editquality/bwds/tests/test_bwds.py +++ b/editquality/bwds/tests/test_bwds.py @@ -2,6 +2,9 @@ from editquality.utilities.bad_words_detection_system import bot_gen +EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] + + def test_import_from_path(): import_from_path('revscoring.languages.english') @@ -23,7 +26,36 @@ def test_bot_gen(): def test_parse_bad_edits(): - edits = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] bot = Bot() - bot.parse_edits(edits) + bot.parse_edits(EDITS) bot.parse_bad_edits(numbers_to_show=0) + + +def dump_empty(): + bot = Bot() + bot.dump() + with open('words_db.txt') as words_db: + assert words_db.read() == '{}' + with open('bad_edits_words.txt') as bad_edits_words: + assert bad_edits_words.read() == '{}' + with open('no_docs.txt') as no_docs: + assert no_docs.read() == '0' + + +def dump_toy_data(): + bot = Bot() + bot.parse_edits(EDITS) + bot.parse_bad_edits(0) + bot.dump() + with open('words_db.txt') as words_db: + assert words_db.read() == '{"three": 1}' + with open('bad_edits_words.txt') as bad_edits_words: + assert bad_edits_words.read() == '{"three": 3}' + with open('no_docs.txt') as no_docs: + assert no_docs.read() == '4' + + +def test_dump(): + # Calling both tests from here because we want to ensure they're not run concurrently + dump_empty() + dump_toy_data() diff --git a/editquality/feature_lists/tests/__init__.py b/editquality/feature_lists/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/editquality/feature_lists/tests/test_huwiki.py b/editquality/feature_lists/tests/test_huwiki.py deleted file mode 100644 index e2d27db..0000000 --- a/editquality/feature_lists/tests/test_huwiki.py +++ /dev/null @@ -1,29 +0,0 @@ -from revscoring.dependencies import solve - -from .. import huwiki - -OK_WORDS = [ - "fartő", # part of a cattle - "fartőke", # part of a boat - "ok", - "dada", - "ha" -] - -STILL_MATCH = [ - "fart", - "farts", - "farting", - "farter" -] - - -def test_huwiki(): - ok_cache = {'datasource.revision.text': " ".join(OK_WORDS)} - bad_cache = {'datasource.revision.text': " ".join(STILL_MATCH)} - assert (solve(huwiki.english_badwords_safe.revision.datasources.matches, - cache=ok_cache) == []) - assert (solve(huwiki.english_informals_safe.revision.datasources.matches, - cache=ok_cache) == []) - assert (solve(huwiki.english_badwords_safe.revision.datasources.matches, - cache=bad_cache) == STILL_MATCH) diff --git a/editquality/feature_lists/tests/test_wikidatawiki.py b/editquality/feature_lists/tests/test_wikidatawiki.py deleted file mode 100644 index cbb5abe..0000000 --- a/editquality/feature_lists/tests/test_wikidatawiki.py +++ /dev/null @@ -1,45 +0,0 @@ -from revscoring.datasources import revision_oriented -from revscoring.dependencies import solve - -from .. import wikidatawiki - - -def test_comment_features(): - comment_ds = revision_oriented.revision.comment - cache = {comment_ds: "/* wbmergeitems-to:0||Q928543 */ "} - assert solve(wikidatawiki.is_merge_into, cache=cache) - assert solve(wikidatawiki.is_merge_from, cache=cache) is False - assert solve(wikidatawiki.is_item_creation, cache=cache) is False - - cache = {comment_ds: "/* wbmergeitems-from:0||Q928543 */ "} - assert solve(wikidatawiki.is_merge_from, cache=cache) - assert solve(wikidatawiki.is_merge_into, cache=cache) is False - - cache = {comment_ds: "/* clientsitelink-remove:1||enwiki */ Boris Kok"} - assert solve(wikidatawiki.is_client_delete, cache=cache) - assert solve(wikidatawiki.is_client_move, cache=cache) is False - - cache = {comment_ds: "/* clientsitelink-update:0|uk|uk:A|uk:B *"} - assert solve(wikidatawiki.is_client_move, cache=cache) - assert solve(wikidatawiki.is_client_delete, cache=cache) is False - assert solve(wikidatawiki.is_revert, cache=cache) is False - - cache = {comment_ds: "Undid revision 1448592 by [[Special:Contributions/"} - assert solve(wikidatawiki.is_revert, cache=cache) - cache = {comment_ds: "Reverted edits by [[Special:Contributions/"} - assert solve(wikidatawiki.is_revert, cache=cache) - cache = {comment_ds: "rvv racial slurs"} - assert solve(wikidatawiki.is_revert, cache=cache) - - cache = {comment_ds: "Restored revision 123456"} - assert solve(wikidatawiki.is_restore, cache=cache) - assert solve(wikidatawiki.is_item_creation, cache=cache) is False - assert solve(wikidatawiki.is_revert, cache=cache) is False - - cache = {comment_ds: "/* wbeditentity-create:0| */"} - assert solve(wikidatawiki.is_item_creation, cache=cache) - - -def test_property_features(): - # assert False, "TODO" - pass From e78ac9253d6bd326076fa9d2455c3e998057c458 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 13:46:48 -0400 Subject: [PATCH 10/19] Add test_read_rev_pages --- editquality/bwds/tests/test_bwds.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py index e26e870..179a630 100644 --- a/editquality/bwds/tests/test_bwds.py +++ b/editquality/bwds/tests/test_bwds.py @@ -1,4 +1,4 @@ -from editquality.bwds import import_from_path, cache_parse, Edit, Bot +from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages from editquality.utilities.bad_words_detection_system import bot_gen @@ -25,6 +25,10 @@ def test_bot_gen(): bot_gen([(en_main_page_id, a_revision_id)], 'TODO', en_api_url) +def test_read_rev_pages(): + assert list(read_rev_pages(["0", "1\t2"])) == [(0, None), (1, 2)] + + def test_parse_bad_edits(): bot = Bot() bot.parse_edits(EDITS) From f437ef671c44fea8c048a5cb5be424d381ee5278 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 14:51:16 -0400 Subject: [PATCH 11/19] bot_gen test works except extractor.extract --- editquality/bwds/__init__.py | 55 ++++++++++++++++++- editquality/bwds/tests/test_bwds.py | 9 ++- .../utilities/bad_words_detection_system.py | 43 +-------------- 3 files changed, 59 insertions(+), 48 deletions(-) diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index 9b299c3..af3e8b0 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -16,13 +16,28 @@ """ import math import json +import sys import time -from collections import OrderedDict +from collections import OrderedDict, namedtuple from importlib import import_module +import traceback +# TODO: User argparse + +from revscoring.extractors.api import Extractor +from revscoring.features.wikitext import Diff + +from mwapi import Session +import mwreverts + + base_file_path = '/data/project/dexbot/pywikibot-core/something_' +# This is nice for debugging, e.g. printing this includes it values +EditNamedTuple = namedtuple('EditNamedTuple', ['id', 'added_words', 'reverted']) + + class Edit(object): def __init__(self, rev_id, added_words, reverted): self.id = rev_id @@ -37,6 +52,9 @@ def fix_added_words(self): temp[word] = temp.get(word, 0) + 1 self.added_words = temp + def as_named_tuple(self): + return EditNamedTuple(self.id, self.added_words, self.reverted) + class Bot(object): @@ -175,3 +193,38 @@ def cache_parse(pathes, num_res): bot = Bot(words_cache=pathes[0], bad_words_cache=pathes[1], no_docs=pathes[2]) bot.parse_bad_edits(num_res) + + +def bot_gen(rev_pages, language, api_url): + session = Session(api_url) + extractor = Extractor(session) + + for revision_id, page_id in rev_pages: + api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids') + revisions = next(iter(api_result['query']['pages'].values()))['revisions'] + + sys.stderr.write(".") + sys.stderr.flush() + try: + revisions = [revision for revision in revisions if 'sha1hidden' not in revision] + + reverted_revision_ids = set() + # Detect reverted status + for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions): + for reverted in revert.reverteds: + reverted_revision_ids.add(reverted['revid']) + + # added_words = list(extractor.extract(rev_id, [diff.added_words]))[0] + added_words = list() # TODO how to upgrade this? + yield Edit(revision_id, added_words, revision_id in reverted_revision_ids) + + except KeyboardInterrupt: + sys.stderr.write("\n^C Caught. Exiting...") + break + + except: + sys.stderr.write(traceback.format_exc()) + sys.stderr.write("\n") + + sys.stderr.write("\n") + diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py index 179a630..bfcbf99 100644 --- a/editquality/bwds/tests/test_bwds.py +++ b/editquality/bwds/tests/test_bwds.py @@ -1,6 +1,4 @@ -from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages -from editquality.utilities.bad_words_detection_system import bot_gen - +from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages, bot_gen, EditNamedTuple EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] @@ -19,10 +17,11 @@ def test_cache_parse(): def test_bot_gen(): - en_main_page_id = 232335 a_revision_id = 7101436 + en_main_page_id = 232335 en_api_url = 'https://en.wikipedia.org/w/api.php' - bot_gen([(en_main_page_id, a_revision_id)], 'TODO', en_api_url) + assert list(bot_gen([(a_revision_id, en_main_page_id)], 'TODO', en_api_url))[0].as_named_tuple() == \ + EditNamedTuple(7101436, {"TODO"}, False) def test_read_rev_pages(): diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py index 7113f93..e0b4417 100755 --- a/editquality/utilities/bad_words_detection_system.py +++ b/editquality/utilities/bad_words_detection_system.py @@ -10,16 +10,7 @@ python3 bad_words_detection_system.py --cache: """ import sys -import traceback -# TODO: User argparse - -from revscoring.extractors.api import Extractor -from revscoring.features.wikitext import Diff - -from mwapi import Session -import mwreverts - -from editquality.bwds import Bot, Edit, cache_parse, import_from_path, read_rev_pages +from editquality.bwds import Bot, Edit, cache_parse, import_from_path, read_rev_pages, bot_gen def handle_args(): @@ -40,38 +31,6 @@ def handle_args(): return args -def bot_gen(rev_pages, language, api_url): - - session = Session(api_url) - extractor = Extractor(session) - - for rev_id, page_id in rev_pages: - api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids') - revisions = next(iter(api_result['query']['pages'].values()))['revisions'] - - sys.stderr.write(".") - sys.stderr.flush() - try: - revisions = [revision for revision in revisions if 'sha1hidden' not in revision] - - # Detect reverted status - for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions): - for reverted in revert.reverteds: - # added_words = list(extractor.extract(rev_id, [diff.added_words]))[0] - added_words = list() # TODO how to upgrade this? - yield Edit(rev_id, added_words, reverted) - - except KeyboardInterrupt: - sys.stderr.write("\n^C Caught. Exiting...") - break - - except: - sys.stderr.write(traceback.format_exc()) - sys.stderr.write("\n") - - sys.stderr.write("\n") - - def main(): args = handle_args() if '--num_res' in args: From 52e961c512b0fed0d203db286c796a6858819213 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 15:41:53 -0400 Subject: [PATCH 12/19] Try out a dummy Datasource --- editquality/bwds/__init__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index af3e8b0..1931817 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -23,13 +23,13 @@ import traceback # TODO: User argparse - +from revscoring.datasources import Datasource, revision_oriented from revscoring.extractors.api import Extractor -from revscoring.features.wikitext import Diff +from revscoring.features.wikitext import revision from mwapi import Session import mwreverts - +from tests.extractors.test_extractor import get_last_two base_file_path = '/data/project/dexbot/pywikibot-core/something_' @@ -214,8 +214,9 @@ def bot_gen(rev_pages, language, api_url): for reverted in revert.reverteds: reverted_revision_ids.add(reverted['revid']) - # added_words = list(extractor.extract(rev_id, [diff.added_words]))[0] - added_words = list() # TODO how to upgrade this? + # added_words = list(extractor.extract(revision_id, [revision.diff.words_added]))[0] + datasource = Datasource("last_two_in_id", get_last_two, depends_on=[revision_oriented.revision.id]) + added_words = {extractor.extract(revision_id, datasource)} yield Edit(revision_id, added_words, revision_id in reverted_revision_ids) except KeyboardInterrupt: From 43a222c9701766e4d1522db58352c35ba4af777b Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 15:54:47 -0400 Subject: [PATCH 13/19] Remove dump_based_detection autodoc --- editquality/utilities/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/editquality/utilities/__init__.py b/editquality/utilities/__init__.py index c6678f5..ca4198f 100644 --- a/editquality/utilities/__init__.py +++ b/editquality/utilities/__init__.py @@ -47,8 +47,4 @@ bad_words_detection_system ++++++++++++++++++++++++++ .. automodule:: editquality.utilities.bad_words_detection_system - -dump_based_detection -++++++++++++++++++++ -.. automodule:: editquality.utilities.dump_based_detection """ From 65bbec185655026d53c842f6dc1f6c5cd06f606c Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Fri, 18 Sep 2020 15:56:39 -0400 Subject: [PATCH 14/19] Restore tests that I accidentally deleted --- editquality/feature_lists/tests/__init__.py | 0 .../feature_lists/tests/test_huwiki.py | 29 ++++++++++++ .../feature_lists/tests/test_wikidatawiki.py | 45 +++++++++++++++++++ 3 files changed, 74 insertions(+) create mode 100644 editquality/feature_lists/tests/__init__.py create mode 100644 editquality/feature_lists/tests/test_huwiki.py create mode 100644 editquality/feature_lists/tests/test_wikidatawiki.py diff --git a/editquality/feature_lists/tests/__init__.py b/editquality/feature_lists/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/editquality/feature_lists/tests/test_huwiki.py b/editquality/feature_lists/tests/test_huwiki.py new file mode 100644 index 0000000..e2d27db --- /dev/null +++ b/editquality/feature_lists/tests/test_huwiki.py @@ -0,0 +1,29 @@ +from revscoring.dependencies import solve + +from .. import huwiki + +OK_WORDS = [ + "fartő", # part of a cattle + "fartőke", # part of a boat + "ok", + "dada", + "ha" +] + +STILL_MATCH = [ + "fart", + "farts", + "farting", + "farter" +] + + +def test_huwiki(): + ok_cache = {'datasource.revision.text': " ".join(OK_WORDS)} + bad_cache = {'datasource.revision.text': " ".join(STILL_MATCH)} + assert (solve(huwiki.english_badwords_safe.revision.datasources.matches, + cache=ok_cache) == []) + assert (solve(huwiki.english_informals_safe.revision.datasources.matches, + cache=ok_cache) == []) + assert (solve(huwiki.english_badwords_safe.revision.datasources.matches, + cache=bad_cache) == STILL_MATCH) diff --git a/editquality/feature_lists/tests/test_wikidatawiki.py b/editquality/feature_lists/tests/test_wikidatawiki.py new file mode 100644 index 0000000..cbb5abe --- /dev/null +++ b/editquality/feature_lists/tests/test_wikidatawiki.py @@ -0,0 +1,45 @@ +from revscoring.datasources import revision_oriented +from revscoring.dependencies import solve + +from .. import wikidatawiki + + +def test_comment_features(): + comment_ds = revision_oriented.revision.comment + cache = {comment_ds: "/* wbmergeitems-to:0||Q928543 */ "} + assert solve(wikidatawiki.is_merge_into, cache=cache) + assert solve(wikidatawiki.is_merge_from, cache=cache) is False + assert solve(wikidatawiki.is_item_creation, cache=cache) is False + + cache = {comment_ds: "/* wbmergeitems-from:0||Q928543 */ "} + assert solve(wikidatawiki.is_merge_from, cache=cache) + assert solve(wikidatawiki.is_merge_into, cache=cache) is False + + cache = {comment_ds: "/* clientsitelink-remove:1||enwiki */ Boris Kok"} + assert solve(wikidatawiki.is_client_delete, cache=cache) + assert solve(wikidatawiki.is_client_move, cache=cache) is False + + cache = {comment_ds: "/* clientsitelink-update:0|uk|uk:A|uk:B *"} + assert solve(wikidatawiki.is_client_move, cache=cache) + assert solve(wikidatawiki.is_client_delete, cache=cache) is False + assert solve(wikidatawiki.is_revert, cache=cache) is False + + cache = {comment_ds: "Undid revision 1448592 by [[Special:Contributions/"} + assert solve(wikidatawiki.is_revert, cache=cache) + cache = {comment_ds: "Reverted edits by [[Special:Contributions/"} + assert solve(wikidatawiki.is_revert, cache=cache) + cache = {comment_ds: "rvv racial slurs"} + assert solve(wikidatawiki.is_revert, cache=cache) + + cache = {comment_ds: "Restored revision 123456"} + assert solve(wikidatawiki.is_restore, cache=cache) + assert solve(wikidatawiki.is_item_creation, cache=cache) is False + assert solve(wikidatawiki.is_revert, cache=cache) is False + + cache = {comment_ds: "/* wbeditentity-create:0| */"} + assert solve(wikidatawiki.is_item_creation, cache=cache) + + +def test_property_features(): + # assert False, "TODO" + pass From e1ed560e79554d46de504d900f0ccf8039635f96 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Tue, 22 Sep 2020 15:03:22 -0400 Subject: [PATCH 15/19] Remove unused import --- editquality/utilities/bad_words_detection_system.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py index e0b4417..7a64fc7 100755 --- a/editquality/utilities/bad_words_detection_system.py +++ b/editquality/utilities/bad_words_detection_system.py @@ -10,7 +10,8 @@ python3 bad_words_detection_system.py --cache: """ import sys -from editquality.bwds import Bot, Edit, cache_parse, import_from_path, read_rev_pages, bot_gen + +from editquality.bwds import Bot, cache_parse, import_from_path, read_rev_pages, bot_gen def handle_args(): From df1ad80a87e1279cb624be7654d93eeda5714e7b Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Tue, 22 Sep 2020 15:06:45 -0400 Subject: [PATCH 16/19] Add test_bot_gen_empty --- editquality/bwds/tests/test_bwds.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py index bfcbf99..b2e4473 100644 --- a/editquality/bwds/tests/test_bwds.py +++ b/editquality/bwds/tests/test_bwds.py @@ -16,6 +16,11 @@ def test_cache_parse(): ) +def test_bot_gen_empty(): + en_api_url = 'https://en.wikipedia.org/w/api.php' + assert list(bot_gen([], 'TODO', en_api_url)) == [] + + def test_bot_gen(): a_revision_id = 7101436 en_main_page_id = 232335 From d75f06ac820def4e95486afb4e369e8fb5eba31a Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Wed, 23 Sep 2020 09:59:36 -0400 Subject: [PATCH 17/19] It works!!! --- editquality/bwds/__init__.py | 10 +++------- editquality/bwds/tests/test_bwds.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index 1931817..ba3883e 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -25,11 +25,10 @@ # TODO: User argparse from revscoring.datasources import Datasource, revision_oriented from revscoring.extractors.api import Extractor -from revscoring.features.wikitext import revision +from revscoring.features import wikitext from mwapi import Session import mwreverts -from tests.extractors.test_extractor import get_last_two base_file_path = '/data/project/dexbot/pywikibot-core/something_' @@ -202,21 +201,18 @@ def bot_gen(rev_pages, language, api_url): for revision_id, page_id in rev_pages: api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids') revisions = next(iter(api_result['query']['pages'].values()))['revisions'] + revisions = [revision for revision in revisions if 'sha1hidden' not in revision] sys.stderr.write(".") sys.stderr.flush() try: - revisions = [revision for revision in revisions if 'sha1hidden' not in revision] - reverted_revision_ids = set() # Detect reverted status for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions): for reverted in revert.reverteds: reverted_revision_ids.add(reverted['revid']) - # added_words = list(extractor.extract(revision_id, [revision.diff.words_added]))[0] - datasource = Datasource("last_two_in_id", get_last_two, depends_on=[revision_oriented.revision.id]) - added_words = {extractor.extract(revision_id, datasource)} + added_words = set(extractor.extract(revision_id, wikitext.revision.diff.datasources.words_added)) yield Edit(revision_id, added_words, revision_id in reverted_revision_ids) except KeyboardInterrupt: diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py index b2e4473..86da579 100644 --- a/editquality/bwds/tests/test_bwds.py +++ b/editquality/bwds/tests/test_bwds.py @@ -1,3 +1,5 @@ +from deltas import Token + from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages, bot_gen, EditNamedTuple EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] @@ -22,11 +24,13 @@ def test_bot_gen_empty(): def test_bot_gen(): - a_revision_id = 7101436 - en_main_page_id = 232335 + a_revision_id = 979192243 + pasta_page_id = 23871 en_api_url = 'https://en.wikipedia.org/w/api.php' - assert list(bot_gen([(a_revision_id, en_main_page_id)], 'TODO', en_api_url))[0].as_named_tuple() == \ - EditNamedTuple(7101436, {"TODO"}, False) + generated, = bot_gen([(a_revision_id, pasta_page_id)], '', en_api_url) + assert generated.id == a_revision_id + assert Token('unleavened', type='word') in generated.added_words + assert not generated.reverted def test_read_rev_pages(): From ed9acb579626774c035e00677112d8ebb8906a03 Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Wed, 23 Sep 2020 10:19:53 -0400 Subject: [PATCH 18/19] Remove unused code branch --- editquality/bwds/__init__.py | 9 +-------- editquality/bwds/tests/test_bwds.py | 10 +++------- editquality/utilities/bad_words_detection_system.py | 13 +++---------- 3 files changed, 7 insertions(+), 25 deletions(-) diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index ba3883e..526415b 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -19,11 +19,8 @@ import sys import time from collections import OrderedDict, namedtuple -from importlib import import_module import traceback -# TODO: User argparse -from revscoring.datasources import Datasource, revision_oriented from revscoring.extractors.api import Extractor from revscoring.features import wikitext @@ -181,10 +178,6 @@ def read_rev_pages(f): yield int(rev_id), int(page_id) -def import_from_path(path): - return import_module(path) - - def cache_parse(pathes, num_res): if not pathes.strip(): pathes = 'words_db.txt,bad_edits_words.txt,no_docs.txt' @@ -194,7 +187,7 @@ def cache_parse(pathes, num_res): bot.parse_bad_edits(num_res) -def bot_gen(rev_pages, language, api_url): +def bot_gen(rev_pages, api_url): session = Session(api_url) extractor = Extractor(session) diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py index 86da579..6c6cfd5 100644 --- a/editquality/bwds/tests/test_bwds.py +++ b/editquality/bwds/tests/test_bwds.py @@ -1,14 +1,10 @@ from deltas import Token -from editquality.bwds import import_from_path, cache_parse, Edit, Bot, read_rev_pages, bot_gen, EditNamedTuple +from editquality.bwds import cache_parse, Edit, Bot, read_rev_pages, bot_gen EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] -def test_import_from_path(): - import_from_path('revscoring.languages.english') - - def test_cache_parse(): cache_parse( 'editquality/bwds/tests/words_db.txt,' @@ -20,14 +16,14 @@ def test_cache_parse(): def test_bot_gen_empty(): en_api_url = 'https://en.wikipedia.org/w/api.php' - assert list(bot_gen([], 'TODO', en_api_url)) == [] + assert list(bot_gen([], en_api_url)) == [] def test_bot_gen(): a_revision_id = 979192243 pasta_page_id = 23871 en_api_url = 'https://en.wikipedia.org/w/api.php' - generated, = bot_gen([(a_revision_id, pasta_page_id)], '', en_api_url) + generated, = bot_gen([(a_revision_id, pasta_page_id)], en_api_url) assert generated.id == a_revision_id assert Token('unleavened', type='word') in generated.added_words assert not generated.reverted diff --git a/editquality/utilities/bad_words_detection_system.py b/editquality/utilities/bad_words_detection_system.py index 7a64fc7..7ab7450 100755 --- a/editquality/utilities/bad_words_detection_system.py +++ b/editquality/utilities/bad_words_detection_system.py @@ -4,23 +4,21 @@ python3 bad_words_detection_system.py --rev-pages:f.txt --api:https://en.wikipedia.org/w/api.php - --language:revscoring.languages.english Use cache: python3 bad_words_detection_system.py --cache: """ import sys -from editquality.bwds import Bot, cache_parse, import_from_path, read_rev_pages, bot_gen +from editquality.bwds import Bot, cache_parse, read_rev_pages, bot_gen +# TODO: Use argparse def handle_args(): args = {} for arg in sys.argv[1:]: if arg.startswith('--rev-pages:'): args['--rev-pages'] = arg[len('--rev-pages:'):] - elif arg.startswith('--language:'): - args['--language'] = arg[len('--language:'):] elif arg.startswith('--api:'): args['--api'] = arg[len('--api:'):] elif arg.startswith('--cache:'): @@ -43,13 +41,8 @@ def main(): return rev_pages = read_rev_pages(open(args['--rev-pages'])) - if args['--language'] is not None: - language = import_from_path(args['--language']) - else: - language = None - api_url = args['--api'] - gen = bot_gen(rev_pages, language, api_url) + gen = bot_gen(rev_pages, api_url) bot = Bot() bot.parse_edits(gen) bot.parse_bad_edits(num_res) From f56c1f7d46729aa8192abd09f10a65da64edf14b Mon Sep 17 00:00:00 2001 From: paulkernfeld Date: Wed, 23 Sep 2020 10:33:47 -0400 Subject: [PATCH 19/19] Fix flake8 --- editquality/bwds/__init__.py | 34 ++++++++++++++++++++--------- editquality/bwds/tests/test_bwds.py | 9 ++++++-- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/editquality/bwds/__init__.py b/editquality/bwds/__init__.py index 526415b..b0af7a6 100644 --- a/editquality/bwds/__init__.py +++ b/editquality/bwds/__init__.py @@ -61,7 +61,9 @@ def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None): self.bad_words_db = {} self.bad_counter = 0 if bool(bad_words_cache) != bool(words_cache): - raise ValueError("bad_words_cache should be defined if and only words_cache is defined") + raise ValueError( + "bad_words_cache should be defined if and only words_cache is " + "defined") if words_cache: self.cache = True self.initiate_cache(words_cache, bad_words_cache, no_docs) @@ -110,11 +112,11 @@ def parse_bad_edits(self, numbers_to_show=10): def tf_idf(self, word): tf = math.log(self.bad_edits.added_words[word]) + 1 - idf = math.log(self.counter/self.words_db[word]) - return tf*idf + idf = math.log(self.counter / self.words_db[word]) + return tf * idf def idf(self, word): - return math.log(self.counter/self.words_db[word]) + return math.log(self.counter / self.words_db[word]) def show_results(self, numbers_to_show): print("Showing %d results" % numbers_to_show) @@ -192,21 +194,34 @@ def bot_gen(rev_pages, api_url): extractor = Extractor(session) for revision_id, page_id in rev_pages: - api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids') + api_result = session.get( + action='query', + titles='Main Page', + prop='revisions', + rvlimit=500, + rvprop='sha1|ids' + ) revisions = next(iter(api_result['query']['pages'].values()))['revisions'] - revisions = [revision for revision in revisions if 'sha1hidden' not in revision] + revisions = [ + revision for revision in revisions if 'sha1hidden' not in revision] sys.stderr.write(".") sys.stderr.flush() try: reverted_revision_ids = set() # Detect reverted status - for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions): + for revert in mwreverts.detect( + (revision['sha1'], revision) for revision in revisions + ): for reverted in revert.reverteds: reverted_revision_ids.add(reverted['revid']) - added_words = set(extractor.extract(revision_id, wikitext.revision.diff.datasources.words_added)) - yield Edit(revision_id, added_words, revision_id in reverted_revision_ids) + added_words = set(extractor.extract( + revision_id, wikitext.revision.diff.datasources.words_added + )) + yield Edit( + revision_id, added_words, revision_id in reverted_revision_ids + ) except KeyboardInterrupt: sys.stderr.write("\n^C Caught. Exiting...") @@ -217,4 +232,3 @@ def bot_gen(rev_pages, api_url): sys.stderr.write("\n") sys.stderr.write("\n") - diff --git a/editquality/bwds/tests/test_bwds.py b/editquality/bwds/tests/test_bwds.py index 6c6cfd5..3c12b7f 100644 --- a/editquality/bwds/tests/test_bwds.py +++ b/editquality/bwds/tests/test_bwds.py @@ -2,7 +2,11 @@ from editquality.bwds import cache_parse, Edit, Bot, read_rev_pages, bot_gen -EDITS = [Edit(1, {'one': 1, 'two': 2}, False), Edit(2, {'three': 3}, True), Edit(3, {'one': 5, 'four': 1}, False)] +EDITS = [ + Edit(1, {'one': 1, 'two': 2}, False), + Edit(2, {'three': 3}, True), + Edit(3, {'one': 5, 'four': 1}, False) +] def test_cache_parse(): @@ -64,6 +68,7 @@ def dump_toy_data(): def test_dump(): - # Calling both tests from here because we want to ensure they're not run concurrently + # Calling both tests from here because we want to ensure they're not run + # concurrently dump_empty() dump_toy_data()