Skip to content

Commit

Permalink
Major improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Ladsgroup committed May 7, 2016
1 parent 0363bb2 commit 1f6aeeb
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 60 deletions.
42 changes: 23 additions & 19 deletions bad_words_detection_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
https://github.com/halfak/Objective-Revision-Evaluation-Service/blob/master/ores/label_reverted.py
>>> from bad_words_detection_system import *
>>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True), Edit(3, {'one':5, 'four': 1}, False)]
>>> edits = [Edit(1, {'one':1, 'two': 2}, False), Edit(2, {'three':3}, True),
... Edit(3, {'one':5, 'four': 1}, False)]
>>> bot = Bot()
>>> bot.parse_edits(edits)
>>> bot.parse_bad_edits()
Expand All @@ -25,18 +26,18 @@
import sys
import traceback
import json
import codecs
import time
from importlib import import_module
from collections import OrderedDict
# TODO: User argparse
# import argparse

from revscoring.extractors import APIExtractor
from revscoring.datasources import diff

This comment has been minimized.

Copy link
@halfak

halfak May 7, 2016

Member

You should be able to solve(wikitext.revision.diff.datasources.words_added, cache={revision_oriented.revision.text: ..., revision_oriented.revision.parent.text: ...}) to get the same wordlist that revscoring/ORES works with. It would be great if you could try that and point out any issues so we can repair them as appropriate for the whole system.


from mw import api
from mw.lib import reverts

#from revscoring.extractors import APIExtractor
#from revscoring.datasources import diff
base_file_path = '/data/project/dexbot/pywikibot-core/something_'


class Edit(object):
Expand Down Expand Up @@ -71,37 +72,39 @@ def __init__(self, words_cache=None, bad_words_cache=None, no_docs=None):
self.cache = False

def initiate_cache(self, words_cache, bad_words_cache, no_docs):
with codecs.open(words_cache, 'r', 'utf-8') as f:
with open(words_cache, 'r') as f:
self.words_db = json.loads(f.read())
with codecs.open(bad_words_cache, 'r', 'utf-8') as f:
with open(bad_words_cache, 'r') as f:
self.bad_edits.added_words = json.loads(f.read())
with codecs.open(no_docs, 'r', 'utf-8') as f:
with open(no_docs, 'r') as f:
self.counter = int(f.read())

def parse_edits(self, edits):
for edit in edits:
#Since edits can be gen and len doesn't mean there
# Since edits can be gen and len doesn't mean there
self.counter += 1
if edit.reverted:
for word in edit.added_words:
self.bad_edits.added_words[word] = \
self.bad_edits.added_words.get(word, 0) + \
edit.added_words[word]
self.bad_words_db[word] = self.bad_words_db.get(word, 0) + 1
self.bad_words_db[word] = (
self.bad_words_db.get(word, 0) + 1)
self.bad_counter += 1
continue
for word in edit.added_words:
self.words_db[word] = self.words_db.get(word, 0) + 1

def parse_bad_edits(self, numbers_to_show=10):
self.possible_bad_words = {}
#self.possible_bad_words2 = {}
self.stop_words = {}
if not self.cache:
self.counter += 1
for word in self.bad_edits.added_words:
if not self.cache:
self.words_db[word] = self.words_db.get(word, 0) + 1
if 'sh' in word or 'ch' in word:
continue
self.possible_bad_words[word] = self.tf_idf(word)
self.stop_words[word] = self.idf(word)
if numbers_to_show:
Expand All @@ -124,14 +127,15 @@ def show_results(self, numbers_to_show):
for word in self.possible_bad_words:
if self.possible_bad_words[word] >= lim:
res[word] = self.possible_bad_words[word]
res = OrderedDict(sorted(res.items(), key=lambda t: t[1], reverse=True))
res = OrderedDict(
sorted(res.items(), key=lambda t: t[1], reverse=True))
res_text = []
for word in res:
res_text.append(word)
res_text.sort()
res_text = "#" + '\n#'.join(res_text)
self.bad_words_res_text = res_text
with codecs.open('/data/project/dexbot/pywikibot-core/something_%s.txt' % time.time(), 'w', 'utf-8') as f:
with open('%s_%s.txt' % (base_file_path, time.time()), 'w') as f:
f.write(res_text)

def show_results2(self, numbers_to_show):
Expand All @@ -149,18 +153,18 @@ def show_results2(self, numbers_to_show):
res_text.sort()
res_text = "#" + '\n#'.join(res_text)
self.stop_words_res_text = res_text
with codecs.open('/data/project/dexbot/pywikibot-core/something2_%s.txt' % time.time(), 'w', 'utf-8') as f:
with open('%s2_%s.txt' % (base_file_path, time.time()), 'w') as f:
f.write(res_text)

def dump(self):
new_db = {}
for word in self.bad_edits.added_words:
new_db[word] = self.words_db[word]
with codecs.open('words_db.txt', 'w', 'utf-8') as f:
with open('words_db.txt', 'w') as f:
f.write(json.dumps(new_db))
with codecs.open('bad_edits_words.txt', 'w', 'utf-8') as f:
with open('bad_edits_words.txt', 'w') as f:
f.write(json.dumps(self.bad_edits.added_words))
with codecs.open('no_docs.txt', 'w', 'utf-8') as f:
with open('no_docs.txt', 'w') as f:
f.write(json.dumps(self.counter))


Expand Down Expand Up @@ -209,8 +213,8 @@ def handle_args():

def bot_gen(rev_pages, language, api_url):

#session = api.Session(api_url)
#extractor = APIExtractor(session, language=language)
session = api.Session(api_url)
extractor = APIExtractor(session, language=language)

This comment has been minimized.

Copy link
@halfak

halfak May 7, 2016

Member

This shouldn't work. An extractor hasn't taken "language" as an argument in a long time.

This comment has been minimized.

Copy link
@Ladsgroup

Ladsgroup May 7, 2016

Author Member

We don't use this method, I did it just to shut up flake8 :D


for rev_id, page_id in rev_pages:
sys.stderr.write(".")
Expand Down
136 changes: 95 additions & 41 deletions dump_based_detection.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,49 @@
https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c
"""
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
import sys
from mw.lib import reverts
from pywikibot import xmlreader
import pywikibot
import re
import time
import regex

from bad_words_detection_system import Edit, Bot

stemmer = SnowballStemmer('portuguese')
cache = {}

languages_by_size = [
'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi',
'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar',
'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk',
'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz',
'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka',
'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy',
'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb',
'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv',
'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne',
'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa',
'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa',
'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo',
'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb',
'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue',
'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf',
'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea',
'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu',
'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt',
'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo',
'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv',
'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw',
'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr',
'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty',
'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab',
'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr',
'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt',
'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff',
'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc',
'azb', 'or'
]
cjk = (
r'\u4E00-\u62FF' + # Unified Ideographs
r'\u6300-\u77FF' +
Expand Down Expand Up @@ -47,18 +79,28 @@
'zh': cjk,
'ja': cjk,
'pt': u'A-Za-záàâãçéêíóôõúüÁÀÂÃÇÉÊÍÓÔÕÚ',
'tr': u'A-Za-zÇĞİÖŞÜçğıöşü',
'tr': u'A-Za-zÇĞİÖŞÜçğıöşüâîûÂÎÛ',
'fa': u'ابپتثجچحخدذرزژسشصآضطظعغفقکگلمنوهی‌يك',
'fr': u'A-Za-zÀàÂâÆæÄäÇçÉéÈèÊêËëÎîÏïÔôŒœÖöÙùÛûÜüŸÿ',
'de': u'A-Za-zÄäÖöÜüß',
'es': u'A-Za-zÑñéÉüÜóÓ',
'uk': u'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЬьЮю'
u'Яя',
'uk': u'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЬ'
u'ьЮюЯя',
'pl': u'AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż',
'he': u'למנסעפצקרשתםןףץאבגדהוזחטיכך',
'hy': u'ԱաԲբԳգԴդԵեԶզԷէԸըԹթԺժԻիԼլԽխԾծԿկՀհՁձՂղՃճՄմՅյՆնՇշՈոՉչՊպՋջՌռՍսՎվՏտՐրՑ'
u'ցՈՒՈւուՒւՓփՔքևևՕօՖֆ',
'hy': u'ԱաԲբԳգԴդԵեԶզԷէԸըԹթԺժԻիԼլԽխԾծԿկՀհՁձՂղՃճՄմՅյՆնՇշՈոՉչՊպՋջՌռՍսՎվՏտՐր'
u'ՑցՈՒՈւուՒւՓփՔքևևՕօՖֆ',
'vi': u'AaĂăÂâBbCcDdĐđEeÊêGgHhIiKkLlMmNnOoÔôƠơPpQqRrSsTtUuƯưVvXxYy',
'ur': u'ابپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوهھءیےٹڈڑ‌آّْیٰوَُِٗ',
'uz': 'A-Za-zʻ',
'sv': u'A-Za-zÅÄÖåäö',
'hu': u'A-Za-zËëÉéÓóÖöŐőÚúÜüŰűÁá',
'cs': u'A-Za-zÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž',
'hi': u'कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसहळक्षज्ञ:अपआपाइपिईपीउपुऊपूऋपृॠप'
u'ॄऌपॢॡपॣएपेऐपैओपोऔपौअंपंअःपः',
'no': u'A-Za-zÆØÅæøåéèêóòâôüáàé',
'ta': u'௰௱௲௳௴௵௶௷௸௹௺ௗௐொோௌ்ெேைீுூாிரறலளழவஶஷஸஹணதநனபம'
u'யஐஒஓஔகஙசஜஞடஂஃஅஆஇஈஉஊஎஏ',
}


Expand All @@ -68,11 +110,12 @@ def lower(a, lang):
return a.lower()


def page_info(dump, lang, stemming=False):
global tokenizer, stemmer
def page_info(dump, lang):
global tokenizer
c = 1
di_old = []
di = []
nombre = '3,' if lang not in ['ja', 'zh'] else '1'
for entry in dump.parse():
if entry.ns != '0':
continue
Expand All @@ -91,29 +134,25 @@ def page_info(dump, lang, stemming=False):
history = {}
detector = reverts.Detector(radius=3)
for revision in di_old:
revision.text = pywikibot.textlib.removeLanguageLinks(
revision.text = re.sub(
r'\[\[(%s)\:' % '|'.join(languages_by_size),
'',
revision.text)
stems = set()
tokenizer = RegexpTokenizer(r'[%s]{3,}' %
chars.get(lang, chars['en']))
for w in tokenizer.tokenize(revision.text):
if stemming:
if len(w) < 3:
continue
elif len(w) == 3:
stems.add(w.lower())
continue
else:
if w not in cache:
cache[w] = stemmer.stem(w)
stems.add(cache[w].lower())
else:
stems.add(lower(w, lang))
words = set()
if lang in chars:
token_pattern = r'[%s]{%s}' % (chars[lang], nombre)
tokenizer = RegexpTokenizer(token_pattern)
tokens = tokenizer.tokenize(revision.text)
else:
token_pattern = r'\p{alpha}+'
tokens = regex.findall(token_pattern, revision.text)
for w in tokens:
words.add(lower(w, lang))
if firstRev:
prevIntersection = stems
prevIntersection = words
firstRev = False
added = stems - prevIntersection
prevIntersection = stems
added = words - prevIntersection
prevIntersection = words
history[revision.revisionid] = Edit(
revision.revisionid, added, False)
rev = detector.process(revision.text,
Expand All @@ -126,8 +165,9 @@ def page_info(dump, lang, stemming=False):


def run(dumps):
number = 0
number = 500000
counter = 0
start_time = time.time()
for casee in dumps:
lang = casee.split('/')[-1].split('wiki')[0]
dump = xmlreader.XmlDump(casee, True)
Expand All @@ -137,31 +177,45 @@ def run(dumps):
if number and counter > number:
break
bot.parse_edits(case.values())

bot.parse_bad_edits(250)
bot.dump()
print(time.time() - start_time)
site = pywikibot.Site('meta', fam='meta')
prefix = 'Research:Revision scoring as a service/Word lists/'
page = pywikibot.Page(site, prefix + lang)
page = pywikibot.Page(
site, 'Research:Revision scoring as a service/Word lists/' + lang)
try:
text = page.get()
except pywikibot.NoPage:
text = ("{{Research:Revision scoring as a service/template/word list "
"data\n |lang=%s\n |gen=250\n |badwords=-\n |informal=-"
"\n |stopwords=-\n |dictionary=-\n |stemmer=-\n |contact="
"\n |features=no\n |labels=requested\n |campaign=no\n "
"|needs=-\n |list-generated=\n |list-stop=\n}}\n" % lang)
except:
return False
new_text = text
if re.search(r'\|\s*?list\-generated\s*?\=\s*?', text):
if re.search(r'\|\s*?list\-generated\s*?\=\s*?(\||\}\})', text):
new_text = re.sub(r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})',
r'\1%s\2' % bot.bad_words_res_text, new_text)
new_text = re.sub(
r'(\|\s*?list\-generated\s*?\=\s*?)(\||\}\})',
r'\1%s\2' % bot.bad_words_res_text,
new_text)
else:
new_text = re.sub(r'\}\}', r'|list-generated=%s\n}}' %
bot.bad_words_res_text, new_text)
new_text = re.sub(
r'\}\}',
r'|list-generated=%s\n}}' % bot.bad_words_res_text,
new_text)
if re.search(r'\|\s*?list\-stop\s*?\=\s*?', text):
if re.search(r'\|\s*?list\-stop\s*?\=\s*?(\||\}\})', text):
new_text = re.sub(r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})',
r'\1%s\2' % bot.stop_words_res_text, new_text)
new_text = re.sub(
r'(\|\s*?list\-stop\s*?\=\s*?)(\||\}\})',
r'\1%s\2' % bot.stop_words_res_text,
new_text)
else:
new_text = re.sub(r'\}\}', r'|list-stop=%s\n}}'
% bot.stop_words_res_text, new_text)
new_text = re.sub(
r'\}\}',
r'|list-stop=%s\n}}' % bot.stop_words_res_text,
new_text)
if new_text != text:
page.text = new_text
page.save('Bot: update results')
Expand Down

0 comments on commit 1f6aeeb

Please sign in to comment.