Skip to content

Commit

Permalink
init refactoring with basic python structure stuff.
Browse files Browse the repository at this point in the history
  • Loading branch information
halfak committed Mar 18, 2017
1 parent 1f6aeeb commit 53769bc
Show file tree
Hide file tree
Showing 10 changed files with 394 additions and 37 deletions.
Empty file added README.md
Empty file.
5 changes: 5 additions & 0 deletions bwds/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .about import (__name__, __version__, __author__, __author_email__,
__description__, __license__, __url__)

__all__ = [__name__, __version__, __author__, __author_email__,
__description__, __license__, __url__]
8 changes: 8 additions & 0 deletions bwds/about.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
__name__ = "bwds"
__version__ = "0.0.1"
__author__ = "Amir Sarabadani"
__author_email__ = "[email protected]"
__description__ = "A library for performing automatic detection of the " + \
"badwords added to Wikipedia articles"
__url__ = "https://github.com/wiki-ai/bwds"
__license__ = "MIT"
47 changes: 47 additions & 0 deletions bwds/bwds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
This script provides access to a set of utilities for extracting features and
building edit quality predictors.
* process_api -- Processes a sample of revisions using the API
* process_dump -- Processes an XML dump
Usage:
bwds (-h | --help)
bwds <utility> [-h | --help]
Options:
-h | --help Prints this documentation
<utility> The name of the utility to run
"""
import sys
import traceback
from importlib import import_module


USAGE = """Usage:
bwds (-h | --help)
bwds <utility> [-h | --help]\n"""


def main():

if len(sys.argv) < 2:
sys.stderr.write(USAGE)
sys.exit(1)
elif sys.argv[1] in ("-h", "--help"):
sys.stderr.write(__doc__ + "\n")
sys.exit(1)
elif sys.argv[1][:1] == "-":
sys.stderr.write(USAGE)
sys.exit(1)

module_name = sys.argv[1]
try:
module = import_module(".utilities." + module_name,
package="bwds")
except ImportError:
sys.stderr.write(traceback.format_exc())
sys.stderr.write("Could not load utility {0}.\n".format(module_name))
sys.exit(1)

module.main(sys.argv[2:])
75 changes: 38 additions & 37 deletions dump_based_detection.py → bwds/dump_based_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,51 @@
Some parts are copied from
https://gist.github.com/he7d3r/f99482f4f54f97895ccb/9205f3271fe8daa2f694f4ce3ba9b29213dbad6c
"""
from nltk.tokenize import RegexpTokenizer
import sys
from mw.lib import reverts
from pywikibot import xmlreader
import pywikibot
import re
import sys
import time

import regex
from mw.lib import reverts
from nltk.tokenize import RegexpTokenizer

from bad_words_detection_system import Edit, Bot
import pywikibot
from bad_words_detection_system import Bot, Edit
from pywikibot import xmlreader

cache = {}

languages_by_size = [
'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi',
'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar',
'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk',
'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz',
'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka',
'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy',
'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb',
'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv',
'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne',
'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa',
'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa',
'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo',
'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb',
'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue',
'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf',
'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea',
'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu',
'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt',
'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo',
'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv',
'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw',
'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr',
'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty',
'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab',
'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr',
'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt',
'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff',
'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc',
'azb', 'or'
]
language_codes = [
'en', 'sv', 'nl', 'de', 'fr', 'war', 'ru', 'ceb', 'it', 'es', 'vi',
'pl', 'ja', 'pt', 'zh', 'uk', 'ca', 'fa', 'no', 'sh', 'fi', 'ar',
'id', 'cs', 'sr', 'ro', 'ko', 'hu', 'ms', 'tr', 'min', 'eo', 'kk',
'eu', 'sk', 'da', 'bg', 'he', 'lt', 'hy', 'hr', 'sl', 'et', 'uz',
'gl', 'nn', 'vo', 'la', 'simple', 'el', 'hi', 'az', 'th', 'ka',
'ce', 'oc', 'be', 'mk', 'mg', 'new', 'ur', 'tt', 'ta', 'pms', 'cy',
'tl', 'lv', 'bs', 'te', 'be-tarask', 'br', 'ht', 'sq', 'jv', 'lb',
'mr', 'is', 'ml', 'zh-yue', 'bn', 'af', 'ba', 'ga', 'pnb', 'cv',
'fy', 'lmo', 'tg', 'sco', 'my', 'yo', 'an', 'ky', 'sw', 'io', 'ne',
'gu', 'scn', 'bpy', 'nds', 'ku', 'ast', 'qu', 'als', 'su', 'pa',
'kn', 'ckb', 'ia', 'mn', 'nap', 'bug', 'arz', 'bat-smg', 'wa',
'zh-min-nan', 'am', 'map-bms', 'gd', 'yi', 'mzn', 'si', 'fo',
'bar', 'vec', 'nah', 'sah', 'os', 'sa', 'roa-tara', 'li', 'hsb',
'pam', 'mrj', 'mhr', 'se', 'mi', 'ilo', 'hif', 'bcl', 'gan', 'rue',
'ps', 'glk', 'nds-nl', 'bo', 'vls', 'diq', 'fiu-vro', 'bh', 'xmf',
'tk', 'gv', 'sc', 'co', 'csb', 'hak', 'km', 'kv', 'vep', 'zea',
'crh', 'zh-classical', 'frr', 'eml', 'ay', 'stq', 'udm', 'wuu',
'nrm', 'kw', 'rm', 'szl', 'so', 'koi', 'as', 'lad', 'fur', 'mt',
'dv', 'gn', 'dsb', 'ie', 'pcd', 'sd', 'lij', 'cbk-zam', 'cdo',
'ksh', 'ext', 'mwl', 'gag', 'ang', 'ug', 'ace', 'pi', 'pag', 'nv',
'lez', 'frp', 'sn', 'kab', 'ln', 'myv', 'pfl', 'xal', 'krc', 'haw',
'rw', 'pdc', 'kaa', 'to', 'kl', 'arc', 'nov', 'kbd', 'av', 'bxr',
'lo', 'bjn', 'ha', 'tet', 'tpi', 'na', 'pap', 'lbe', 'jbo', 'ty',
'mdf', 'roa-rup', 'wo', 'tyv', 'ig', 'srn', 'nso', 'kg', 'ab',
'ltg', 'zu', 'om', 'za', 'chy', 'cu', 'rmy', 'tw', 'tn', 'chr',
'mai', 'pih', 'got', 'xh', 'bi', 'sm', 'ss', 'rn', 'ki', 'pnt',
'bm', 'iu', 'ee', 'lg', 'ts', 'fj', 'ak', 'ik', 'st', 'sg', 'ff',
'dz', 'ny', 'ch', 'ti', 've', 'ks', 'tum', 'cr', 'gom', 'lrc',
'azb', 'or'
]
cjk = (
r'\u4E00-\u62FF' + # Unified Ideographs
r'\u6300-\u77FF' +
Expand Down
Loading

0 comments on commit 53769bc

Please sign in to comment.