-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocessing_lib.py
94 lines (74 loc) · 3.6 KB
/
processing_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import time
from operator import itemgetter
import glob
from article_lib import *
from mitie_lib import *
def process_folder_of_articles(in_folder, out_file, fakes, batch_size=1000, start=0):
print "Started processing folder %s" % in_folder
articles = glob.glob(in_folder + "\\*.xml")
length = len(articles)
print "Found articles: %i." % length
all_fakes = 0
for batch_start in range(start, length, batch_size):
batch_num = batch_start / batch_size + 1
print "Processing batch #%i..." % batch_num
start = time.time()
batch = [articles[i] for i in range(batch_start, min(batch_start + batch_size, len(articles)))]
fakes_found = process_batch(batch, out_file, fakes)
all_fakes += fakes_found
print "Found %i fakes. Results saved in file: %s. " % (fakes_found, out_file)
end = time.time()
batch_time = end - start
batches_more = (length - batch_start - batch_size) / batch_size
left_time = batch_time * batches_more
print "Batch #%i finished. Time spent: %s. To finish: %s" % (batch_num, str(batch_time), str(left_time))
print "Folder %s done. %i fakes found." % (in_folder, all_fakes)
def process_batch(batch, out_file, fakes):
df = read_list_of_articles_as_dataset(batch)
fake_columns = []
# try to extract relations for each fake
for fake in fakes.iterrows():
column_name = "fake_%s" % fake[1]["Id"]
df[column_name + "_title"] = df.apply(
lambda row: assign_fake_score(row["Title"], fake[1]["Subject"], fake[1]["Object"], fake[1]["Predicate"]),
axis=1)
df[column_name + "_body"] = df.apply(
lambda row: assign_fake_score(row["Body"], fake[1]["Subject"], fake[1]["Object"], fake[1]["Predicate"]),
axis=1)
fake_columns.append(column_name + "_title")
fake_columns.append(column_name + "_body")
# split results on two columns
fake_columns_scores = []
for fake_column in fake_columns:
df[fake_column + "_score"] = df.apply(lambda row: row[fake_column][0], axis=1)
fake_columns_scores.append(fake_column + "_score")
df[fake_column + "_text"] = df.apply(lambda row: unicode(row[fake_column][1], errors='replace'), axis=1)
del df[fake_column]
# calculate results
fakes_found = df[fake_columns_scores].apply(
lambda row: 1 if any([row[name] != 0 for name in fake_columns_scores]) else 0, axis=1).sum()
# remove extra data
del df["BinaryRelationId"]
del df["Title"]
del df["Body"]
# append to existing file or create new
if os.path.exists(out_file):
with open(out_file, 'a') as f:
df.to_csv(f, header=False, index=False, encoding='utf8')
else:
df.to_csv(out_file, index=False, encoding='utf8')
return fakes_found
# selects the best result
def assign_fake_score(text, subject_text, object_text, predicate):
if text is None or len(text) < 5: # magic numbers!
return 0, ""
relations = [r for rr in [find_binary_relation_in_text(text.upper(), subject_text, object_text, predicate, False),
find_binary_relation_in_text(text.lower(), subject_text, object_text, predicate, False),
find_binary_relation_in_text(text, subject_text, object_text, predicate, False)]
for r in rr]
if len(relations) == 0:
return 0, ""
relations.sort(key=itemgetter(2), reverse=True)
best_relation = relations[0]
relation_text = extract_text_between_entities(best_relation[0], best_relation[1], text)
return best_relation[2], relation_text