-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathsingle_core_run.py
95 lines (82 loc) · 3.41 KB
/
single_core_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import multiprocessing
from pipeline.entitylinker import *
from pipeline.triplealigner import *
from pipeline.datareader import WikiDataAbstractsDataReader
from pipeline.writer import JsonWriter, JsonlWriter, OutputSplitter, NextFile
from pipeline.coreference import *
from utils.triplereader import *
from pipeline.filter import *
from pympler import muppy, summary
import pandas as pd
import argparse
from timeit import default_timer
__START_DOC__ = 0 #start reading from document number
__CORES__ = 7
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__)
parser.add_argument("--input", default = 'text/zh',
help="XML wiki dump file")
parser.add_argument("--output", default = './out/zh',
help="XML wiki dump file")
parser.add_argument("--input_triples", default = 'data/zh/wikidata-triples-zh-subj.csv',
help="XML wiki dump file")
parser.add_argument("--language", default = 'zh',
help="language to sue")
args = parser.parse_args()
# Reading the DBpedia Abstracts Dataset
reader = WikiDataAbstractsDataReader(args.input)
# Loading the WikidataSpotlightEntityLinker ... DBpedia Spotlight with mapping DBpedia URIs to Wikidata
# link = WikidataSpotlightEntityLinker('./datasets/wikidata/dbpedia-wikidata-sameas-dict.csv', support=10, confidence=0.4)
main_ent_lim = MainEntityLimiter()
min_ent_lim = EntityLimiter(2, 100)
min_trip_lim = MinTriplesLimiter(1)
# min_trip_lim = TriplesLimiter(5, 500)
filter_entities = ['Q4167410', 'Q13406463', 'Q18340514', 'Q12308941', 'Q11879590', 'Q101352']
# trip_read = TripleSPARQLReader('./datasets/wikidata/wikidata-triples.csv')
if args.input_triples.endswith('.db'):
trip_read = TripleDBReader(args.input_triples, args.language)
else:
trip_read = TripleCSVReader(args.input_triples, args.language)
Salign = SimpleAligner(trip_read)
#prop = WikidataPropertyLinker('./datasets/wikidata/wikidata-properties.csv')
if args.language == 'zh':
spacy_model = 'zh_core_web_sm'
elif args.language == 'en':
spacy_model = 'en_core_web_sm'
elif args.language == 'es' or args.language == 'ca':
spacy_model = 'es_core_news_sm'
elif args.language == 'it':
spacy_model = 'it_core_news_sm'
else:
spacy_model = 'xx_ent_wiki_sm'
# date = DateLinkerSpacy(spacy_model)
date = DateLinkerRegex(args.language)
#SPOalign = SPOAligner(trip_read)
NSalign = NoSubjectAlign(trip_read)
writer = JsonlWriter(args.output, "rebel", filesize=5000, startfile=__START_DOC__)
def reading_documents():
# reading document and apply all non parallelizable process
for d in reader.read_documents():
# d = date.run(d) # SU Time is non parallelizable
yield d
def process_document(d):
if trip_read.get_exists(d.uri, 'P31', filter_entities):
return None
d = date.run(d)
if not main_ent_lim.run(d):
return None
if not min_ent_lim.run(d):
return None
d = NSalign.run(d)
d = Salign.run(d)
if not min_trip_lim.run(d):
return None
writer.run(d)
del(d)
# print("error Processing document %s" % d.title)
if __name__ == '__main__':
interval_start = default_timer()
for d in reader.read_documents():
process_document(d)
print(f'Finished in {(default_timer() - interval_start)}')