-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmakethes.awk
executable file
·97 lines (85 loc) · 2.9 KB
/
makethes.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#Todo:
# Flokka eftir [1], [2] etc. (Viljum við gera það ef t.d. eru tvö [1] í sama *heiti eða bara á milli heita?)
# Bæta við orðskýringum
# ef þær eru fleiri en ein og eitt orð hver?
# ef þær eru ekki nafnorð, þ.á.m. sérnöfn, karlmanns- og kvenmannsnöfn, (og fleiri en eitt)?
# bæta við öðrum stafsetningum, "sjá einnig", (orðtök, "afleiddar merkingar" og orðsifjar).
# Hvað með orð í fleiri en einum orðflokki, t.d. dýr?
BEGIN { print "UTF-8"; }
{
if (match($0, /<title>.*<\/title>/)) {
if (nMeanings > 0) {
print word"|"nMeanings;
print "|"lines;
}
nMeanings = 0;
lines = "";
word=substr($0, 12, length($0)-19);
icelandic=0;
} else if (match($0, /{{-..-}}/)) {
if (match($0, /{{-is-}}/)) {
icelandic=1;
} else {
icelandic=0;
}
}
if (icelandic) {
if (match($1, /{{-samheiti-}}/)) {
explaination = "";
} else if (match($1, /{{-andheiti-}}/)) {
explaination = " (andheiti)";
} else if (match($1, /{{-yfirheiti-}}/)) {
explaination = " (yfirheiti)";
} else if (match($1, /{{-undirheiti-}}/)) {
explaination = " (undirheiti)";
}
if (RSTART) {
getline;
while (NF > 1) {
thes = $0;
# remove optinal listing at the beginning, eg. [1]: todo: make use of this.
sub(/:+(\[([[:alnum:]]|,|-)+\])? */, "", thes);
# remove link template
if (match(thes, /{{tengill\|/)) {
sub(/{{tengill\|/, "", thes);
sub(/\|/, " (", thes);
}
#remove [1] after word
gsub(/ \[[0-9]\]/, "", thes);
#remove links and brackets
gsub(/\[(([[:alnum:]]| )*\|)?|\]/, "", thes);
#remove html tags and text inside.
gsub(/ *<[[:alpha:]]+>.*<\/[[:alpha:]]+>/, "", thes);
#replace quotes, and curly brackets with parenthesis.
while (match(thes, /[\047{}]/)) {
sub(/[\047{]+/, "(", thes);
sub(/[\047}]+/, ")", thes);
}
#remove colon
gsub(/:\)/, ")", thes);
sub(/[^:(]+: ?/, "", thes);
#remove double parenthesis possibly added by previous replacement.
gsub(/\(+/, "(", thes);
gsub(/\)+/, ")", thes);
#replace , with | unless the comma is inside parenthesis.
while (match(thes, /[^,\(\)]*, /)) {
if (substr(thes, RSTART-1, 1) == "(") {
thes = substr(thes, 0, RSTART+RLENGTH-3) "𐌸" substr(thes, RSTART+RLENGTH);
} else {
thes = substr(thes, 0, RSTART+RLENGTH-3) explaination"|" substr(thes, RSTART+RLENGTH);
}
}
gsub(/𐌸/, ", ", thes); # re-add commas inside parenthesis
if(thes != "") {
nMeanings++;
if (lines == "") {
lines=lines thes explaination;
} else {
lines=lines "\n|"thes explaination;
}
}
getline;
}
}
}
}