-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword_error_rate.py
105 lines (89 loc) · 5.16 KB
/
word_error_rate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
@author Kiettiphong Manovisut
References:
https://en.wikipedia.org/wiki/Word_error_rate
https://www.github.com/mission-peace/interview/wiki
"""
import string
import numpy
class WER:
def __init__(self, human_transcript=None, system_transcript=None, wer_html=None):
self.human_transcript = human_transcript
self.system_transcript = system_transcript
self.r = self.text_format(filename=self.human_transcript) # baseline reference
self.h = self.text_format(filename=self.system_transcript) # hypothesis, system generated
self.wer_html = wer_html
def text_format(self, filename=None):
"""Method to take in two file paths, and break into lists"""
with open(filename, 'r', encoding='utf-8') as f:
a = f.readlines()
# Take out punctuation
text_string = a[0]
converted = text_string.translate(str.maketrans('', '', string.punctuation))
# Convert to lowercase
converted = converted.lower()
# Convert to split
stripped_values = converted.split(' ')
return stripped_values
def print_to_html(self, d):
#filename = "diff.html"
x = len(self.r)
y = len(self.h)
html = '<html><body><head><meta charset="utf-8"></head>' \
'<style>.g{background-color:#0080004d}.r{background-color:#ff00004d}.y{background-color:#ffa50099}</style>'
while True:
if x == 0 or y == 0:
break
if self.r[x - 1] == self.h[y - 1]:
x = x - 1
y = y - 1
html = '%s ' % self.h[y] + html
elif d[x][y] == d[x - 1][y - 1] + 1: # substitution
x = x - 1
y = y - 1
html = '<span class="y">%s(%s)</span> ' % (self.h[y], self.r[x]) + html
elif d[x][y] == d[x - 1][y] + 1: # deletion
x = x - 1
html = '<span class="r">%s</span> ' % self.r[x] + html
elif d[x][y] == d[x][y - 1] + 1: # insertion
y = y - 1
html = '<span class="g">%s</span> ' % self.h[y] + html
else:
print('\nWe got an error.')
break
html += '</body></html>'
with open(self.wer_html, 'w') as f:
f.write(html)
#f.close()
print("Printed comparison to: {0}".format(self.wer_html))
def get_word_error_rate(self):
"""Given two list of strings how many word error rate(insert, delete or substitution)."""
d = numpy.zeros((len(self.r) + 1) * (len(self.h) + 1), dtype=numpy.uint16)
d = d.reshape((len(self.r) + 1, len(self.h) + 1))
for i in range(len(self.r) + 1):
for j in range(len(self.h) + 1):
if i == 0:
d[0][j] = j
elif j == 0:
d[i][0] = i
for i in range(1, len(self.r) + 1):
for j in range(1, len(self.h) + 1):
if self.r[i - 1] == self.h[j - 1]:
d[i][j] = d[i - 1][j - 1]
else:
substitution = d[i - 1][j - 1] + 1
insertion = d[i][j - 1] + 1
deletion = d[i - 1][j] + 1
d[i][j] = min(substitution, insertion, deletion)
result = float(d[len(self.r)][len(self.h)]) / len(self.r) * 100
self.print_to_html(d)
return result
if __name__ == "__main__":
w = WER(human_transcript='./../data-feeds/human-transcripts/martha-human.txt',
system_transcript='./../data-feeds/system-transcripts/martha.txt',
wer_html='./../data-feeds/wer-outputs/martha.html')
response = w.get_word_error_rate()
print(f"Word error rate: {response:<6.6}%")#response)
# Sample of how the lists look before being compared for WER
#r = ['In', 'computational', 'linguistics', 'and', 'computer', 'science', ',', 'edit', 'distance', 'is', 'a', 'way', 'of', 'quantifying', 'how', 'dissimilar', 'two', 'strings', 'are', 'to', 'one', 'another', 'by', 'counting', 'the', 'minimum', 'number', 'of', 'operations', 'required', 'to', 'transform', 'one', 'string', 'into', 'the', 'other.', 'Edit', 'distances', 'find', 'applications', 'in', 'natural', 'language', 'processing,', 'where', 'automatic', 'spelling', 'correction', 'can', 'determine', 'candidate', 'corrections', 'for', 'a', 'misspelled', 'word', 'by', 'selecting', 'words', 'from', 'a', 'dictionary', 'that', 'have', 'a', 'low', 'distance', 'to', 'the', 'word', 'in', 'question']
#h = ['In', 'linguistics', 'and', 'computer', 'science', 'theory', ',', 'edit', 'distance', 'iss', 'a', 'way', 'of', 'quantifying', 'how', 'dissimilar', 'the', 'two', 'string', 'is', 'to', 'one', 'another', 'by', 'counting', 'the', 'number', 'of', 'operations', 'required', 'to', 'transform', 'one', 'string', 'into', 'the', 'other.', 'Edit', 'distances', 'find', 'applications', 'in', 'natural', 'language', 'processing,', 'where', 'automatic', 'spelling', 'correction', 'can', 'determine', 'candidate', 'corrections', 'for', 'a', 'misspelled', 'word', 'by', 'selecting', 'words', 'from', 'a', 'dictionary', 'that', 'have', 'a', 'low', 'distance', 'to', 'the', 'words', 'in', 'question']