-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathbuild_confusion_matrix.py
129 lines (104 loc) · 4.82 KB
/
build_confusion_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
import argparse
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
CONSTANTS_PATH = 'constants'
CLASSES_NAMES = ['No Diacritic', 'Fatha', 'Fathatan', 'Damma', 'Dammatan', 'Kasra', 'Kasratan', 'Sukun', 'Shadda', 'Shadda + Fatha', 'Shadda + Fathatan', 'Shadda + Damma', 'Shadda + Dammatan', 'Shadda + Kasra', 'Shadda + Kasratan']
def get_diacritic_class(idx, line, arabic_letters, diacritic_classes):
if idx + 1 >= len(line) or line[idx + 1] not in diacritic_classes:
# No diacritic
return CLASSES_NAMES[0]
diac = line[idx + 1]
if idx + 2 >= len(line) or line[idx + 2] not in diacritic_classes:
# Only one diacritic
return CLASSES_NAMES[diacritic_classes.index(diac) + 1]
diac += line[idx + 2]
try:
# Try the possibility of double diacritics
return CLASSES_NAMES[diacritic_classes.index(diac) + 1]
except:
try:
# Try the possibility of reversed double diacritics
return CLASSES_NAMES[diacritic_classes.index(diac[::-1]) + 1]
except:
# Otherwise consider only the first diacritic
return CLASSES_NAMES[diacritic_classes.index(diac[0]) + 1]
def get_diacritics_classes(line, arabic_letters, diacritic_classes):
classes = list()
for idx, char in enumerate(line):
if char in arabic_letters:
classes.append(get_diacritic_class(idx, line, arabic_letters, diacritic_classes))
return classes
def plot_confusion_matrix(y_true, y_pred, subplot,
normalize=False,
cmap=plt.cm.Blues):
# Only use the labels that appear in the data
classes = unique_labels(y_true, y_pred)
new_classes = list()
for class_name in CLASSES_NAMES:
if class_name in classes:
new_classes.append(class_name)
classes = new_classes
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
else:
print('Confusion matrix, without normalization')
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes)
ax.set_xlabel('Predicted label', fontsize=20)
ax.set_ylabel('True label', fontsize=20)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha='right',
rotation_mode='anchor')
for tick in ax.xaxis.get_major_ticks():
tick.label.set_fontsize(16)
for tick in ax.yaxis.get_major_ticks():
tick.label.set_fontsize(16)
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha='center', va='center', fontsize=14,
color='white' if cm[i, j] > thresh else 'black')
plt.tight_layout()
plt.show()
return ax
if __name__ =='__main__':
parser = argparse.ArgumentParser(description='Builds DER Figure')
parser.add_argument('-ofp', '--original-file-path', help='File path to original text', required=True)
parser.add_argument('-stfp', '--small-target-file-path', help='File path to target text', required=True)
parser.add_argument('-btfp', '--big-target-file-path', help='File path to target text', required=True)
args = parser.parse_args()
with open(CONSTANTS_PATH + '/ARABIC_LETTERS_LIST.pickle', 'rb') as file:
ARABIC_LETTERS_LIST = pkl.load(file)
with open(CONSTANTS_PATH + '/FFNN_CLASSES_MAPPING.pickle', 'rb') as file:
CLASSES_LIST = list(pkl.load(file))
with open(args.original_file_path, 'r') as file:
original_content = file.readlines()
with open(args.small_target_file_path, 'r') as file:
small_target_content = file.readlines()
with open(args.big_target_file_path, 'r') as file:
big_target_content = file.readlines()
original_classes = list()
small_target_classes = list()
big_target_classes = list()
for original_line, small_target_line, big_target_line in zip(original_content, small_target_content, big_target_content):
original_classes.extend(get_diacritics_classes(original_line, ARABIC_LETTERS_LIST, CLASSES_LIST))
small_target_classes.extend(get_diacritics_classes(small_target_line, ARABIC_LETTERS_LIST, CLASSES_LIST))
big_target_classes.extend(get_diacritics_classes(big_target_line, ARABIC_LETTERS_LIST, CLASSES_LIST))
plot_confusion_matrix(original_classes, small_target_classes, 1, True)
plot_confusion_matrix(original_classes, big_target_classes, 2, True)