-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatamodel.puml
163 lines (145 loc) · 4.05 KB
/
datamodel.puml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@startuml
'' Store information about token, lexical category and
'' This is needed to distinguish forms
'' VERB and AUX link to the same qid
entity lexical_category {
+ id INT PRIMARY UNIQUE
+ qid INT
+ postag TEXT
}
'' The ISO 639-1 code is stored like in the yml like so: 'en'
class language {
+ id INT PRIMARY AUTOINCREMENT
+ name_en TEXT NOT NULL UNIQUE
+ iso_code TEXT NOT NULL UNIQUE
+ qid INT NOT NULL UNIQUE
}
'' e.g. Riksdagenförvaltningen Q10655176)
entity provider {
+ id INT PRIMARY AUTOINCREMENT
+ title TEXT NOT NULL
+ qid VARCHAR(30) NOT NULL UNIQUE
}
'' e.g. Riksdagens öppna data Q108560253
'' QID for this term: Q59294700
entity collection {
+ id INT PRIMARY AUTOINCREMENT
+ title TEXT NOT NULL
+ qid VARCHAR(30) NOT NULL UNIQUE
+ provider INT NOT NULL
}
'' dataset is e.g. departementserien Q123501464
entity dataset {
+ id INT PRIMARY AUTOINCREMENT
+ title TEXT NOT NULL
+ qid VARCHAR(30) NOT NULL UNIQUE
+ collection INT NOT NULL
}
'' e.g. GNB465, see https://www.riksdagen.se/sv/dokument-och-lagar/dokument/departementsserien/ds-1999-65-_gnb465/
entity document {
+ id INT PRIMARY AUTOINCREMENT
+ dataset INT NOT NULL
+ external_id TEXT NOT NULL
}
'' sentences is detected by the spaCy NLP
entity sentence {
+ id INT PRIMARY AUTOINCREMENT
+ text TEXT NOT NULL
+ uuid TEXT NOT NULL UNIQUE
+ document INT NOT NULL
+ score INT NOT NULL
+ language INT NOT NULL
}
'' linking table
entity rawtoken_sentence_linking {
+ sentence INT PRIMARY
+ rawtoken INT PRIMARY
}
'' lexeme form ids can be matched later on to forms
entity lexeme_form {
+ lexeme INT PRIMARY
+ form INT PRIMARY
}
'' linking table
entity rawtoken_lexeme_form_linking {
+ rawtoken INT PRIMARY
+ lexeme INT PRIMARY
+ form INT PRIMARY
}
'' raw tokens appear in sentences as output form the NLP and have a lexical category
'' they have a composite primary key hardcoding them to a specific lexical category
'' text + lexical_category_id + language = UNIQUE
entity rawtoken {
+ id INT PRIMARY AUTOINCREMENT
+ lexical_category INT
+ text TEXT NOT NULL
+ score INT NOT NULL
+ language INT NOT NULL
}
'' This prevents duplication of scores in the database on every token and sentence
entity score {
+ id INTEGER PRIMARY AUTOINCREMENT
+ score FLOAT NOT NULL UNIQUE
}
'' normalized tokens are derived from raw tokens and have no lexical category
'' They help users who want to lookup variations of any given token
'' e.g. the form Hus has the normalized token hus
'' e.g. the form statsminister has the normalized token statsminister
entity normtoken {
+ id INT PRIMARY AUTOINCREMENT
+ text TEXT
}
'' link table between forms and normalized tokens
entity rawtoken_normtoken_linking {
+ rawtoken INT PRIMARY
+ normtoken INT PRIMARY
}
'' This table holds the NER labels of spaCy
entity ner_label {
+ id SMALLINT PRIMARY
+ label VARCHAR(30) UNIQUE
+ description VARCHAR(255)
}
'' This is a unique NER occurrence.
'' E.g. label: Europe: ner_label_id -> id of 'LOC'
entity entity {
+ id INT
+ label VARCHAR(255)
+ ner_label_id INT
--
+ PK (id)
+ FK (ner_label_id) REFERENCES ner_label(id)
}
'' We only store one unique ner per sentence.
'' We don't care about the same entity
'' appearing multiple times in a sentence
entity sentence_entity_linking {
+ sentence INT
+ entity INT
--
+ PK (sentence, entity)
}
' many to one:
lexeme_form }-- rawtoken
normtoken }-- rawtoken
rawtoken }-- sentence
sentence }-- document
document }-- collection
collection }-- dataset
dataset }-- provider
' one to one
entity -- ner_label
sentence -- score
sentence -- language
rawtoken -- lexical_category
rawtoken -- language
' many to many
rawtoken }--{ rawtoken_normtoken_linking
normtoken }--{ rawtoken_normtoken_linking
rawtoken }--{ rawtoken_sentence_linking
sentence }--{ rawtoken_sentence_linking
rawtoken }--{ rawtoken_lexeme_form_linking
lexeme_form }--{ rawtoken_lexeme_form_linking
sentence_entity_linking }--{ sentence
entity }--{ sentence_entity_linking
@enduml