-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.py
313 lines (278 loc) · 12.5 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
from config import *
from shared import LexerError, ImplementationError, Token, ConfigError
class Lexer:
def __init__(self, program: str):
self.program = program
self.pos = 0
self.line = 0
self.col = 0
def current_char(self) -> str:
return self.program[self.pos]
def peekChar(self) -> str | None:
return self.program[self.pos + 1] if self.pos + 1 < len(self.program) else None
def peekDigit(self) -> bool:
return self.peekChar().isdigit() if self.peekChar() else False
def checkEOS(self) -> bool:
return self.pos >= len(self.program)
def eat_char(self, chars: list[str] = []) -> str:
char = self.current_char()
if chars and char not in chars:
raise self.getError(
f"Unexpected character. Expected one of: {str(chars)} but got {char}"
)
self.pos += 1
self.col += 1
if char == "\n":
self.line += 1
self.col = 0
return char
def charIsOkForWord(self, char: str) -> bool:
return char.isalnum() or char in OK_IN_WORD
def getError(self, message: str) -> LexerError:
return LexerError(message, self.line, self.col, self.program)
def getImplementationError(self, message: str) -> ImplementationError:
return ImplementationError(
f"{message} This is unexpected and probably a bug in the lexer.",
self.line,
self.col,
self.program,
)
def comment(self):
# Eat the stuff after the comment mark
while not self.checkEOS() and self.current_char() != "\n":
self.eat_char()
def unescapeChar(self):
try:
self.eat_char(["\\"])
except LexerError:
raise self.getImplementationError(
"Expected escape sequence but didn't get one."
)
char = self.eat_char()
if char == "n":
return "\n"
elif char == "t":
return "\t"
elif char == "r":
return "\r"
elif char == "\\":
return "\\"
elif char == "'":
return "'"
elif char == '"':
return '"'
elif char == "`":
return "`"
else:
raise self.getError(f"Unknown escape sequence: \\{char}")
def getString(self, delimiter: str) -> str:
string = ""
try:
self.eat_char([delimiter])
except LexerError:
raise self.getImplementationError(
"Expected string start delimiter but didn't get one."
)
while not self.checkEOS() and self.current_char() != delimiter:
if self.current_char() == "\n":
raise self.getError("Line break before the end of string literal")
if self.current_char() == "\\":
string += self.unescapeChar()
else:
string += self.eat_char()
try:
self.eat_char([delimiter])
except IndexError:
raise self.getError("Unterminated string")
return string
def skipWhitespace(self):
while not self.checkEOS() and self.current_char() in IGNORE:
self.eat_char(IGNORE)
def getNumber(self):
number = ""
try:
number = self.eat_char(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])
except LexerError:
raise self.getImplementationError("Expected number but didn't get one.")
encountered_dot = False
# While there's a digit OR a dot followed by a digit
while not self.checkEOS() and (
self.current_char().isdigit()
or (self.current_char() == "." and not encountered_dot and self.peekDigit())
):
if self.current_char() == ".":
encountered_dot = True
number += self.eat_char(["."])
else:
number += self.eat_char(
["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
)
if not encountered_dot:
return Token("literal_integer", int(number), self.line, self.col)
return Token("literal_float", float(number), self.line, self.col)
def getWord(self):
word = ""
if not self.charIsOkForWord(self.current_char()):
raise self.getImplementationError("Expected word but didn't get one.")
word += self.eat_char()
while not self.checkEOS() and self.charIsOkForWord(self.current_char()):
word += self.eat_char()
return word
def symbolThatStartsWith(self, string: str) -> str | None:
# Return the symbol that starts with the given string
for symbol in SYMBOLS:
if symbol.startswith(string):
return symbol
return None
def symbolExists(self, string: str) -> bool:
# Return true if the given string is a symbol
for symbol in SYMBOLS:
if symbol == string:
return True
return False
def charIsOkForSymbol(self, char: str) -> bool:
return not self.charIsOkForWord(char) and char not in RESERVED_CHARS
def getLongest(self, list: list[str]) -> str:
longest = None
for item in list:
if longest is None or len(item) > len(longest):
longest = item
return longest
def getSymbol(self) -> str:
symbol = ""
symbols = []
while (
not self.checkEOS()
and self.symbolThatStartsWith(symbol + self.current_char())
and self.charIsOkForSymbol(self.current_char())
):
symbol += self.eat_char()
if self.symbolExists(symbol):
symbols.append(symbol)
if len(symbols) == 0:
raise self.getError(
f"Expected symbol but didn't get one ('{symbol}' not in SYMBOLS)"
)
return self.getLongest(symbols)
def internal_name_to_token(self, name: str) -> Token:
if name == "comment":
while not self.checkEOS() and self.current_char() != "\n":
self.eat_char()
elif name in ASSIGNMENT_OPERATORS:
return Token("assignment_operator", name, self.line, self.col)
elif name in BINARY_OPERATORS:
if name in UNARY_OPERATORS:
return Token("ambiguous_operator", name, self.line, self.col)
return Token("binary_operator", name, self.line, self.col)
elif name in UNARY_OPERATORS:
return Token("unary_operator", name, self.line, self.col)
elif name in OPEN_BRACKETS:
return Token("open_bracket", BRACKET_TYPES[name], self.line, self.col)
elif name in CLOSE_BRACKETS:
return Token("close_bracket", BRACKET_TYPES[name], self.line, self.col)
elif name in MISC_SYMBOLS:
return Token("misc_symbol", name, self.line, self.col)
elif name in BOOLS:
return Token("literal_boolean", name == "true", self.line, self.col)
elif name in KEYWORDS:
return Token("keyword", name, self.line, self.col)
else:
raise self.getError(
f"Reserved word or symbol not in any of the known token types (assignment_operator, binary_operator, unary_operator, open_bracket, close_bracket, misc_symbol, literal_boolean, keyword). Add the internal name to one of those lists or modify getSymbolToken to include a new list"
)
def getNextToken(self) -> Token:
if self.checkEOS():
return Token("EOS", "", self.line, self.col)
current_char = self.current_char()
if current_char in IGNORE:
# Ignore whitespaces
self.skipWhitespace()
return self.getNextToken()
elif current_char in STRING_DELIMITERS:
string = self.getString(current_char)
return Token("literal_string", string, self.line, self.col)
elif current_char.isdigit():
return self.getNumber()
elif self.charIsOkForSymbol(current_char):
# If char IS ok for symbol, it's NOT ok for word, so we can assume it's a symbol
internal_name = SYMBOLS[self.getSymbol()]
else:
if not self.charIsOkForWord(current_char):
raise self.getError(
f"Unexpected character. Expected a word but got {current_char}"
)
lexeme = self.getWord()
if lexeme in RESERVED_WORDS:
internal_name = RESERVED_WORDS[lexeme]
else:
return Token("identifier", lexeme, self.line, self.col)
token = self.internal_name_to_token(internal_name)
return token if token else self.getNextToken()
def getArrDiff(
self, arr1: list[str], arr2: list[str]
) -> tuple[list[str], list[str]]:
not_in_arr1 = []
for item in arr2:
if item not in arr1:
not_in_arr1.append(item)
not_in_arr2 = []
for item in arr1:
if item not in arr2:
not_in_arr2.append(item)
return (not_in_arr1, not_in_arr2)
def listEqual(self, list1: list[str], list2: list[str]) -> bool:
diff = self.getArrDiff(list1, list2)
return len(diff[0]) == 0 and len(diff[1]) == 0
def preCheck(self):
for symbol in SYMBOLS:
for char in symbol:
if not self.charIsOkForSymbol(char):
raise ConfigError(
f"CONFIG ERROR: Character '{char}' is not allowed in symbols, but occurs in '{symbol}' in the SYMBOLS list."
)
for word in RESERVED_WORDS.keys():
for char in word:
if not self.charIsOkForWord(char):
raise ConfigError(
f"CONFIG ERROR: Character '{char}' is not allowed in keywords, but occurs in '{word}' in the RESERVED_WORDS list."
)
if len(BOOLS) != 2:
raise ConfigError("CONFIG ERROR: BOOLS list must have two elements.")
if not self.listEqual(BINARY_OPERATORS, BINARY_OPERATOR_PRECEDENCE.keys()):
diff = self.getArrDiff(BINARY_OPERATORS, BINARY_OPERATOR_PRECEDENCE.keys())
raise ConfigError(
f"CONFIG ERROR: BINARY_OPERATOR_PRECEDENCE's keys must match BINARY_OPERATORS (BINARY_OPERATORS is missing {diff[0]}, BINARY_OPERATOR_PRECEDENCE is missing {diff[1]})."
)
if not self.listEqual(ASSIGNMENT_OPERATORS, ASSIGNMENT_OPERATIONS.keys()):
diff = self.getArrDiff(ASSIGNMENT_OPERATORS, ASSIGNMENT_OPERATIONS.keys())
raise ConfigError(
f"CONFIG ERROR: ASSIGNMENT_OPERATIONS's keys must match ASSIGNMENT_OPERATORS (ASSIGNMENT_OPERATORS is missing {diff[0]}, ASSIGNMENT_OPERATIONS is missing {diff[1]})."
)
if not self.listEqual(BINARY_OPERATORS, BINARY_OPERATIONS.keys()):
diff = self.getArrDiff(BINARY_OPERATORS, BINARY_OPERATIONS.keys())
raise ConfigError(
f"CONFIG ERROR: BINARY_OPERATIONS's keys must match BINARY_OPERATORS (BINARY_OPERATORS is missing {diff[0]}, BINARY_OPERATIONS is missing {diff[1]})."
)
if not self.listEqual(UNARY_OPERATORS, UNARY_OPERATIONS.keys()):
diff = self.getArrDiff(UNARY_OPERATORS, UNARY_OPERATIONS.keys())
raise ConfigError(
f"CONFIG ERROR: UNARY_OPERATIONS's keys must match UNARY_OPERATORS (UNARY_OPERATORS is missing {diff[0]}, UNARY_OPERATIONS is missing {diff[1]})."
)
if not self.listEqual(LANGUAGE_TYPES, LANGUAGE_TYPE_CONVERSIONS.keys()):
diff = self.getArrDiff(LANGUAGE_TYPES, LANGUAGE_TYPE_CONVERSIONS.keys())
raise ConfigError(
f"CONFIG ERROR: LANGUAGE_TYPE_CONVERSIONS's keys must match LANGUAGE_TYPES (LANGUAGE_TYPES is missing {diff[0]}, LANGUAGE_TYPE_CONVERSIONS is missing {diff[1]})."
)
if not self.listEqual(LANGUAGE_TYPES, LANGUAGE_TYPE_CHECKS.keys()):
diff = self.getArrDiff(LANGUAGE_TYPES, LANGUAGE_TYPE_CHECKS.keys())
raise ConfigError(
f"CONFIG ERROR: LANGUAGE_TYPE_CHECKS's keys must match LANGUAGE_TYPES (LANGUAGE_TYPES is missing {diff[0]}, LANGUAGE_TYPE_CHECKS is missing {diff[1]})."
)
def tokenize(self) -> list[Token]:
self.preCheck()
tokens = []
# The not tokens is to make sure we don't index an empty list
while not tokens or tokens[-1].type != "EOS":
token = self.getNextToken()
tokens.append(token)
return tokens