-
Notifications
You must be signed in to change notification settings - Fork 39
Expand file tree
/
Copy pathtokenizer.py
More file actions
165 lines (115 loc) · 4.46 KB
/
tokenizer.py
File metadata and controls
165 lines (115 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import re
SPECIAL_CHARS = ['+', '-' '*', '/', '=', '^', '%', # Math
'[', ']', '(', ')', '{', '}', '<', '>', # Brackets
'!', '~', '#', '&', '|', '?', # Logic
'@', '$', '\\', ';', ',', '.', ':'] # Symbols
SPECIAL_STRINGS = ['...', '..', '::',
'||', '&&',
'==', '!=', '~='
'>=', '<=',
'>>', '<<']
BUILTIN_WORDS = ['and', 'break', 'do', 'else', 'elseif', 'end',
'false', 'for', 'function', 'goto', 'if', 'in',
'local', 'nil', 'not', 'or', 'repeat', 'return',
'then', 'true', 'until', 'while', 'continue'] # 'continue' only in gLua
GM_WORDS = ['ENT', 'WEAPON', 'SWEP', 'GAMEMODE']
RESERVED_WORDS = BUILTIN_WORDS + GM_WORDS
TOKEN_PATTERN = re.compile(r"(\d*\.?\d+|[^\s" # Checks for decimals with optional lead and required end ex: ".5", "0.5"
+ re.escape("".join(SPECIAL_CHARS)) + r"]+|"
+ re.escape("___sep___".join(SPECIAL_STRINGS)).replace("___sep___", "|") + r"|["
+ re.escape("".join(SPECIAL_CHARS)) + r"])")
WORD_PATTERN = re.compile(r"^[^\s"
+ re.escape("".join(SPECIAL_CHARS)) + r"]+$")
SCOPE_IN = ['do', 'then', 'function']
SCOPE_OUT = ['end', 'elseif'] # 'elseif' because it comes with a second 'then'.
def tokenize(lua):
return TOKEN_PATTERN.findall(lua)
def fix_functions(tokens, strings):
"""
All functions called with string literals and table literals
will be given parenthesis
ex: print "abc" -> print("abc")
"""
tokens = _fix_functions_string_literals(tokens, strings)
tokens = _fix_functions_table_literals(tokens)
tokens = fix_table_semicolons(tokens)
return tokens
def fix_table_semicolons(tokens, start_index=0, start_depth=0):
"""
Replaces semicolons in table definitions with commas
{ {
"a"; -> "a",
"b" -> "b"
} }
"""
depth = start_depth
for i in range(start_index, len(tokens)):
t = tokens[i]
if t == "{":
depth += 1
elif t == "}":
depth -= 1
elif t == "function":
# Skip ahead to the end of the function
end = find_function_end(tokens, i)
return fix_table_semicolons(tokens, end, depth)
elif depth > 0 and t == ";":
tokens[i] = ","
return tokens
def _fix_functions_string_literals(tokens, strings, start_index=0):
last = None
for i in range(start_index, len(tokens)):
t = tokens[i]
if last is None:
last = t
continue
if is_word(last) and is_word(t):
if t in strings.keys():
tokens.insert(i, "(")
tokens.insert(i + 2, ")")
return _fix_functions_string_literals(tokens, strings, i)
last = t
return tokens
def _fix_functions_table_literals(tokens, start_index=0):
last = None
for i in range(start_index, len(tokens)):
t = tokens[i]
if last is None:
last = t
continue
if is_word(last) and t == "{":
end = find_table_end(tokens, i)
tokens = tokens[:i] + ["("] + tokens[i:end] + [")"] + tokens[end:]
return _fix_functions_table_literals(tokens, end)
last = t
return tokens
def find_table_end(tokens, start_index):
"""
Find the end of a table declaration where start_index is the first '{'
"""
depth = 0
for i in range(start_index, len(tokens)):
t = tokens[i]
if t == "{":
depth += 1
elif t == "}":
depth -= 1
if depth == 0:
return i + 1
print("Fatal error occurred.", "Please check that your input is syntactically correct.", sep="\n")
exit(0)
def find_function_end(tokens, start_index):
depth = 0
for i in range(start_index, len(tokens)):
t = tokens[i]
if t in SCOPE_IN:
depth += 1
elif t in SCOPE_OUT:
depth -= 1
if depth == 0:
return i + 1
return start_index
def is_word(s):
return s is not None and s not in RESERVED_WORDS and WORD_PATTERN.match(s) is not None and not is_number(s)
def is_number(s):
return s.isdigit()