forked from grant-olson/pyasm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
x86tokenizer.py
executable file
·208 lines (178 loc) · 7.73 KB
/
x86tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# Copyright 2004-2010 Grant T. Olson.
# See license.txt for terms.
"""
x86tokenizer.py
---------------
This provides the framework to tokenize x86 instructions from strings. There
are two major functions:
tokenizeInstDef()
This tokenizes an instruction definition so we can build the appropriate
tables to lookup at a later date.
tokenizeInst()
This tokenizes an actual instruction, in code, to help build an instruction
instance.
Much of the stuff going on here is the same, but an actual instruction may (and
will) contain info that is invalid for a definition. i.e.::
# an instance can have concrete values, but a def can't
tokenizeInst("MOV EAX,0xF000")
tokenizeInst("MOV [EBP+8],0x10")
Of course I still need to implement the tokenizeInst function.
"""
import re
#
class tokenizeError(Exception):pass
#token IDs
REGISTER,OPCODE,COMMA,OPERAND,LBRACKET,RBRACKET,NUMBER,SYMBOL,STRING = range(1,10)
tokLookup = {'REGISTER':REGISTER,
'OPCODE':OPCODE,
'COMMA':COMMA,
'OPERAND':OPERAND,
'LBRACKET':LBRACKET,
'RBRACKET':RBRACKET,
'NUMBER':NUMBER,
'SYMBOL':SYMBOL,
}
#Re's
# Re's used by both tokeinizers
commaRe = '(?P<COMMA>,)'
# Instruction defs can have hard-coded indirect r/m's in them.
# An actual instruction will extract the LBRACKET and RBRACKET
# and include any appropriate offsets: 'MOV [EDP + 4], AX'
# This will never happen in an instruction def so we need to parse differently
basicRegisterRe = 'AL|CL|DL|BL|AH|CH|DH|BH|AX|CX|DX|BX|SP|BP|SI|DI|'\
'EAX|ECX|EDX|EBX|ESP|EBP|ESI|EDI'
indirectRegisterRe = '\[AL\]|\[CL\]|\[DL\]|\[BL\]|\[AH\]|\[CH\]|\[DH\]|\[BH\]|'\
'\[AX\]|\[CX\]|\[DX\]|\[BX\]|\[SP\]|\[BP\]|\[SI\]|\[DI\]|'\
'\[EAX\]|\[ECX\]|\[EDX\]|\[EBX\]|\[ESP\]|\[EBP\]|\[ESI\]|\[EDI\]'
defRegRe = '(?P<REGISTER>%s|%s)' % (basicRegisterRe,indirectRegisterRe)
instRegRe= '(?P<REGISTER>%s)' % (basicRegisterRe)
# def specific stuff
#TODO: fix this better
ptrRe = 'ptr16\:16|ptr32\:32'
memRe = 'm16\:16|m32\:32'
sregRe = 'Sreg'
moffsRe = 'moffs8|moffs16|moffs32'
immediateRe = 'imm8|imm16|imm32'
relativeRe = 'rel8|rel16|rel32'
rRe = 'r8|r16|r32'
mmRe = 'mm|xmm'
rmRe = 'r/m8|r/m16|r/m32|m8|m16|m32|m'
otherRe = '/digit|REG'
operandRe = '(?P<OPERAND>%s|%s|%s|%s|%s|%s|%s|%s|%s|%s)' % (ptrRe, memRe, sregRe,
moffsRe, immediateRe,
relativeRe, rRe, mmRe,
rmRe, otherRe)
whitespaceRe = '[\s\+]'
#opcode is a symbol that is all caps, no lowercase or underscore
opcodeRe = re.compile('^[A-Z]+$')
#instructionRe specific stuff
stringRe = "((?P<q>'|\")(?P<STRING>.*)(?P=q))"
lbracketRe = '(?P<LBRACKET>\[)'
rbracketRe = '(?P<RBRACKET>\])'
numberRe = '(?P<NUMBER>[\+\-]?(0x[0-9A-Fa-f]+|[0-9]+))'
symbolRe = '(?P<SYMBOL>[A-Za-z_@][A-Za-z_@0-9]*)'
#define final re's
instructionDefRe = re.compile("(?:%s*(?:%s|%s|%s|%s|%s|%s|%s)(?P<rest>.*))" % \
(whitespaceRe,defRegRe,operandRe,symbolRe,commaRe,numberRe,lbracketRe,rbracketRe))
instructionRe = re.compile("(?:%s*(?:%s|%s|%s|%s|%s|%s|%s)(?P<rest>.*))" % \
(whitespaceRe,lbracketRe,rbracketRe,instRegRe,
commaRe,numberRe,symbolRe,stringRe))
def tokenizeString(s,reToProcess):
lst = []
rest = s
while rest:
instMatch = reToProcess.match(rest)
if not instMatch:
raise tokenizeError("Couldn't find match for string '%s' from '%s'" % (rest,s))
instDict = instMatch.groupdict()
if instDict['REGISTER']: lst.append((REGISTER,instDict['REGISTER']))
elif instDict['SYMBOL']:
if opcodeRe.match(instDict['SYMBOL']):
lst.append((OPCODE,instDict['SYMBOL']))
else:
lst.append((SYMBOL,instDict['SYMBOL']))
elif instDict['COMMA']: lst.append((COMMA,instDict['COMMA']))
elif instDict.has_key('OPERAND') and instDict['OPERAND']:
# only defs have operands.
#only instructions have anything below here, but if it's a def
#we've already (hopefully) found a match so we don't need to check
#for key existance.
opText = instDict['OPERAND']
# Hack for 'm' codes. These are stored in the RM field,
# But register values are technically invalid. Should this be
# verified while compiling?
if opText == 'm32':
opText = 'r/m32'
elif opText == 'm16':
opText = 'r/m16'
elif opText == 'm8':
opText = 'r/m8'
lst.append((OPERAND,opText))
elif instDict['LBRACKET']: lst.append((LBRACKET,instDict['LBRACKET']))
elif instDict['RBRACKET']: lst.append((RBRACKET,instDict['RBRACKET']))
elif instDict['NUMBER']: lst.append((NUMBER,instDict['NUMBER']))
elif instDict['STRING']: lst.append((STRING,instDict['STRING']))
else:
raise tokenizeError("Tokenization failed on string %s, match %s" \
% (s,rest))
rest = instDict['rest']
return tuple(lst)
def tokenizeInstDef(s):
toks = tokenizeString(s, instructionDefRe)
index,length = 0,len(toks)
if length == 0:
raise tokenizeError("Invalid Instruction. Cannot be blank")
if toks[index][0] != OPCODE:
raise tokenizeError("Invalid Instruction: '%s' " \
"Must start with an OPCODE" % s)
while index < length and toks[index][0] == OPCODE:
index += 1
while index < length:
if toks[index][0] not in (REGISTER,OPERAND,NUMBER):
raise tokenizeError("Invalid Instruction Definition: '%s' " \
"Expected a REGISTER OR OPERAND ENTRY" % s)
index += 1
if index < length:
if toks[index][0] != COMMA:
raise tokenizeError("Invalid Instruction Def: '%s' Expected " \
"a COMMA" % s)
index += 1
return toks
def tokenizeInst(s):
toks = tokenizeString(s, instructionRe)
index,length = 0,len(toks)
if length == 0:
raise tokenizeError("Invalid Instruction. Cannot be blank")
if toks[index][0] != OPCODE:
raise tokenizeError("Invalid Instruction: '%s' " \
"Must start with an OPCODE" % s)
while index < length and toks[index][0] == OPCODE:
index += 1
while index < length:
if toks[index][0] in (REGISTER,NUMBER,SYMBOL,STRING):
index += 1
elif toks[index][0] == LBRACKET:
index += 1
if toks[index][0] in (NUMBER,SYMBOL):
index += 1
elif toks[index][0] == REGISTER:
index += 1
if toks[index][0] in (NUMBER,SYMBOL):
index += 1
else:
raise tokenizeError("Invalid Instruction: '%s' Expected a " \
"REGISTER inside the [brackets]" %s)
if toks[index][0] != RBRACKET:
raise tokenizeError("Invalid Instruction: '%s' Expected an " \
"ending BRACKET here." % s)
else:
index += 1
else:
raise tokenizeError("Invalid Instruction: '%s' " \
"Expected a REGISTER,LBRACKET,NUMBER,SYMBOL, or STRING" % s)
if index < length:
if toks[index][0] != COMMA:
raise tokenizeError("Invalid Instruction: '%s' Expected " \
"a COMMA" % s)
index += 1
return toks