-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.go
241 lines (214 loc) · 5.55 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// generated by Textmapper; DO NOT EDIT
package cool
import (
"strings"
"unicode/utf8"
)
// Lexer states.
const (
StateInitial = 0
StateInComment = 1
)
// Lexer uses a generated DFA to scan through a utf-8 encoded input string. If
// the string starts with a BOM character, it gets skipped.
type Lexer struct {
source string
ch rune // current character, -1 means EOI
offset int // character offset
tokenOffset int // last token offset
line int // current line number (1-based)
tokenLine int // last token line
scanOffset int // scanning offset
value interface{}
State int // lexer state, modifiable
commentLevel int // number of open nested block comments
invalidTokenClass InvalidTokenClass // reason for the last invalid token found
}
var bomSeq = "\xef\xbb\xbf"
// Init prepares the lexer l to tokenize source by performing the full reset
// of the internal state.
func (l *Lexer) Init(source string) {
l.source = source
l.ch = 0
l.offset = 0
l.tokenOffset = 0
l.line = 1
l.tokenLine = 1
l.State = 0
l.commentLevel = 0
l.invalidTokenClass = InvalidTokenUnknown
if strings.HasPrefix(source, bomSeq) {
l.offset += len(bomSeq)
}
l.rewind(l.offset)
}
// Next finds and returns the next token in l.source. The source end is
// indicated by Token.EOI.
//
// The token text can be retrieved later by calling the Text() method.
func (l *Lexer) Next() Token {
restart:
l.tokenLine = l.line
l.tokenOffset = l.offset
state := tmStateMap[l.State]
for state >= 0 {
var ch int
if uint(l.ch) < tmRuneClassLen {
ch = int(tmRuneClass[l.ch])
} else if l.ch < 0 {
state = int(tmLexerAction[state*tmNumClasses])
continue
} else {
ch = 1
}
state = int(tmLexerAction[state*tmNumClasses+ch])
if state > tmFirstRule {
if l.ch == '\n' {
l.line++
}
// Scan the next character.
// Note: the following code is inlined to avoid performance implications.
l.offset = l.scanOffset
if l.offset < len(l.source) {
r, w := rune(l.source[l.offset]), 1
if r >= 0x80 {
// not ASCII
r, w = utf8.DecodeRuneInString(l.source[l.offset:])
}
l.scanOffset += w
l.ch = r
} else {
l.ch = -1 // EOI
}
}
}
rule := tmFirstRule - state
token := tmToken[rule]
space := false
switch rule {
case 0:
if l.offset == l.tokenOffset {
l.rewind(l.scanOffset)
}
case 2: // invalid_token: /\x00/
{
l.invalidTokenClass = InvalidTokenNullCharInCode
}
case 3: // whitespace: /[\n\r\t\f\v ]+/
space = true
case 4: // EnterBlockComment: /\(\*/
space = true
{
l.enterBlockComment()
}
case 5: // invalid_token: /\*\)/
{
l.invalidTokenClass = InvalidTokenUnmatchedBlockComment
}
case 6: // invalid_token: /{eoi}/
{
l.State = StateInitial
l.invalidTokenClass = InvalidTokenEoiInComment
}
case 7: // ExitBlockComment: /\*\)/
space = true
{
l.exitBlockComment()
}
case 8: // BlockComment: /[^\(\)\*]+|[\*\(\)]/
space = true
case 9: // LineComment: /\-\-.*/
space = true
case 14: // invalid_token: /"({strRune}*\x00{strRune}*)+"/
{
l.invalidTokenClass = InvalidTokenNullCharInString
}
case 15: // invalid_token: /"({strRune}*\\\x00{strRune}*)+"/
{
l.invalidTokenClass = InvalidTokenEscapedNullCharInString
}
case 16: // invalid_token: /"{strRune}*{eoi}/
{
l.invalidTokenClass = InvalidTokenEoiInString
}
}
if space {
goto restart
}
return token
}
// Pos returns the start and end positions of the last token returned by Next().
func (l *Lexer) Pos() (start, end int) {
start = l.tokenOffset
end = l.offset
return
}
// Line returns the line number of the last token returned by Next().
func (l *Lexer) Line() int {
return l.tokenLine
}
// Text returns the substring of the input corresponding to the last token.
func (l *Lexer) Text() string {
return l.source[l.tokenOffset:l.offset]
}
// Value returns the value associated with the last returned token.
func (l *Lexer) Value() interface{} {
return l.value
}
// rewind can be used in lexer actions to accept a portion of a scanned token, or to include
// more text into it.
func (l *Lexer) rewind(offset int) {
if offset < l.offset {
l.line -= strings.Count(l.source[offset:l.offset], "\n")
} else {
if offset > len(l.source) {
offset = len(l.source)
}
l.line += strings.Count(l.source[l.offset:offset], "\n")
}
// Scan the next character.
l.scanOffset = offset
l.offset = offset
if l.offset < len(l.source) {
r, w := rune(l.source[l.offset]), 1
if r >= 0x80 {
// not ASCII
r, w = utf8.DecodeRuneInString(l.source[l.offset:])
}
l.scanOffset += w
l.ch = r
} else {
l.ch = -1 // EOI
}
}
type InvalidTokenClass int
const (
InvalidTokenUnknown = iota - 1
InvalidTokenEoiInComment
InvalidTokenEoiInString
InvalidTokenUnterminatedStringLiteral
InvalidTokenNullCharInString
InvalidTokenEscapedNullCharInString
InvalidTokenNullCharInCode
InvalidTokenUnmatchedBlockComment
)
// InvalidTokenReason returns the error class that led to the
// last invalid token found during lexing.
func (l *Lexer) InvalidTokenReason() InvalidTokenClass {
return l.invalidTokenClass
}
// enterBlockComment marks the beginning of a comment block
// and makes the lexer to transition to "inComment" state.
func (l *Lexer) enterBlockComment() {
l.commentLevel++
l.State = StateInComment
}
// exitBlockComment marks the end of a comment block
// and makes the lexer to transition to "initial" state
// if no other blocks are still open.
func (l *Lexer) exitBlockComment() {
l.commentLevel--
if l.commentLevel <= 0 {
l.State = StateInitial
}
}