-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This adds formal support for the IDL that is used in the specification. I plan to eventually replace the JSON representation by this IDL. It is much easier to use for humans. For example the Otel STEF IDL is 151 easily readable lines, whereas the equivalent in JSON is 487 harder to read lines. The IDL will only be used by the generator. The lexer/parser is not subject to inputs that may come malicious sources, so I am setting the testing and fuzzing bar relatively low since there is no attack risk and the worse case the generator will fail to run or will run incorrectly. The IDL implementation is not yet used anywhere. It is only verified in tests. Future PRs will replace JSON usage by IDL in the generator. The parser is a hand-coded recursive descent parser with separate lexer for tokenization. I looked at alternates, particular at defining the grammar in ABNF and using ABNF parsers available in Go. Unfortunately none that I tried worked and it was simpler to just hand code it. A couple additional changes that I had to do at the same time for simplicity: - Rename "MainStruct" to "Root" in generator (for naming consistency). - Change examples in specification.md to use lower-case `key` and `value` (since these are keywords in IDL).
- Loading branch information
1 parent
587bc0a
commit df41803
Showing
17 changed files
with
1,573 additions
and
118 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,275 @@ | ||
package idl | ||
|
||
import ( | ||
"bufio" | ||
"errors" | ||
"fmt" | ||
"io" | ||
"unicode" | ||
) | ||
|
||
// Lexer splits a UTF8-encoded input into tokens. | ||
type Lexer struct { | ||
input *bufio.Reader | ||
token Token | ||
|
||
nextRune rune | ||
prevWasCR bool | ||
|
||
isEOF bool | ||
isError bool | ||
errMsg string | ||
|
||
curPos Pos | ||
prevPos Pos | ||
|
||
identRunes []rune | ||
ident string | ||
} | ||
|
||
// Pos indicates a position in the input stream. | ||
type Pos struct { | ||
ByteOfs uint | ||
Line uint | ||
Col uint | ||
} | ||
|
||
type Token uint | ||
|
||
const ( | ||
tError Token = iota | ||
tEOF | ||
|
||
tPackage | ||
tIdent | ||
|
||
tStruct | ||
tOneof | ||
tMultimap | ||
|
||
tOptional | ||
tRoot | ||
tDict | ||
tKey | ||
tValue | ||
|
||
tBool | ||
tInt64 | ||
tUint64 | ||
tFloat64 | ||
tString | ||
tBytes | ||
|
||
tLBracket = '[' | ||
tRBracket = ']' | ||
tLParen = '(' | ||
tRParen = ')' | ||
tLBrace = '{' | ||
tRBrace = '}' | ||
) | ||
|
||
func (t Token) String() string { | ||
str, ok := keywordsReverse[t] | ||
if ok { | ||
return str | ||
} | ||
switch t { | ||
case tEOF: | ||
return "EOF" | ||
case tIdent: | ||
return "identifier" | ||
default: | ||
return string(byte(t)) | ||
} | ||
} | ||
|
||
var keywords = map[string]Token{ | ||
"package": tPackage, | ||
"struct": tStruct, | ||
"oneof": tOneof, | ||
"multimap": tMultimap, | ||
"optional": tOptional, | ||
"root": tRoot, | ||
"dict": tDict, | ||
"key": tKey, | ||
"value": tValue, | ||
"bool": tBool, | ||
"int64": tInt64, | ||
"uint64": tUint64, | ||
"float64": tFloat64, | ||
"string": tString, | ||
"bytes": tBytes, | ||
} | ||
|
||
var keywordsReverse = func() map[Token]string { | ||
m := make(map[Token]string) | ||
for k, v := range keywords { | ||
m[v] = k | ||
} | ||
return m | ||
}() | ||
|
||
func NewLexer(input io.Reader) *Lexer { | ||
l := &Lexer{ | ||
input: bufio.NewReader(input), | ||
curPos: Pos{ | ||
ByteOfs: 0, | ||
Line: 1, | ||
Col: 1, | ||
}, | ||
} | ||
// Fetch the first rune. | ||
l.readNextRune() | ||
|
||
// Fetch the first token. | ||
l.Next() | ||
return l | ||
} | ||
|
||
func (l *Lexer) Token() Token { | ||
return l.token | ||
} | ||
|
||
// Next reads the input for the next token. After that Token() will return | ||
// the token that was read. | ||
// | ||
// If Lexer input is at EOF then the next Token() call will return tEOF. | ||
// If reading failed (e.g. if the input is not valid UTF8) the next Token() call | ||
// will return tError. | ||
func (l *Lexer) Next() { | ||
l.prevPos = l.curPos | ||
|
||
l.skipWhiteSpaceOrComment() | ||
|
||
if l.isEOF { | ||
l.token = tEOF | ||
return | ||
} else if l.isError { | ||
l.token = tError | ||
l.isError = false | ||
return | ||
} | ||
|
||
switch l.nextRune { | ||
case tLParen: | ||
l.token = tLParen | ||
case tRParen: | ||
l.token = tRParen | ||
case tLBracket: | ||
l.token = tLBracket | ||
case tRBracket: | ||
l.token = tRBracket | ||
case tRBrace: | ||
l.token = tRBrace | ||
case tLBrace: | ||
l.token = tLBrace | ||
default: | ||
if unicode.IsLetter(l.nextRune) { | ||
// This is a letter. It must a start of an identifier or keyword. | ||
l.readIdentOrKeyword() | ||
return | ||
} | ||
l.token = tError | ||
l.errMsg = fmt.Sprintf("invalid character: %c", l.nextRune) | ||
} | ||
l.readNextRune() | ||
} | ||
|
||
func (l *Lexer) skipWhiteSpaceOrComment() { | ||
for !l.isEOF && !l.isError { | ||
if unicode.IsSpace(l.nextRune) { | ||
l.readNextRune() | ||
} else if l.nextRune == '/' { | ||
l.skipComment() | ||
} else { | ||
break | ||
} | ||
} | ||
} | ||
|
||
func (l *Lexer) skipComment() { | ||
l.readNextRune() | ||
if l.isEOF || l.isError || l.nextRune != '/' { | ||
l.token = tError | ||
l.errMsg = "expected start of comment" | ||
return | ||
} | ||
|
||
for !l.isEOF && !l.isError && l.nextRune != '\r' && l.nextRune != '\n' { | ||
l.readNextRune() | ||
} | ||
} | ||
|
||
func (l *Lexer) readNextRune() { | ||
nextRune, size, err := l.input.ReadRune() | ||
if err != nil { | ||
if errors.Is(err, io.EOF) { | ||
l.isEOF = true | ||
} else { | ||
l.isError = true | ||
l.errMsg = fmt.Sprintf("invalid character") | ||
} | ||
return | ||
} | ||
l.nextRune = nextRune | ||
l.curPos.ByteOfs += uint(size) | ||
l.curPos.Col++ | ||
|
||
// Handle any of CR,LF,CRLF as a new line. | ||
const cCR = '\r' | ||
const cLF = '\n' | ||
if l.nextRune == cCR { | ||
l.curPos.Line++ | ||
l.curPos.Col = 1 | ||
l.prevWasCR = true | ||
} else if l.nextRune == cLF { | ||
if !l.prevWasCR { | ||
l.curPos.Line++ | ||
l.curPos.Col = 1 | ||
} | ||
l.prevWasCR = false | ||
} else { | ||
l.prevWasCR = false | ||
} | ||
} | ||
|
||
func (l *Lexer) readIdentOrKeyword() Token { | ||
l.identRunes = l.identRunes[:0] | ||
|
||
// The first character is already read. Subsequent characters must be | ||
// letters, digits or underscore. | ||
for (unicode.IsLetter(l.nextRune) || unicode.IsDigit(l.nextRune) || l.nextRune == '_') && !l.isError { | ||
l.identRunes = append(l.identRunes, l.nextRune) | ||
l.readNextRune() | ||
if l.isEOF { | ||
break | ||
} | ||
if l.isError { | ||
l.token = tError | ||
return tError | ||
} | ||
} | ||
|
||
l.ident = string(l.identRunes) | ||
|
||
// Check if it is a keyword. | ||
if token, ok := keywords[l.ident]; ok { | ||
l.token = token | ||
return token | ||
} | ||
|
||
l.token = tIdent | ||
return tIdent | ||
} | ||
|
||
// Ident will return the identifier if the current token is tIdent. | ||
// Use Token() first. | ||
func (l *Lexer) Ident() string { | ||
return l.ident | ||
} | ||
|
||
// TokenStartPos will return the starting position of the last read | ||
// token after Next() call. | ||
func (l *Lexer) TokenStartPos() Pos { | ||
return l.prevPos | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package idl | ||
|
||
import ( | ||
"bytes" | ||
"os" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestLexer(t *testing.T) { | ||
l := NewLexer(bytes.NewBufferString("struct abc {}")) | ||
|
||
tokens := []Token{tStruct, tIdent, tLBrace, tRBrace, tEOF} | ||
i := 0 | ||
for { | ||
token := l.Token() | ||
assert.Equal(t, tokens[i], token, i) | ||
i++ | ||
if token == tEOF { | ||
break | ||
} | ||
l.Next() | ||
} | ||
} | ||
|
||
func FuzzLexer(f *testing.F) { | ||
f.Add([]byte(nil)) | ||
f.Add([]byte("")) | ||
f.Add([]byte("struct abc {}")) | ||
|
||
testFiles := []string{"testdata/example.stef", "testdata/otel.stef"} | ||
for _, file := range testFiles { | ||
content, err := os.ReadFile(file) | ||
require.NoError(f, err) | ||
f.Add(content) | ||
} | ||
|
||
f.Fuzz( | ||
func(t *testing.T, content []byte) { | ||
l := NewLexer(bytes.NewBuffer(content)) | ||
for { | ||
token := l.Token() | ||
if token == tEOF || token == tError { | ||
break | ||
} | ||
l.Next() | ||
} | ||
}, | ||
) | ||
} |
Oops, something went wrong.