Skip to content

Commit

Permalink
Begin implementing STEF IDL (#43)
Browse files Browse the repository at this point in the history
This adds formal support for the IDL that is used in the specification.
I plan to eventually replace the JSON representation by this IDL. It is
much easier to use for humans. For example the Otel STEF IDL is 151 easily
readable lines, whereas the equivalent in JSON is 487 harder to read lines.

The IDL will only be used by the generator. The lexer/parser is not subject
to inputs that may come malicious sources, so I am setting the testing and fuzzing
bar relatively low since there is no attack risk and the worse case the generator
will fail to run or will run incorrectly.

The IDL implementation is not yet used anywhere. It is only verified in tests.
Future PRs will replace JSON usage by IDL in the generator.

The parser is a hand-coded recursive descent parser with separate lexer
for tokenization. I looked at alternates, particular at defining the grammar
in ABNF and using ABNF parsers available in Go. Unfortunately none that I tried
worked and it was simpler to just hand code it.

A couple additional changes that I had to do at the same time for simplicity:
- Rename "MainStruct" to "Root" in generator (for naming consistency).
- Change examples in specification.md to use lower-case `key` and `value` (since these are keywords in IDL).
  • Loading branch information
tigrannajaryan authored Feb 25, 2025
1 parent 587bc0a commit df41803
Show file tree
Hide file tree
Showing 17 changed files with 1,573 additions and 118 deletions.
2 changes: 1 addition & 1 deletion go/otel/oteltef/spans.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

275 changes: 275 additions & 0 deletions go/pkg/idl/lexer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
package idl

import (
"bufio"
"errors"
"fmt"
"io"
"unicode"
)

// Lexer splits a UTF8-encoded input into tokens.
type Lexer struct {
input *bufio.Reader
token Token

nextRune rune
prevWasCR bool

isEOF bool
isError bool
errMsg string

curPos Pos
prevPos Pos

identRunes []rune
ident string
}

// Pos indicates a position in the input stream.
type Pos struct {
ByteOfs uint
Line uint
Col uint
}

type Token uint

const (
tError Token = iota
tEOF

tPackage
tIdent

tStruct
tOneof
tMultimap

tOptional
tRoot
tDict
tKey
tValue

tBool
tInt64
tUint64
tFloat64
tString
tBytes

tLBracket = '['
tRBracket = ']'
tLParen = '('
tRParen = ')'
tLBrace = '{'
tRBrace = '}'
)

func (t Token) String() string {
str, ok := keywordsReverse[t]
if ok {
return str
}
switch t {
case tEOF:
return "EOF"
case tIdent:
return "identifier"
default:
return string(byte(t))
}
}

var keywords = map[string]Token{
"package": tPackage,
"struct": tStruct,
"oneof": tOneof,
"multimap": tMultimap,
"optional": tOptional,
"root": tRoot,
"dict": tDict,
"key": tKey,
"value": tValue,
"bool": tBool,
"int64": tInt64,
"uint64": tUint64,
"float64": tFloat64,
"string": tString,
"bytes": tBytes,
}

var keywordsReverse = func() map[Token]string {
m := make(map[Token]string)
for k, v := range keywords {
m[v] = k
}
return m
}()

func NewLexer(input io.Reader) *Lexer {
l := &Lexer{
input: bufio.NewReader(input),
curPos: Pos{
ByteOfs: 0,
Line: 1,
Col: 1,
},
}
// Fetch the first rune.
l.readNextRune()

// Fetch the first token.
l.Next()
return l
}

func (l *Lexer) Token() Token {
return l.token
}

// Next reads the input for the next token. After that Token() will return
// the token that was read.
//
// If Lexer input is at EOF then the next Token() call will return tEOF.
// If reading failed (e.g. if the input is not valid UTF8) the next Token() call
// will return tError.
func (l *Lexer) Next() {
l.prevPos = l.curPos

l.skipWhiteSpaceOrComment()

if l.isEOF {
l.token = tEOF
return
} else if l.isError {
l.token = tError
l.isError = false
return
}

switch l.nextRune {
case tLParen:
l.token = tLParen
case tRParen:
l.token = tRParen
case tLBracket:
l.token = tLBracket
case tRBracket:
l.token = tRBracket
case tRBrace:
l.token = tRBrace
case tLBrace:
l.token = tLBrace
default:
if unicode.IsLetter(l.nextRune) {
// This is a letter. It must a start of an identifier or keyword.
l.readIdentOrKeyword()
return
}
l.token = tError
l.errMsg = fmt.Sprintf("invalid character: %c", l.nextRune)
}
l.readNextRune()
}

func (l *Lexer) skipWhiteSpaceOrComment() {
for !l.isEOF && !l.isError {
if unicode.IsSpace(l.nextRune) {
l.readNextRune()
} else if l.nextRune == '/' {
l.skipComment()
} else {
break
}
}
}

func (l *Lexer) skipComment() {
l.readNextRune()
if l.isEOF || l.isError || l.nextRune != '/' {
l.token = tError
l.errMsg = "expected start of comment"
return
}

for !l.isEOF && !l.isError && l.nextRune != '\r' && l.nextRune != '\n' {
l.readNextRune()
}
}

func (l *Lexer) readNextRune() {
nextRune, size, err := l.input.ReadRune()
if err != nil {
if errors.Is(err, io.EOF) {
l.isEOF = true
} else {
l.isError = true
l.errMsg = fmt.Sprintf("invalid character")
}
return
}
l.nextRune = nextRune
l.curPos.ByteOfs += uint(size)
l.curPos.Col++

// Handle any of CR,LF,CRLF as a new line.
const cCR = '\r'
const cLF = '\n'
if l.nextRune == cCR {
l.curPos.Line++
l.curPos.Col = 1
l.prevWasCR = true
} else if l.nextRune == cLF {
if !l.prevWasCR {
l.curPos.Line++
l.curPos.Col = 1
}
l.prevWasCR = false
} else {
l.prevWasCR = false
}
}

func (l *Lexer) readIdentOrKeyword() Token {
l.identRunes = l.identRunes[:0]

// The first character is already read. Subsequent characters must be
// letters, digits or underscore.
for (unicode.IsLetter(l.nextRune) || unicode.IsDigit(l.nextRune) || l.nextRune == '_') && !l.isError {
l.identRunes = append(l.identRunes, l.nextRune)
l.readNextRune()
if l.isEOF {
break
}
if l.isError {
l.token = tError
return tError
}
}

l.ident = string(l.identRunes)

// Check if it is a keyword.
if token, ok := keywords[l.ident]; ok {
l.token = token
return token
}

l.token = tIdent
return tIdent
}

// Ident will return the identifier if the current token is tIdent.
// Use Token() first.
func (l *Lexer) Ident() string {
return l.ident
}

// TokenStartPos will return the starting position of the last read
// token after Next() call.
func (l *Lexer) TokenStartPos() Pos {
return l.prevPos
}
52 changes: 52 additions & 0 deletions go/pkg/idl/lexer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package idl

import (
"bytes"
"os"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestLexer(t *testing.T) {
l := NewLexer(bytes.NewBufferString("struct abc {}"))

tokens := []Token{tStruct, tIdent, tLBrace, tRBrace, tEOF}
i := 0
for {
token := l.Token()
assert.Equal(t, tokens[i], token, i)
i++
if token == tEOF {
break
}
l.Next()
}
}

func FuzzLexer(f *testing.F) {
f.Add([]byte(nil))
f.Add([]byte(""))
f.Add([]byte("struct abc {}"))

testFiles := []string{"testdata/example.stef", "testdata/otel.stef"}
for _, file := range testFiles {
content, err := os.ReadFile(file)
require.NoError(f, err)
f.Add(content)
}

f.Fuzz(
func(t *testing.T, content []byte) {
l := NewLexer(bytes.NewBuffer(content))
for {
token := l.Token()
if token == tEOF || token == tError {
break
}
l.Next()
}
},
)
}
Loading

0 comments on commit df41803

Please sign in to comment.