From df41803acef3f7fe14eac2cabc8016589fd2eda0 Mon Sep 17 00:00:00 2001 From: Tigran Najaryan <4194920+tigrannajaryan@users.noreply.github.com> Date: Tue, 25 Feb 2025 12:24:39 -0500 Subject: [PATCH] Begin implementing STEF IDL (#43) This adds formal support for the IDL that is used in the specification. I plan to eventually replace the JSON representation by this IDL. It is much easier to use for humans. For example the Otel STEF IDL is 151 easily readable lines, whereas the equivalent in JSON is 487 harder to read lines. The IDL will only be used by the generator. The lexer/parser is not subject to inputs that may come malicious sources, so I am setting the testing and fuzzing bar relatively low since there is no attack risk and the worse case the generator will fail to run or will run incorrectly. The IDL implementation is not yet used anywhere. It is only verified in tests. Future PRs will replace JSON usage by IDL in the generator. The parser is a hand-coded recursive descent parser with separate lexer for tokenization. I looked at alternates, particular at defining the grammar in ABNF and using ABNF parsers available in Go. Unfortunately none that I tried worked and it was simpler to just hand code it. A couple additional changes that I had to do at the same time for simplicity: - Rename "MainStruct" to "Root" in generator (for naming consistency). - Change examples in specification.md to use lower-case `key` and `value` (since these are keywords in IDL). --- go/otel/oteltef/spans.go | 2 +- go/pkg/idl/lexer.go | 275 +++++++++++++ go/pkg/idl/lexer_test.go | 52 +++ go/pkg/idl/parser.go | 411 +++++++++++++++++++ go/pkg/idl/parser_test.go | 103 +++++ go/pkg/idl/testdata/example.stef | 38 ++ go/pkg/idl/testdata/otel.stef | 151 +++++++ go/pkg/idl/testdata/oteltef.wire.json | 487 +++++++++++++++++++++++ go/pkg/schema/schema.go | 14 +- go/pkg/schema/schema_test.go | 20 +- go/pkg/schema/stef.abnf | 65 --- go/pkg/schema/testdata/oteltef.wire.json | 20 + stef-spec/specification.md | 36 +- stefgen/generator/compileschema.go | 9 +- stefgen/generator/generator.go | 1 - stefgen/generator/genschema.go | 5 +- stefgen/generator/structs.go | 2 +- 17 files changed, 1573 insertions(+), 118 deletions(-) create mode 100644 go/pkg/idl/lexer.go create mode 100644 go/pkg/idl/lexer_test.go create mode 100644 go/pkg/idl/parser.go create mode 100644 go/pkg/idl/parser_test.go create mode 100644 go/pkg/idl/testdata/example.stef create mode 100755 go/pkg/idl/testdata/otel.stef create mode 100755 go/pkg/idl/testdata/oteltef.wire.json delete mode 100644 go/pkg/schema/stef.abnf diff --git a/go/otel/oteltef/spans.go b/go/otel/oteltef/spans.go index a419b2b..19524fa 100644 --- a/go/otel/oteltef/spans.go +++ b/go/otel/oteltef/spans.go @@ -422,7 +422,7 @@ func (d *SpansDecoder) Init(state *ReaderState, columns *pkg.ReadColumnSet) erro d.column = columns.Column() - d.lastVal.init(nil, 0) + d.lastVal.Init() d.lastValPtr = &d.lastVal var err error diff --git a/go/pkg/idl/lexer.go b/go/pkg/idl/lexer.go new file mode 100644 index 0000000..d097b6e --- /dev/null +++ b/go/pkg/idl/lexer.go @@ -0,0 +1,275 @@ +package idl + +import ( + "bufio" + "errors" + "fmt" + "io" + "unicode" +) + +// Lexer splits a UTF8-encoded input into tokens. +type Lexer struct { + input *bufio.Reader + token Token + + nextRune rune + prevWasCR bool + + isEOF bool + isError bool + errMsg string + + curPos Pos + prevPos Pos + + identRunes []rune + ident string +} + +// Pos indicates a position in the input stream. +type Pos struct { + ByteOfs uint + Line uint + Col uint +} + +type Token uint + +const ( + tError Token = iota + tEOF + + tPackage + tIdent + + tStruct + tOneof + tMultimap + + tOptional + tRoot + tDict + tKey + tValue + + tBool + tInt64 + tUint64 + tFloat64 + tString + tBytes + + tLBracket = '[' + tRBracket = ']' + tLParen = '(' + tRParen = ')' + tLBrace = '{' + tRBrace = '}' +) + +func (t Token) String() string { + str, ok := keywordsReverse[t] + if ok { + return str + } + switch t { + case tEOF: + return "EOF" + case tIdent: + return "identifier" + default: + return string(byte(t)) + } +} + +var keywords = map[string]Token{ + "package": tPackage, + "struct": tStruct, + "oneof": tOneof, + "multimap": tMultimap, + "optional": tOptional, + "root": tRoot, + "dict": tDict, + "key": tKey, + "value": tValue, + "bool": tBool, + "int64": tInt64, + "uint64": tUint64, + "float64": tFloat64, + "string": tString, + "bytes": tBytes, +} + +var keywordsReverse = func() map[Token]string { + m := make(map[Token]string) + for k, v := range keywords { + m[v] = k + } + return m +}() + +func NewLexer(input io.Reader) *Lexer { + l := &Lexer{ + input: bufio.NewReader(input), + curPos: Pos{ + ByteOfs: 0, + Line: 1, + Col: 1, + }, + } + // Fetch the first rune. + l.readNextRune() + + // Fetch the first token. + l.Next() + return l +} + +func (l *Lexer) Token() Token { + return l.token +} + +// Next reads the input for the next token. After that Token() will return +// the token that was read. +// +// If Lexer input is at EOF then the next Token() call will return tEOF. +// If reading failed (e.g. if the input is not valid UTF8) the next Token() call +// will return tError. +func (l *Lexer) Next() { + l.prevPos = l.curPos + + l.skipWhiteSpaceOrComment() + + if l.isEOF { + l.token = tEOF + return + } else if l.isError { + l.token = tError + l.isError = false + return + } + + switch l.nextRune { + case tLParen: + l.token = tLParen + case tRParen: + l.token = tRParen + case tLBracket: + l.token = tLBracket + case tRBracket: + l.token = tRBracket + case tRBrace: + l.token = tRBrace + case tLBrace: + l.token = tLBrace + default: + if unicode.IsLetter(l.nextRune) { + // This is a letter. It must a start of an identifier or keyword. + l.readIdentOrKeyword() + return + } + l.token = tError + l.errMsg = fmt.Sprintf("invalid character: %c", l.nextRune) + } + l.readNextRune() +} + +func (l *Lexer) skipWhiteSpaceOrComment() { + for !l.isEOF && !l.isError { + if unicode.IsSpace(l.nextRune) { + l.readNextRune() + } else if l.nextRune == '/' { + l.skipComment() + } else { + break + } + } +} + +func (l *Lexer) skipComment() { + l.readNextRune() + if l.isEOF || l.isError || l.nextRune != '/' { + l.token = tError + l.errMsg = "expected start of comment" + return + } + + for !l.isEOF && !l.isError && l.nextRune != '\r' && l.nextRune != '\n' { + l.readNextRune() + } +} + +func (l *Lexer) readNextRune() { + nextRune, size, err := l.input.ReadRune() + if err != nil { + if errors.Is(err, io.EOF) { + l.isEOF = true + } else { + l.isError = true + l.errMsg = fmt.Sprintf("invalid character") + } + return + } + l.nextRune = nextRune + l.curPos.ByteOfs += uint(size) + l.curPos.Col++ + + // Handle any of CR,LF,CRLF as a new line. + const cCR = '\r' + const cLF = '\n' + if l.nextRune == cCR { + l.curPos.Line++ + l.curPos.Col = 1 + l.prevWasCR = true + } else if l.nextRune == cLF { + if !l.prevWasCR { + l.curPos.Line++ + l.curPos.Col = 1 + } + l.prevWasCR = false + } else { + l.prevWasCR = false + } +} + +func (l *Lexer) readIdentOrKeyword() Token { + l.identRunes = l.identRunes[:0] + + // The first character is already read. Subsequent characters must be + // letters, digits or underscore. + for (unicode.IsLetter(l.nextRune) || unicode.IsDigit(l.nextRune) || l.nextRune == '_') && !l.isError { + l.identRunes = append(l.identRunes, l.nextRune) + l.readNextRune() + if l.isEOF { + break + } + if l.isError { + l.token = tError + return tError + } + } + + l.ident = string(l.identRunes) + + // Check if it is a keyword. + if token, ok := keywords[l.ident]; ok { + l.token = token + return token + } + + l.token = tIdent + return tIdent +} + +// Ident will return the identifier if the current token is tIdent. +// Use Token() first. +func (l *Lexer) Ident() string { + return l.ident +} + +// TokenStartPos will return the starting position of the last read +// token after Next() call. +func (l *Lexer) TokenStartPos() Pos { + return l.prevPos +} diff --git a/go/pkg/idl/lexer_test.go b/go/pkg/idl/lexer_test.go new file mode 100644 index 0000000..1362cba --- /dev/null +++ b/go/pkg/idl/lexer_test.go @@ -0,0 +1,52 @@ +package idl + +import ( + "bytes" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestLexer(t *testing.T) { + l := NewLexer(bytes.NewBufferString("struct abc {}")) + + tokens := []Token{tStruct, tIdent, tLBrace, tRBrace, tEOF} + i := 0 + for { + token := l.Token() + assert.Equal(t, tokens[i], token, i) + i++ + if token == tEOF { + break + } + l.Next() + } +} + +func FuzzLexer(f *testing.F) { + f.Add([]byte(nil)) + f.Add([]byte("")) + f.Add([]byte("struct abc {}")) + + testFiles := []string{"testdata/example.stef", "testdata/otel.stef"} + for _, file := range testFiles { + content, err := os.ReadFile(file) + require.NoError(f, err) + f.Add(content) + } + + f.Fuzz( + func(t *testing.T, content []byte) { + l := NewLexer(bytes.NewBuffer(content)) + for { + token := l.Token() + if token == tEOF || token == tError { + break + } + l.Next() + } + }, + ) +} diff --git a/go/pkg/idl/parser.go b/go/pkg/idl/parser.go new file mode 100644 index 0000000..5f78bc3 --- /dev/null +++ b/go/pkg/idl/parser.go @@ -0,0 +1,411 @@ +package idl + +import ( + "fmt" + + "github.com/splunk/stef/go/pkg/schema" +) + +// Parser parses a STEF IDL input into Schema. +// +// This is a recursive descent parser with separate lexer for tokenization. +type Parser struct { + lexer *Lexer + schema *schema.Schema + fileName string +} + +// Error represents a parsing error. +type Error struct { + Msg string + Filename string + Pos Pos +} + +func (e *Error) Error() string { + return fmt.Sprintf("%s:%d:%d: %s", e.Filename, e.Pos.Line, e.Pos.Col, e.Msg) +} + +var _ error = (*Error)(nil) + +// NewParser creates a new parser with specified lexer as the input. +// fileName is used for composing error messages (if any). +func NewParser(lexer *Lexer, fileName string) *Parser { + p := &Parser{fileName: fileName} + p.lexer = lexer + p.schema = &schema.Schema{} + return p +} + +// Schema returns the parsed Schema, assuming Parse() returned nil. +func (p *Parser) Schema() *schema.Schema { + return p.schema +} + +// Parse an IDL input into Schema. +// Will return an error if the input syntax is invalid. +func (p *Parser) Parse() error { + p.schema = &schema.Schema{ + Structs: map[string]*schema.Struct{}, + Multimaps: map[string]*schema.Multimap{}, + } + + if err := p.parsePackage(); err != nil { + return err + } + + for { + var err error + switch p.lexer.Token() { + case tStruct: + _, err = p.parseStruct() + case tOneof: + err = p.parseOneof() + case tMultimap: + err = p.parseMultimap() + default: + return p.error("expected struct, oneof or multimap") + } + if err != nil { + return err + } + if p.lexer.Token() == tEOF { + break + } + } + return p.resolveFieldTypes() +} + +func (p *Parser) parseStruct() (*schema.Struct, error) { + p.lexer.Next() // skip "struct" + + if p.lexer.Token() != tIdent { + return nil, p.error("struct name expected") + } + structName := p.lexer.Ident() + p.lexer.Next() + + str := &schema.Struct{ + Name: structName, + } + p.schema.Structs[str.Name] = str + + if err := p.parseStructModifiers(str); err != nil { + return nil, err + } + + if err := p.eat(tLBrace); err != nil { + return nil, err + } + + if err := p.parseStructFields(str); err != nil { + return nil, err + } + + if err := p.eat(tRBrace); err != nil { + return nil, err + } + + return str, nil +} + +func (p *Parser) parseOneof() error { + // "oneof" syntax is identical to struct, except we need to set "OneOf" flag. + str, err := p.parseStruct() + if err != nil { + return err + } + str.OneOf = true + return nil +} + +func (p *Parser) parseMultimap() error { + p.lexer.Next() // skip "multimap" + + if p.lexer.Token() != tIdent { + return p.error("multimap name expected") + } + multimapName := p.lexer.Ident() + p.lexer.Next() + + mm := &schema.Multimap{ + Name: multimapName, + } + p.schema.Multimaps[mm.Name] = mm + + if err := p.eat(tLBrace); err != nil { + return err + } + + // Parse the key. + if err := p.eat(tKey); err != nil { + return err + } + if err := p.parseMultimapField(&mm.Key); err != nil { + return err + } + + // Parse the value. + if err := p.eat(tValue); err != nil { + return err + } + if err := p.parseMultimapField(&mm.Value); err != nil { + return err + } + + if err := p.eat(tRBrace); err != nil { + return err + } + + return nil +} + +func (p *Parser) error(msg string) error { + return &Error{ + Msg: msg, + Filename: p.fileName, + Pos: p.lexer.TokenStartPos(), + } +} + +func (p *Parser) parseStructModifiers(str *schema.Struct) error { + for { + err, ok := p.parseStructModifier(str) + if err != nil { + return err + } + if !ok { + break + } + } + return nil +} + +func (p *Parser) parseStructModifier(str *schema.Struct) (error, bool) { + switch p.lexer.Token() { + case tDict: + dictName, err := p.parseDictModifier() + if err != nil { + return err, false + } + str.DictName = dictName + case tRoot: + str.IsRoot = true + p.lexer.Next() + default: + return nil, false + } + return nil, false +} + +func (p *Parser) parseDictModifier() (string, error) { + p.lexer.Next() // skip "dict" + + if err := p.eat(tLParen); err != nil { + return "", err + } + + if p.lexer.Token() != tIdent { + return "", p.error("dict name expected") + } + dictName := p.lexer.Ident() + p.lexer.Next() + + if err := p.eat(tRParen); err != nil { + return "", err + } + return dictName, nil +} + +// eat checks that the current token is the expected one and skips it. +func (p *Parser) eat(token Token) error { + if p.lexer.Token() != token { + return p.error(fmt.Sprintf("expected %s but got %s", token, p.lexer.Token())) + } + p.lexer.Next() + return nil +} + +func (p *Parser) parseStructFields(str *schema.Struct) error { + for { + err, ok := p.parseStructField(str) + if err != nil { + return err + } + if !ok { + break + } + } + return nil +} + +func (p *Parser) parseStructField(str *schema.Struct) (error, bool) { + if p.lexer.Token() != tIdent { + return nil, false + } + + str.Fields = append(str.Fields, schema.StructField{Name: p.lexer.Ident()}) + field := &str.Fields[len(str.Fields)-1] + + p.lexer.Next() + + if err := p.parseFieldType(&field.FieldType); err != nil { + return err, false + } + if err := p.parseStructFieldModifiers(field); err != nil { + return err, false + } + + return nil, true +} + +func (p *Parser) parseFieldType(field *schema.FieldType) error { + isArray := false + if p.lexer.Token() == tLBracket { + isArray = true + p.lexer.Next() + // We expect a matching right bracket. + if err := p.eat(tRBracket); err != nil { + return err + } + } + + ft := schema.FieldType{} + switch p.lexer.Token() { + case tIdent: + // Temporarily store in "Struct", but this may also be a oneof or multimap. + // We will resolve to the correct type it later, after all input is read, + // since it may be a forward reference. + ft.Struct = p.lexer.Ident() + + case tBool: + v := schema.PrimitiveTypeBool + ft.Primitive = &v + + case tInt64: + v := schema.PrimitiveTypeInt64 + ft.Primitive = &v + + case tUint64: + v := schema.PrimitiveTypeUint64 + ft.Primitive = &v + + case tFloat64: + v := schema.PrimitiveTypeFloat64 + ft.Primitive = &v + + case tString: + v := schema.PrimitiveTypeString + ft.Primitive = &v + + case tBytes: + v := schema.PrimitiveTypeBytes + ft.Primitive = &v + + default: + if isArray { + return p.error("type specifier expected after []") + } + return nil + } + p.lexer.Next() + + if isArray { + field.Array = &ft + } else { + *field = ft + } + + return nil +} + +func (p *Parser) parseStructFieldModifiers(field *schema.StructField) error { + for { + err, ok := p.parseStructFieldModifier(field) + if err != nil { + return err + } + if !ok { + break + } + } + return nil +} + +func (p *Parser) parseStructFieldModifier(field *schema.StructField) (error, bool) { + switch p.lexer.Token() { + case tDict: + dictName, err := p.parseDictModifier() + if err != nil { + return err, false + } + field.DictName = dictName + return nil, true + case tOptional: + field.Optional = true + p.lexer.Next() + return nil, true + default: + return nil, false + } +} + +func (p *Parser) parseMultimapField(field *schema.MultimapField) error { + if err := p.parseFieldType(&field.Type); err != nil { + return err + } + + if p.lexer.Token() == tDict { + dictName, err := p.parseDictModifier() + if err != nil { + return err + } + field.Type.DictName = dictName + } + + return nil +} + +func (p *Parser) resolveFieldTypes() error { + for _, v := range p.schema.Structs { + for i := range v.Fields { + field := &v.Fields[i] + if err := p.resolveFieldType(&field.FieldType); err != nil { + return err + } + } + } + for _, v := range p.schema.Multimaps { + if err := p.resolveFieldType(&v.Key.Type); err != nil { + return err + } + if err := p.resolveFieldType(&v.Value.Type); err != nil { + return err + } + } + return nil +} + +func (p *Parser) resolveFieldType(fieldType *schema.FieldType) error { + if fieldType.Struct != "" { + _, ok := p.schema.Multimaps[fieldType.Struct] + if ok { + fieldType.MultiMap = fieldType.Struct + fieldType.Struct = "" + } + } + return nil +} + +func (p *Parser) parsePackage() error { + if p.lexer.Token() == tPackage { + p.lexer.Next() // skip "package" + + if p.lexer.Token() != tIdent { + return p.error("package name expected") + } + p.schema.PackageName = p.lexer.Ident() + p.lexer.Next() + } + return nil +} diff --git a/go/pkg/idl/parser_test.go b/go/pkg/idl/parser_test.go new file mode 100644 index 0000000..ecdbb92 --- /dev/null +++ b/go/pkg/idl/parser_test.go @@ -0,0 +1,103 @@ +package idl + +import ( + "bytes" + "encoding/json" + "os" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/splunk/stef/go/pkg/schema" +) + +func TestParserErrors(t *testing.T) { + tests := []struct { + input string + err string + }{ + { + input: "package ", + err: "test.stef:1:9: package name expected", + }, + { + input: "package abc\nhello", + err: "test.stef:2:1: expected struct, oneof or multimap", + }, + { + input: "package abc\nstruct string", + err: "test.stef:2:8: struct name expected", + }, + { + input: "package abc\nmultimap [", + err: "test.stef:2:10: multimap name expected", + }, + { + input: "package abc\nstruct MyStruct dict()", + err: "test.stef:2:23: dict name expected", + }, + { + input: "package abc\nstruct MyStruct dict[]", + err: "test.stef:2:22: expected ( but got [", + }, + { + input: "package abc\nstruct MyStruct {\nField []struct", + err: "test.stef:3:10: type specifier expected after []", + }, + } + + for _, test := range tests { + lexer := NewLexer(bytes.NewBufferString(test.input)) + parser := NewParser(lexer, "test.stef") + err := parser.Parse() + require.Error(t, err) + require.Equal(t, test.err, err.Error()) + } +} + +func TestParseExample(t *testing.T) { + inputFile := "testdata/example.stef" + idlBytes, err := os.ReadFile(inputFile) + require.NoError(t, err) + + lexer := NewLexer(bytes.NewBuffer(idlBytes)) + parser := NewParser(lexer, inputFile) + err = parser.Parse() + require.NoError(t, err) +} + +func TestParserOtelSTEF(t *testing.T) { + inputFile := "testdata/otel.stef" + idlBytes, err := os.ReadFile(inputFile) + require.NoError(t, err) + + lexer := NewLexer(bytes.NewBuffer(idlBytes)) + parser := NewParser(lexer, inputFile) + err = parser.Parse() + require.NoError(t, err) + + jsonBytes, err := os.ReadFile("testdata/oteltef.wire.json") + require.NoError(t, err) + + var schem schema.Schema + err = json.Unmarshal(jsonBytes, &schem) + require.NoError(t, err) + + require.EqualValues(t, &schem, parser.Schema()) +} + +func FuzzParser(f *testing.F) { + testFiles := []string{"testdata/example.stef", "testdata/otel.stef"} + for _, file := range testFiles { + content, err := os.ReadFile(file) + require.NoError(f, err) + f.Add(content) + } + + f.Fuzz( + func(t *testing.T, content []byte) { + p := NewParser(NewLexer(bytes.NewBuffer(content)), "temp.stef") + _ = p.Parse() + }, + ) +} diff --git a/go/pkg/idl/testdata/example.stef b/go/pkg/idl/testdata/example.stef new file mode 100644 index 0000000..632e8fd --- /dev/null +++ b/go/pkg/idl/testdata/example.stef @@ -0,0 +1,38 @@ +// Records of events that happened with books. This is the main record struct. +struct BookRecords root { + Book Book // Which book the event is about. + Event BookEvent // The event that happened. +} + +struct Book { + Title string // The title of the book. + PublishedOn Date // When was it published. + Publisher string dict(Publisher) // Publishers name, encoded with a dict. + Authors []Person // Zero or more authors of the book. +} + +// BookEvent describes either a checkout or a checkin event. +oneof BookEvent { + Checkout CheckoutEvent + Checkin CheckinEvent +} + +struct CheckoutEvent { + Date Date // when was it checked out + Person Person // who checked out the book. +} + +struct CheckinEvent { + Date Date + DamageValue float64 optional // Amount of damage assessed for the book. +} + +struct Person { + Name string +} + +struct Date { + Year uint64 + Month uint64 + Day uint64 +} diff --git a/go/pkg/idl/testdata/otel.stef b/go/pkg/idl/testdata/otel.stef new file mode 100755 index 0000000..5f15f92 --- /dev/null +++ b/go/pkg/idl/testdata/otel.stef @@ -0,0 +1,151 @@ +// This is Otel/STEF schema: a representation of OpenTelemetry data model +// as STEF records. Data model is virtually a direct mapping from OpenTelemetry +// Protobuf IDL for metrics and traces, see: https://github.com/open-telemetry/opentelemetry-proto/tree/main/opentelemetry/proto +// TODO: add Logs and Profiles. + +package oteltef + +multimap Attributes { + key string dict(AttributeKey) + value AnyValue +} + +multimap EnvelopeAttributes { + key string + value bytes +} + +multimap KeyValueList { + key string + value AnyValue +} + +struct Resource dict(Resource) { + // All SchemaURL fields use the same (shared) dict. + SchemaURL string dict(SchemaURL) + Attributes Attributes + DroppedAttributesCount uint64 +} + +// Point represents a metric data point. +struct Point { + StartTimestamp uint64 + Timestamp uint64 + Value PointValue + Exemplars []Exemplar +} + +struct Span { + TraceID bytes + SpanID bytes + TraceState string + ParentSpanID bytes + Flags uint64 + Name string dict(SpanName) + Kind uint64 + StartTimeUnixNano uint64 + EndTimeUnixNano uint64 + Attributes Attributes + DroppedAttributesCount uint64 + Events []Event + Links []Link + Status SpanStatus +} + +oneof PointValue { + Int64 int64 + Float64 float64 + Histogram HistogramValue + // TODO: Add Summary and Exponential Histogram value support. +} + +struct Metric dict(Metric) { + Name string dict(MetricName) + Description string dict(MetricDescription) + Unit string dict(MetricUnit) + Type uint64 + Metadata Attributes + HistogramBounds []float64 + AggregationTemporality uint64 + Monotonic bool +} + +struct Metrics root { + Envelope Envelope + Metric Metric + Resource Resource + Scope Scope + Attributes Attributes + Point Point +} + +struct Scope dict(Scope) { + Name string dict(ScopeName) + Version string dict(ScopeVersion) + SchemaURL string dict(SchemaURL) + Attributes Attributes + DroppedAttributesCount uint64 +} + +struct Link { + TraceID bytes + SpanID bytes + TraceState string + Flags uint64 + Attributes Attributes + DroppedAttributesCount uint64 +} + +struct HistogramValue { + Count int64 + Sum float64 optional + Min float64 optional + Max float64 optional + BucketCounts []int64 +} + +oneof AnyValue { + String string dict(AnyValueString) + Bool bool + Int64 int64 + Float64 float64 + Array []AnyValue + KVList KeyValueList + Bytes bytes +} + +struct Event { + Name string dict(SpanEventName) + TimeUnixNano uint64 + Attributes Attributes + DroppedAttributesCount uint64 +} + +struct SpanStatus { + Message string + Code uint64 +} + +struct Spans root { + Envelope Envelope + Resource Resource + Scope Scope + Span Span +} + +struct Envelope { + Attributes EnvelopeAttributes +} + +struct Exemplar { + Timestamp uint64 + Value ExemplarValue + SpanID bytes dict(Span) + TraceID bytes dict(Trace) + FilteredAttributes Attributes +} + +oneof ExemplarValue { + Int64 int64 + Float64 float64 +} diff --git a/go/pkg/idl/testdata/oteltef.wire.json b/go/pkg/idl/testdata/oteltef.wire.json new file mode 100755 index 0000000..ea938a5 --- /dev/null +++ b/go/pkg/idl/testdata/oteltef.wire.json @@ -0,0 +1,487 @@ +{ + "package": "oteltef", + "structs": { + "AnyValue": { + "name": "AnyValue", + "oneof": true, + "fields": [ + { + "primitive": 4, + "dict": "AnyValueString", + "name": "String" + }, + { + "primitive": 3, + "name": "Bool" + }, + { + "primitive": 0, + "name": "Int64" + }, + { + "primitive": 2, + "name": "Float64" + }, + { + "array": { + "struct": "AnyValue" + }, + "name": "Array", + "recursive": true + }, + { + "multimap": "KeyValueList", + "name": "KVList", + "recursive": true + }, + { + "primitive": 5, + "name": "Bytes" + } + ] + }, + "Envelope": { + "name": "Envelope", + "fields": [ + { + "multimap": "EnvelopeAttributes", + "name": "Attributes" + } + ] + }, + "Exemplar": { + "name": "Exemplar", + "fields": [ + { + "primitive": 1, + "name": "Timestamp" + }, + { + "struct": "ExemplarValue", + "name": "Value" + }, + { + "primitive": 5, + "dict": "Span", + "name": "SpanID" + }, + { + "primitive": 5, + "dict": "Trace", + "name": "TraceID" + }, + { + "multimap": "Attributes", + "name": "FilteredAttributes" + } + ] + }, + "ExemplarValue": { + "name": "ExemplarValue", + "oneof": true, + "fields": [ + { + "primitive": 0, + "name": "Int64" + }, + { + "primitive": 2, + "name": "Float64" + } + ] + }, + "HistogramValue": { + "name": "HistogramValue", + "fields": [ + { + "primitive": 0, + "name": "Count" + }, + { + "primitive": 2, + "name": "Sum", + "optional": true + }, + { + "primitive": 2, + "name": "Min", + "optional": true + }, + { + "primitive": 2, + "name": "Max", + "optional": true + }, + { + "array": { + "primitive": 0 + }, + "name": "BucketCounts" + } + ] + }, + "Metric": { + "name": "Metric", + "dict": "Metric", + "fields": [ + { + "primitive": 4, + "dict": "MetricName", + "name": "Name" + }, + { + "primitive": 4, + "dict": "MetricDescription", + "name": "Description" + }, + { + "primitive": 4, + "dict": "MetricUnit", + "name": "Unit" + }, + { + "primitive": 1, + "name": "Type" + }, + { + "multimap": "Attributes", + "name": "Metadata" + }, + { + "array": { + "primitive": 2 + }, + "name": "HistogramBounds" + }, + { + "primitive": 1, + "name": "AggregationTemporality" + }, + { + "primitive": 3, + "name": "Monotonic" + } + ] + }, + "Point": { + "name": "Point", + "fields": [ + { + "primitive": 1, + "name": "StartTimestamp" + }, + { + "primitive": 1, + "name": "Timestamp" + }, + { + "struct": "PointValue", + "name": "Value" + }, + { + "array": { + "struct": "Exemplar" + }, + "name": "Exemplars" + } + ] + }, + "PointValue": { + "name": "PointValue", + "oneof": true, + "fields": [ + { + "primitive": 0, + "name": "Int64" + }, + { + "primitive": 2, + "name": "Float64" + }, + { + "struct": "HistogramValue", + "name": "Histogram" + } + ] + }, + "Metrics": { + "name": "Metrics", + "root": true, + "fields": [ + { + "struct": "Envelope", + "name": "Envelope" + }, + { + "struct": "Metric", + "name": "Metric" + }, + { + "struct": "Resource", + "name": "Resource" + }, + { + "struct": "Scope", + "name": "Scope" + }, + { + "multimap": "Attributes", + "name": "Attributes" + }, + { + "struct": "Point", + "name": "Point" + } + ] + }, + "Resource": { + "name": "Resource", + "dict": "Resource", + "fields": [ + { + "primitive": 4, + "dict": "SchemaURL", + "name": "SchemaURL" + }, + { + "multimap": "Attributes", + "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 + } + ] + }, + "Scope": { + "name": "Scope", + "dict": "Scope", + "fields": [ + { + "primitive": 4, + "dict": "ScopeName", + "name": "Name" + }, + { + "primitive": 4, + "dict": "ScopeVersion", + "name": "Version" + }, + { + "primitive": 4, + "dict": "SchemaURL", + "name": "SchemaURL" + }, + { + "multimap": "Attributes", + "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 + } + ] + }, + + "Span": { + "name": "Span", + "fields": [ + { + "primitive": 5, + "name": "TraceID" + }, + { + "primitive": 5, + "name": "SpanID" + }, + { + "primitive": 4, + "name": "TraceState" + }, + { + "primitive": 5, + "name": "ParentSpanID" + }, + { + "primitive": 1, + "name": "Flags" + }, + { + "primitive": 4, + "dict": "SpanName", + "name": "Name" + }, + { + "primitive": 1, + "name": "Kind" + }, + { + "primitive": 1, + "name": "StartTimeUnixNano" + }, + { + "primitive": 1, + "name": "EndTimeUnixNano" + }, + { + "multimap": "Attributes", + "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 + }, + { + "array": { + "struct": "Event" + }, + "name": "Events" + }, + { + "array": { + "struct": "Link" + }, + "name": "Links" + }, + { + "struct": "SpanStatus", + "name": "Status" + } + ] + }, + + "Link": { + "name": "Link", + "fields": [ + { + "primitive": 5, + "name": "TraceID" + }, + { + "primitive": 5, + "name": "SpanID" + }, + { + "primitive": 4, + "name": "TraceState" + }, + { + "primitive": 1, + "name": "Flags" + }, + { + "multimap": "Attributes", + "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 + } + ] + }, + + "Event": { + "name": "Event", + "fields": [ + { + "primitive": 4, + "dict": "SpanEventName", + "name": "Name" + }, + { + "primitive": 1, + "name": "TimeUnixNano" + }, + { + "multimap": "Attributes", + "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 + } + ] + }, + + "SpanStatus": { + "name": "SpanStatus", + "fields": [ + { + "primitive": 4, + "name": "Message" + }, + { + "primitive": 1, + "name": "Code" + } + ] + }, + + "Spans": { + "name": "Spans", + "root": true, + "fields": [ + { + "struct": "Envelope", + "name": "Envelope" + }, + { + "struct": "Resource", + "name": "Resource" + }, + { + "struct": "Scope", + "name": "Scope" + }, + { + "struct": "Span", + "name": "Span" + } + ] + } + }, + "multimaps": { + "Attributes": { + "name": "Attributes", + "key": { + "type": { + "primitive": 4, + "dict": "AttributeKey" + } + }, + "value": { + "type": { + "struct": "AnyValue" + } + } + }, + "EnvelopeAttributes": { + "name": "EnvelopeAttributes", + "key": { + "type": { + "primitive": 4 + } + }, + "value": { + "type": { + "primitive": 5 + } + } + }, + "KeyValueList": { + "name": "KeyValueList", + "key": { + "type": { + "primitive": 4 + } + }, + "value": { + "type": { + "struct": "AnyValue" + }, + "recursive": true + } + } + }, + "main": "Metrics" +} \ No newline at end of file diff --git a/go/pkg/schema/schema.go b/go/pkg/schema/schema.go index 2149730..1156182 100644 --- a/go/pkg/schema/schema.go +++ b/go/pkg/schema/schema.go @@ -9,7 +9,6 @@ type Schema struct { PackageName string `json:"package,omitempty"` Structs map[string]*Struct `json:"structs"` Multimaps map[string]*Multimap `json:"multimaps"` - MainStruct string `json:"main"` } type Compatibility int @@ -23,14 +22,6 @@ const ( // Compatible checks backward compatibility of this schema with oldSchema. // If the schemas are incompatible returns CompatibilityIncompatible and an error. func (d *Schema) Compatible(oldSchema *Schema) (Compatibility, error) { - if d.MainStruct != oldSchema.MainStruct { - return CompatibilityIncompatible, - fmt.Errorf( - "mismatched main structure names (old=%s, new=%s)", - oldSchema.MainStruct, d.MainStruct, - ) - } - // Exact compatibility is only possible if the number of structs is exactly the same. exact := len(d.Structs) == len(oldSchema.Structs) @@ -184,9 +175,8 @@ func isCompatibleFieldType( // are excluded. func (d *Schema) PrunedForRoot(rootStructName string) (*Schema, error) { out := Schema{ - Structs: map[string]*Struct{}, - Multimaps: map[string]*Multimap{}, - MainStruct: rootStructName, + Structs: map[string]*Struct{}, + Multimaps: map[string]*Multimap{}, } if err := d.copyPrunedStruct(rootStructName, &out); err != nil { return nil, err diff --git a/go/pkg/schema/schema_test.go b/go/pkg/schema/schema_test.go index 9d1bf13..bb9a0b2 100644 --- a/go/pkg/schema/schema_test.go +++ b/go/pkg/schema/schema_test.go @@ -97,7 +97,6 @@ func TestSchemaSelfCompatible(t *testing.T) { Structs: map[string]*Struct{ "Root": {Name: "Root"}, }, - MainStruct: "Root", }, { PackageName: "pkg", @@ -119,7 +118,6 @@ func TestSchemaSelfCompatible(t *testing.T) { Value: MultimapField{Type: FieldType{Primitive: &p}}, }, }, - MainStruct: "Root", }, } @@ -155,8 +153,7 @@ func TestSchemaSuperset(t *testing.T) { }, }, }, - Multimaps: nil, - MainStruct: "Root", + Multimaps: nil, }, new: &Schema{ PackageName: "def", @@ -179,8 +176,7 @@ func TestSchemaSuperset(t *testing.T) { }, }, }, - Multimaps: nil, - MainStruct: "Root", + Multimaps: nil, }, }, { @@ -253,7 +249,6 @@ func TestSchemaSuperset(t *testing.T) { Value: MultimapField{Type: FieldType{Primitive: &primitiveTypeString}}, }, }, - MainStruct: "Root", }, new: &Schema{ PackageName: "def", @@ -347,7 +342,6 @@ func TestSchemaSuperset(t *testing.T) { Value: MultimapField{Type: FieldType{Primitive: &primitiveTypeString}}, }, }, - MainStruct: "Root2", }, }, } @@ -392,8 +386,7 @@ func TestSchemaIncompatible(t *testing.T) { }, }, }, - Multimaps: nil, - MainStruct: "Root", + Multimaps: nil, }, new: &Schema{ PackageName: "def", @@ -410,7 +403,6 @@ func TestSchemaIncompatible(t *testing.T) { }, }, }, - MainStruct: "Root", }, err: "struct Root has fewer fields in new schema (1 vs 2)", }, @@ -428,7 +420,7 @@ func TestSchemaIncompatible(t *testing.T) { } func expandSchema(t *testing.T, r *rand.Rand, orig *Schema) (cpy *Schema) { - cpy, err := orig.PrunedForRoot(orig.MainStruct) + cpy, err := orig.PrunedForRoot("Metrics") require.NoError(t, err) for { for _, str := range cpy.Structs { @@ -509,7 +501,7 @@ func expandStruct(t *testing.T, r *rand.Rand, schema *Schema, str *Struct) bool } func shrinkSchema(t *testing.T, r *rand.Rand, orig *Schema) (cpy *Schema) { - cpy, err := orig.PrunedForRoot(orig.MainStruct) + cpy, err := orig.PrunedForRoot("Metrics") require.NoError(t, err) for { for _, str := range cpy.Structs { @@ -548,7 +540,7 @@ func TestSchemaExpand(t *testing.T) { orig := &Schema{} err = json.Unmarshal(schemaJson, &orig) require.NoError(t, err) - orig, err = orig.PrunedForRoot(orig.MainStruct) + orig, err = orig.PrunedForRoot("Metrics") require.NoError(t, err) r := rand.New(rand.NewSource(42)) diff --git a/go/pkg/schema/stef.abnf b/go/pkg/schema/stef.abnf deleted file mode 100644 index ee16bdb..0000000 --- a/go/pkg/schema/stef.abnf +++ /dev/null @@ -1,65 +0,0 @@ -; Validated by https://author-tools.ietf.org/abnf - -schema = struct oneof multimap - -; struct rules - -struct = "struct" struct-name *struct-modifier "{" struct-body "}" - -struct-name = identifier - -identifier = ALPHA *(ALPHA / DIGIT) - -struct-modifier = dict-modifier / "main" - -dict-modifier = "(" identifier ")" - -struct-body = 1*struct-field - -struct-field = identifier field-type [struct-field-modifier] - -struct-field-modifier = dict-modifier / optional-modifier - -optional-modifier = "optional" - -; oneof rules - -oneof = "oneof" oneof-name "{" oneof-body "}" - -oneof-name = identifier - -oneof-body = 1*oneof-field - -oneof-field = identifier field-type [oneof-field-modifier] - -oneof-field-modifier = dict-modifier - -; multimap rules - -multimap = "multimap" multimap-name "{" multimap-body "}" - -multimap-name = identifier - -multimap-body = multimap-key multimap-value - -multimap-key = "Key" field-type [multimap-field-modifier] - -multimap-value = "Value" field-type [multimap-field-modifier] - -multimap-field-modifier = dict-modifier - -; type definition - -field-type = [array-specifier] builtin-type / userdefined-type - -array-specifier = "[]" - -builtin-type = "bool" / "int64" / "float64" / "string" / "bytes" - -userdefined-type = identifier - -; core rules - -ALPHA = %x41-5A / %x61-7A ; A-Z / a-z - -DIGIT = %x30-39 ; 0-9 diff --git a/go/pkg/schema/testdata/oteltef.wire.json b/go/pkg/schema/testdata/oteltef.wire.json index d9cd4ad..ea938a5 100755 --- a/go/pkg/schema/testdata/oteltef.wire.json +++ b/go/pkg/schema/testdata/oteltef.wire.json @@ -246,6 +246,10 @@ { "multimap": "Attributes", "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 } ] }, @@ -271,6 +275,10 @@ { "multimap": "Attributes", "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 } ] }, @@ -319,6 +327,10 @@ "multimap": "Attributes", "name": "Attributes" }, + { + "name": "DroppedAttributesCount", + "primitive": 1 + }, { "array": { "struct": "Event" @@ -360,6 +372,10 @@ { "multimap": "Attributes", "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 } ] }, @@ -379,6 +395,10 @@ { "multimap": "Attributes", "name": "Attributes" + }, + { + "name": "DroppedAttributesCount", + "primitive": 1 } ] }, diff --git a/stef-spec/specification.md b/stef-spec/specification.md index a82f84b..3bc8ec5 100644 --- a/stef-spec/specification.md +++ b/stef-spec/specification.md @@ -169,8 +169,8 @@ oneof PointValue { } multimap Attributes { - Key string - Value string + key string + value string } ``` @@ -271,8 +271,8 @@ above `Measurement` example by the following tree: root Measurement |- MetricName string |- Attributes Attributes - | |- Key string - | |- Value string + | |- key string + | |- value string |- Timestamp uint64 |- Value PointValue |- Int64 int64 @@ -313,8 +313,8 @@ oneof PointValue { } multimap Attributes { - Key string - Value AnyValue + key string + value AnyValue } oneof AnyValue { @@ -324,8 +324,8 @@ oneof AnyValue { } multimap KVList { - Key string - Value AnyValue + key string + value AnyValue } ``` @@ -336,13 +336,13 @@ and KVList mutually refer to each other. The corresponding schema tree looks lik root Measurement |- MetricName string |- Attributes Attributes - | |- Key string - | |- Value AnyValue + | |- key string + | |- value AnyValue | |- String string | |- Array []AnyValue <--- loop detected here, backtrack. Non-primitive leaf. | |- KVList KVList - | |- Key string - | |- Value AnyValue <--- loop detected here, backtrack. Non-primitive leaf. + | |- key string + | |- value AnyValue <--- loop detected here, backtrack. Non-primitive leaf. |- Timestamp uint64 |- Value PointValue |- Int64 int64 @@ -365,7 +365,7 @@ Secondly, because the schema allows recursive types a record may contain more th value associated with the same node in the schema tree. Consider the following `AnyValue`: ``` -AnyValue = { KVList = { Key = "abc", Value = { AnyValue = { String = "xyx" } } } } +AnyValue = { KVList = { key = "abc", value = { AnyValue = { String = "xyx" } } } } ``` Represented as a tree this AnyValue can be laid out as: @@ -373,7 +373,7 @@ Represented as a tree this AnyValue can be laid out as: ``` AnyValue |- KVList - |- Key = "abc" + |- key = "abc" |- Value |- AnyValue |- String = "xyz" @@ -523,13 +523,13 @@ Tree Column Codec Type root Measurement 1 struct |- MetricName string 2 string |- Attributes Attributes 3 multimap - | |- Key string 4 string - | |- Value AnyValue 5 oneof + | |- key string 4 string + | |- value AnyValue 5 oneof | |- String string 6 string | |- Array []AnyValue 7 array | |- KVList KVList 8 multimap - | |- Key string 9 string - | |- Value AnyValue 10 oneof + | |- key string 9 string + | |- value AnyValue 10 oneof |- Timestamp uint64 11 uint64 |- Value PointValue 12 oneof |- Int64 int64 13 int64 diff --git a/stefgen/generator/compileschema.go b/stefgen/generator/compileschema.go index 70c8d1a..7df09ed 100644 --- a/stefgen/generator/compileschema.go +++ b/stefgen/generator/compileschema.go @@ -11,7 +11,6 @@ func compileSchema(src *schema.Schema) (*genSchema, error) { PackageName: src.PackageName, Structs: map[string]*genStructDef{}, Multimaps: map[string]*genMapDef{}, - MainStruct: src.MainStruct, } for name, struc := range src.Structs { @@ -26,8 +25,12 @@ func compileSchema(src *schema.Schema) (*genSchema, error) { return nil, err } - stack := recurseStack{asMap: map[string]bool{}} - computeRecursiveStruct(dst.Structs[dst.MainStruct], &stack) + for _, struc := range dst.Structs { + if struc.IsRoot { + stack := recurseStack{asMap: map[string]bool{}} + computeRecursiveStruct(struc, &stack) + } + } return dst, nil } diff --git a/stefgen/generator/generator.go b/stefgen/generator/generator.go index 58c87a9..590c9ea 100644 --- a/stefgen/generator/generator.go +++ b/stefgen/generator/generator.go @@ -108,7 +108,6 @@ func (g *Generator) oStartFile(fileName string) error { func (g *Generator) oTemplate(templateName, outputFileName string, data map[string]any) error { data["PackageName"] = g.compiledSchema.PackageName - data["MainStructName"] = g.compiledSchema.MainStruct if err := g.oStartFile(outputFileName); err != nil { return err diff --git a/stefgen/generator/genschema.go b/stefgen/generator/genschema.go index 183a2bd..2d07a97 100644 --- a/stefgen/generator/genschema.go +++ b/stefgen/generator/genschema.go @@ -12,7 +12,6 @@ type genSchema struct { PackageName string Structs map[string]*genStructDef Multimaps map[string]*genMapDef - MainStruct string } func (s *genSchema) SchemaStr() string { @@ -30,8 +29,8 @@ func (s *genSchema) SchemaStr() string { str += "struct" } str += " " + struc.Name - if struc.Name == s.MainStruct { - str += " main" + if struc.IsRoot { + str += " root" } if struc.Dict != "" { str += " dict(" + struc.Dict + ")" diff --git a/stefgen/generator/structs.go b/stefgen/generator/structs.go index cacc02b..b852336 100644 --- a/stefgen/generator/structs.go +++ b/stefgen/generator/structs.go @@ -74,7 +74,7 @@ func (g *Generator) oStruct(str *genStructDef) error { "Fields": fields, "DictName": str.Dict, "Type": str, - "IsMainStruct": str.Name == g.compiledSchema.MainStruct, + "IsMainStruct": str.IsRoot, "OptionalFieldCount": optionalFieldIndex, }