Skip to content

Commit

Permalink
Make bundle JSON schema modular with $defs (#1700)
Browse files Browse the repository at this point in the history
## Changes
This PR makes sweeping changes to the way we generate and test the
bundle JSON schema. The main benefits are:

1. More modular JSON schema. Every definition in the schema now is one
level deep and points to references instead of inlining the entire
schema for a field. This unblocks PyDABs from taking a dependency on the
JSON schema.

2. Generate the JSON schema during CLI code generation. Directly stream
it instead of computing it at runtime whenever a user calls `databricks
bundle schema`. This is nice because we no longer need to embed a
partial OpenAPI spec in the CLI. Down the line, we can add a `Schema()`
method to every struct in the Databricks Go SDK and remove the
dependency on the OpenAPI spec altogether. It'll become more important
once we decouple Go SDK structs and methods from the underlying APIs.

3. Add enum values for Go SDK fields in the JSON schema. Better
autocompletion and validation for these fields. As a follow-up, we can
add enum values for non-Go SDK enums as well (created internal ticket to
track).

4. Use "packageName.structName" as a key to read JSON schemas from the
OpenAPI spec for Go SDK structs. Before, we would use an unrolled
presentation of the JSON schema (stored in `bundle_descriptions.json`),
which was complex to parse and include in the final JSON schema output.
This also means loading values from the OpenAPI spec for `target` schema
works automatically and no longer needs custom code.
5. Support recursive types (eg: `for_each_task`). With us now using
$refs everywhere it's trivial to support.
6. Using complex variables would be invalid according to the schema
generated before this PR. Now that bug is fixed. In the future adding
more custom rules will be easier as well due to the single level nature
of the JSON schema.


Since this is a complete change of approach in how we generate the JSON
schema, there are a few (very minor) regressions worth calling out.
1. We'll lose a few custom descriptions for non Go SDK structs that were
a part of `bundle_descriptions.json`. Support for those can be added in
the future as a followup.
2. Since now the final JSON schema is a static artefact, we lose some
lead time for the signal that JSON schema integration tests are failing.
It's okay though since we have a lot of coverage via the existing unit
tests.

## Tests
Unit tests. End to end tests are being added in this PR:
#1726

Previous unit tests were all deleted because they were bloated. Effort
was made to make the new unit tests provide (almost) equivalent
coverage.
  • Loading branch information
shreyas-goenka authored Sep 10, 2024
1 parent d3e221a commit 28b39cd
Show file tree
Hide file tree
Showing 26 changed files with 6,731 additions and 9,880 deletions.
4 changes: 2 additions & 2 deletions .codegen.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
"toolchain": {
"required": ["go"],
"post_generate": [
"go run ./bundle/internal/bundle/schema/main.go ./bundle/schema/docs/bundle_descriptions.json",
"go run ./bundle/internal/schema/*.go ./bundle/schema/jsonschema.json",
"echo 'bundle/internal/tf/schema/\\*.go linguist-generated=true' >> ./.gitattributes",
"echo 'go.sum linguist-generated=true' >> ./.gitattributes",
"echo 'bundle/schema/docs/bundle_descriptions.json linguist-generated=true' >> ./.gitattributes"
"echo 'bundle/schema/jsonschema.json linguist-generated=true' >> ./.gitattributes"
]
}
}
2 changes: 1 addition & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,4 @@ cmd/workspace/workspace-conf/workspace-conf.go linguist-generated=true
cmd/workspace/workspace/workspace.go linguist-generated=true
bundle/internal/tf/schema/\*.go linguist-generated=true
go.sum linguist-generated=true
bundle/schema/docs/bundle_descriptions.json linguist-generated=true
bundle/schema/jsonschema.json linguist-generated=true
42 changes: 0 additions & 42 deletions bundle/internal/bundle/schema/main.go

This file was deleted.

93 changes: 93 additions & 0 deletions bundle/internal/schema/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package main

import (
"encoding/json"
"fmt"
"log"
"os"
"reflect"

"github.com/databricks/cli/bundle/config"
"github.com/databricks/cli/bundle/config/variable"
"github.com/databricks/cli/libs/jsonschema"
)

func interpolationPattern(s string) string {
return fmt.Sprintf(`\$\{(%s(\.[a-zA-Z]+([-_]?[a-zA-Z0-9]+)*(\[[0-9]+\])*)+)\}`, s)
}

func addInterpolationPatterns(typ reflect.Type, s jsonschema.Schema) jsonschema.Schema {
if typ == reflect.TypeOf(config.Root{}) || typ == reflect.TypeOf(variable.Variable{}) {
return s
}

switch s.Type {
case jsonschema.ArrayType, jsonschema.ObjectType:
// arrays and objects can have complex variable values specified.
return jsonschema.Schema{
AnyOf: []jsonschema.Schema{
s,
{
Type: jsonschema.StringType,
Pattern: interpolationPattern("var"),
}},
}
case jsonschema.IntegerType, jsonschema.NumberType, jsonschema.BooleanType:
// primitives can have variable values, or references like ${bundle.xyz}
// or ${workspace.xyz}
return jsonschema.Schema{
AnyOf: []jsonschema.Schema{
s,
{Type: jsonschema.StringType, Pattern: interpolationPattern("resources")},
{Type: jsonschema.StringType, Pattern: interpolationPattern("bundle")},
{Type: jsonschema.StringType, Pattern: interpolationPattern("workspace")},
{Type: jsonschema.StringType, Pattern: interpolationPattern("artifacts")},
{Type: jsonschema.StringType, Pattern: interpolationPattern("var")},
},
}
default:
return s
}
}

func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: go run main.go <output-file>")
os.Exit(1)
}

// Output file, where the generated JSON schema will be written to.
outputFile := os.Args[1]

// Input file, the databricks openapi spec.
inputFile := os.Getenv("DATABRICKS_OPENAPI_SPEC")
if inputFile == "" {
log.Fatal("DATABRICKS_OPENAPI_SPEC environment variable not set")
}

p, err := newParser(inputFile)
if err != nil {
log.Fatal(err)
}

// Generate the JSON schema from the bundle Go struct.
s, err := jsonschema.FromType(reflect.TypeOf(config.Root{}), []func(reflect.Type, jsonschema.Schema) jsonschema.Schema{
p.addDescriptions,
p.addEnums,
addInterpolationPatterns,
})
if err != nil {
log.Fatal(err)
}

b, err := json.MarshalIndent(s, "", " ")
if err != nil {
log.Fatal(err)
}

// Write the schema descriptions to the output file.
err = os.WriteFile(outputFile, b, 0644)
if err != nil {
log.Fatal(err)
}
}
123 changes: 123 additions & 0 deletions bundle/internal/schema/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package main

import (
"encoding/json"
"fmt"
"os"
"path"
"reflect"
"strings"

"github.com/databricks/cli/libs/jsonschema"
)

type Components struct {
Schemas map[string]jsonschema.Schema `json:"schemas,omitempty"`
}

type Specification struct {
Components Components `json:"components"`
}

type openapiParser struct {
ref map[string]jsonschema.Schema
}

func newParser(path string) (*openapiParser, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
}

spec := Specification{}
err = json.Unmarshal(b, &spec)
if err != nil {
return nil, err
}

p := &openapiParser{}
p.ref = spec.Components.Schemas
return p, nil
}

// This function checks if the input type:
// 1. Is a Databricks Go SDK type.
// 2. Has a Databricks Go SDK type embedded in it.
//
// If the above conditions are met, the function returns the JSON schema
// corresponding to the Databricks Go SDK type from the OpenAPI spec.
func (p *openapiParser) findRef(typ reflect.Type) (jsonschema.Schema, bool) {
typs := []reflect.Type{typ}

// Check for embedded Databricks Go SDK types.
if typ.Kind() == reflect.Struct {
for i := 0; i < typ.NumField(); i++ {
if !typ.Field(i).Anonymous {
continue
}

// Deference current type if it's a pointer.
ctyp := typ.Field(i).Type
for ctyp.Kind() == reflect.Ptr {
ctyp = ctyp.Elem()
}

typs = append(typs, ctyp)
}
}

for _, ctyp := range typs {
// Skip if it's not a Go SDK type.
if !strings.HasPrefix(ctyp.PkgPath(), "github.com/databricks/databricks-sdk-go") {
continue
}

pkgName := path.Base(ctyp.PkgPath())
k := fmt.Sprintf("%s.%s", pkgName, ctyp.Name())

// Skip if the type is not in the openapi spec.
_, ok := p.ref[k]
if !ok {
continue
}

// Return the first Go SDK type found in the openapi spec.
return p.ref[k], true
}

return jsonschema.Schema{}, false
}

// Use the OpenAPI spec to load descriptions for the given type.
func (p *openapiParser) addDescriptions(typ reflect.Type, s jsonschema.Schema) jsonschema.Schema {
ref, ok := p.findRef(typ)
if !ok {
return s
}

s.Description = ref.Description
for k, v := range s.Properties {
if refProp, ok := ref.Properties[k]; ok {
v.Description = refProp.Description
}
}

return s
}

// Use the OpenAPI spec add enum values for the given type.
func (p *openapiParser) addEnums(typ reflect.Type, s jsonschema.Schema) jsonschema.Schema {
ref, ok := p.findRef(typ)
if !ok {
return s
}

s.Enum = append(s.Enum, ref.Enum...)
for k, v := range s.Properties {
if refProp, ok := ref.Properties[k]; ok {
v.Enum = append(v.Enum, refProp.Enum...)
}
}

return s
}
18 changes: 0 additions & 18 deletions bundle/schema/README.md

This file was deleted.

Loading

0 comments on commit 28b39cd

Please sign in to comment.