Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PythonMutator: propagate source locations #1783

Merged
merged 11 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bundle/config/mutator/python/python_diagnostics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/databricks/cli/libs/dyn"
)

// pythonDiagnostic is a single entry in diagnostics.json
type pythonDiagnostic struct {
Severity pythonSeverity `json:"severity"`
Summary string `json:"summary"`
Expand Down
194 changes: 194 additions & 0 deletions bundle/config/mutator/python/python_locations.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
package python

import (
"encoding/json"
"fmt"
"io"
"path/filepath"

"github.com/databricks/cli/libs/dyn"
)

// generatedFileName is used as the virtual file name for YAML generated by Python code.
//
// mergePythonLocations replaces dyn.Location with generatedFileName with locations loaded
// from locations.json
const generatedFileName = "__generated_by_python__.yml"

// pythonLocations is data structure for efficient location lookup for a given path
kanterov marked this conversation as resolved.
Show resolved Hide resolved
//
// Locations form a tree, and we assign locations of the closest ancestor to each dyn.Value based on its path.
// We implement it as a trie (prefix tree) where keys are components of the path. With that, lookups are O(n)
// where n is the number of components in the path.
//
// For example, with locations.json:
//
// {"path": "resources.jobs.job_0", "file": "resources/job_0.py", "line": 3, "column": 5}
// {"path": "resources.jobs.job_0.tasks[0].task_key", "file": "resources/job_0.py", "line": 10, "column": 5}
// {"path": "resources.jobs.job_1", "file": "resources/job_1.py", "line": 5, "column": 7}
//
// - resources.jobs.job_0.tasks[0].task_key is located at job_0.py:10:5
//
// - resources.jobs.job_0.tasks[0].email_notifications is located at job_0.py:3:5,
// because we use the location of the job as the most precise approximation.
//
// See pythonLocationEntry for the structure of a single entry in locations.json
type pythonLocations struct {
// descendants referenced by index, e.g. '.foo'
keys map[string]*pythonLocations

// descendants referenced by key, e.g. '[0]'
indexes map[int]*pythonLocations

// location for the current node if it exists
location dyn.Location

// if true, location is present
exists bool
}

// pythonLocationEntry is a single entry in locations.json
type pythonLocationEntry struct {
lennartkats-db marked this conversation as resolved.
Show resolved Hide resolved
Path string `json:"path"`
File string `json:"file"`
Line int `json:"line"`
Column int `json:"column"`
}

// mergePythonLocations applies locations from Python mutator into given dyn.Value
//
// The primary use-case is to merge locations.json with output.json, so that any
// validation errors will point to Python source code instead of generated YAML.
func mergePythonLocations(value dyn.Value, locations *pythonLocations) (dyn.Value, error) {
return dyn.Walk(value, func(path dyn.Path, value dyn.Value) (dyn.Value, error) {
newLocation, ok := findPythonLocation(locations, path)
if !ok {
return value, nil
}

// The first item in the list is the "last" location used for error reporting
//
// Loaded YAML uses virtual file path as location, we remove any of such references,
// because they should use 'newLocation' instead.
//
// We preserve any previous non-virtual locations in case when Python function modified
// resource defined in YAML.
newLocations := append(
[]dyn.Location{newLocation},
removeVirtualLocations(value.Locations())...,
)

return value.WithLocations(newLocations), nil
})
}

func removeVirtualLocations(locations []dyn.Location) []dyn.Location {
var newLocations []dyn.Location

for _, location := range locations {
if filepath.Base(location.File) == generatedFileName {
continue
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic is hard to parse in combination with the append above.

If I understand correctly, you want to want the output to:

  • Have the Python location as the first element
  • Filter out the locations with generatedFileName for the filename

It would be clearer if newLocations was initialized by assignment and if the loop got a comment saying that it only appends locations that are relevant (i.e. not the ones with the virtual file path).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've extracted the code into a separate function, and added elaborate comment


newLocations = append(newLocations, location)
}

return newLocations
}

// parsePythonLocations parses locations.json from the Python mutator.
//
// locations file is newline-separated JSON objects with pythonLocationEntry structure.
func parsePythonLocations(input io.Reader) (*pythonLocations, error) {
decoder := json.NewDecoder(input)
locations := newPythonLocations()

for decoder.More() {
var entry pythonLocationEntry

err := decoder.Decode(&entry)
if err != nil {
return nil, fmt.Errorf("failed to parse python location: %s", err)
}

path, err := dyn.NewPathFromString(entry.Path)
if err != nil {
return nil, fmt.Errorf("failed to parse python location: %s", err)
}

location := dyn.Location{
File: entry.File,
Line: entry.Line,
Column: entry.Column,
}

putPythonLocation(locations, path, location)
}

return locations, nil
}

// putPythonLocation puts the location to the trie for the given path
func putPythonLocation(trie *pythonLocations, path dyn.Path, location dyn.Location) {
currentNode := trie

for _, component := range path {
if key := component.Key(); key != "" {
if _, ok := currentNode.keys[key]; !ok {
currentNode.keys[key] = newPythonLocations()
}

currentNode = currentNode.keys[key]
} else {
index := component.Index()
if _, ok := currentNode.indexes[index]; !ok {
currentNode.indexes[index] = newPythonLocations()
}

currentNode = currentNode.indexes[index]
}
}

currentNode.location = location
currentNode.exists = true
}
kanterov marked this conversation as resolved.
Show resolved Hide resolved

// newPythonLocations creates a new trie node
func newPythonLocations() *pythonLocations {
return &pythonLocations{
keys: make(map[string]*pythonLocations),
indexes: make(map[int]*pythonLocations),
}
}

// findPythonLocation finds the location or closest ancestor location in the trie for the given path
// if no ancestor or exact location is found, false is returned.
func findPythonLocation(locations *pythonLocations, path dyn.Path) (dyn.Location, bool) {
currentNode := locations
lastLocation := locations.location
exists := locations.exists

for _, component := range path {
if key := component.Key(); key != "" {
if _, ok := currentNode.keys[key]; !ok {
break
}

currentNode = currentNode.keys[key]
} else {
index := component.Index()
if _, ok := currentNode.indexes[index]; !ok {
break
}

currentNode = currentNode.indexes[index]
}

if currentNode.exists {
lastLocation = currentNode.location
exists = true
}
}

return lastLocation, exists
}
179 changes: 179 additions & 0 deletions bundle/config/mutator/python/python_locations_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
package python

import (
"bytes"
"path/filepath"
"testing"

"github.com/databricks/cli/libs/diag"
"github.com/stretchr/testify/require"

"github.com/databricks/cli/libs/dyn"
assert "github.com/databricks/cli/libs/dyn/dynassert"
)

func TestMergeLocations(t *testing.T) {
pythonLocation := dyn.Location{File: "foo.py", Line: 1, Column: 1}
generatedLocation := dyn.Location{File: generatedFileName, Line: 1, Column: 1}
yamlLocation := dyn.Location{File: "foo.yml", Line: 1, Column: 1}

locations := newPythonLocations()
putPythonLocation(locations, dyn.MustPathFromString("foo"), pythonLocation)

input := dyn.NewValue(
map[string]dyn.Value{
"foo": dyn.NewValue(
map[string]dyn.Value{
"baz": dyn.NewValue("baz", []dyn.Location{yamlLocation}),
"qux": dyn.NewValue("baz", []dyn.Location{generatedLocation, yamlLocation}),
},
[]dyn.Location{},
),
"bar": dyn.NewValue("baz", []dyn.Location{generatedLocation}),
},
[]dyn.Location{yamlLocation},
)

expected := dyn.NewValue(
map[string]dyn.Value{
"foo": dyn.NewValue(
map[string]dyn.Value{
// pythonLocation is appended to the beginning of the list if absent
"baz": dyn.NewValue("baz", []dyn.Location{pythonLocation, yamlLocation}),
// generatedLocation is replaced by pythonLocation
"qux": dyn.NewValue("baz", []dyn.Location{pythonLocation, yamlLocation}),
},
[]dyn.Location{pythonLocation},
),
// if location is unknown, we keep it as-is
"bar": dyn.NewValue("baz", []dyn.Location{generatedLocation}),
},
[]dyn.Location{yamlLocation},
)

actual, err := mergePythonLocations(input, locations)

assert.NoError(t, err)
assert.Equal(t, expected, actual)
}

func TestFindLocation(t *testing.T) {
location0 := dyn.Location{File: "foo.py", Line: 1, Column: 1}
location1 := dyn.Location{File: "foo.py", Line: 2, Column: 1}

locations := newPythonLocations()
putPythonLocation(locations, dyn.MustPathFromString("foo"), location0)
putPythonLocation(locations, dyn.MustPathFromString("foo.bar"), location1)

actual, exists := findPythonLocation(locations, dyn.MustPathFromString("foo.bar"))

assert.True(t, exists)
assert.Equal(t, location1, actual)
}

func TestFindLocation_indexPathComponent(t *testing.T) {
location0 := dyn.Location{File: "foo.py", Line: 1, Column: 1}
location1 := dyn.Location{File: "foo.py", Line: 2, Column: 1}
location2 := dyn.Location{File: "foo.py", Line: 3, Column: 1}

locations := newPythonLocations()
putPythonLocation(locations, dyn.MustPathFromString("foo"), location0)
putPythonLocation(locations, dyn.MustPathFromString("foo.bar"), location1)
putPythonLocation(locations, dyn.MustPathFromString("foo.bar[0]"), location2)

actual, exists := findPythonLocation(locations, dyn.MustPathFromString("foo.bar[0]"))

assert.True(t, exists)
assert.Equal(t, location2, actual)
}

func TestFindLocation_closestAncestorLocation(t *testing.T) {
location0 := dyn.Location{File: "foo.py", Line: 1, Column: 1}
location1 := dyn.Location{File: "foo.py", Line: 2, Column: 1}

locations := newPythonLocations()
putPythonLocation(locations, dyn.MustPathFromString("foo"), location0)
putPythonLocation(locations, dyn.MustPathFromString("foo.bar"), location1)

actual, exists := findPythonLocation(locations, dyn.MustPathFromString("foo.bar.baz"))

assert.True(t, exists)
assert.Equal(t, location1, actual)
}

func TestFindLocation_unknownLocation(t *testing.T) {
location0 := dyn.Location{File: "foo.py", Line: 1, Column: 1}
location1 := dyn.Location{File: "foo.py", Line: 2, Column: 1}

locations := newPythonLocations()
putPythonLocation(locations, dyn.MustPathFromString("foo"), location0)
putPythonLocation(locations, dyn.MustPathFromString("foo.bar"), location1)

_, exists := findPythonLocation(locations, dyn.MustPathFromString("bar"))

assert.False(t, exists)
}

func TestLoadOutput(t *testing.T) {
location := dyn.Location{File: "my_job.py", Line: 1, Column: 1}
bundleRoot := t.TempDir()
output := `{
"resources": {
"jobs": {
"my_job": {
"name": "my_job",
"tasks": [
{
"task_key": "my_task",
"notebook_task": {
"notebook_path": "my_notebook"
}
}
]
}
}
}
}`

locations := newPythonLocations()
putPythonLocation(
locations,
dyn.MustPathFromString("resources.jobs.my_job"),
location,
)

value, diags := loadOutput(
bundleRoot,
bytes.NewReader([]byte(output)),
locations,
)

assert.Equal(t, diag.Diagnostics{}, diags)

name, err := dyn.Get(value, "resources.jobs.my_job.name")
require.NoError(t, err)
require.Equal(t, []dyn.Location{location}, name.Locations())

// until we implement path normalization, we have to keep locations of values
// that change semantic depending on their location
//
// note: it's important to have absolute path including 'bundleRoot'
// because mutator pipeline already has expanded locations into absolute path
notebookPath, err := dyn.Get(value, "resources.jobs.my_job.tasks[0].notebook_task.notebook_path")
require.NoError(t, err)
require.Len(t, notebookPath.Locations(), 1)
require.Equal(t, filepath.Join(bundleRoot, generatedFileName), notebookPath.Locations()[0].File)
}

func TestParsePythonLocations(t *testing.T) {
expected := dyn.Location{File: "foo.py", Line: 1, Column: 2}

input := `{"path": "foo", "file": "foo.py", "line": 1, "column": 2}`
reader := bytes.NewReader([]byte(input))
locations, err := parsePythonLocations(reader)

assert.NoError(t, err)

assert.True(t, locations.keys["foo"].exists)
assert.Equal(t, expected, locations.keys["foo"].location)
}
Loading
Loading