Skip to content

Commit

Permalink
Improve error messages for modular pipelines (#3716)
Browse files Browse the repository at this point in the history
* Improve error messages with more details

Signed-off-by: Ahdra Merali <[email protected]>

* Fix existing tests

Signed-off-by: Ahdra Merali <[email protected]>

* Add tests

Signed-off-by: Ahdra Merali <[email protected]>

* Apply suggestions from code review

Signed-off-by: Ahdra Merali <[email protected]>

* Apply suggestions from code review pt 2

Signed-off-by: Ahdra Merali <[email protected]>

---------

Signed-off-by: Ahdra Merali <[email protected]>
  • Loading branch information
AhdraMeraliQB authored Mar 18, 2024
1 parent 32b87be commit 06cf288
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 11 deletions.
24 changes: 19 additions & 5 deletions kedro/pipeline/modular_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import copy
import difflib
from typing import AbstractSet, Iterable

from kedro.pipeline.node import Node
Expand Down Expand Up @@ -52,10 +53,14 @@ def _validate_inputs_outputs(
free_inputs = {_strip_transcoding(i) for i in pipe.inputs()}

if not inputs <= free_inputs:
raise ModularPipelineError("Inputs should be free inputs to the pipeline")
raise ModularPipelineError(
"Inputs must not be outputs from another node in the same pipeline"
)

if outputs & free_inputs:
raise ModularPipelineError("Outputs can't contain free inputs to the pipeline")
raise ModularPipelineError(
"All outputs must be generated by some node within the pipeline"
)


def _validate_datasets_exist(
Expand All @@ -64,16 +69,25 @@ def _validate_datasets_exist(
parameters: AbstractSet[str],
pipe: Pipeline,
) -> None:
"""Validate that inputs, parameters and outputs map correctly onto the provided nodes."""
inputs = {_strip_transcoding(k) for k in inputs}
outputs = {_strip_transcoding(k) for k in outputs}

existing = {_strip_transcoding(ds) for ds in pipe.datasets()}
non_existent = (inputs | outputs | parameters) - existing
if non_existent:
raise ModularPipelineError(
f"Failed to map datasets and/or parameters: "
f"{', '.join(sorted(non_existent))}"
sorted_non_existent = sorted(non_existent)
possible_matches = []
for non_existent_input in sorted_non_existent:
possible_matches += difflib.get_close_matches(non_existent_input, existing)

error_msg = f"Failed to map datasets and/or parameters onto the nodes provided: {', '.join(sorted_non_existent)}"
suggestions = (
f" - did you mean one of these instead: {', '.join(possible_matches)}"
if possible_matches
else ""
)
raise ModularPipelineError(error_msg + suggestions)


def _get_dataset_names_mapping(
Expand Down
63 changes: 57 additions & 6 deletions tests/pipeline/test_modular_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,16 +217,67 @@ def test_empty_output(self):
),
],
)
def test_missing_dataset_name(
def test_missing_dataset_name_no_suggestion(
self, func, inputs, outputs, inputs_map, outputs_map, expected_missing
): # noqa: PLR0913
raw_pipeline = modular_pipeline([node(func, inputs, outputs)])

with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e:
with pytest.raises(
ModularPipelineError, match=r"Failed to map datasets and/or parameters"
) as e:
pipeline(
raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map
)
assert ", ".join(expected_missing) in str(e.value)
assert " - did you mean one of these instead: " not in str(e.value)

@pytest.mark.parametrize(
"func, inputs, outputs, inputs_map, outputs_map, expected_missing, expected_suggestion",
[
# Testing inputs
(identity, "IN", "OUT", {"I": "in"}, {}, ["I"], ["IN"]),
(
biconcat,
["IN1", "IN2"],
"OUT",
{"IN": "input"},
{},
["IN"],
["IN2", "IN1"],
),
# Testing outputs
(identity, "IN", "OUT", {}, {"OUT_": "output"}, ["OUT_"], ["OUT"]),
(
identity,
"IN",
["OUT1", "OUT2"],
{},
{"OUT": "out"},
["OUT"],
["OUT2", "OUT1"],
),
],
)
def test_missing_dataset_with_suggestion(
self,
func,
inputs,
outputs,
inputs_map,
outputs_map,
expected_missing,
expected_suggestion,
):
raw_pipeline = modular_pipeline([node(func, inputs, outputs)])

with pytest.raises(
ModularPipelineError, match=r"Failed to map datasets and/or parameters"
) as e:
pipeline(
raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map
)
assert ", ".join(expected_missing) in str(e.value)
assert ", ".join(expected_suggestion) in str(e.value)

def test_node_properties_preserved(self):
"""
Expand Down Expand Up @@ -378,11 +429,11 @@ def test_non_existent_parameters_mapped(self):
]
)

pattern = r"Failed to map datasets and/or parameters: params:beta"
pattern = r"Failed to map datasets and/or parameters onto the nodes provided: params:beta"
with pytest.raises(ModularPipelineError, match=pattern):
pipeline(raw_pipeline, parameters={"beta": "gamma"})

pattern = r"Failed to map datasets and/or parameters: parameters"
pattern = r"Failed to map datasets and/or parameters onto the nodes provided: parameters"
with pytest.raises(ModularPipelineError, match=pattern):
pipeline(raw_pipeline, parameters={"parameters": "some_yaml_dataset"})

Expand All @@ -394,7 +445,7 @@ def test_bad_inputs_mapping(self):
]
)

pattern = "Inputs should be free inputs to the pipeline"
pattern = "Inputs must not be outputs from another node in the same pipeline"
with pytest.raises(ModularPipelineError, match=pattern):
pipeline(raw_pipeline, inputs={"AA": "CC"})

Expand All @@ -406,7 +457,7 @@ def test_bad_outputs_mapping(self):
]
)

pattern = "Outputs can't contain free inputs to the pipeline"
pattern = "All outputs must be generated by some node within the pipeline"
with pytest.raises(ModularPipelineError, match=pattern):
pipeline(raw_pipeline, outputs={"A": "C"})

Expand Down

0 comments on commit 06cf288

Please sign in to comment.