Improve error messages for modular pipelines (#3716)

* Improve error messages with more details Signed-off-by: Ahdra Merali <[email protected]> * Fix existing tests Signed-off-by: Ahdra Merali <[email protected]> * Add tests Signed-off-by: Ahdra Merali <[email protected]> * Apply suggestions from code review Signed-off-by: Ahdra Merali <[email protected]> * Apply suggestions from code review pt 2 Signed-off-by: Ahdra Merali <[email protected]> --------- Signed-off-by: Ahdra Merali <[email protected]>
kedro-org · Mar 18, 2024 · 06cf288 · 06cf288
1 parent 32b87be
commit 06cf288
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 11 deletions.
diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import copy
+import difflib
 from typing import AbstractSet, Iterable
 
 from kedro.pipeline.node import Node
@@ -52,10 +53,14 @@ def _validate_inputs_outputs(
     free_inputs = {_strip_transcoding(i) for i in pipe.inputs()}
 
     if not inputs <= free_inputs:
-        raise ModularPipelineError("Inputs should be free inputs to the pipeline")
+        raise ModularPipelineError(
+            "Inputs must not be outputs from another node in the same pipeline"
+        )
 
     if outputs & free_inputs:
-        raise ModularPipelineError("Outputs can't contain free inputs to the pipeline")
+        raise ModularPipelineError(
+            "All outputs must be generated by some node within the pipeline"
+        )
 
 
 def _validate_datasets_exist(
@@ -64,16 +69,25 @@ def _validate_datasets_exist(
     parameters: AbstractSet[str],
     pipe: Pipeline,
 ) -> None:
+    """Validate that inputs, parameters and outputs map correctly onto the provided nodes."""
     inputs = {_strip_transcoding(k) for k in inputs}
     outputs = {_strip_transcoding(k) for k in outputs}
 
     existing = {_strip_transcoding(ds) for ds in pipe.datasets()}
     non_existent = (inputs | outputs | parameters) - existing
     if non_existent:
-        raise ModularPipelineError(
-            f"Failed to map datasets and/or parameters: "
-            f"{', '.join(sorted(non_existent))}"
+        sorted_non_existent = sorted(non_existent)
+        possible_matches = []
+        for non_existent_input in sorted_non_existent:
+            possible_matches += difflib.get_close_matches(non_existent_input, existing)
+
+        error_msg = f"Failed to map datasets and/or parameters onto the nodes provided: {', '.join(sorted_non_existent)}"
+        suggestions = (
+            f" - did you mean one of these instead: {', '.join(possible_matches)}"
+            if possible_matches
+            else ""
         )
+        raise ModularPipelineError(error_msg + suggestions)
 
 
 def _get_dataset_names_mapping(

diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py
@@ -217,16 +217,67 @@ def test_empty_output(self):
             ),
         ],
     )
-    def test_missing_dataset_name(
+    def test_missing_dataset_name_no_suggestion(
         self, func, inputs, outputs, inputs_map, outputs_map, expected_missing
     ):  # noqa: PLR0913
         raw_pipeline = modular_pipeline([node(func, inputs, outputs)])
 
-        with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e:
+        with pytest.raises(
+            ModularPipelineError, match=r"Failed to map datasets and/or parameters"
+        ) as e:
             pipeline(
                 raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map
             )
         assert ", ".join(expected_missing) in str(e.value)
+        assert " - did you mean one of these instead: " not in str(e.value)
+
+    @pytest.mark.parametrize(
+        "func, inputs, outputs, inputs_map, outputs_map, expected_missing, expected_suggestion",
+        [
+            # Testing inputs
+            (identity, "IN", "OUT", {"I": "in"}, {}, ["I"], ["IN"]),
+            (
+                biconcat,
+                ["IN1", "IN2"],
+                "OUT",
+                {"IN": "input"},
+                {},
+                ["IN"],
+                ["IN2", "IN1"],
+            ),
+            # Testing outputs
+            (identity, "IN", "OUT", {}, {"OUT_": "output"}, ["OUT_"], ["OUT"]),
+            (
+                identity,
+                "IN",
+                ["OUT1", "OUT2"],
+                {},
+                {"OUT": "out"},
+                ["OUT"],
+                ["OUT2", "OUT1"],
+            ),
+        ],
+    )
+    def test_missing_dataset_with_suggestion(
+        self,
+        func,
+        inputs,
+        outputs,
+        inputs_map,
+        outputs_map,
+        expected_missing,
+        expected_suggestion,
+    ):
+        raw_pipeline = modular_pipeline([node(func, inputs, outputs)])
+
+        with pytest.raises(
+            ModularPipelineError, match=r"Failed to map datasets and/or parameters"
+        ) as e:
+            pipeline(
+                raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map
+            )
+        assert ", ".join(expected_missing) in str(e.value)
+        assert ", ".join(expected_suggestion) in str(e.value)
 
     def test_node_properties_preserved(self):
         """
@@ -378,11 +429,11 @@ def test_non_existent_parameters_mapped(self):
             ]
         )
 
-        pattern = r"Failed to map datasets and/or parameters: params:beta"
+        pattern = r"Failed to map datasets and/or parameters onto the nodes provided: params:beta"
         with pytest.raises(ModularPipelineError, match=pattern):
             pipeline(raw_pipeline, parameters={"beta": "gamma"})
 
-        pattern = r"Failed to map datasets and/or parameters: parameters"
+        pattern = r"Failed to map datasets and/or parameters onto the nodes provided: parameters"
         with pytest.raises(ModularPipelineError, match=pattern):
             pipeline(raw_pipeline, parameters={"parameters": "some_yaml_dataset"})
 
@@ -394,7 +445,7 @@ def test_bad_inputs_mapping(self):
             ]
         )
 
-        pattern = "Inputs should be free inputs to the pipeline"
+        pattern = "Inputs must not be outputs from another node in the same pipeline"
         with pytest.raises(ModularPipelineError, match=pattern):
             pipeline(raw_pipeline, inputs={"AA": "CC"})
 
@@ -406,7 +457,7 @@ def test_bad_outputs_mapping(self):
             ]
         )
 
-        pattern = "Outputs can't contain free inputs to the pipeline"
+        pattern = "All outputs must be generated by some node within the pipeline"
         with pytest.raises(ModularPipelineError, match=pattern):
             pipeline(raw_pipeline, outputs={"A": "C"})