From 06cf288ac96f7d0fd8dd4825c8c6b577f5529654 Mon Sep 17 00:00:00 2001
From: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com>
Date: Mon, 18 Mar 2024 17:33:18 +0000
Subject: [PATCH] Improve error messages for modular pipelines (#3716)

* Improve error messages with more details

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Fix existing tests

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Add tests

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Apply suggestions from code review

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

* Apply suggestions from code review pt 2

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>

---------

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>
---
 kedro/pipeline/modular_pipeline.py      | 24 ++++++++--
 tests/pipeline/test_modular_pipeline.py | 63 ++++++++++++++++++++++---
 2 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py
index 6e89b810b4..9ac2d6a1d1 100644
--- a/kedro/pipeline/modular_pipeline.py
+++ b/kedro/pipeline/modular_pipeline.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import copy
+import difflib
 from typing import AbstractSet, Iterable
 
 from kedro.pipeline.node import Node
@@ -52,10 +53,14 @@ def _validate_inputs_outputs(
     free_inputs = {_strip_transcoding(i) for i in pipe.inputs()}
 
     if not inputs <= free_inputs:
-        raise ModularPipelineError("Inputs should be free inputs to the pipeline")
+        raise ModularPipelineError(
+            "Inputs must not be outputs from another node in the same pipeline"
+        )
 
     if outputs & free_inputs:
-        raise ModularPipelineError("Outputs can't contain free inputs to the pipeline")
+        raise ModularPipelineError(
+            "All outputs must be generated by some node within the pipeline"
+        )
 
 
 def _validate_datasets_exist(
@@ -64,16 +69,25 @@ def _validate_datasets_exist(
     parameters: AbstractSet[str],
     pipe: Pipeline,
 ) -> None:
+    """Validate that inputs, parameters and outputs map correctly onto the provided nodes."""
     inputs = {_strip_transcoding(k) for k in inputs}
     outputs = {_strip_transcoding(k) for k in outputs}
 
     existing = {_strip_transcoding(ds) for ds in pipe.datasets()}
     non_existent = (inputs | outputs | parameters) - existing
     if non_existent:
-        raise ModularPipelineError(
-            f"Failed to map datasets and/or parameters: "
-            f"{', '.join(sorted(non_existent))}"
+        sorted_non_existent = sorted(non_existent)
+        possible_matches = []
+        for non_existent_input in sorted_non_existent:
+            possible_matches += difflib.get_close_matches(non_existent_input, existing)
+
+        error_msg = f"Failed to map datasets and/or parameters onto the nodes provided: {', '.join(sorted_non_existent)}"
+        suggestions = (
+            f" - did you mean one of these instead: {', '.join(possible_matches)}"
+            if possible_matches
+            else ""
         )
+        raise ModularPipelineError(error_msg + suggestions)
 
 
 def _get_dataset_names_mapping(
diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py
index bfb2e347c3..c1e76867b5 100644
--- a/tests/pipeline/test_modular_pipeline.py
+++ b/tests/pipeline/test_modular_pipeline.py
@@ -217,16 +217,67 @@ def test_empty_output(self):
             ),
         ],
     )
-    def test_missing_dataset_name(
+    def test_missing_dataset_name_no_suggestion(
         self, func, inputs, outputs, inputs_map, outputs_map, expected_missing
     ):  # noqa: PLR0913
         raw_pipeline = modular_pipeline([node(func, inputs, outputs)])
 
-        with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e:
+        with pytest.raises(
+            ModularPipelineError, match=r"Failed to map datasets and/or parameters"
+        ) as e:
             pipeline(
                 raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map
             )
         assert ", ".join(expected_missing) in str(e.value)
+        assert " - did you mean one of these instead: " not in str(e.value)
+
+    @pytest.mark.parametrize(
+        "func, inputs, outputs, inputs_map, outputs_map, expected_missing, expected_suggestion",
+        [
+            # Testing inputs
+            (identity, "IN", "OUT", {"I": "in"}, {}, ["I"], ["IN"]),
+            (
+                biconcat,
+                ["IN1", "IN2"],
+                "OUT",
+                {"IN": "input"},
+                {},
+                ["IN"],
+                ["IN2", "IN1"],
+            ),
+            # Testing outputs
+            (identity, "IN", "OUT", {}, {"OUT_": "output"}, ["OUT_"], ["OUT"]),
+            (
+                identity,
+                "IN",
+                ["OUT1", "OUT2"],
+                {},
+                {"OUT": "out"},
+                ["OUT"],
+                ["OUT2", "OUT1"],
+            ),
+        ],
+    )
+    def test_missing_dataset_with_suggestion(
+        self,
+        func,
+        inputs,
+        outputs,
+        inputs_map,
+        outputs_map,
+        expected_missing,
+        expected_suggestion,
+    ):
+        raw_pipeline = modular_pipeline([node(func, inputs, outputs)])
+
+        with pytest.raises(
+            ModularPipelineError, match=r"Failed to map datasets and/or parameters"
+        ) as e:
+            pipeline(
+                raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map
+            )
+        assert ", ".join(expected_missing) in str(e.value)
+        assert ", ".join(expected_suggestion) in str(e.value)
 
     def test_node_properties_preserved(self):
         """
@@ -378,11 +429,11 @@ def test_non_existent_parameters_mapped(self):
             ]
         )
 
-        pattern = r"Failed to map datasets and/or parameters: params:beta"
+        pattern = r"Failed to map datasets and/or parameters onto the nodes provided: params:beta"
         with pytest.raises(ModularPipelineError, match=pattern):
             pipeline(raw_pipeline, parameters={"beta": "gamma"})
 
-        pattern = r"Failed to map datasets and/or parameters: parameters"
+        pattern = r"Failed to map datasets and/or parameters onto the nodes provided: parameters"
         with pytest.raises(ModularPipelineError, match=pattern):
             pipeline(raw_pipeline, parameters={"parameters": "some_yaml_dataset"})
 
@@ -394,7 +445,7 @@ def test_bad_inputs_mapping(self):
             ]
         )
 
-        pattern = "Inputs should be free inputs to the pipeline"
+        pattern = "Inputs must not be outputs from another node in the same pipeline"
         with pytest.raises(ModularPipelineError, match=pattern):
             pipeline(raw_pipeline, inputs={"AA": "CC"})
 
@@ -406,7 +457,7 @@ def test_bad_outputs_mapping(self):
             ]
         )
 
-        pattern = "Outputs can't contain free inputs to the pipeline"
+        pattern = "All outputs must be generated by some node within the pipeline"
         with pytest.raises(ModularPipelineError, match=pattern):
             pipeline(raw_pipeline, outputs={"A": "C"})