From 06cf288ac96f7d0fd8dd4825c8c6b577f5529654 Mon Sep 17 00:00:00 2001 From: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:33:18 +0000 Subject: [PATCH] Improve error messages for modular pipelines (#3716) * Improve error messages with more details Signed-off-by: Ahdra Merali * Fix existing tests Signed-off-by: Ahdra Merali * Add tests Signed-off-by: Ahdra Merali * Apply suggestions from code review Signed-off-by: Ahdra Merali * Apply suggestions from code review pt 2 Signed-off-by: Ahdra Merali --------- Signed-off-by: Ahdra Merali --- kedro/pipeline/modular_pipeline.py | 24 ++++++++-- tests/pipeline/test_modular_pipeline.py | 63 ++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index 6e89b810b4..9ac2d6a1d1 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -2,6 +2,7 @@ from __future__ import annotations import copy +import difflib from typing import AbstractSet, Iterable from kedro.pipeline.node import Node @@ -52,10 +53,14 @@ def _validate_inputs_outputs( free_inputs = {_strip_transcoding(i) for i in pipe.inputs()} if not inputs <= free_inputs: - raise ModularPipelineError("Inputs should be free inputs to the pipeline") + raise ModularPipelineError( + "Inputs must not be outputs from another node in the same pipeline" + ) if outputs & free_inputs: - raise ModularPipelineError("Outputs can't contain free inputs to the pipeline") + raise ModularPipelineError( + "All outputs must be generated by some node within the pipeline" + ) def _validate_datasets_exist( @@ -64,16 +69,25 @@ def _validate_datasets_exist( parameters: AbstractSet[str], pipe: Pipeline, ) -> None: + """Validate that inputs, parameters and outputs map correctly onto the provided nodes.""" inputs = {_strip_transcoding(k) for k in inputs} outputs = {_strip_transcoding(k) for k in outputs} existing = {_strip_transcoding(ds) for ds in pipe.datasets()} non_existent = (inputs | outputs | parameters) - existing if non_existent: - raise ModularPipelineError( - f"Failed to map datasets and/or parameters: " - f"{', '.join(sorted(non_existent))}" + sorted_non_existent = sorted(non_existent) + possible_matches = [] + for non_existent_input in sorted_non_existent: + possible_matches += difflib.get_close_matches(non_existent_input, existing) + + error_msg = f"Failed to map datasets and/or parameters onto the nodes provided: {', '.join(sorted_non_existent)}" + suggestions = ( + f" - did you mean one of these instead: {', '.join(possible_matches)}" + if possible_matches + else "" ) + raise ModularPipelineError(error_msg + suggestions) def _get_dataset_names_mapping( diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py index bfb2e347c3..c1e76867b5 100644 --- a/tests/pipeline/test_modular_pipeline.py +++ b/tests/pipeline/test_modular_pipeline.py @@ -217,16 +217,67 @@ def test_empty_output(self): ), ], ) - def test_missing_dataset_name( + def test_missing_dataset_name_no_suggestion( self, func, inputs, outputs, inputs_map, outputs_map, expected_missing ): # noqa: PLR0913 raw_pipeline = modular_pipeline([node(func, inputs, outputs)]) - with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e: + with pytest.raises( + ModularPipelineError, match=r"Failed to map datasets and/or parameters" + ) as e: pipeline( raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map ) assert ", ".join(expected_missing) in str(e.value) + assert " - did you mean one of these instead: " not in str(e.value) + + @pytest.mark.parametrize( + "func, inputs, outputs, inputs_map, outputs_map, expected_missing, expected_suggestion", + [ + # Testing inputs + (identity, "IN", "OUT", {"I": "in"}, {}, ["I"], ["IN"]), + ( + biconcat, + ["IN1", "IN2"], + "OUT", + {"IN": "input"}, + {}, + ["IN"], + ["IN2", "IN1"], + ), + # Testing outputs + (identity, "IN", "OUT", {}, {"OUT_": "output"}, ["OUT_"], ["OUT"]), + ( + identity, + "IN", + ["OUT1", "OUT2"], + {}, + {"OUT": "out"}, + ["OUT"], + ["OUT2", "OUT1"], + ), + ], + ) + def test_missing_dataset_with_suggestion( + self, + func, + inputs, + outputs, + inputs_map, + outputs_map, + expected_missing, + expected_suggestion, + ): + raw_pipeline = modular_pipeline([node(func, inputs, outputs)]) + + with pytest.raises( + ModularPipelineError, match=r"Failed to map datasets and/or parameters" + ) as e: + pipeline( + raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map + ) + assert ", ".join(expected_missing) in str(e.value) + assert ", ".join(expected_suggestion) in str(e.value) def test_node_properties_preserved(self): """ @@ -378,11 +429,11 @@ def test_non_existent_parameters_mapped(self): ] ) - pattern = r"Failed to map datasets and/or parameters: params:beta" + pattern = r"Failed to map datasets and/or parameters onto the nodes provided: params:beta" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, parameters={"beta": "gamma"}) - pattern = r"Failed to map datasets and/or parameters: parameters" + pattern = r"Failed to map datasets and/or parameters onto the nodes provided: parameters" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, parameters={"parameters": "some_yaml_dataset"}) @@ -394,7 +445,7 @@ def test_bad_inputs_mapping(self): ] ) - pattern = "Inputs should be free inputs to the pipeline" + pattern = "Inputs must not be outputs from another node in the same pipeline" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, inputs={"AA": "CC"}) @@ -406,7 +457,7 @@ def test_bad_outputs_mapping(self): ] ) - pattern = "Outputs can't contain free inputs to the pipeline" + pattern = "All outputs must be generated by some node within the pipeline" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, outputs={"A": "C"})