Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change how kedro import datasets and raise explicit error when dependencies is missing #3272

Merged
merged 20 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* The new spaceflights starters, `spaceflights-pandas`, `spaceflights-pandas-viz`, `spaceflights-pyspark`, and `spaceflights-pyspark-viz` can be used with the `kedro new` command with the `--starter` flag.
* Added the `--conf-source` option to `%reload_kedro`, allowing users to specify a source for project configuration.
* Added the functionality to choose a merging strategy for config files loaded with `OmegaConfigLoader`.
* Modified the mechanism of importing datasets, raise more explicit error when dependencies are missing.


## Bug fixes and other changes
Expand Down
28 changes: 12 additions & 16 deletions kedro/io/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,25 +376,22 @@ def parse_dataset_definition(
if "type" not in config:
raise DatasetError("'type' is missing from dataset catalog configuration")

class_obj = config.pop("type")
if isinstance(class_obj, str):
if len(class_obj.strip(".")) != len(class_obj):
dataset_type = config.pop("type")
if isinstance(dataset_type, str):
if len(dataset_type.strip(".")) != len(dataset_type):
raise DatasetError(
"'type' class path does not support relative "
"paths or paths ending with a dot."
)
class_paths = (prefix + class_obj for prefix in _DEFAULT_PACKAGES)
class_paths = (prefix + dataset_type for prefix in _DEFAULT_PACKAGES)
Comment on lines +379 to +386
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refactor the name because class_obj is overloaded and confusing. Originally it is a str but load as a class later, it's better to make it explicit.


for class_path in class_paths:
tmp = _load_obj(class_path)
if tmp is not None:
class_obj = tmp
break
else:
raise DatasetError(
f"Class '{class_obj}' not found or one of its dependencies "
f"has not been installed."
)
raise DatasetError(f"Class '{dataset_type}' not found, is this a typo?")

if not issubclass(class_obj, AbstractDataset):
raise DatasetError(
Expand Down Expand Up @@ -422,8 +419,9 @@ def parse_dataset_definition(
return class_obj, config


def _load_obj(class_path: str) -> object | None:
def _load_obj(class_path: str) -> Any | None:
mod_path, _, class_name = class_path.rpartition(".")
# Check if the module exists
try:
available_classes = load_obj(f"{mod_path}.__all__")
# ModuleNotFoundError: When `load_obj` can't find `mod_path` (e.g `kedro.io.pandas`)
Expand All @@ -432,18 +430,16 @@ def _load_obj(class_path: str) -> object | None:
# `__all__` attribute -- either because it's a custom or a kedro.io dataset
except (ModuleNotFoundError, AttributeError, ValueError):
available_classes = None

try:
class_obj = load_obj(class_path)
except (ModuleNotFoundError, ValueError):
return None
except AttributeError as exc:
except (ModuleNotFoundError, ValueError, AttributeError) as exc:
# If it's available, module exist but dependencies are missing
if available_classes and class_name in available_classes:
raise DatasetError(
f"{exc} Please see the documentation on how to "
f"{exc}. Please see the documentation on how to "
f"install relevant dependencies for {class_path}:\n"
f"https://kedro.readthedocs.io/en/stable/"
f"kedro_project_setup/dependencies.html"
f"https://docs.kedro.org/en/stable/kedro_project_setup/"
f"dependencies.html#install-dependencies-related-to-the-data-catalog"
Comment on lines +439 to +442
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated the link according to review suggestion

) from exc
return None

Expand Down
2 changes: 0 additions & 2 deletions kedro/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,4 @@ def load_obj(obj_path: str, default_obj_path: str = "") -> Any:
obj_path = obj_path_list.pop(0) if len(obj_path_list) > 1 else default_obj_path
obj_name = obj_path_list[0]
module_obj = importlib.import_module(obj_path)
if not hasattr(module_obj, obj_name):
raise AttributeError(f"Object '{obj_name}' cannot be loaded from '{obj_path}'.")
Comment on lines -26 to -27
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can now create a ModuleNotFoundError due to lazy-loading. I decide to remove it because it adds more complexity but the error message itself isn't helpful at all.

return getattr(module_obj, obj_name)
27 changes: 27 additions & 0 deletions tests/io/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
generate_timestamp,
get_filepath_str,
get_protocol_and_path,
parse_dataset_definition,
validate_on_forbidden_chars,
)

Expand Down Expand Up @@ -265,6 +266,32 @@ def test_validate_forbidden_chars(self, input):
with pytest.raises(DatasetError, match=expected_error_message):
validate_on_forbidden_chars(**input)

def test_dataset_name_typo(self, mocker):
# If the module doesn't exist, it return None instead ModuleNotFoundError
mocker.patch("kedro.io.core.load_obj", return_value=None)
dataset_name = "lAmbDaDaTAsET"

with pytest.raises(
DatasetError, match=f"Class '{dataset_name}' not found, is this a typo?"
):
parse_dataset_definition({"type": dataset_name})

def test_dataset_missing_dependencies(self, mocker):
# If the module is found but import the dataset trigger ModuleNotFoundError
dataset_name = "LambdaDataset"

def side_effect_function(value):
if "__all__" in value:
return [dataset_name]
else:
raise ModuleNotFoundError

mocker.patch("kedro.io.core.load_obj", side_effect=side_effect_function)

pattern = "Please see the documentation on how to install relevant dependencies"
with pytest.raises(DatasetError, match=pattern):
parse_dataset_definition({"type": dataset_name})

Comment on lines +269 to +294
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some tests for testing the two common scenarios.


class TestAbstractVersionedDataset:
def test_version_str_repr(self, load_version, save_version):
Expand Down
6 changes: 0 additions & 6 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@ def test_load_obj_default_path(self):
extracted_obj = load_obj("DummyClass", "tests.test_utils")
assert extracted_obj is DummyClass

def test_load_obj_invalid_attribute(self):
with pytest.raises(
AttributeError, match=r"Object 'InvalidClass' cannot be loaded"
):
load_obj("InvalidClass", "tests.test_utils")

def test_load_obj_invalid_module(self):
with pytest.raises(ImportError, match=r"No module named 'missing_path'"):
load_obj("InvalidClass", "missing_path")
Loading