Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve ludwig feature dict #3904

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
10 changes: 6 additions & 4 deletions ludwig/explain/captum.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,13 +216,13 @@ def explain(self) -> ExplanationsResult:
feat_to_token_attributions_global[feat_name] = token_attributions_global

self.global_explanation.add(
input_features.keys(), total_attribution_global, feat_to_token_attributions_global
input_features.key_list(), total_attribution_global, feat_to_token_attributions_global
)

for i, (feature_attributions, explanation) in enumerate(zip(total_attribution, self.row_explanations)):
# Add the feature attributions to the explanation object for this row.
explanation.add(
input_features.keys(),
input_features.key_list(),
feature_attributions,
{k: v[i] for k, v in feat_to_token_attributions.items()},
)
Expand All @@ -245,7 +245,7 @@ def explain(self) -> ExplanationsResult:
}
# Prepend the negative class to the list of label explanations.
self.global_explanation.add(
input_features.keys(), negated_attributions, negated_token_attributions, prepend=True
input_features.key_list(), negated_attributions, negated_token_attributions, prepend=True
)

for explanation in self.row_explanations:
Expand All @@ -257,7 +257,9 @@ def explain(self) -> ExplanationsResult:
if fa.token_attributions is not None
}
# Prepend the negative class to the list of label explanations.
explanation.add(input_features.keys(), negated_attributions, negated_token_attributions, prepend=True)
explanation.add(
input_features.key_list(), negated_attributions, negated_token_attributions, prepend=True
)

# TODO(travis): for force plots, need something similar to SHAP E[X]
expected_values.append(0.0)
Expand Down
10 changes: 6 additions & 4 deletions ludwig/explain/captum_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,13 @@ def explain(self) -> ExplanationsResult:
feat_to_token_attributions_global[feat_name] = token_attributions_global

self.global_explanation.add(
input_features.keys(), total_attribution_global, feat_to_token_attributions_global
input_features.key_list(), total_attribution_global, feat_to_token_attributions_global
)

for i, (feature_attributions, explanation) in enumerate(zip(total_attribution, self.row_explanations)):
# Add the feature attributions to the explanation object for this row.
explanation.add(
input_features.keys(),
input_features.key_list(),
feature_attributions,
{k: v[i] for k, v in feat_to_token_attributions.items()},
)
Expand All @@ -140,7 +140,7 @@ def explain(self) -> ExplanationsResult:
}
# Prepend the negative class to the list of label explanations.
self.global_explanation.add(
input_features.keys(), negated_attributions, negated_token_attributions, prepend=True
input_features.key_list(), negated_attributions, negated_token_attributions, prepend=True
)

for explanation in self.row_explanations:
Expand All @@ -152,7 +152,9 @@ def explain(self) -> ExplanationsResult:
if fa.token_attributions is not None
}
# Prepend the negative class to the list of label explanations.
explanation.add(input_features.keys(), negated_attributions, negated_token_attributions, prepend=True)
explanation.add(
input_features.key_list(), negated_attributions, negated_token_attributions, prepend=True
)

# TODO(travis): for force plots, need something similar to SHAP E[X]
expected_values.append(0.0)
Expand Down
4 changes: 2 additions & 2 deletions ludwig/explain/gbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ def explain(self) -> ExplanationsResult:

expected_values = []
for _ in range(self.vocab_size):
self.global_explanation.add(base_model.input_features.keys(), feat_imp)
self.global_explanation.add(base_model.input_features.key_list(), feat_imp)

for explanation in self.row_explanations:
# Add the feature attributions to the explanation object for this row.
explanation.add(base_model.input_features.keys(), feat_imp)
explanation.add(base_model.input_features.key_list(), feat_imp)

# TODO:
expected_values.append(0.0)
Expand Down
47 changes: 20 additions & 27 deletions ludwig/features/feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
# limitations under the License.
# ==============================================================================
import re
from typing import Dict, List, Optional, Tuple, Union
from collections.abc import MutableMapping
from typing import Iterator, List, Optional, Tuple, Union

import numpy as np
import torch
Expand Down Expand Up @@ -157,7 +158,7 @@ def get_name_from_module_dict_key(key: str, feature_name_suffix_length: int = FE
return name[:-feature_name_suffix_length]


class LudwigFeatureDict(torch.nn.Module):
class LudwigFeatureDict(torch.nn.Module, MutableMapping):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dennisrall Thank you for incorporating my previous suggestion -- I think that part looks clean now. Thank you for the idea and the implementation!

For this one, I am not sure if the benefits due to adding the MutableMapping subclassing justify taking the risk brought about the multiple inheritance. Do the test cover all the eventualities that might happen with this change?

Thank you. /cc @justinxzhao @arnavgarg1 @Infernaught

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem, I was just playing around a bit.

I can't think of any problems about the multiple inheritance, but you know the code better than me😉

It is also possible to remove the MutableMapping inheritance and implement the other methods by hand. But I think this way it is a bit cleaner, if it doesn't cause any problems...

"""Torch ModuleDict wrapper that permits keys with any name.

Torch's ModuleDict implementation doesn't allow certain keys to be used if they conflict with existing class
Expand All @@ -174,39 +175,31 @@ class LudwigFeatureDict(torch.nn.Module):
def __init__(self):
super().__init__()
self.module_dict = torch.nn.ModuleDict()
self.internal_key_to_original_name_map = {}

def get(self, key) -> torch.nn.Module:
def __getitem__(self, key: str) -> torch.nn.Module:
return self.module_dict[get_module_dict_key_from_name(key)]

def set(self, key: str, module: torch.nn.Module) -> None:
def __setitem__(self, key: str, value: torch.nn.Module) -> None:
module_dict_key_name = get_module_dict_key_from_name(key)
self.internal_key_to_original_name_map[module_dict_key_name] = key
self.module_dict[module_dict_key_name] = module
self.module_dict[module_dict_key_name] = value

def __len__(self) -> int:
return len(self.module_dict)
def __delitem__(self, key: str) -> None:
del self.module_dict[get_module_dict_key_from_name(key)]

def __next__(self) -> None:
return next(iter(self))
def __iter__(self) -> Iterator[str]:
return (get_name_from_module_dict_key(key) for key in self.module_dict)

def __iter__(self) -> None:
return iter(self.keys())
def __len__(self) -> int:
return len(self.module_dict)

def keys(self) -> List[str]:
return [
get_name_from_module_dict_key(feature_name)
for feature_name in self.internal_key_to_original_name_map.keys()
]
def set(self, key: str, value: torch.nn.Module) -> None:
self[key] = value

def values(self) -> List[torch.nn.Module]:
return [module for _, module in self.module_dict.items()]
def key_list(self) -> List[str]:
return list(self.keys())

def items(self) -> List[Tuple[str, torch.nn.Module]]:
return [
(get_name_from_module_dict_key(feature_name), module) for feature_name, module in self.module_dict.items()
]
def value_list(self) -> List[torch.nn.Module]:
return list(self.values())

def update(self, modules: Dict[str, torch.nn.Module]) -> None:
for feature_name, module in modules.items():
self.set(feature_name, module)
def item_list(self) -> List[Tuple[str, torch.nn.Module]]:
return list(self.items())
2 changes: 1 addition & 1 deletion ludwig/models/ecd.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def forward(
else:
targets = None

assert list(inputs.keys()) == self.input_features.keys()
assert list(inputs.keys()) == list(self.input_features.keys())

encoder_outputs = self.encode(inputs)
combiner_outputs = self.combine(encoder_outputs)
Expand Down
4 changes: 2 additions & 2 deletions ludwig/models/gbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,14 @@ def forward(
) -> Dict[str, torch.Tensor]:
# Invoke output features.
output_logits = {}
output_feature_name = self.output_features.keys()[0]
output_feature_name = next(iter(self.output_features.keys()))
output_feature = self.output_features.get(output_feature_name)

# If `inputs` is a tuple, it should contain `(inputs, targets)`.
if isinstance(inputs, tuple):
inputs, _ = inputs

assert list(inputs.keys()) == self.input_features.keys()
assert list(inputs.keys()) == list(self.input_features.keys())

# If the model has not been compiled, predict using the LightGBM sklearn iterface. Otherwise, use torch with
# the Hummingbird compiled model. Notably, when compiling the model to torchscript, compiling with Hummingbird
Expand Down
11 changes: 6 additions & 5 deletions ludwig/models/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ def __iter__(self) -> None:
return iter(self.obj.keys())

def keys(self) -> List[str]:
return self.obj.keys()
return self.obj.key_list()
Copy link
Collaborator

@alexsherstinsky alexsherstinsky Jan 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dennisrall Do you think it might be simpler to retain the coding pattern of returning list(my_object.keys()) or list(my_object.items()), etc., instead of introducing a special additional methods item_list() and value_list()? It seems to me that doing so will be consistent with other cases in these Ludwig modules as well as general Python collection patterns. Thank you.


def values(self) -> List[torch.nn.Module]:
return self.obj.values()
return self.obj.value_list()

def items(self) -> List[Tuple[str, torch.nn.Module]]:
return self.obj.items()
return self.obj.item_list()

def update(self, modules: Dict[str, torch.nn.Module]) -> None:
self.obj.update(modules)
Expand Down Expand Up @@ -148,7 +148,8 @@ def __init__(
)

# Extract the decoder object for the forward pass
self._output_feature_decoder = ModuleWrapper(self.output_features.items()[0][1])
decoder = next(iter(self.output_features.values()))
self._output_feature_decoder = ModuleWrapper(decoder)

self.attention_masks = None

Expand Down Expand Up @@ -401,7 +402,7 @@ def _unpack_inputs(
else:
targets = None

assert list(inputs.keys()) == self.input_features.keys()
assert list(inputs.keys()) == list(self.input_features.keys())

input_ids = self.get_input_ids(inputs)
target_ids = self.get_target_ids(targets) if targets else None
Expand Down
12 changes: 6 additions & 6 deletions ludwig/trainers/trainer_lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,8 +831,8 @@ def _construct_lgb_datasets(
validation_set: Optional["Dataset"] = None, # noqa: F821
test_set: Optional["Dataset"] = None, # noqa: F821
) -> Tuple[lgb.Dataset, List[lgb.Dataset], List[str]]:
X_train = training_set.to_scalar_df(self.model.input_features.values())
y_train = training_set.to_scalar_df(self.model.output_features.values())
X_train = training_set.to_scalar_df(self.model.input_features.value_list())
y_train = training_set.to_scalar_df(self.model.output_features.value_list())

# create dataset for lightgbm
# keep raw data for continued training https://github.com/microsoft/LightGBM/issues/4965#issuecomment-1019344293
Expand All @@ -850,8 +850,8 @@ def _construct_lgb_datasets(
eval_sets = [lgb_train]
eval_names = [LightGBMTrainer.TRAIN_KEY]
if validation_set is not None:
X_val = validation_set.to_scalar_df(self.model.input_features.values())
y_val = validation_set.to_scalar_df(self.model.output_features.values())
X_val = validation_set.to_scalar_df(self.model.input_features.value_list())
y_val = validation_set.to_scalar_df(self.model.output_features.value_list())
try:
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train, free_raw_data=False).construct()
except lgb.basic.LightGBMError as e:
Expand All @@ -869,8 +869,8 @@ def _construct_lgb_datasets(
pass

if test_set is not None:
X_test = test_set.to_scalar_df(self.model.input_features.values())
y_test = test_set.to_scalar_df(self.model.output_features.values())
X_test = test_set.to_scalar_df(self.model.input_features.value_list())
y_test = test_set.to_scalar_df(self.model.output_features.value_list())
try:
lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train, free_raw_data=False).construct()
except lgb.basic.LightGBMError as e:
Expand Down
118 changes: 113 additions & 5 deletions tests/ludwig/features/test_feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,114 @@
from ludwig.features import feature_utils


@pytest.fixture
def to_module() -> torch.nn.Module:
return torch.nn.Module()


@pytest.fixture
def type_module() -> torch.nn.Module:
Copy link
Collaborator

@alexsherstinsky alexsherstinsky Jan 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dennisrall This fixture and the to_module() one appear the same, and just instantiate the PyTorch Module() class. Without a docstring, it is a bit difficult to justify it having them. Thank you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your feedback. I added a docstring for both fixtures and also for the hash method of the LudwigFeatureDict

return torch.nn.Module()


@pytest.fixture
def feature_dict(to_module: torch.nn.Module, type_module: torch.nn.Module) -> feature_utils.LudwigFeatureDict:
fdict = feature_utils.LudwigFeatureDict()
fdict.set("to", to_module)
fdict["type"] = type_module
return fdict


def test_ludwig_feature_dict_get(
feature_dict: feature_utils.LudwigFeatureDict, to_module: torch.nn.Module, type_module: torch.nn.Module
):
assert feature_dict["to"] == to_module
assert feature_dict.get("type") == type_module
assert feature_dict.get("other_key", default=None) is None


def test_ludwig_feature_dict_keys(feature_dict: feature_utils.LudwigFeatureDict):
assert list(feature_dict.keys()) == ["to", "type"]
assert feature_dict.key_list() == ["to", "type"]


def test_ludwig_feature_dict_values(
feature_dict: feature_utils.LudwigFeatureDict, to_module: torch.nn.Module, type_module: torch.nn.Module
):
assert list(feature_dict.values()) == [to_module, type_module]
assert feature_dict.value_list() == [to_module, type_module]


def test_ludwig_feature_dict_items(
feature_dict: feature_utils.LudwigFeatureDict, to_module: torch.nn.Module, type_module: torch.nn.Module
):
assert list(feature_dict.items()) == [("to", to_module), ("type", type_module)]
assert feature_dict.item_list() == [("to", to_module), ("type", type_module)]


def test_ludwig_feature_dict_iter(feature_dict: feature_utils.LudwigFeatureDict):
assert list(iter(feature_dict)) == ["to", "type"]
assert list(feature_dict) == ["to", "type"]


def test_ludwig_feature_dict_len(feature_dict: feature_utils.LudwigFeatureDict):
assert len(feature_dict) == 2


def test_ludwig_feature_dict_contains(feature_dict: feature_utils.LudwigFeatureDict):
assert "to" in feature_dict and "type" in feature_dict


def test_ludwig_feature_dict_eq(feature_dict: feature_utils.LudwigFeatureDict):
other_dict = feature_utils.LudwigFeatureDict()
assert not feature_dict == other_dict
other_dict.update(feature_dict.item_list())
assert feature_dict == other_dict


def test_ludwig_feature_dict_update(
feature_dict: feature_utils.LudwigFeatureDict, to_module: torch.nn.Module, type_module: torch.nn.Module
):
feature_dict.update({"to": torch.nn.Module(), "new": torch.nn.Module()})
assert len(feature_dict) == 3
assert not feature_dict.get("to") == to_module
assert feature_dict.get("type") == type_module


def test_ludwig_feature_dict_del(feature_dict: feature_utils.LudwigFeatureDict):
del feature_dict["to"]
assert len(feature_dict) == 1


def test_ludwig_feature_dict_clear(feature_dict: feature_utils.LudwigFeatureDict):
feature_dict.clear()
assert len(feature_dict) == 0


def test_ludwig_feature_dict_pop(feature_dict: feature_utils.LudwigFeatureDict, type_module: torch.nn.Module):
assert feature_dict.pop("type") == type_module
assert len(feature_dict) == 1
assert feature_dict.pop("type", default=None) is None


def test_ludwig_feature_dict_popitem(feature_dict: feature_utils.LudwigFeatureDict, to_module: torch.nn.Module):
assert feature_dict.popitem() == ("to", to_module)
assert len(feature_dict) == 1


def test_ludwig_feature_dict_setdefault(feature_dict: feature_utils.LudwigFeatureDict, to_module: torch.nn.Module):
assert feature_dict.setdefault("to") == to_module
assert feature_dict.get("other_key") is None


@pytest.mark.parametrize("name", ["to", "type", "foo", "foo.bar"])
def test_name_to_module_dict_key(name: str):
key = feature_utils.get_module_dict_key_from_name(name)
assert key != name
assert "." not in key
assert feature_utils.get_name_from_module_dict_key(key) == name


def test_ludwig_feature_dict():
feature_dict = feature_utils.LudwigFeatureDict()

Expand All @@ -15,10 +123,10 @@ def test_ludwig_feature_dict():
feature_dict.set("type", type_module)

assert iter(feature_dict) is not None
assert next(feature_dict) is not None
# assert next(feature_dict) is not None
assert len(feature_dict) == 2
assert feature_dict.keys() == ["to", "type"]
assert feature_dict.items() == [("to", to_module), ("type", type_module)]
assert feature_dict.key_list() == ["to", "type"]
assert feature_dict.item_list() == [("to", to_module), ("type", type_module)]
assert feature_dict.get("to"), to_module

feature_dict.update({"to_empty": torch.nn.Module()})
Expand All @@ -34,8 +142,8 @@ def test_ludwig_feature_dict_with_periods():

feature_dict.set("to.", to_module)

assert feature_dict.keys() == ["to."]
assert feature_dict.items() == [("to.", to_module)]
assert feature_dict.key_list() == ["to."]
assert feature_dict.item_list() == [("to.", to_module)]
assert feature_dict.get("to.") == to_module


Expand Down
Loading