Skip to content

Commit

Permalink
Improve error messages + support bool as categorical
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 576463711
  • Loading branch information
achoum authored and copybara-github committed Oct 25, 2023
1 parent 04668ca commit 5d8c812
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 13 deletions.
43 changes: 30 additions & 13 deletions yggdrasil_decision_forests/port/python/ydf/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def _add_column(
if column_data.dtype != np.float32:
# TODO: Add control for warning (flag or count).
logging.info(
"Column '%s' with numerical semantic has dtype %s. Casting value to"
"Column '%s' with NUMERICAL semantic has dtype %s. Casting value to"
" float32.",
column.name,
column_data.dtype.name,
Expand All @@ -309,11 +309,12 @@ def _add_column(
column_data = column_data.astype(np.float32)
except ValueError as e:
raise ValueError(
f"Cannot convert NUMERICAL column {column.name!r} with"
f" content={column_data!r} to np.float32 values. If"
" the column is a label, make sure the training task is"
" compatible. For example, you cannot train a regression model"
" (task=ydf.Task.REGRESSION) on a string column."
f"Cannot convert NUMERICAL column {column.name!r} of type"
f" {_type(column_data)} and with content={column_data!r} to"
" np.float32 values.\nNote: If the column is a label, make sure"
" the training task is compatible. For example, you cannot train"
" a regression model (task=ydf.Task.REGRESSION) on a string"
" column."
) from e

self._dataset.PopulateColumnNumericalNPFloat32(
Expand All @@ -336,6 +337,7 @@ def _add_column(
elif column_data.dtype.type in [
np.object_,
np.string_,
np.bool_,
np.int8,
np.int16,
np.int32,
Expand All @@ -352,11 +354,12 @@ def _add_column(
np.float64,
]:
raise ValueError(
f"Column {column.name!r} with semantic={column.semantic} should not"
f" contain floating point values. Got {original_column_data!r}. If"
" the column is a label, make sure the correct task is selected."
" For example, you cannot train a classification model"
" (task=ydf.Task.CLASSIFICATION) with floating point labels."
f"Cannot import column {column.name!r} with"
f" semantic={column.semantic} as it contains floating point values."
f" Got {original_column_data!r}.\nNote: If the column is a label,"
" make sure the correct task is selected. For example, you cannot"
" train a classification model (task=ydf.Task.CLASSIFICATION) with"
" floating point labels."
)

if column_data.dtype.type == np.bytes_:
Expand All @@ -372,8 +375,13 @@ def _add_column(
return

raise ValueError(
f"Column {column.name!r} with semantic={column.semantic} and"
f" content={original_column_data!r} is not supported"
f"Cannot import column {column.name!r} with semantic={column.semantic},"
f" type={_type(original_column_data)} and"
f" content={original_column_data!r}.\nNote: If the column is a label,"
" the semantic was selected based on the task. For example,"
" task=ydf.Task.CLASSIFICATION requires a CATEGORICAL compatible label"
" column, and task=ydf.Task.REGRESSION requires a NUMERICAL compatible"
" label column."
)

def _initialize_from_data_spec(
Expand Down Expand Up @@ -826,3 +834,12 @@ def _normalize_monotonic_constraint(
" Monotonic.INCREASING, or Monotonic.DECREASING. Got"
f" {constraint!r} instead"
)


def _type(value: Any) -> str:
"""Returns a string representation of the type of value."""

if isinstance(value, np.ndarray):
return f"numpy's array of '{value.dtype.name}'"
else:
return str(type(value))
18 changes: 18 additions & 0 deletions yggdrasil_decision_forests/port/python/ydf/learner/learner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,24 @@ def test_tuner_predefined(self):
self.assertIsNotNone(logs)
self.assertLen(logs.steps, 5)

def test_label_type_error_message(self):
with self.assertRaisesRegex(
ValueError,
"Cannot import column 'l' with semantic=Semantic.CATEGORICAL",
):
_ = specialized_learners.GradientBoostedTreesLearner(
label="l", task=generic_learner.Task.CLASSIFICATION
).train(pd.DataFrame({"l": [1.0, 2.0], "f": [0, 1]}))

with self.assertRaisesRegex(
ValueError,
"Cannot convert NUMERICAL column 'l' of type numpy's array of 'object'"
" and with content=",
):
_ = specialized_learners.GradientBoostedTreesLearner(
label="l", task=generic_learner.Task.REGRESSION
).train(pd.DataFrame({"l": ["A", "B"], "f": [0, 1]}))


class CARTLearnerTest(LearnerTest):

Expand Down

0 comments on commit 5d8c812

Please sign in to comment.