Skip to content

Commit

Permalink
[YDF] Emit errors when using lists for multi-dim features
Browse files Browse the repository at this point in the history
For NUMERICAL and BOOLEAN features, an incorrect dataset would be consumed before this change.

PiperOrigin-RevId: 691391034
  • Loading branch information
rstz authored and copybara-github committed Oct 30, 2024
1 parent a94d04b commit de37514
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 0 deletions.
22 changes: 22 additions & 0 deletions yggdrasil_decision_forests/port/python/ydf/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ def normalize_categorical_string_value(value: Any) -> bytes:
" task is selected. For example, you cannot train a classification"
" model (task=ydf.Task.CLASSIFICATION) with floating point labels."
)
if isinstance(value, list):
raise ValueError(
f"Cannot import column {column.name!r} with"
f" semantic={column.semantic} as it contains lists.\nNote:"
" Unrolling multi-dimensional columns is only supported for numpy"
" arrays"
)
raise ValueError(
f"Cannot import column {column.name!r} with"
f" semantic={column.semantic} and"
Expand Down Expand Up @@ -134,6 +141,13 @@ def _add_column(
" a regression model (task=ydf.Task.REGRESSION) on a string"
" column."
) from e
if column_data.ndim != 1:
raise ValueError(
f"Cannot convert {column.semantic.name} column {column.name!r} "
f" with content={column_data!r} to a 1-dimensional array of"
" np.float32 values. Note: Unrolling multi-dimensional columns is"
" only supported for numpy arrays"
)

if column.semantic == dataspec.Semantic.NUMERICAL:
self._dataset.PopulateColumnNumericalNPFloat32(
Expand Down Expand Up @@ -172,6 +186,13 @@ def _add_column(
f" values. Got {original_column_data!r}."
)
raise ValueError(message)
if column_data.ndim != 1:
raise ValueError(
f"Cannot convert BOOLEAN column {column.name!r}"
f" with content={column_data!r} to a 1-dimensional array of"
" np.float32 values. Note: Unrolling multi-dimensional columns is"
" only supported for numpy arrays"
)

self._dataset.PopulateColumnBooleanNPBool(
column.name,
Expand Down Expand Up @@ -221,6 +242,7 @@ def _add_column(
)
message += f"\nGot {original_column_data!r}."
raise ValueError(message)
assert column_data.ndim == 1, "Categorical columns must be 1-dimensional"

if column_data.dtype.type == np.bytes_:
if inference_args is not None:
Expand Down
39 changes: 39 additions & 0 deletions yggdrasil_decision_forests/port/python/ydf/dataset/dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1561,6 +1561,45 @@ def test_fail_gracefully_for_incorrect_boolean_type(self):
columns=[("f1", dataspec.Semantic.BOOLEAN)],
)

def test_multidim_numerical_list(self):
data = {
"f1": [[1, 2], [3, 4], [5, 6]],
}
with self.assertRaisesRegex(
ValueError,
".*Unrolling multi-dimensional columns is only supported for numpy"
" arrays.*",
):
_ = dataset.create_vertical_dataset(
data, columns=[("f1", dataspec.Semantic.NUMERICAL)]
)

def test_multidim_boolean_list(self):
data = {
"f1": [[True, False], [True, False], [True, False]],
}
with self.assertRaisesRegex(
ValueError,
".*Unrolling multi-dimensional columns is only supported for numpy"
" arrays.*",
):
_ = dataset.create_vertical_dataset(
data, columns=[("f1", dataspec.Semantic.BOOLEAN)]
)

def test_multidim_categorical_list(self):
data = {
"f1": [[1, 2], [3, 4], [5, 6]],
}
with self.assertRaisesRegex(
ValueError,
".*Unrolling multi-dimensional columns is only supported for numpy"
" arrays.*",
):
_ = dataset.create_vertical_dataset(
data, columns=[("f1", dataspec.Semantic.CATEGORICAL)]
)


class CategoricalSetTest(absltest.TestCase):

Expand Down

0 comments on commit de37514

Please sign in to comment.