Skip to content

Commit

Permalink
OrdinalEncoder handle encoded_missing_value and unknown_value (#…
Browse files Browse the repository at this point in the history
…1132)

* Ordinal handle `encoded_missing_value` and `unknown_value`

Signed-off-by: Lukas Kreussel <[email protected]>

* black

Signed-off-by: Lukas Kreussel <[email protected]>

* `ruff` +  CodeQL

Signed-off-by: Lukas Kreussel <[email protected]>

* CodeQL again

Signed-off-by: Lukas Kreussel <[email protected]>

---------

Signed-off-by: Lukas Kreussel <[email protected]>
  • Loading branch information
LLukas22 authored Nov 14, 2024
1 parent 2bd2d8f commit c672994
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 2 deletions.
28 changes: 27 additions & 1 deletion skl2onnx/operator_converters/ordinal_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ def convert_sklearn_ordinal_encoder(
result = []
input_idx = 0
dimension_idx = 0

# handle the 'handle_unknown=use_encoded_value' case
default_value = (
None if ordinal_op.handle_unknown == "error" else int(ordinal_op.unknown_value)
)

for categories in ordinal_op.categories_:
if len(categories) == 0:
continue
Expand Down Expand Up @@ -82,6 +88,7 @@ def convert_sklearn_ordinal_encoder(
feature_column = casted_feature_column

attrs = {"name": scope.get_unique_operator_name("LabelEncoder")}

if isinstance(feature_column.type, FloatTensorType):
attrs["keys_floats"] = np.array(
[float(s) for s in categories], dtype=np.float32
Expand All @@ -94,7 +101,26 @@ def convert_sklearn_ordinal_encoder(
attrs["keys_strings"] = np.array(
[str(s).encode("utf-8") for s in categories]
)
attrs["values_int64s"] = np.arange(len(categories)).astype(np.int64)

# hanlde encoded_missing_value
if not np.isnan(ordinal_op.encoded_missing_value) and (
isinstance(categories[-1], float) and np.isnan(categories[-1])
):
# sklearn always places np.nan as the last entry
# in its cathegories if it was in the training data
# => we simply add the 'ordinal_op.encoded_missing_value'
# as our last entry in 'values_int64s' if it was in the training data
encoded_missing_value = np.array(
[int(ordinal_op.encoded_missing_value)]
).astype(np.int64)
attrs["values_int64s"] = np.concatenate(
(np.arange(len(categories) - 1).astype(np.int64), encoded_missing_value)
)
else:
attrs["values_int64s"] = np.arange(len(categories)).astype(np.int64)

if default_value:
attrs["default_int64"] = default_value

result.append(scope.get_unique_variable_name("ordinal_output"))
label_encoder_output = scope.get_unique_variable_name("label_encoder")
Expand Down
116 changes: 115 additions & 1 deletion tests/test_sklearn_ordinal_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_model_ordinal_encoder(self):
[("input", Int64TensorType([None, 3]))],
target_opset=TARGET_OPSET,
)
self.assertTrue(model_onnx is not None)
self.assertIsNotNone(model_onnx)
dump_data_and_model(
data, model, model_onnx, basename="SklearnOrdinalEncoderInt64-SkipDim1"
)
Expand Down Expand Up @@ -182,6 +182,120 @@ def test_model_ordinal_encoder_cat_list(self):
data, model, model_onnx, basename="SklearnOrdinalEncoderCatList"
)

@unittest.skipIf(
not ordinal_encoder_support(),
reason="OrdinalEncoder was not available before 0.20",
)
def test_model_ordinal_encoder_unknown_value(self):
from onnxruntime import InferenceSession

model = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=42)
data = np.array([["a"], ["b"], ["c"], ["d"]], dtype=np.object_)
data_with_missing_value = np.array(
[["a"], ["b"], ["c"], ["d"], [np.nan], ["e"], [None]], dtype=np.object_
)

model.fit(data)
# 'np.nan','e' and 'None' become 42.
expected = model.transform(data_with_missing_value)

model_onnx = convert_sklearn(
model,
"scikit-learn ordinal encoder",
[("input", StringTensorType([None, 1]))],
target_opset=TARGET_OPSET,
)
self.assertIsNotNone(model_onnx)
dump_data_and_model(
data, model, model_onnx, basename="SklearnOrdinalEncoderUnknownValue"
)

sess = InferenceSession(
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
)
got = sess.run(
None,
{
"input": data_with_missing_value,
},
)

assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))

@unittest.skipIf(
not ordinal_encoder_support(),
reason="OrdinalEncoder was not available before 0.20",
)
def test_model_ordinal_encoder_encoded_missing_value(self):
from onnxruntime import InferenceSession

model = OrdinalEncoder(encoded_missing_value=42)
data = np.array([["a"], ["b"], [np.nan], ["c"], ["d"]], dtype=np.object_)

# 'np.nan' becomes 42
expected = model.fit_transform(data)

model_onnx = convert_sklearn(
model,
"scikit-learn ordinal encoder",
[("input", StringTensorType([None, 1]))],
target_opset=TARGET_OPSET,
)
self.assertIsNotNone(model_onnx)
dump_data_and_model(
data, model, model_onnx, basename="SklearnOrdinalEncoderEncodedMissingValue"
)

sess = InferenceSession(
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
)
got = sess.run(
None,
{
"input": data,
},
)

assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))

@unittest.skipIf(
not ordinal_encoder_support(),
reason="OrdinalEncoder was not available before 0.20",
)
def test_model_ordinal_encoder_encoded_missing_value_no_nan(self):
from onnxruntime import InferenceSession

model = OrdinalEncoder(encoded_missing_value=42)
data = np.array([["a"], ["b"], ["c"], ["d"]], dtype=np.object_)

expected = model.fit_transform(data)

model_onnx = convert_sklearn(
model,
"scikit-learn ordinal encoder",
[("input", StringTensorType([None, 1]))],
target_opset=TARGET_OPSET,
)
self.assertIsNotNone(model_onnx)
dump_data_and_model(
data,
model,
model_onnx,
basename="SklearnOrdinalEncoderEncodedMissingValueNoNan",
)

sess = InferenceSession(
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
)
got = sess.run(
None,
{
"input": data,
},
)

assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))

@unittest.skipIf(
not set_output_support(),
reason="'ColumnTransformer' object has no attribute 'set_output'",
Expand Down

0 comments on commit c672994

Please sign in to comment.