onnx · xadupre · May 28, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/CHANGELOGS.md b/CHANGELOGS.md
@@ -2,6 +2,12 @@
 
 ## 1.17.0 (development)
 
+* Upgrade the maximum supported opset to 21,
+  update requirements to scikit-learn>=1.1,
+  older versions are not tested anymore,
+  [#1098](https://github.com/onnx/sklearn-onnx/pull/1098)
+* Support infrequent categories for OneHotEncoder
+  [#1029](https://github.com/onnx/sklearn-onnx/pull/1029)
 * Support kernel Matern in Gaussian Process
   [#978](https://github.com/onnx/sklearn-onnx/pull/978)
 * Fix for multidimensional gaussian process

diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ including models or transformers coming from external libraries.
 ## Documentation
 Full documentation including tutorials is available at [https://onnx.ai/sklearn-onnx/](https://onnx.ai/sklearn-onnx/).
 [Supported scikit-learn Models](https://onnx.ai/sklearn-onnx/supported.html)
-Last supported opset is 19.
+Last supported opset is 21.
 
 You may also find answers in [existing issues](https://github.com/onnx/sklearn-onnx/issues?utf8=%E2%9C%93&q=is%3Aissue)
 or submit a new one.

diff --git a/docs/api_summary.rst b/docs/api_summary.rst
@@ -124,6 +124,6 @@ Topology
 --------
 
 .. autoclass:: skl2onnx.common._topology.Topology
-    :members: compile, topological_operator_iterator
+    :members:
 
 .. autofunction:: skl2onnx.common._topology.convert_topology
diff --git a/docs/exts/sphinx_skl2onnx_extension.py b/docs/exts/sphinx_skl2onnx_extension.py
@@ -174,6 +174,19 @@ def make_ref(name):
 
         rows.append("")
         for name in sorted_keys:
+            if name in {
+                "OnnxDecorrelateTransformer",
+                "OnnxGrowthCalculator",
+                "OnnxPredictableTSNE",
+                "OnnxSklearnLGBMClassifier",
+                "OnnxSklearnLGBMRegressor",
+                "OnnxSklearnXGBClassifier",
+                "OnnxSklearnXGBRegressor",
+                "OnnxSklearnPipeline",
+                "OnnxSklearnColumnTransformer",
+                "OnnxSklearnFeatureUnion",
+            }:
+                continue
             rows = []
             cl = cls[name]
             rows.append(".. _l-sklops-{}:".format(cl.__name__))

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,5 +1,6 @@
 # tests
 black
+jinja2
 onnxruntime-extensions
 onnxscript
 pandas

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 onnx>=1.2.1
-scikit-learn>=0.19
+scikit-learn>=1.1
 onnxconverter-common>=1.7.0
diff --git a/skl2onnx/__init__.py b/skl2onnx/__init__.py
@@ -9,7 +9,7 @@
 __producer_version__ = __version__
 __domain__ = "ai.onnx"
 __model_version__ = 0
-__max_supported_opset__ = 19  # Converters are tested up to this version.
+__max_supported_opset__ = 21  # Converters are tested up to this version.
 
 
 from .convert import convert_sklearn, to_onnx, wrap_as_onnx_mixin  # noqa

diff --git a/skl2onnx/algebra/_cache/__init__.py b/skl2onnx/algebra/_cache/__init__.py
diff --git a/skl2onnx/algebra/automation.py b/skl2onnx/algebra/automation.py
@@ -3,7 +3,6 @@
 import textwrap
 import onnx
 import onnx.defs  # noqa
-from onnx.defs import OpSchema
 
 
 def _get_doc_template():
@@ -115,154 +114,6 @@ def get_domain_list():
     )
 
 
-def get_rst_doc(op_name=None):
-    """
-    Returns a documentation in RST format
-    for all :class:`OnnxOperator`.
-
-    :param op_name: operator name of None for all
-    :return: string
-
-    The function relies on module *jinja2* or replaces it
-    with a simple rendering if not present.
-    """
-    if op_name is None:
-        schemas = onnx.defs.get_all_schemas_with_history()
-    elif isinstance(op_name, str):
-        schemas = [
-            schema
-            for schema in onnx.defs.get_all_schemas_with_history()
-            if schema.name == op_name
-        ]
-        if len(schemas) > 1:
-            raise RuntimeError(
-                "Multiple operators have the same name '{}'.".format(op_name)
-            )
-    elif not isinstance(op_name, list):
-        schemas = [op_name]
-    if len(schemas) == 0:
-        raise ValueError("Unable to find any operator with name '{}'.".format(op_name))
-
-    # from onnx.backend.sample.ops import collect_sample_implementations
-    # from onnx.backend.test.case import collect_snippets
-    # SNIPPETS = collect_snippets()
-    # SAMPLE_IMPLEMENTATIONS = collect_sample_implementations()
-    def format_name_with_domain(sch):
-        if sch.domain:
-            return "{} ({})".format(sch.name, sch.domain)
-        return sch.name
-
-    def get_is_homogeneous(obj):
-        try:
-            return obj.is_homogeneous
-        except AttributeError:
-            try:
-                return obj.isHomogeneous
-            except AttributeError:
-                return False
-
-    def format_option(obj):
-        opts = []
-        if OpSchema.FormalParameterOption.Optional == obj.option:
-            opts.append("optional")
-        elif OpSchema.FormalParameterOption.Variadic == obj.option:
-            opts.append("variadic")
-        if get_is_homogeneous(obj):
-            opts.append("heterogeneous")
-        if opts:
-            return " (%s)" % ", ".join(opts)
-        return ""
-
-    def getconstraint(const, ii):
-        if const.type_param_str:
-            name = const.type_param_str
-        else:
-            name = str(ii)
-        if const.allowed_type_strs:
-            name += " " + ", ".join(const.allowed_type_strs)
-        return name
-
-    def getname(obj, i):
-        name = obj.name
-        if len(name) == 0:
-            return str(i)
-        return name
-
-    def process_documentation(doc):
-        if doc is None:
-            doc = ""
-        doc = textwrap.dedent(doc)
-        main_docs_url = "https://github.com/onnx/onnx/blob/main/"
-        rep = {
-            "[the doc](IR.md)": "`ONNX <{0}docs/IR.md>`_",
-            "[the doc](Broadcasting.md)": (
-                "`Broadcasting in ONNX <{0}docs/Broadcasting.md>`_"
-            ),
-            "<dl>": "",
-            "</dl>": "",
-            "<dt>": "* ",
-            "<dd>": "  ",
-            "</dt>": "",
-            "</dd>": "",
-            "<tt>": "``",
-            "</tt>": "``",
-            "<br>": "\n",
-        }
-        for k, v in rep.items():
-            doc = doc.replace(k, v.format(main_docs_url))
-        move = 0
-        lines = []
-        for line in doc.split("\n"):
-            if line.startswith("```"):
-                if move > 0:
-                    move -= 4
-                    lines.append("\n")
-                else:
-                    lines.append("::\n")
-                    move += 4
-            elif move > 0:
-                lines.append(" " * move + line)
-            else:
-                lines.append(line)
-        return "\n".join(lines)
-
-    def build_doc_url(sch):
-        doc_url = "https://github.com/onnx/onnx/blob/main/docs/Operators"
-        if "ml" in sch.domain:
-            doc_url += "-ml"
-        doc_url += ".md"
-        doc_url += "#"
-        if sch.domain not in (None, "", "ai.onnx"):
-            doc_url += sch.domain + "."
-        return doc_url
-
-    def get_type_str(inou):
-        try:
-            return inou.type_str
-        except AttributeError:
-            return inou.typeStr
-
-    fnwd = format_name_with_domain
-    tmpl = _template_operator
-    docs = tmpl.render(
-        schemas=schemas,
-        OpSchema=OpSchema,
-        len=len,
-        getattr=getattr,
-        sorted=sorted,
-        format_option=format_option,
-        getconstraint=getconstraint,
-        getname=getname,
-        enumerate=enumerate,
-        format_name_with_domain=fnwd,
-        process_documentation=process_documentation,
-        build_doc_url=build_doc_url,
-        str=str,
-        get_type_str=get_type_str,
-    )
-    return docs
-
-
 def _get_doc_template_sklearn():
     try:
         from jinja2 import Template

diff --git a/skl2onnx/algebra/onnx_ops.py b/skl2onnx/algebra/onnx_ops.py
@@ -4,7 +4,6 @@
 Place holder for all ONNX operators.
 """
 import sys
-import os
 import numpy as np
 
 try:
@@ -14,8 +13,6 @@
 import onnx
 from ..common.data_types import DataType
 from ..common._topology import Variable
-from .automation import get_rst_doc
-from ._cache import cache_folder
 
 
 def ClassFactory(
@@ -169,7 +166,6 @@ def dynamic_class_creation(cache=False):
     <https://github.com/onnx/onnx/blob/main/docs/
     Operators-ml.md>`_.
     """
-    cache_dir = cache_folder()
     res = {}
     for schema in onnx.defs.get_all_schemas_with_history():
         if schema.support_level == schema.SupportType.EXPERIMENTAL:
@@ -199,22 +195,8 @@ def _c(obj, label, i):
         outputs = [_c(o, "O", i) for i, o in enumerate(schema.outputs)]
         args = [p for p in schema.attributes]
 
-        if "_" in name:
-            class_name = "Onnx" + name
-        else:
-            class_name = "Onnx" + schema.name
-
-        filename = os.path.join(
-            cache_dir, schema.name + "_" + str(schema.since_version) + ".rst"
-        )
-        if not cache and os.path.exists(filename):
-            with open(filename, "r", encoding="utf-8") as f:
-                doc = f.read()
-        else:
-            doc = get_rst_doc(schema)
-            if cache:
-                with open(filename, "w", encoding="utf-8") as f:
-                    f.write(doc)
+        class_name = "Onnx" + (name if "_" in name else schema.name)
+        doc = f"See `{name} <https://onnx.ai/onnx/operators/onnx__{name}.html>`_."
 
         cl = ClassFactory(
             class_name,

diff --git a/skl2onnx/common/_topology.py b/skl2onnx/common/_topology.py
@@ -63,6 +63,7 @@ def _default_OPSET_TO_IR_VERSION():
         18: 8,
         19: 9,
         20: 9,
+        21: 10,
     }
 
 

diff --git a/skl2onnx/operator_converters/one_hot_encoder.py b/skl2onnx/operator_converters/one_hot_encoder.py
@@ -45,13 +45,6 @@ def convert_sklearn_one_hot_encoder(
 
         enum_cats = []
         index_inputs = 0
-        to_drop = ohe_op._drop_idx_after_grouping
-        if to_drop is not None:
-            # raise NotImplementedError(
-            #    f"The converter is not implemented when "
-            #    f"_drop_idx_after_grouping is not None: {to_drop}."
-            # )
-            pass
 
         for index, cats in enumerate(ohe_op.categories_):
             filtered_cats = ohe_op._compute_transformed_categories(index)

diff --git a/tests/test_algebra_onnx_doc.py b/tests/test_algebra_onnx_doc.py
@@ -6,7 +6,7 @@
 from numpy.testing import assert_almost_equal
 import onnx
 from skl2onnx.algebra.onnx_ops import dynamic_class_creation
-from skl2onnx.algebra.automation import get_rst_doc_sklearn, get_rst_doc
+from skl2onnx.algebra.automation import get_rst_doc_sklearn
 from test_utils import TARGET_OPSET
 
 
@@ -43,27 +43,16 @@ def test_transpose2(self):
         res = self.predict_with_onnxruntime(model_def, X)
         assert_almost_equal(res["Y"], X)
 
-    @unittest.skipIf(
-        sys.platform.startswith("win"), reason="onnx schema are incorrect on Windows"
-    )
-    @unittest.skipIf(TARGET_OPSET <= 20, reason="not available")
-    def test_doc_onnx(self):
-        rst = get_rst_doc()
-        assert "**Summary**" in rst
-
     @unittest.skipIf(
         sys.platform.startswith("win"), reason="onnx schema are incorrect on Windows"
     )
     @unittest.skipIf(TARGET_OPSET <= 20, reason="not available")
     def test_doc_sklearn(self):
-        try:
-            rst = get_rst_doc_sklearn()
-            assert ".. _l-sklops-OnnxSklearnBernoulliNB:" in rst
-        except KeyError as e:
-            assert "SklearnGaussianProcessRegressor" in str(
-                e
-            ) or "SklearnGaussianProcessClassifier" in str(e)
+        rst = get_rst_doc_sklearn()
+        assert (
+            ".. _l-sklops-OnnxSklearnBernoulliNB:" in rst
+        ), f"Unable to find a substring in {rst}"
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main(verbosity=2)
diff --git a/tests/test_sklearn_one_hot_encoder_converter.py b/tests/test_sklearn_one_hot_encoder_converter.py
@@ -472,6 +472,7 @@ def test_shape_inference_onnx(self):
     def test_shape_inference_onnxruntime(self):
         self._shape_inference("onnxruntime")
 
+    @unittest.skipIf(not skl12(), reason="sparse output not available")
     def test_min_frequency(self):
         data = pandas.DataFrame(
             [

diff --git a/tests/test_utils/__init__.py b/tests/test_utils/__init__.py
@@ -60,6 +60,8 @@ def create_tensor(N, C, H=None, W=None):
 
 
 def _get_ir_version(opv):
+    if opv >= 21:
+        return 10
     if opv >= 19:
         return 9
     if opv >= 15:
@@ -83,7 +85,11 @@ def max_onnxruntime_opset():
     <https://github.com/microsoft/onnxruntime/blob/main/docs/Versioning.md>`_.
     """
     vi = pv.Version(ort_version.split("+")[0])
-    if vi >= pv.Version("1.16.0"):
+    if vi >= pv.Version("1.18.0"):
+        return 21
+    if vi >= pv.Version("1.17.0"):
+        return 20
+    if vi >= pv.Version("1.15.0"):
         return 19
     if vi >= pv.Version("1.14.0"):
         return 18
@@ -120,6 +126,7 @@ def max_onnxruntime_opset():
     )
 )
 
+# opset-ml == 4 still not implemented in onnxruntime
 value_ml = 3
 if TARGET_OPSET <= 16:
     # TreeEnsemble* for opset-ml == 3 is implemented in onnxruntime==1.12.0
-Original file line number
+Diff line change
@@ Expand Up / @@ -63,6 +63,7 @@ def _default_OPSET_TO_IR_VERSION(): @@
 : 8,
 : 9,
 : 9,
+: 10,
         }
@@ Expand Down @@