From 484ba1888b446ee89ac182c2be9aad8aff3f3afb Mon Sep 17 00:00:00 2001 From: Ryan Soley Date: Thu, 12 Oct 2023 14:31:33 -0400 Subject: [PATCH] add rubicon schema support (#393) * port `rubicon_schema` source over * formatting * add tests * add notebooks * update docs * add recent XGB changes * add recent LGBM changes * reset versions * linting & formatting --- MANIFEST.in | 3 + docs/source/api_reference.rst | 9 + docs/source/contribute-schema.rst | 67 +++ docs/source/index.rst | 5 + docs/source/schema-representation.rst | 440 ++++++++++++++++ environment.yml | 2 + .../logging-examples/log-with-schema.ipynb | 474 ++++++++++++++++++ .../register-custom-schema.ipynb | 356 +++++++++++++ notebooks/logging-examples/set-schema.ipynb | 193 +++++++ rubicon_ml/client/project.py | 3 +- rubicon_ml/schema/__init__.py | 10 + rubicon_ml/schema/logger.py | 221 ++++++++ rubicon_ml/schema/registry.py | 77 +++ .../schema/lightgbm__LGBMClassifier.yaml | 15 + .../schema/schema/lightgbm__LGBMModel.yaml | 75 +++ .../schema/lightgbm__LGBMRegressor.yaml | 9 + .../sklearn__RandomForestClassifier.yaml | 61 +++ .../schema/xgboost__DaskXGBClassifier.yaml | 9 + .../schema/xgboost__DaskXGBRegressor.yaml | 9 + .../schema/schema/xgboost__XGBClassifier.yaml | 9 + .../schema/schema/xgboost__XGBModel.yaml | 120 +++++ .../schema/schema/xgboost__XGBRegressor.yaml | 9 + setup.cfg | 1 + tests/fixtures.py | 280 ++++++++++- tests/integration/test_schema.py | 60 +++ tests/unit/schema/__init__.py | 0 tests/unit/schema/test_schema_logger.py | 355 +++++++++++++ tests/unit/schema/test_schema_registry.py | 56 +++ 28 files changed, 2921 insertions(+), 7 deletions(-) create mode 100644 docs/source/contribute-schema.rst create mode 100644 docs/source/schema-representation.rst create mode 100644 notebooks/logging-examples/log-with-schema.ipynb create mode 100644 notebooks/logging-examples/register-custom-schema.ipynb create mode 100644 notebooks/logging-examples/set-schema.ipynb create mode 100644 rubicon_ml/schema/__init__.py create mode 100644 rubicon_ml/schema/logger.py create mode 100644 rubicon_ml/schema/registry.py create mode 100644 rubicon_ml/schema/schema/lightgbm__LGBMClassifier.yaml create mode 100644 rubicon_ml/schema/schema/lightgbm__LGBMModel.yaml create mode 100644 rubicon_ml/schema/schema/lightgbm__LGBMRegressor.yaml create mode 100644 rubicon_ml/schema/schema/sklearn__RandomForestClassifier.yaml create mode 100644 rubicon_ml/schema/schema/xgboost__DaskXGBClassifier.yaml create mode 100644 rubicon_ml/schema/schema/xgboost__DaskXGBRegressor.yaml create mode 100644 rubicon_ml/schema/schema/xgboost__XGBClassifier.yaml create mode 100644 rubicon_ml/schema/schema/xgboost__XGBModel.yaml create mode 100644 rubicon_ml/schema/schema/xgboost__XGBRegressor.yaml create mode 100644 tests/integration/test_schema.py create mode 100644 tests/unit/schema/__init__.py create mode 100644 tests/unit/schema/test_schema_logger.py create mode 100644 tests/unit/schema/test_schema_registry.py diff --git a/MANIFEST.in b/MANIFEST.in index 50ddf56a..ebcbfd06 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,7 @@ graft rubicon_ml/viz/assets graft rubicon_ml/viz/assets/css + include versioneer.py include rubicon_ml/_version.py + +recursive-include rubicon_ml/schema *.yaml diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index 4a93aee5..a300d208 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -87,6 +87,15 @@ RubiconJSON .. _library-reference-sklearn: +schema +====== + +.. automodule:: rubicon_ml.schema.logger + :members: + +.. automodule:: rubicon_ml.schema.registry + :members: + sklearn ======= ``rubicon_ml`` offers direct integration with **Scikit-learn** via our diff --git a/docs/source/contribute-schema.rst b/docs/source/contribute-schema.rst new file mode 100644 index 00000000..2b6c1a14 --- /dev/null +++ b/docs/source/contribute-schema.rst @@ -0,0 +1,67 @@ +.. _contribute-schema: + +Contribute a schema +******************* + +Consider the following schema that was created in the "Register a custom schema" section: + +.. code-block:: python + + extended_schema = { + "name": "sklearn__RandomForestClassifier__ext", + "extends": "sklearn__RandomForestClassifier", + + "parameters": [ + {"name": "runtime_environment", "value_env": "RUNTIME_ENV"}, + ], + } + +To contribute "sklearn__RandomForestClassifier__ext" to the ``rubicon_ml.schema`` registry, +first write the dictionary out to a YAML file. + +.. code-block:: python + + import yaml + + schema_filename = "sklearn__RandomForestClassifier__ext.yaml" + + with open(schema_filename, "w") as file: + file.write(yaml.dump(extended_schema)) + +Once "sklearn__RandomForestClassifier__ext.yaml" is created, follow the "Developer +instructions" to fork the rubicon-ml GitHub repository and prepare to make a contribution. + +From the root of the forked repository, copy the new schema into the library's schema directory: + +.. code-block:: bash + + cp [PATH_TO]/sklearn__RandomForestClassifier__ext.yaml rubicon_ml/schema/schema/ + +Then update **rubicon_ml/schema/registry.py**, adding the new schema to the +``RUBICON_SCHEMA_REGISTRY``: + +.. code-block:: python + + RUBICON_SCHEMA_REGISTRY = { + # other schema entries... + "sklearn__RandomForestClassifier__ext": lambda: _load_schema( + os.path.join("schema", "sklearn__RandomForestClassifier__ext.yaml") + ), + } + +Finally refer back to the "Contribute" section of the "Developer instructions" to push your +changes to GitHub and open a pull request. Once the pull request is merged, +"sklearn__RandomForestClassifier__ext" will be available in the next release of +``rubicon_ml``. + +Schema naming conventions +========================= + +When naming a schema that extends a schema already made available by ``rubicon_ml.schema``, simply +append a double-underscore and a unique identifier. The "sklearn__RandomForestClassifier__ext" +above is named following this convention. + +When naming a schema that represents an object that is not yet present in schema, +leverage the ``registry.get_schema_name`` function to generate a name. For example, if +you are making a schema for an object ``my_obj`` of class ``Model`` from a module ``my_model``, +``registry.get_schema_name(my_obj)`` will return the name "my_model__Model". diff --git a/docs/source/index.rst b/docs/source/index.rst index 63b0b5d4..a3494283 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -121,6 +121,7 @@ To install all extra modules, use the ``all`` extra. logging-examples/logging-training-metadata logging-examples/logging-plots logging-examples/logging-concurrently + logging-examples/log-with-schema logging-examples/tagging logging-examples/rubiconJSON-querying visualizations.rst @@ -136,6 +137,8 @@ To install all extra modules, use the ``all`` extra. integrations/integration-sklearn logging-examples/logging-feature-plots logging-examples/multiple-backend + logging-examples/register-custom-schema + logging-examples/set-schema logging-examples/visualizing-logged-dataframes .. toctree:: @@ -152,6 +155,7 @@ To install all extra modules, use the ``all`` extra. :caption: Reference api_reference.rst + schema-representation.rst .. toctree:: :maxdepth: 2 @@ -159,6 +163,7 @@ To install all extra modules, use the ``all`` extra. :caption: Community contributing.rst + contribute-schema.rst Changelog Feedback GitHub diff --git a/docs/source/schema-representation.rst b/docs/source/schema-representation.rst new file mode 100644 index 00000000..8653da5f --- /dev/null +++ b/docs/source/schema-representation.rst @@ -0,0 +1,440 @@ +.. _schema-representation: + +Representing model metadata with a schema +***************************************** + +A rubicon-ml schema is a YAML file defining how attributes of a Python object, generally +representing a model, will be logged to a rubicon-ml experiment. Schema can +be used to automatically instrument and standardize the rubicon-ml logging of commonly +used model objects. + +Schema are used to log experiments to an existing rubicon-ml project. +Experiments consist of features, parameters, metrics, artifacts, and dataframes. More info +on each of these can be found in rubicon-ml's glossary. + +A simple schema +=============== + +Consider the following objects from a module called ``my_model``: + +.. code-block:: python + + import pandas as pd + + class Optimizer: + def optimize(X, y, target): + self.optimized_ = True + + return "optimized" + + class Model: + def __init__(self, alpha=1e-3, gamma=1e-3): + self.alpha = alpha + self.gamma = gamma + + def fit(self, X, y): + self.optimizer = Optimizer() + self.target = "y" + + self.feature_names_in_ = X.columns + self.feature_importances_ = [1.0 / len(X.columns)] * len(X.columns) + + self.learned_attribute_ = optimizer.optimize(X, y, target) + + return self + + def score(self, X): + self.score_ = 1.0 + self.summary_ = pd.DataFrame( + [[self.alpha, self.gamma, self.learned_attribute_, self.score_]], + columns=["alpha", "gamma", "learned_attribute", "score"], + ) + + return self.score_ + +The following is a complete YAML representation of the ``Model`` object's schema: + +.. code-block:: yaml + + name: my_model__Model + verison: 1.0.0 + + compatibility: + pandas: + max_version: + min_version: 1.0.5 + docs_url: https://my-docs.com/my-model/Model.html + + artifacts: + - self + - name: optimizer + data_object_attr: optimizer + dataframes: + - name: summary + df_attr: summary_ + features: + - names_attr: feature_names_in_ + importances_attr: feature_importances_ + optional: true + - name_attr: target + metrics: + - name: learned_attribute + value_attr: learned_attribute_ + optional: true + - name: score + value_attr: score_ + - name: env_metric + value_env: METRIC + parameters: + - name: alpha + value_attr: alpha + - name: gamma + value_attr: gamma + - name: env_param + value_env: PARAMETER + +Schema metadata +--------------- + +The first section of the schema defines metadata about the schema itself, +like the name and version. **The name of a schema should be the name of the +library the class it represents comes from and the name of the Python class itself separated +by a double underscore.** + +.. code-block:: yaml + + name: my_model__Model + verison: 1.0.0 + +The next section defines any dependencies the model object has on external Python libraries. +Generally, this will be at least the library the object is imported from. Reference documentation +for the object to be logged can also be included in this section. + +.. code-block:: yaml + + compatibility: + pandas: + max_version: + min_version: 1.0.5 + docs_url: https://my-docs.com/my-model/Model.html + +The remaining sections define how the attributes of the object will be logged to the +``rubicon-ml`` experiment. In general, each section is a list of attributes to log to +``rubicon-ml`` with a name for the logged metadata and the name of the attribute +containing the value to log. + +Artifacts +--------- + +Define a :ref:`rubicon_ml.Artifact` +for logging by providing a ``name`` for the logged artifact and the attribute ``data_object_attr`` +containing the object to log. The special keyword ``self`` will log the full object the schema +represents as an artifact with the same name as the object's class. + +.. code-block:: yaml + + artifacts: + - self # logs this Model as an artifact named "Model" + - name: optimizer # logs Optimizer in `optimizer` attribute as an artifact named "optimizer" + data_object_attr: optimizer + +Dataframes +---------- + +Define a :ref:`rubicon_ml.Dataframe` +for logging by providing a ``name`` for the logged dataframe and the attribute ``df_attr`` +containing the DataFrame to log. + +.. code-block:: yaml + + dataframes: + - name: summary # logs DataFrame in `summary_` attribute as a dataframe named "summary" + df_attr: summary_ + +Features +-------- + +Define a single :ref:`rubicon_ml.Feature` +for logging by providing the attribute ``name_attr`` containing the name of the feature to log +and optionally the attribute ``importance_attr`` containing the feature's importance. + +Lists of features can be defined for logging with the attributes ``names_attr`` containing a +list of feature names to log and optionally ``importances_attr`` containing the corresponding +importances. + +.. code-block:: yaml + + features: + - names_attr: feature_names_in_ # for each value in the `feature_names_in_` attribute, logs a feature named that + # value with the corresponding importance in the `feature_importances_` attribute + importances_attr: feature_importances_ + optional: true + - name_attr: target # logs a feature named the value of the `target` attribute + +Metrics +------- + +Define a :ref:`rubicon_ml.Metric` +for logging by providing a ``name`` for the logged metric and the attribute ``value_attr`` +containing the metric value to log. + +Metric values can also be extracted from the runtime environment. Replace ``value_attr`` with ``value_env`` to +leverage ``os.environ`` to read the metric value from the available environment variables. + +.. code-block:: yaml + + metrics: + - name: learned_attribute # logs value in `learned_attribute_` attribute as a metric named "learned_attribute" + value_attr: learned_attribute_ + optional: true + - name: score # logs value in `score_` attribute as a metric named "score" + value_attr: score_ + - name: env_metric # logs value in `METRIC` environment varibale as a metric named "env_metric" + value_env: METRIC + +Parameters +---------- + +Define a :ref:`rubicon_ml.Parameter` +for logging by providing a ``name`` for the logged parameter and the attribute ``value_attr`` +containing the parameter value to log. + +Parameter values can also be extracted from the runtime environment. Replace ``value_attr`` with ``value_env`` to +leverage ``os.environ`` to read the parameter value from the available environment variables. + +.. code-block:: yaml + + parameters: + - name: alpha # logs value in `alpha` attribute as a parameter named "alpha" + value_attr: alpha + - name: gamma # logs value in `gamma` attribute as a parameter named "gamma" + value_attr: gamma + - name: env_param # logs value in `PARAMETER` environment varibale as a parameter named "env_param" + value_env: PARAMETER + +Optional attributes +=================== + +In some cases, the attribute containing the value to log may not always be set on the underlying object. A model +may have been trained on a dataset with no feature names, or perhaps some learned attributes are only learned +if certain parameters have certain values while fitting. + +By default, schema logging will raise an exception if the attribute to be logged is not set. To suppress the errors +and simply move on, items in the ``artifacts``, ``dataframes``, ``features``, ``metrics``, ``parameters`` and +``schema`` lists may optionally contain a key ``optional`` with a **true** value. + +The ``feature_names_in_`` and ``learned_attribute_`` attributes are both marked optional in the example schema +above to handle cases where no feature names were present in the training data and ``learned_attribute_`` was +not learned: + +.. code-block:: yaml + + features: + - names_attr: feature_names_in_ + importances_attr: feature_importances_ + optional: true # will not error if `feature_importances_` attribute is not set + - name_attr: target # **will** error if `target` attribute is not set + metrics: + - name: learned_attribute + value_attr: learned_attribute_ + optional: true # will not error if `learned_attribute_` attribute is not set + +**Note:** Optional items in ``artifacts``, ``dataframes``, ``features``, and ``schema`` will omit the associated +entity from logging entirely if an optional attribute is not set. Optional items in ``metrics`` and ``parameters`` +will log the associated entity with the given name and a value of **None** if an optional attribute is not set. + +Nested schema +============= + +The following is a complete YAML representation of the ``Optimizer`` object's schema: + +.. code-block:: yaml + + name: my_model__Optimizer + verison: 1.0.0 + + metrics: + - name: optimized + value_attr: optimized_ + +To apply another schema to one of the attributes of the original object, provide the schema ``name`` +to be retrieved via ``registry.get_schema`` and the attribute ``attr`` containing the +object to apply the schema to. + +.. code-block:: yaml + + schema: + - name: my_model__Optimizer # logs a metric according to the above schema using the object in `optimizer` + - attr: optimizer + +**Note:** Nested schema will add the logged entities to the original experiment created by the parent schema, +not a new experiment. Nested schema cannot have names that conflict with the entites logged by the parent +schema. + +The complete schema now looks like this and will log an additional metric ``optimized`` as defined by the +``Optimizer`` schema to the original experiment: + +.. code-block:: yaml + + name: my_model__Model + verison: 1.0.0 + + compatibility: + pandas: + max_version: + min_version: 1.0.5 + docs_url: https://my-docs.com/my-model/Model.html + + artifacts: + - self + - name: optimizer + data_object_attr: optimizer + dataframes: + - name: summary + df_attr: summary_ + features: + - names_attr: feature_names_in_ + importances_attr: feature_importances_ + optional: true + - name_attr: target + metrics: + - name: learned_attribute + value_attr: learned_attribute_ + optional: true + - name: score + value_attr: score_ + - name: env_metric + value_env: METRIC + parameters: + - name: alpha + value_attr: alpha + - name: gamma + value_attr: gamma + - name: env_param + value_env: PARAMETER + schema: + - name: my_model__Optimizer + - attr: optimizer + +Hierarchical schema +=================== + +Some objects may contain a list of other objects that are already represented by a scehma, like +a feature eliminator or hyperparameter optimizer that trained multiple iterations of an underlying model +object. + +The ``children`` key can be provided to log each of these underlying objects to a **new experiment**. This +means that a single call to ``project.log_with_schema`` will log **1+n** experiments to ``project`` where +**n** is the number of objects in the list specified by ``children``. + +Within the ``children`` key, provide the schema ``name`` for the children objects to be retrieved via +``registry.get_schema`` and the attribute ``attr`` containing the list of child objects. + +.. code-block:: yaml + + children: + - name: my_model__Optimizer # defines the children's schema + - attr: optimizers # logs an experiment according to the schema for each object in `optimizers` + +If we replace the nested schema from the previous example with a list of children that adhere to the same +``Optimizer`` schema, the complete schema now looks like this. It will log a single experiment for ``Model`` +containing all the information in the original ``Model`` schema, as well as an additional experiment as +defined by the ``Optimizer`` schema for each of the objects in ``Model``'s ``optimizers`` list. + +.. code-block:: yaml + + name: my_model__Model + verison: 1.0.0 + + compatibility: + pandas: + max_version: + min_version: 1.0.5 + docs_url: https://my-docs.com/my-model/Model.html + + artifacts: + - self + - name: optimizer + data_object_attr: optimizer + children: + - name: my_model__Optimizer + - attr: optimizers + dataframes: + - name: summary + df_attr: summary_ + features: + - names_attr: feature_names_in_ + importances_attr: feature_importances_ + optional: true + - name_attr: target + metrics: + - name: learned_attribute + value_attr: learned_attribute_ + optional: true + - name: score + value_attr: score_ + - name: env_metric + value_env: METRIC + parameters: + - name: alpha + value_attr: alpha + - name: gamma + value_attr: gamma + - name: env_param + value_env: PARAMETER + +Extending a schema +================== + +Consider an extension of ``Model`` named ``NewModel``: + +.. code-block:: python + + class NewModel(Model): + def __init__(self, alpha=1e-3, gamma=1e-3, delta=1e-3): + super().__init__(alpha=alpha, gamma=gamma) + + self.delta = delta + + def fit(self, X, y): + super().fit(X, y) + + self.other_learned_attribute_ = self.delta * self.learned_attribute_ + + return self + +To extend an existing schema, provide the name of the schema to extend as the +``extends`` key's value after the new schema's name. This new schema will log everything +in the schema represented by ``extends`` plus any additional values. + +.. code-block:: yaml + + name: my_model__NewModel + extends: my_model__Model + verison: 1.0.0 + +The following is a complete YAML representation of the ``NewModel`` object's schema. +This schema will log everything that the ``Model`` schema would with the addition of the +``other_learned_attribute`` metric and ``delta`` parameter from ``NewModel``. + +.. code-block:: yaml + + name: my_model__NewModel + extends: my_model__Model + verison: 1.0.0 + + compatibility: + pandas: + max_version: + min_version: 1.0.5 + docs_url: https://my-docs.com/my-model/NewModel.html + + metrics: + - name: other_learned_attribute + value_attr: other_learned_attribute_ + parameters: + - name: delta + value_attr: delta + +To see an extended schema in action, check out the "Register a custom +schema" section. diff --git a/environment.yml b/environment.yml index 38be381b..852a9523 100644 --- a/environment.yml +++ b/environment.yml @@ -32,9 +32,11 @@ dependencies: - ipykernel - isort - jupyterlab + - lightgbm - nbconvert - pytest - pytest-cov + - xgboost # for versioning - versioneer diff --git a/notebooks/logging-examples/log-with-schema.ipynb b/notebooks/logging-examples/log-with-schema.ipynb new file mode 100644 index 00000000..37eb2a37 --- /dev/null +++ b/notebooks/logging-examples/log-with-schema.ipynb @@ -0,0 +1,474 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Logging with a schema\n", + "\n", + "Create a ``rubicon_ml`` project" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from rubicon_ml import Rubicon\n", + "\n", + "rubicon = Rubicon(persistence=\"memory\", auto_git_enabled=True)\n", + "project = rubicon.create_project(name=\"apply schema\")\n", + "project" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train a ``RandomForestClassifier``\n", + "\n", + "Load a training dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "X, y = load_wine(return_X_y=True, as_frame=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train an instance of the model the schema represents" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RandomForestClassifier(ccp_alpha=0.005, criterion='log_loss',\n", + " max_features='log2', n_estimators=24, oob_score=True,\n", + " random_state=121)\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rfc = RandomForestClassifier(\n", + " ccp_alpha=5e-3,\n", + " criterion=\"log_loss\",\n", + " max_features=\"log2\",\n", + " n_estimators=24,\n", + " oob_score=True,\n", + " random_state=121,\n", + ")\n", + "rfc.fit(X, y)\n", + "\n", + "print(rfc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Infer schema and log model metadata\n", + "\n", + "Log the model metadata defined in the applied schema to a new experiment in ``project`` with ``project.log_with_schema``\n", + "\n", + "**Note:** ``project.log_with_schema`` will infer the correct schema based on the given object to log" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "inferred schema name: sklearn__RandomForestClassifier\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment = project.log_with_schema(\n", + " rfc,\n", + " experiment_kwargs={ # additional kwargs to be passed to `project.log_experiment`\n", + " \"name\": \"log with schema\",\n", + " \"model_name\": \"RandomForestClassifier\",\n", + " \"description\": \"logged with the `RandomForestClassifier` `rubicon_schema`\",\n", + " },\n", + ")\n", + "\n", + "print(f\"inferred schema name: {project.schema_['name']}\")\n", + "experiment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View the experiment's logged metadata\n", + "\n", + "Each experiment contains all the data represented in the schema - more information on the data captured by\n", + "a ``rubicon_schema`` can be found in the \"Representing model metadata with a ``rubicon_schema``\" section" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'project_name': 'apply schema',\n", + " 'id': 'ec4c3ead-3337-4623-9a97-c61f48e8de3d',\n", + " 'name': 'log with schema',\n", + " 'description': 'logged with the `RandomForestClassifier` `rubicon_schema`',\n", + " 'model_name': 'RandomForestClassifier',\n", + " 'branch_name': 'schema',\n", + " 'commit_hash': 'c9f696408a03c6a6fbf2fbff39fa48bbf722bae1',\n", + " 'training_metadata': None,\n", + " 'tags': [],\n", + " 'created_at': datetime.datetime(2023, 9, 25, 15, 47, 37, 552091)}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vars(experiment._domain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The features and their importances are logged as defined in the schema's \"features\" section" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'names_attr': 'feature_names_in_',\n", + " 'importances_attr': 'feature_importances_',\n", + " 'optional': True}]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.schema_[\"features\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "alcohol (0.1276831830349219)\n", + "malic_acid (0.03863837532736449)\n", + "ash (0.006168227239831861)\n", + "alcalinity_of_ash (0.025490751927615605)\n", + "magnesium (0.02935763050777937)\n", + "total_phenols (0.058427899304369986)\n", + "flavanoids (0.15309812550131274)\n", + "nonflavanoid_phenols (0.007414542189797497)\n", + "proanthocyanins (0.012615187741781065)\n", + "color_intensity (0.13608806341133572)\n", + "hue (0.0892558912217226)\n", + "od280/od315_of_diluted_wines (0.15604181694153108)\n", + "proline (0.15972030565063608)\n" + ] + } + ], + "source": [ + "for feature in experiment.features():\n", + " print(f\"{feature.name} ({feature.importance})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each parameter and its value are logged as defined in the schema's \"parameters\" section" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'name': 'bootstrap', 'value_attr': 'bootstrap'},\n", + " {'name': 'ccp_alpha', 'value_attr': 'ccp_alpha'},\n", + " {'name': 'class_weight', 'value_attr': 'class_weight'},\n", + " {'name': 'criterion', 'value_attr': 'criterion'},\n", + " {'name': 'max_depth', 'value_attr': 'max_depth'},\n", + " {'name': 'max_features', 'value_attr': 'max_features'},\n", + " {'name': 'min_impurity_decrease', 'value_attr': 'min_impurity_decrease'},\n", + " {'name': 'max_leaf_nodes', 'value_attr': 'max_leaf_nodes'},\n", + " {'name': 'max_samples', 'value_attr': 'max_samples'},\n", + " {'name': 'min_samples_split', 'value_attr': 'min_samples_split'},\n", + " {'name': 'min_samples_leaf', 'value_attr': 'min_samples_leaf'},\n", + " {'name': 'min_weight_fraction_leaf',\n", + " 'value_attr': 'min_weight_fraction_leaf'},\n", + " {'name': 'n_estimators', 'value_attr': 'n_estimators'},\n", + " {'name': 'oob_score', 'value_attr': 'oob_score'},\n", + " {'name': 'random_state', 'value_attr': 'random_state'}]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.schema_[\"parameters\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bootstrap: True\n", + "ccp_alpha: 0.005\n", + "class_weight: None\n", + "criterion: log_loss\n", + "max_depth: None\n", + "max_features: log2\n", + "min_impurity_decrease: 0.0\n", + "max_leaf_nodes: None\n", + "max_samples: None\n", + "min_samples_split: 2\n", + "min_samples_leaf: 1\n", + "min_weight_fraction_leaf: 0.0\n", + "n_estimators: 24\n", + "oob_score: True\n", + "random_state: 121\n" + ] + } + ], + "source": [ + "for parameter in experiment.parameters():\n", + " print(f\"{parameter.name}: {parameter.value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each metric and its value are logged as defined in the schema's \"metrics\" section" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'name': 'classes', 'value_attr': 'classes_'},\n", + " {'name': 'n_classes', 'value_attr': 'n_classes_'},\n", + " {'name': 'n_features_in', 'value_attr': 'n_features_in_'},\n", + " {'name': 'n_outputs', 'value_attr': 'n_outputs_'},\n", + " {'name': 'oob_decision_function',\n", + " 'value_attr': 'oob_decision_function_',\n", + " 'optional': True},\n", + " {'name': 'oob_score', 'value_attr': 'oob_score_', 'optional': True}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.schema_[\"metrics\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "classes: ...\n", + "n_classes: 3\n", + "n_features_in: 13\n", + "n_outputs: 1\n", + "oob_decision_function: ...\n", + "oob_score: 0.9775280898876404\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "for metric in experiment.metrics():\n", + " if np.isscalar(metric.value):\n", + " print(f\"{metric.name}: {metric.value}\")\n", + " else: # don't print long metrics\n", + " print(f\"{metric.name}: ...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A copy of the trained model is logged as defined in the schema's \"artifacts\" section" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['self']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.schema_[\"artifacts\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RandomForestClassifier:\n", + "RandomForestClassifier(ccp_alpha=0.005, criterion='log_loss',\n", + " max_features='log2', n_estimators=24, oob_score=True,\n", + " random_state=121)\n" + ] + } + ], + "source": [ + "for artifact in experiment.artifacts():\n", + " print(f\"{artifact.name}:\\n{artifact.get_data(unpickle=True)}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/logging-examples/register-custom-schema.ipynb b/notebooks/logging-examples/register-custom-schema.ipynb new file mode 100644 index 00000000..c3a95b1a --- /dev/null +++ b/notebooks/logging-examples/register-custom-schema.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6898ac1c-c6e3-40d8-a787-c70f2b4e0b03", + "metadata": { + "tags": [] + }, + "source": [ + "# Register a custom schema\n", + "\n", + "``rubicon_schema`` can be constructed within a Python session in addition to being read from\n", + "the registry's YAML files\n", + "\n", + "## Define additional metadata to log\n", + "\n", + "Add an additional variable to the environment to record with our ``rubicon_schema``" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "75a9fb48-2c0f-4fdc-91df-9105fde2892f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AWS\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"RUNTIME_ENV\"] = \"AWS\"\n", + "\n", + "! echo $RUNTIME_ENV" + ] + }, + { + "cell_type": "markdown", + "id": "6ce08db5-c532-4f1a-9357-61a6b3d1eadd", + "metadata": {}, + "source": [ + "## Construct a custom schema\n", + "\n", + "Create a dictionary representation of the new, custom schema. This new schema will extend\n", + "the existing ``RandomForestClassifier`` schema with an additional parameter that logs the\n", + "new environment variable\n", + "\n", + "**Note:** The ``extends`` key is not required - custom schema do not need to extend existing schema" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "020156b3-d0b2-4b99-8c5a-bfb60e02612d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'extends': 'sklearn__RandomForestClassifier',\n", + " 'name': 'sklearn__RandomForestClassifier__ext',\n", + " 'parameters': [{'name': 'runtime_environment', 'value_env': 'RUNTIME_ENV'}]}\n" + ] + } + ], + "source": [ + "import pprint\n", + "\n", + "extended_schema = {\n", + " \"name\": \"sklearn__RandomForestClassifier__ext\",\n", + " \"extends\": \"sklearn__RandomForestClassifier\", \n", + "\n", + " \"parameters\": [\n", + " {\"name\": \"runtime_environment\", \"value_env\": \"RUNTIME_ENV\"},\n", + " ],\n", + "}\n", + "pprint.pprint(extended_schema)" + ] + }, + { + "cell_type": "markdown", + "id": "ae6d5dfa-e511-4901-ae61-f6f2aea55cb0", + "metadata": {}, + "source": [ + "## Apply a custom schema to a project\n", + "\n", + "Create a ``rubicon_ml`` project" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6fde24fc-1ab3-49fc-8ea4-fb3573e3bb29", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from rubicon_ml import Rubicon\n", + "\n", + "rubicon = Rubicon(persistence=\"memory\", auto_git_enabled=True)\n", + "project = rubicon.create_project(name=\"apply schema\")\n", + "project" + ] + }, + { + "cell_type": "markdown", + "id": "b9c46f5b-27da-42bb-b2d2-4761e35cdd4a", + "metadata": { + "tags": [] + }, + "source": [ + "Apply the custom schema to the project" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e91c3d60-806a-49d4-a4b8-ec13489e6a11", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "project.set_schema(extended_schema)" + ] + }, + { + "cell_type": "markdown", + "id": "885b3e61-7875-445e-993f-3359bd4bb7ad", + "metadata": {}, + "source": [ + "## Log model metadata with a custom schema\n", + "\n", + "Load a training dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f71158d7-208d-4094-92b9-94b49a45cb6b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn.datasets import load_wine\n", + "\n", + "X, y = load_wine(return_X_y=True, as_frame=True)" + ] + }, + { + "cell_type": "markdown", + "id": "7b779808-771f-4c40-8250-a347e3b67c19", + "metadata": { + "tags": [] + }, + "source": [ + "Train an instance of the model the schema represents" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "838d254b-de2a-4155-909b-707728f343d9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RandomForestClassifier(ccp_alpha=0.005, criterion='log_loss',\n", + " max_features='log2', n_estimators=24, oob_score=True,\n", + " random_state=121)\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rfc = RandomForestClassifier(\n", + " ccp_alpha=5e-3,\n", + " criterion=\"log_loss\",\n", + " max_features=\"log2\",\n", + " n_estimators=24,\n", + " oob_score=True,\n", + " random_state=121,\n", + ")\n", + "rfc.fit(X, y)\n", + "\n", + "print(rfc)" + ] + }, + { + "cell_type": "markdown", + "id": "60e4c75b-1b92-4b8e-b938-81603162f2f4", + "metadata": {}, + "source": [ + "Log the model metadata defined in the base ``RandomForestClassifier`` plus the additional parameter\n", + "from the environment to a new experiment in ``project`` with ``project.log_with_schema``" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "34341f0d-32a8-4a39-aaf6-dad8ccc8bf1b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment = project.log_with_schema(\n", + " rfc,\n", + " experiment_kwargs={\n", + " \"name\": \"log with extended schema\",\n", + " \"model_name\": \"RandomForestClassifier\",\n", + " \"description\": \"logged with an extended `rubicon_schema`\",\n", + " },\n", + ")\n", + "experiment" + ] + }, + { + "cell_type": "markdown", + "id": "793daa8e-693b-4d2e-8c31-c71cd236291e", + "metadata": {}, + "source": [ + "## View the experiment's logged metadata\n", + "\n", + "Each experiment contains all the data represented in the base ``RandomForestClassifier`` schema plus the\n", + "additional parameter from the environment" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c656f695-5a30-4333-9aa8-a206f52a6d31", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bootstrap: True\n", + "ccp_alpha: 0.005\n", + "class_weight: None\n", + "criterion: log_loss\n", + "max_depth: None\n", + "max_features: log2\n", + "min_impurity_decrease: 0.0\n", + "max_leaf_nodes: None\n", + "max_samples: None\n", + "min_samples_split: 2\n", + "min_samples_leaf: 1\n", + "min_weight_fraction_leaf: 0.0\n", + "n_estimators: 24\n", + "oob_score: True\n", + "random_state: 121\n", + "runtime_environment: AWS\n" + ] + } + ], + "source": [ + "for parameter in experiment.parameters():\n", + " print(f\"{parameter.name}: {parameter.value}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d33f1585-00b3-42e6-9741-ac849a6cc8a9", + "metadata": {}, + "source": [ + "Don't forget to clean up" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ec8e4159-0f97-4c4d-923a-b8f283184b66", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "del os.environ[\"RUNTIME_ENV\"]" + ] + }, + { + "cell_type": "markdown", + "id": "d4757cb2-00e8-4ba1-aa64-04959dfea5d8", + "metadata": {}, + "source": [ + "## Persisting and sharing a custom schema\n", + "\n", + "To share custom schema with all ``rubicon_schema`` users, check out the \"Contribute a ``rubicon_schema``\" section" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/logging-examples/set-schema.ipynb b/notebooks/logging-examples/set-schema.ipynb new file mode 100644 index 00000000..64285a53 --- /dev/null +++ b/notebooks/logging-examples/set-schema.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Set a schema on a project\n", + "\n", + "\"Log a ``rubicon_ml`` experiment with a ``rubicon_schema``\" showed how ``rubicon_schema`` can\n", + "infer schema from the object to log - sometimes, this may not be possible and a schema may need to be set manually\n", + "\n", + "## Select a schema\n", + "\n", + "View all available schema" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['sklearn__RandomForestClassifier',\n", + " 'xgboost__XGBClassifier',\n", + " 'xgboost__DaskXGBClassifier']" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from rubicon_ml.schema import registry\n", + "\n", + "available_schema = registry.available_schema()\n", + "available_schema" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load a schema" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'artifacts': ['self'],\n", + " 'compatibility': {'scikit-learn': {'max_version': None,\n", + " 'min_version': '1.0.2'}},\n", + " 'docs_url': 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html',\n", + " 'features': [{'importances_attr': 'feature_importances_',\n", + " 'names_attr': 'feature_names_in_',\n", + " 'optional': True}],\n", + " 'metrics': [{'name': 'classes', 'value_attr': 'classes_'},\n", + " {'name': 'n_classes', 'value_attr': 'n_classes_'},\n", + " {'name': 'n_features_in', 'value_attr': 'n_features_in_'},\n", + " {'name': 'n_outputs', 'value_attr': 'n_outputs_'},\n", + " {'name': 'oob_decision_function',\n", + " 'optional': True,\n", + " 'value_attr': 'oob_decision_function_'},\n", + " {'name': 'oob_score',\n", + " 'optional': True,\n", + " 'value_attr': 'oob_score_'}],\n", + " 'name': 'sklearn__RandomForestClassifier',\n", + " 'parameters': [{'name': 'bootstrap', 'value_attr': 'bootstrap'},\n", + " {'name': 'ccp_alpha', 'value_attr': 'ccp_alpha'},\n", + " {'name': 'class_weight', 'value_attr': 'class_weight'},\n", + " {'name': 'criterion', 'value_attr': 'criterion'},\n", + " {'name': 'max_depth', 'value_attr': 'max_depth'},\n", + " {'name': 'max_features', 'value_attr': 'max_features'},\n", + " {'name': 'min_impurity_decrease',\n", + " 'value_attr': 'min_impurity_decrease'},\n", + " {'name': 'max_leaf_nodes', 'value_attr': 'max_leaf_nodes'},\n", + " {'name': 'max_samples', 'value_attr': 'max_samples'},\n", + " {'name': 'min_samples_split',\n", + " 'value_attr': 'min_samples_split'},\n", + " {'name': 'min_samples_leaf', 'value_attr': 'min_samples_leaf'},\n", + " {'name': 'min_weight_fraction_leaf',\n", + " 'value_attr': 'min_weight_fraction_leaf'},\n", + " {'name': 'n_estimators', 'value_attr': 'n_estimators'},\n", + " {'name': 'oob_score', 'value_attr': 'oob_score'},\n", + " {'name': 'random_state', 'value_attr': 'random_state'}],\n", + " 'verison': '1.0.0'}\n" + ] + } + ], + "source": [ + "import pprint\n", + "\n", + "rfc_schema = registry.get_schema(\"sklearn__RandomForestClassifier\")\n", + "pprint.pprint(rfc_schema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Apply the schema to a project\n", + "\n", + "Create a ``rubicon_ml`` project" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from rubicon_ml import Rubicon\n", + "\n", + "rubicon = Rubicon(persistence=\"memory\")\n", + "project = rubicon.create_project(name=\"apply schema\")\n", + "project" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the schema on the project" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "project.set_schema(rfc_schema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, ``log_with_schema`` will leverage the schema ``rfc_schema`` instead of trying to infer one" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/rubicon_ml/client/project.py b/rubicon_ml/client/project.py index 62543b76..e5ddb067 100644 --- a/rubicon_ml/client/project.py +++ b/rubicon_ml/client/project.py @@ -12,6 +12,7 @@ from rubicon_ml.client.utils.exception_handling import failsafe from rubicon_ml.client.utils.tags import filter_children from rubicon_ml.exceptions import RubiconException +from rubicon_ml.schema.logger import SchemaMixin if TYPE_CHECKING: from rubicon_ml import Rubicon @@ -19,7 +20,7 @@ from rubicon_ml.domain import Project as ProjectDomain -class Project(Base, ArtifactMixin, DataframeMixin): +class Project(Base, ArtifactMixin, DataframeMixin, SchemaMixin): """A client project. A `project` is a collection of `experiments`, diff --git a/rubicon_ml/schema/__init__.py b/rubicon_ml/schema/__init__.py new file mode 100644 index 00000000..d8c2df5b --- /dev/null +++ b/rubicon_ml/schema/__init__.py @@ -0,0 +1,10 @@ +"""``schema`` submodule initialization.""" + +from rubicon_ml.schema.registry import ( + available_schema, + get_schema, + get_schema_name, + register_schema, +) + +__all__ = ["available_schema", "get_schema", "get_schema_name", "register_schema"] diff --git a/rubicon_ml/schema/logger.py b/rubicon_ml/schema/logger.py new file mode 100644 index 00000000..cfd83ae8 --- /dev/null +++ b/rubicon_ml/schema/logger.py @@ -0,0 +1,221 @@ +"""Methods and a mixin to enable schema logging. + +The functions available in the ``schema`` submodule are applied to +``rubicon_ml.Project`` via the ``SchemaMixin`` class. They can be +called directly as a method of an existing project. +""" + +import os +from contextlib import contextmanager +from typing import Any, Dict, Optional + +from rubicon_ml.client.experiment import Experiment +from rubicon_ml.exceptions import RubiconException +from rubicon_ml.schema import registry + + +def _get_value(obj, entity_schema): + optional = entity_schema.get("optional", False) + value = None + + if "value_attr" in entity_schema: + value = _safe_getattr(obj, entity_schema["value_attr"], optional) + if "value_env" in entity_schema: + value = _safe_environ(entity_schema["value_env"], optional) + if "value_func" in entity_schema: + value = _safe_call_func(obj, entity_schema["value_func"], optional) + + return value + + +def _get_df(obj, entity_schema): + optional = entity_schema.get("optional", False) + df_value = None + + if "df_attr" in entity_schema: + df_value = _safe_getattr(obj, entity_schema["df_attr"], optional) + if "df_func" in entity_schema: + df_value = _safe_call_func(obj, entity_schema["df_func"], optional) + + return df_value + + +def _get_data_object(obj, entity_schema): + optional = entity_schema.get("optional", False) + data_object = None + + if "data_object_func" in entity_schema: + data_object = _safe_call_func(obj, entity_schema["data_object_func"], optional) + elif "data_object_attr" in entity_schema: + data_object = _safe_getattr(obj, entity_schema["data_object_attr"], optional) + + return data_object + + +def _safe_getattr(obj, attr, optional, default=None): + try: + value = getattr(obj, attr) + except (TypeError, AttributeError) as err: + if optional or (attr is None and isinstance(err, TypeError)): + return default + + raise err + + return value + + +def _safe_environ(environ_var, optional, default=None): + try: + value = os.environ[environ_var] + except KeyError as err: + if optional: + return default + + raise RubiconException(f"Environment variable '{environ_var}' not set.") from err + + return value + + +def _safe_call_func(obj, func, optional, default=None): + method = _safe_getattr(obj, func, optional, default) + value = None + + if method is not None: + try: + value = method() + except Exception as err: + if optional: + return default + + raise err + + return value + + +@contextmanager +def _set_temporary_schema(project, schema_name): + original_schema = project.schema_ + project.set_schema(registry.get_schema(schema_name)) + + yield + + project.set_schema(original_schema) + + +class SchemaMixin: + """Adds schema logging support to a client object.""" + + def log_with_schema( + self, + obj: Any, + experiment: Experiment = None, + experiment_kwargs: Optional[Dict[str, Any]] = None, + ) -> Any: + """Log an experiment leveraging ``self.schema_``.""" + + if not hasattr(self, "schema_"): + try: + schema_name = registry.get_schema_name(obj) + self.schema_ = registry.get_schema(schema_name) + except ValueError as err: + raise ValueError( + f"No schema set and no schema could be inferred from object {obj}. " + f"Set a schema with `Project.set_schema(schema)`." + ) from err + + if experiment_kwargs is None: + experiment_kwargs = {} + + if experiment is None: + experiment = self.log_experiment(**experiment_kwargs) + + base_schema_name = self.schema_.get("extends") + if base_schema_name is not None: + with _set_temporary_schema(self, base_schema_name): + self.log_with_schema(obj, experiment=experiment) + + for feature in self.schema_.get("features", []): + is_optional = feature.get("optional", False) + + if "names_attr" in feature: + feature_names = _safe_getattr(obj, feature["names_attr"], is_optional) + + if feature_names is not None: + feature_importances = _safe_getattr( + obj, + feature.get("importances_attr"), + is_optional, + default=[None] * len(feature["names_attr"]), + ) + + for name, importance in zip(feature_names, feature_importances): + experiment.log_feature(name=name, importance=importance) + + elif "name_attr" in feature: + feature_name = _safe_getattr(obj, feature["name_attr"], is_optional) + + if feature_name is not None: + feature_importance = _safe_getattr( + obj, feature.get("importance_attr"), is_optional + ) + + experiment.log_feature(name=feature_name, importance=feature_importance) + + for parameter in self.schema_.get("parameters", []): + experiment.log_parameter( + name=parameter["name"], + value=_get_value(obj, parameter), + ) + + for metric in self.schema_.get("metrics", []): + experiment.log_metric( + name=metric["name"], + value=_get_value(obj, metric), + ) + for artifact in self.schema_.get("artifacts", []): + if isinstance(artifact, str): + if artifact == "self": + experiment.log_artifact(name=obj.__class__.__name__, data_object=obj) + elif isinstance(artifact, dict): + data_object = _get_data_object(obj, artifact) + if data_object is not None: + experiment.log_artifact(name=artifact["name"], data_object=data_object) + + for dataframe in self.schema_.get("dataframes", []): + df_value = _get_df(obj, dataframe) + + if df_value is not None: + experiment.log_dataframe(df=df_value, name=dataframe["name"]) + + for schema in self.schema_.get("schema", []): + object_to_log = _safe_getattr(obj, schema["attr"], schema.get("optional", False)) + + if object_to_log is not None: + with _set_temporary_schema(self, schema["name"]): + self.log_with_schema(object_to_log, experiment=experiment) + + has_children = False + + for children in self.schema_.get("children", []): + children_objects = _safe_getattr( + obj, children["attr"], children.get("optional", False), default=[] + ) + + for child in children_objects: + has_children = True + + child_experiment = self.log_experiment(**experiment_kwargs) + child_experiment.add_tags(tags=["child", f"parent_id:{experiment.id}"]) + + with _set_temporary_schema(self, children["name"]): + self.log_with_schema(child, experiment=child_experiment) + + if has_children: + experiment.add_tags(tags=["parent"]) + + return experiment + + def set_schema(self, schema: Dict[str, Any]) -> None: + """Set the schema for this client object.""" + + self.schema_ = schema diff --git a/rubicon_ml/schema/registry.py b/rubicon_ml/schema/registry.py new file mode 100644 index 00000000..ad73c4f1 --- /dev/null +++ b/rubicon_ml/schema/registry.py @@ -0,0 +1,77 @@ +"""Mehtods for interacting with the existing rubicon-ml ``schema``.""" + +import os +from typing import Any, List + +import yaml + +RUBICON_SCHEMA_REGISTRY = { + "lightgbm__LGBMModel": lambda: _load_schema(os.path.join("schema", "lightgbm__LGBMModel.yaml")), + "lightgbm__LGBMClassifier": lambda: _load_schema( + os.path.join("schema", "lightgbm__LGBMClassifier.yaml") + ), + "lightgbm__LGBMRegressor": lambda: _load_schema( + os.path.join("schema", "lightgbm__LGBMRegressor.yaml") + ), + "sklearn__RandomForestClassifier": lambda: _load_schema( + os.path.join("schema", "sklearn__RandomForestClassifier.yaml") + ), + "xgboost__XGBModel": lambda: _load_schema(os.path.join("schema", "xgboost__XGBModel.yaml")), + "xgboost__XGBClassifier": lambda: _load_schema( + os.path.join("schema", "xgboost__XGBClassifier.yaml") + ), + "xgboost__XGBRegressor": lambda: _load_schema( + os.path.join("schema", "xgboost__XGBRegressor.yaml") + ), + "xgboost__DaskXGBClassifier": lambda: _load_schema( + os.path.join("schema", "xgboost__DaskXGBClassifier.yaml") + ), + "xgboost__DaskXGBRegressor": lambda: _load_schema( + os.path.join("schema", "xgboost__DaskXGBRegressor.yaml") + ), +} + + +def _load_schema(path: str) -> Any: + """Loads a schema YAML file from ``path`` relative to this file.""" + + full_path = os.path.join(os.path.dirname(__file__), path) + with open(full_path, "r") as file: + schema = yaml.safe_load(file) + + return schema + + +def available_schema() -> List[str]: + """Get the names of all available schema.""" + + return list(RUBICON_SCHEMA_REGISTRY.keys()) + + +def get_schema(name: str) -> Any: + """Get the schema with name ``name``.""" + + if name not in RUBICON_SCHEMA_REGISTRY: + raise ValueError( + f"'{name}' is not the name of an available rubicon schema. " + "For a list of schema names, use `registry.available_schema()`." + ) + + return RUBICON_SCHEMA_REGISTRY[name]() + + +def get_schema_name(obj: Any) -> str: + """Get the name of the schema that represents object ``obj``.""" + + obj_cls = obj.__class__ + + cls_name = obj_cls.__name__ + module_name = obj_cls.__module__.split(".")[0] + + return f"{module_name}__{cls_name}" + + +def register_schema(name: str, schema: dict): + """Add a schema to the schema registry.""" + + RUBICON_SCHEMA_REGISTRY[name] = lambda: schema diff --git a/rubicon_ml/schema/schema/lightgbm__LGBMClassifier.yaml b/rubicon_ml/schema/schema/lightgbm__LGBMClassifier.yaml new file mode 100644 index 00000000..b4990e09 --- /dev/null +++ b/rubicon_ml/schema/schema/lightgbm__LGBMClassifier.yaml @@ -0,0 +1,15 @@ +name: lightgbm__LGBMClassifier +extends: lightgbm__LGBMModel +version: 1.0.0 + +compatibility: + lightgbm: + max_version: + min_version: 3.1.1 +docs_url: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMModel.html#lightgbm.LGBMClassifier + +metrics: + - name: classes + value_attr: classes_ + - name: n_classes + value_attr: n_classes_ diff --git a/rubicon_ml/schema/schema/lightgbm__LGBMModel.yaml b/rubicon_ml/schema/schema/lightgbm__LGBMModel.yaml new file mode 100644 index 00000000..a501c1f3 --- /dev/null +++ b/rubicon_ml/schema/schema/lightgbm__LGBMModel.yaml @@ -0,0 +1,75 @@ +name: lightgbm__LGBMModel +version: 1.0.0 + +compatibility: + lightgbm: + max_version: + min_version: 3.1.1 +docs_url: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMModel.html#lightgbm.LGBMModel + +artifacts: + - self + - name: booster + data_object_attr: booster_ +features: + - names_attr: feature_name_ + importances_attr: feature_importances_ + optional: true +metrics: + - name: best_iteration + value_attr: best_iteration_ + optional: true + - name: best_score + value_attr: best_score_ + optional: true + - name: evals_result + value_attr: evals_result_ + optional: true + - name: n_features + value_attr: n_features_ + - name: n_features_in + value_attr: n_features_in_ + - name: n_iter + value_attr: n_iter_ + optional: true + - name: objective + value_attr: objective_ +parameters: + - name: boosting_type + value_attr: boosting_type + - name: num_leaves + value_attr: num_leaves + - name: max_depth + value_attr: max_depth + - name: learning_rate + value_attr: learning_rate + - name: n_estimators + value_attr: n_estimators + - name: subsample_for_bin + value_attr: subsample_for_bin + - name: objective + value_attr: objective + - name: class_weight + value_attr: class_weight + - name: min_split_gain + value_attr: min_split_gain + - name: min_child_weight + value_attr: min_child_weight + - name: min_child_samples + value_attr: min_child_samples + - name: subsample + value_attr: subsample + - name: subsample_freq + value_attr: subsample_freq + - name: colsample_bytree + value_attr: colsample_bytree + - name: reg_alpha + value_attr: reg_alpha + - name: reg_lambda + value_attr: reg_lambda + - name: random_state + value_attr: random_state + - name: n_jobs + value_attr: n_jobs + - name: importance_type + value_attr: importance_type diff --git a/rubicon_ml/schema/schema/lightgbm__LGBMRegressor.yaml b/rubicon_ml/schema/schema/lightgbm__LGBMRegressor.yaml new file mode 100644 index 00000000..39c67532 --- /dev/null +++ b/rubicon_ml/schema/schema/lightgbm__LGBMRegressor.yaml @@ -0,0 +1,9 @@ +name: lightgbm__LGBMRegressor +extends: lightgbm__LGBMModel +version: 1.0.0 + +compatibility: + lightgbm: + max_version: + min_version: 3.1.1 +docs_url: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMModel.html#lightgbm.LGBMRegressor diff --git a/rubicon_ml/schema/schema/sklearn__RandomForestClassifier.yaml b/rubicon_ml/schema/schema/sklearn__RandomForestClassifier.yaml new file mode 100644 index 00000000..028e9c00 --- /dev/null +++ b/rubicon_ml/schema/schema/sklearn__RandomForestClassifier.yaml @@ -0,0 +1,61 @@ +name: sklearn__RandomForestClassifier +verison: 1.0.0 + +compatibility: + scikit-learn: + max_version: + min_version: 1.0.2 +docs_url: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html + +artifacts: + - self +features: + - names_attr: feature_names_in_ + importances_attr: feature_importances_ + optional: true +metrics: + - name: classes + value_attr: classes_ + - name: n_classes + value_attr: n_classes_ + - name: n_features_in + value_attr: n_features_in_ + - name: n_outputs + value_attr: n_outputs_ + - name: oob_decision_function + value_attr: oob_decision_function_ + optional: true + - name: oob_score + value_attr: oob_score_ + optional: true +parameters: + - name: bootstrap + value_attr: bootstrap + - name: ccp_alpha + value_attr: ccp_alpha + - name: class_weight + value_attr: class_weight + - name: criterion + value_attr: criterion + - name: max_depth + value_attr: max_depth + - name: max_features + value_attr: max_features + - name: min_impurity_decrease + value_attr: min_impurity_decrease + - name: max_leaf_nodes + value_attr: max_leaf_nodes + - name: max_samples + value_attr: max_samples + - name: min_samples_split + value_attr: min_samples_split + - name: min_samples_leaf + value_attr: min_samples_leaf + - name: min_weight_fraction_leaf + value_attr: min_weight_fraction_leaf + - name: n_estimators + value_attr: n_estimators + - name: oob_score + value_attr: oob_score + - name: random_state + value_attr: random_state diff --git a/rubicon_ml/schema/schema/xgboost__DaskXGBClassifier.yaml b/rubicon_ml/schema/schema/xgboost__DaskXGBClassifier.yaml new file mode 100644 index 00000000..ae54ebe1 --- /dev/null +++ b/rubicon_ml/schema/schema/xgboost__DaskXGBClassifier.yaml @@ -0,0 +1,9 @@ +name: xgboost__DaskXGBClassifier +extends: xgboost__XGBModel +version: 1.0.0 + +compatibility: + xgboost: + max_version: + min_version: 1.7.0 +docs_url: https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.dask.DaskXGBClassifier diff --git a/rubicon_ml/schema/schema/xgboost__DaskXGBRegressor.yaml b/rubicon_ml/schema/schema/xgboost__DaskXGBRegressor.yaml new file mode 100644 index 00000000..be3e93e4 --- /dev/null +++ b/rubicon_ml/schema/schema/xgboost__DaskXGBRegressor.yaml @@ -0,0 +1,9 @@ +name: xgboost__DaskXGBRegressor +extends: xgboost__XGBModel +version: 1.0.0 + +compatibility: + xgboost: + max_version: + min_version: 1.7.0 +docs_url: https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.dask.DaskXGBRegressor diff --git a/rubicon_ml/schema/schema/xgboost__XGBClassifier.yaml b/rubicon_ml/schema/schema/xgboost__XGBClassifier.yaml new file mode 100644 index 00000000..6d463629 --- /dev/null +++ b/rubicon_ml/schema/schema/xgboost__XGBClassifier.yaml @@ -0,0 +1,9 @@ +name: xgboost__XGBClassifier +extends: xgboost__XGBModel +version: 1.0.0 + +compatibility: + xgboost: + max_version: + min_version: 1.7.0 +docs_url: https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier diff --git a/rubicon_ml/schema/schema/xgboost__XGBModel.yaml b/rubicon_ml/schema/schema/xgboost__XGBModel.yaml new file mode 100644 index 00000000..74844434 --- /dev/null +++ b/rubicon_ml/schema/schema/xgboost__XGBModel.yaml @@ -0,0 +1,120 @@ +name: xgboost__XGBModel +version: 1.0.0 + +compatibility: + xgboost: + max_version: + min_version: 1.7.0 +docs_url: https://xgboost.readthedocs.io/en/stable/python/python_api.html + +artifacts: + - self + - name: booster + data_object_func: get_booster +features: + - names_attr: feature_names_in_ + importances_attr: feature_importances_ + optional: true +metrics: + - name: best_iteration + value_attr: best_iteration + optional: true + - name: best_score + value_attr: best_score + optional: true + - name: coef_ + value_attr: coef_ + optional: true + - name: intercept_ + value_attr: intercept_ + optional: true + - name: n_features_in_ + value_attr: n_features_in_ + - name: evals_result + value_func: evals_result + optional: true + - name: num_boosting_rounds + value_func: get_num_boosting_rounds +parameters: + - name: n_estimators + value_attr: n_estimators + - name: max_depth + value_attr: max_depth + - name: max_leaves + value_attr: max_leaves + - name: max_bin + value_attr: max_bin + - name: grow_policy + value_attr: grow_policy + - name: learning_rate + value_attr: learning_rate + - name: verbosity + value_attr: verbosity + - name: objective + value_attr: objective + - name: booster + value_attr: booster + - name: tree_method + value_attr: tree_method + - name: n_jobs + value_attr: n_jobs + - name: gamma + value_attr: gamma + - name: min_child_weight + value_attr: min_child_weight + - name: max_delta_step + value_attr: max_delta_step + - name: subsample + value_attr: subsample + - name: sampling_method + value_attr: sampling_method + - name: colsample_bytree + value_attr: colsample_bytree + - name: colsample_bylevel + value_attr: colsample_bylevel + - name: colsample_bynode + value_attr: colsample_bynode + - name: reg_alpha + value_attr: reg_alpha + - name: reg_lambda + value_attr: reg_lambda + - name: scale_pos_weight + value_attr: scale_pos_weight + - name: base_score + value_attr: base_score + - name: random_state + value_attr: random_state + - name: missing + value_attr: missing + - name: num_parallel_tree + value_attr: num_parallel_tree + - name: monotone_constraints + value_attr: monotone_constraints + - name: interaction_constraints + value_attr: interaction_constraints + - name: importance_type + value_attr: importance_type + - name: gpu_id + value_attr: gpu_id + optional: true # removed in xgboost 2.0.0 + - name: device + value_attr: device + optional: true # added in xgboost 2.0.0 + - name: validate_parameters + value_attr: validate_parameters + - name: predictor + value_attr: predictor + optional: true # removed in xgboost 2.0.0 + - name: enable_categorical + value_attr: enable_categorical + - name: max_cat_to_onehot + value_attr: max_cat_to_onehot + - name: max_cat_threshold + value_attr: max_cat_threshold + - name: multi-strategy + value_attr: multi_strategy + optional: true # added in xgboost 2.0.0 + - name: eval_metric + value_attr: eval_metric + - name: early_stopping_rounds + value_attr: early_stopping_rounds diff --git a/rubicon_ml/schema/schema/xgboost__XGBRegressor.yaml b/rubicon_ml/schema/schema/xgboost__XGBRegressor.yaml new file mode 100644 index 00000000..2092d152 --- /dev/null +++ b/rubicon_ml/schema/schema/xgboost__XGBRegressor.yaml @@ -0,0 +1,9 @@ +name: xgboost__XGBRegressor +extends: xgboost__XGBModel +version: 1.0.0 + +compatibility: + xgboost: + max_version: + min_version: 1.7.0 +docs_url: https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor diff --git a/setup.cfg b/setup.cfg index 6c941456..63a7dc09 100644 --- a/setup.cfg +++ b/setup.cfg @@ -110,6 +110,7 @@ deps = pytest pytest-cov prefect + xgboost extras = all upgrade = diff --git a/tests/fixtures.py b/tests/fixtures.py index 71579d1b..b3807f9b 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -1,17 +1,59 @@ import os import random +import uuid +import dask.array as da +import dask.dataframe as dd import numpy as np import pandas as pd import pytest +from dask.distributed import Client +from sklearn.datasets import make_classification +from rubicon_ml import Rubicon from rubicon_ml.repository import MemoryRepository -class MockCompletedProcess: - """Use to mock a CompletedProcess result from - `subprocess.run()`. - """ +class _AnotherObject: + """Another object to log for schema testing.""" + + def __init__(self): + self.another_parameter = 100 + self.another_metric = 100 + + +class _ObjectToLog: + """An object to log for schema testing.""" + + def __init__(self): + """Initialize an object to log.""" + + self.object_ = _AnotherObject() + self.feature_names_ = ["var_001", "var_002"] + self.other_feature_names_ = ["var_003", "var_004"] + self.feature_importances_ = [0.75, 0.25] + self.feature_name_ = "var_005" + self.other_feature_name_ = "var_006" + self.feature_importance_ = 1.0 + self.dataframe = pd.DataFrame([[100, 0], [0, 100]], columns=["x", "y"]) + self.parameter = 100 + self.metric = 100 + + def metric_function(self): + return self.metric + + def artifact_function(self): + return self + + def dataframe_function(self): + return pd.DataFrame([[100, 0], [0, 100]], columns=["x", "y"]) + + def erroring_function(self): + raise RuntimeError("raised from `_ObjectToLog.erroring_function`") + + +class _MockCompletedProcess: + """Use to mock a CompletedProcess result from `subprocess.run()`.""" def __init__(self, stdout="", returncode=0): self.stdout = stdout @@ -20,12 +62,12 @@ def __init__(self, stdout="", returncode=0): @pytest.fixture def mock_completed_process_empty(): - return MockCompletedProcess(stdout=b"\n") + return _MockCompletedProcess(stdout=b"\n") @pytest.fixture def mock_completed_process_git(): - return MockCompletedProcess(stdout=b"origin github.com (fetch)\n") + return _MockCompletedProcess(stdout=b"origin github.com (fetch)\n") @pytest.fixture @@ -250,3 +292,229 @@ def viz_experiments(rubicon_and_project_client): experiment.log_dataframe(data_df, name="test dataframe") return project.experiments() + + +@pytest.fixture +def objects_to_log(): + """Returns objects for testing.""" + + return _ObjectToLog(), _AnotherObject() + + +@pytest.fixture +def another_object_schema(): + """Returns a schema representing ``_AnotherObject``.""" + + return { + "parameters": [{"name": "another_parameter", "value_attr": "another_parameter"}], + "metrics": [{"name": "another_metric", "value_attr": "another_metric"}], + } + + +@pytest.fixture +def artifact_schema(): + """Returns a schema for testing artifacts.""" + + return { + "artifacts": [ + "self", + {"name": "object_", "data_object_attr": "object_"}, + {"name": "object_b", "data_object_func": "artifact_function"}, + ] + } + + +@pytest.fixture +def dataframe_schema(): + """Returns a schema for testing dataframes.""" + + return { + "dataframes": [ + {"name": "dataframe", "df_attr": "dataframe"}, + {"name": "dataframe_b", "df_func": "dataframe_function"}, + ] + } + + +@pytest.fixture +def feature_schema(): + """Returns a schema for testing features.""" + + return { + "features": [ + { + "names_attr": "feature_names_", + "importances_attr": "feature_importances_", + }, + {"names_attr": "other_feature_names_"}, + {"name_attr": "feature_name_", "importance_attr": "feature_importance_"}, + {"name_attr": "other_feature_name_"}, + ] + } + + +@pytest.fixture +def metric_schema(): + """Returns a schema for testing metrics.""" + + return { + "metrics": [ + {"name": "metric_a", "value_attr": "metric"}, + {"name": "metric_b", "value_env": "METRIC"}, + {"name": "metric_c", "value_func": "metric_function"}, + ], + } + + +@pytest.fixture +def parameter_schema(): + """Returns a schema for testing parameters.""" + + return { + "parameters": [ + {"name": "parameter_a", "value_attr": "parameter"}, + {"name": "parameter_b", "value_env": "PARAMETER"}, + ], + } + + +@pytest.fixture +def nested_schema(): + """Returns a schema for testing nested schema.""" + + return {"schema": [{"name": "AnotherObject", "attr": "object_"}]} + + +@pytest.fixture +def optional_schema(): + """Returns a schema for testing optional attributes.""" + + return { + "artifacts": [ + { + "name": "object", + "data_object_attr": "missing_object", + "optional": "true", + }, + { + "name": "object_b", + "data_object_func": "missing_object_func", + "optional": "true", + }, + ], + "dataframes": [ + {"name": "dataframe", "df_attr": "missing_dataframe", "optional": "true"}, + { + "name": "dataframe_b", + "df_func": "missing_dataframe_func", + "optional": "true", + }, + ], + "features": [ + {"names_attr": "missing_feature_names", "optional": "true"}, + {"name_attr": "missing_feature_name", "optional": "true"}, + ], + "metrics": [ + {"name": "metric_a", "value_attr": "missing_metric", "optional": "true"}, + {"name": "metric_b", "value_env": "MISSING_METRIC", "optional": "true"}, + { + "name": "metric_c", + "value_func": "missing_metric_func", + "optional": "true", + }, + ], + "parameters": [ + { + "name": "parameter_a", + "value_attr": "missing_parameter", + "optional": "true", + }, + { + "name": "parameter_b", + "value_env": "MISSING_PARAMETER", + "optional": "true", + }, + ], + "schema": [ + { + "name": "MissingObject", + "attr": "another_missing_object", + "optional": "true", + } + ], + } + + +@pytest.fixture +def hierarchical_schema(): + """Returns a schema for testing hierarchical schema.""" + + return {"children": [{"name": "AnotherObject", "attr": "children"}]} + + +@pytest.fixture +def rubicon_project(): + """Returns an in-memory rubicon project for testing.""" + + rubicon = Rubicon(persistence="memory", root_dir="/tmp") + + random_name = str(uuid.uuid4()) + return rubicon.create_project(name=random_name) + + +@pytest.fixture +def make_classification_array(): + """Returns classification data generated by scikit-learn as an array.""" + + X, y = make_classification( + n_samples=1000, + n_features=10, + n_informative=5, + n_redundant=5, + n_classes=2, + class_sep=1, + random_state=3211, + ) + + return X, y + + +@pytest.fixture +def make_classification_df(make_classification_array): + """Returns classification data generated by scikit-learn as dataframes.""" + + X, y = make_classification_array + X_df = pd.DataFrame(X, columns=[f"var_{i}" for i in range(10)]) + + return X_df, y + + +@pytest.fixture +def dask_client(): + """Returns a dask client and shuts it down upon test completion.""" + + client = Client() + + yield client + + client.shutdown() + + +@pytest.fixture +def make_classification_dask_array(make_classification_array): + """Returns classification data generated by scikit-learn as a dask array.""" + + X, y = make_classification_array + X_da, y_da = da.from_array(X), da.from_array(y) + + return X_da, y_da + + +@pytest.fixture +def make_classification_dask_df(make_classification_df): + """Returns classification data generated by scikit-learn as dataframes.""" + + X, y = make_classification_df + X_df, y_da = dd.from_pandas(X, npartitions=1), da.from_array(y) + + return X_df, y_da diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py new file mode 100644 index 00000000..065201f9 --- /dev/null +++ b/tests/integration/test_schema.py @@ -0,0 +1,60 @@ +import pytest +from lightgbm import LGBMClassifier, LGBMRegressor +from sklearn.ensemble import RandomForestClassifier +from xgboost import XGBClassifier, XGBRegressor +from xgboost.dask import DaskXGBClassifier, DaskXGBRegressor + +PANDAS_SCHEMA_CLS = [ + LGBMClassifier, + LGBMRegressor, + RandomForestClassifier, + XGBClassifier, + XGBRegressor, +] +DASK_SCHEMA_CLS = [DaskXGBClassifier, DaskXGBRegressor] + + +def _fit_and_log(X, y, schema_cls, rubicon_project): + model = schema_cls() + model.fit(X, y) + + rubicon_project.log_with_schema(model) + + +@pytest.mark.integration +@pytest.mark.parametrize("schema_cls", PANDAS_SCHEMA_CLS) +def test_estimator_schema_fit_array(schema_cls, make_classification_array, rubicon_project): + X, y = make_classification_array + + _fit_and_log(X, y, schema_cls, rubicon_project) + + +@pytest.mark.integration +@pytest.mark.parametrize("schema_cls", PANDAS_SCHEMA_CLS) +def test_estimator_schema_fit_df(schema_cls, make_classification_df, rubicon_project): + X, y = make_classification_df + + _fit_and_log(X, y, schema_cls, rubicon_project) + + +@pytest.mark.integration +@pytest.mark.parametrize("schema_cls", DASK_SCHEMA_CLS) +def test_estimator_schema_fit_dask_array( + schema_cls, + make_classification_dask_array, + rubicon_project, + dask_client, +): + X_da, y_da = make_classification_dask_array + + _fit_and_log(X_da, y_da, schema_cls, rubicon_project) + + +@pytest.mark.integration +@pytest.mark.parametrize("schema_cls", DASK_SCHEMA_CLS) +def test_estimator_schema_fit_dask_df( + schema_cls, make_classification_dask_df, rubicon_project, dask_client +): + X_df, y_da = make_classification_dask_df + + _fit_and_log(X_df, y_da, schema_cls, rubicon_project) diff --git a/tests/unit/schema/__init__.py b/tests/unit/schema/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/schema/test_schema_logger.py b/tests/unit/schema/test_schema_logger.py new file mode 100644 index 00000000..0931a6d7 --- /dev/null +++ b/tests/unit/schema/test_schema_logger.py @@ -0,0 +1,355 @@ +"""Testing ``schema_logger``.""" + +import os +from copy import deepcopy +from unittest import mock + +import pandas as pd +import pytest + +from rubicon_ml.exceptions import RubiconException +from rubicon_ml.schema import logger +from rubicon_ml.schema.registry import RUBICON_SCHEMA_REGISTRY + + +def test_safe_getattr_raises_error(objects_to_log): + """Testing ``_safe_getattr`` raises an error when not optional.""" + + object_to_log, _ = objects_to_log + missing_attr_name = "missing_attr" + + with pytest.raises(AttributeError) as e: + logger._safe_getattr( + object_to_log, + missing_attr_name, + optional=False, + ) + + assert f"no attribute '{missing_attr_name}'" in str(e) + + +def test_safe_call_func_raises_error(objects_to_log): + """Testing ``_safe_call_func`` raises an error when not optional.""" + + object_to_log, _ = objects_to_log + missing_func_name = "missing_func" + + with pytest.raises(AttributeError) as e: + logger._safe_call_func( + object_to_log, + missing_func_name, + optional=False, + ) + + assert f"no attribute '{missing_func_name}'" in str(e) + + +def test_safe_call_func_reraises_error(objects_to_log): + """Testing ``_safe_call_func`` reraises an error when not optional.""" + + object_to_log, _ = objects_to_log + erroring_func_name = "erroring_function" + + with pytest.raises(Exception) as e: + logger._safe_call_func( + object_to_log, + erroring_func_name, + optional=False, + ) + + assert "raised from `_ObjectToLog.erroring_function`" in str(e) + + +def test_safe_environ_raises_error(objects_to_log): + """Testing ``_safe_environ`` raises an error when not optional.""" + + object_to_log, _ = objects_to_log + missing_environ_name = "missing_environ" + + with pytest.raises(RubiconException) as e: + logger._safe_environ( + missing_environ_name, + optional=False, + ) + + assert f"'{missing_environ_name}' not set" in str(e) + + +def test_log_inferred_schema(objects_to_log, rubicon_project, another_object_schema): + """Testing ``Project.log_with_schema`` can log inferred schema.""" + + _, another_object = objects_to_log + schema_to_patch = {"tests___AnotherObject": lambda: another_object_schema} + + with mock.patch.dict(RUBICON_SCHEMA_REGISTRY, schema_to_patch, clear=True): + experiment = rubicon_project.log_with_schema(another_object) + + parameter = experiment.parameter(name=another_object_schema["parameters"][0]["name"]) + metric = experiment.metric(name=another_object_schema["metrics"][0]["name"]) + + assert rubicon_project.schema_ == another_object_schema + assert parameter.value == getattr( + another_object, + another_object_schema["parameters"][0]["value_attr"], + ) + assert metric.value == getattr( + another_object, + another_object_schema["metrics"][0]["value_attr"], + ) + + +def test_log_artifacts_with_schema(objects_to_log, rubicon_project, artifact_schema): + """Testing ``Project.log_with_schema`` can log artifacts.""" + + object_to_log, another_object = objects_to_log + object_b = object_to_log + otl_cls, ao_cls, obj_b_cls = ( + object_to_log.__class__, + another_object.__class__, + object_b.__class__, + ) + + rubicon_project.set_schema(artifact_schema) + experiment = rubicon_project.log_with_schema(object_to_log) + + otl_artifact = experiment.artifact(name=otl_cls.__name__) + ao_artifact = experiment.artifact(name=artifact_schema["artifacts"][1]["name"]) + obj_b_artifact = experiment.artifact(name=artifact_schema["artifacts"][2]["name"]) + + assert isinstance(otl_artifact.get_data(unpickle=True), otl_cls) + assert isinstance(ao_artifact.get_data(unpickle=True), ao_cls) + assert isinstance(obj_b_artifact.get_data(unpickle=True), obj_b_cls) + + +def test_log_dataframes_with_schema(objects_to_log, rubicon_project, dataframe_schema): + """Testing ``Project.log_with_schema`` can log dataframes.""" + + object_to_log, _ = objects_to_log + + rubicon_project.set_schema(dataframe_schema) + experiment = rubicon_project.log_with_schema(object_to_log) + + dataframe = experiment.dataframe(name=dataframe_schema["dataframes"][0]["name"]) + dataframe_b = experiment.dataframe(name=dataframe_schema["dataframes"][1]["name"]) + + assert isinstance(dataframe.get_data(), pd.DataFrame) + assert isinstance(dataframe_b.get_data(), pd.DataFrame) + assert dataframe.get_data().equals(object_to_log.dataframe) + assert dataframe.get_data().equals(object_to_log.dataframe_function()) + + +def test_log_features_with_schema(objects_to_log, rubicon_project, feature_schema): + """Testing ``Project.log_with_schema`` can log features.""" + + object_to_log, _ = objects_to_log + + rubicon_project.set_schema(feature_schema) + experiment = rubicon_project.log_with_schema(object_to_log) + + expected_feature_names = getattr(object_to_log, feature_schema["features"][0]["names_attr"]) + expected_feature_names.extend( + getattr(object_to_log, feature_schema["features"][1]["names_attr"]) + ) + expected_feature_names.append( + getattr(object_to_log, feature_schema["features"][2]["name_attr"]) + ) + expected_feature_names.append( + getattr(object_to_log, feature_schema["features"][3]["name_attr"]) + ) + + expected_feature_importances = getattr( + object_to_log, feature_schema["features"][0].get("importances_attr") + ) + expected_feature_importances.extend([None, None]) + expected_feature_importances.append( + getattr(object_to_log, feature_schema["features"][2].get("importance_attr")) + ) + expected_feature_importances.append(None) + + for name, importance in zip(expected_feature_names, expected_feature_importances): + feature = experiment.feature(name=name) + + assert feature.importance == importance + + +def test_log_metrics_with_schema(objects_to_log, rubicon_project, metric_schema): + """Testing ``Project.log_with_schema`` can log metrics.""" + + object_to_log, _ = objects_to_log + + rubicon_project.set_schema(metric_schema) + + with mock.patch.dict(os.environ, {"METRIC": "metric env value"}, clear=True): + experiment = rubicon_project.log_with_schema(object_to_log) + + metric_a = experiment.metric(name=metric_schema["metrics"][0]["name"]) + metric_b = experiment.metric(name=metric_schema["metrics"][1]["name"]) + metric_c = experiment.metric(name=metric_schema["metrics"][2]["name"]) + + assert metric_a.value == getattr(object_to_log, metric_schema["metrics"][0]["value_attr"]) + assert metric_b.value == "metric env value" + + method = getattr(object_to_log, metric_schema["metrics"][2]["value_func"]) + assert metric_c.value == method() + + +def test_log_parameters_with_schema(objects_to_log, rubicon_project, parameter_schema): + """Testing ``Project.log_with_schema`` can log parameters.""" + + object_to_log, _ = objects_to_log + + rubicon_project.set_schema(parameter_schema) + + with mock.patch.dict(os.environ, {"PARAMETER": "param env value"}, clear=True): + experiment = rubicon_project.log_with_schema(object_to_log) + + parameter_a = experiment.parameter(name=parameter_schema["parameters"][0]["name"]) + parameter_b = experiment.parameter(name=parameter_schema["parameters"][1]["name"]) + + assert parameter_a.value == getattr( + object_to_log, parameter_schema["parameters"][0]["value_attr"] + ) + assert parameter_b.value == "param env value" + + +def test_log_nested_schema(objects_to_log, rubicon_project, another_object_schema, nested_schema): + """Testing ``Project.log_with_schema`` can log nested schema.""" + + object_to_log, another_object = objects_to_log + schema_to_patch = {"AnotherObject": lambda: another_object_schema} + + with mock.patch.dict(RUBICON_SCHEMA_REGISTRY, schema_to_patch, clear=True): + rubicon_project.set_schema(nested_schema) + experiment = rubicon_project.log_with_schema(object_to_log) + + parameter = experiment.parameter(name=another_object_schema["parameters"][0]["name"]) + metric = experiment.metric(name=another_object_schema["metrics"][0]["name"]) + + assert parameter.value == getattr( + another_object, + another_object_schema["parameters"][0]["value_attr"], + ) + assert metric.value == getattr( + another_object, + another_object_schema["metrics"][0]["value_attr"], + ) + + +def test_log_extended_schema(objects_to_log, rubicon_project, another_object_schema): + """Testing ``Project.log_with_schema`` can log extended schema.""" + + _, another_object = objects_to_log + + feature_name_attr = "extended_schema_feature" + feature_name_value = "extended schema feature" + setattr(another_object, feature_name_attr, feature_name_value) + + schema_to_patch = {"AnotherObject": lambda: another_object_schema} + extended_schema = { + "extends": "AnotherObject", + "features": [{"name_attr": feature_name_attr}], + } + + with mock.patch.dict(RUBICON_SCHEMA_REGISTRY, schema_to_patch, clear=True): + rubicon_project.set_schema(extended_schema) + experiment = rubicon_project.log_with_schema(another_object) + + feature = experiment.feature(name=feature_name_value) + parameter = experiment.parameter(name=another_object_schema["parameters"][0]["name"]) + metric = experiment.metric(name=another_object_schema["metrics"][0]["name"]) + + assert feature.name == feature_name_value + assert parameter.value == getattr( + another_object, + another_object_schema["parameters"][0]["value_attr"], + ) + assert metric.value == getattr( + another_object, + another_object_schema["metrics"][0]["value_attr"], + ) + + +def test_log_optional_schema(objects_to_log, rubicon_project, optional_schema): + """Testing ``Project.log_with_schema`` can log optional schema.""" + + object_to_log, _ = objects_to_log + schema_to_patch = {"MissingObject": lambda: {}} + + with mock.patch.dict(RUBICON_SCHEMA_REGISTRY, schema_to_patch, clear=True): + rubicon_project.set_schema(optional_schema) + experiment = rubicon_project.log_with_schema(object_to_log) + + assert len(experiment.artifacts()) == 0 + assert len(experiment.dataframes()) == 0 + assert len(experiment.features()) == 0 + + assert len(experiment.parameters()) == 2 + for parameter in experiment.parameters(): + assert parameter.value is None + + assert len(experiment.metrics()) == 3 + for metric in experiment.metrics(): + assert metric.value is None + + +def test_log_with_children( + objects_to_log, rubicon_project, another_object_schema, hierarchical_schema +): + """Testing ``Project.log_with_schema`` can log hierarchical schema.""" + + object_to_log, another_object = objects_to_log + schema_to_patch = {"AnotherObject": lambda: another_object_schema} + + num_children = 4 + object_to_log.children = [deepcopy(another_object) for _ in range(num_children)] + + with mock.patch.dict(RUBICON_SCHEMA_REGISTRY, schema_to_patch, clear=True): + rubicon_project.set_schema(hierarchical_schema) + parent_experiment = rubicon_project.log_with_schema(object_to_log) + + assert len(rubicon_project.experiments()) == 1 + num_children + assert len(rubicon_project.experiments(tags=["parent"])) == 1 + + child_experiments = rubicon_project.experiments(tags=["child"]) + assert len(child_experiments) == num_children + + for child_experiment in child_experiments: + assert f"parent_id:{parent_experiment.id}" in child_experiment.tags + + +def test_log_with_schema_and_experiment_kwargs( + objects_to_log, + rubicon_project, + artifact_schema, +): + """Testing ``Project.log_with_schema`` can log experiment kwargs.""" + + object_to_log, _ = objects_to_log + + rubicon_project.set_schema(artifact_schema) + experiment = rubicon_project.log_with_schema( + object_to_log, + experiment_kwargs={"name": "name", "description": "description"}, + ) + + assert experiment.name == "name" + assert experiment.description == "description" + + +def test_log_with_schema_raises_error(objects_to_log, rubicon_project): + """Testing ``Project.log_with_schema`` rasies an error when no schema is set.""" + + object_to_log, _ = objects_to_log + + with pytest.raises(ValueError) as err: + _ = rubicon_project.log_with_schema(object_to_log) + + assert "No schema set and no schema could be inferred" in str(err) + + +def test_set_schema(rubicon_project, artifact_schema): + """Testing ``Project.set_schema``.""" + + rubicon_project.set_schema(artifact_schema) + + assert rubicon_project.schema_ == artifact_schema diff --git a/tests/unit/schema/test_schema_registry.py b/tests/unit/schema/test_schema_registry.py new file mode 100644 index 00000000..dcd1f865 --- /dev/null +++ b/tests/unit/schema/test_schema_registry.py @@ -0,0 +1,56 @@ +"""Testing ``schema.registry``.""" + +import pytest + +from rubicon_ml.schema import registry + + +def test_available_schema(): + """Testing ``schema.registry.available_schema``.""" + + for schema_name in registry.available_schema(): + assert schema_name in registry.RUBICON_SCHEMA_REGISTRY + + assert len(registry.available_schema()) == len(registry.RUBICON_SCHEMA_REGISTRY) + + +def test_get_schema(): + """Testing ``schema.registry.get_schema``.""" + + for name, load_func in registry.RUBICON_SCHEMA_REGISTRY.items(): + schema = registry.get_schema(name) + + assert load_func() == schema + + +def test_get_schema_raises_error(): + """Testing ``schema.registry.get_schema`` raises an error when an invalid name is given.""" + + with pytest.raises(ValueError) as err: + registry.get_schema("InvalidSchemaClass") + + assert "'InvalidSchemaClass' is not the name of an available rubicon schema." in str(err) + + +def test_get_schema_name(objects_to_log): + """Testing ``schema.registry.get_schema_name``.""" + + object_to_log, _ = objects_to_log + + schema_name = registry.get_schema_name(object_to_log) + + assert schema_name == "tests___ObjectToLog" + + +def test_register_schema(): + """Testing ``schema.registry.register_schema``.""" + + schema_name = "c1-rubicon-schema__TestRegisterSchema" + schema = {"name": schema_name} + + registry.register_schema(schema_name, schema) + + assert schema_name in registry.RUBICON_SCHEMA_REGISTRY + assert registry.get_schema(schema_name) == schema + + del registry.RUBICON_SCHEMA_REGISTRY[schema_name]