From eb0a28a005154c570f6e4245206e221c61305a54 Mon Sep 17 00:00:00 2001 From: zhangxingmeng Date: Mon, 16 Oct 2023 17:01:55 +0800 Subject: [PATCH] repo-sync-2023-10-16T17:01:44+0800 --- .bazelrc | 2 +- .gitmodules | 3 + CHANGELOG.md | 12 + CONTRIBUTING.md | 28 +- README.md | 2 +- README.zh-CN.md | 2 +- dev-requirements.txt | 3 +- docker/comp_list.json | 292 +- docker/translation.json | 44 +- docs/component/comp_guide.rst | 490 +- docs/component/comp_list.md | 59 +- docs/component/comp_spec.md | 585 +-- docs/component/comp_spec.tmpl | 6 +- docs/component/comp_spec_design.rst | 216 +- docs/component/index.rst | 42 +- docs/component/update_comp_list.py | 24 +- docs/getting_started/deployment.md | 4 +- .../zh_CN/LC_MESSAGES/component/comp_guide.po | 114 +- .../zh_CN/LC_MESSAGES/component/comp_list.po | 454 +- .../zh_CN/LC_MESSAGES/component/comp_spec.po | 1414 +---- .../LC_MESSAGES/component/comp_spec_design.po | 531 +- .../zh_CN/LC_MESSAGES/component/index.po | 171 +- .../LC_MESSAGES/getting_started/deployment.po | 6 +- .../data_preprocessing_with_data_frame.po | 4 +- docs/tutorial/WeightOfEvidenceEncoding.ipynb | 61 +- .../data_preprocessing_with_data_frame.ipynb | 117 +- ...l_case_walkthrough_using_sf_with_spu.ipynb | 55 +- docs/tutorial/risk_control_scenario.ipynb | 4576 +++++++++-------- docs/update_data.sh | 9 +- .../horizontal_federated_learning/index.rst | 15 +- .../WeightOfEvidenceEncoding.ipynb | 54 +- secretflow/__init__.py | 2 - secretflow/cli.py | 14 +- secretflow/component/component.py | 31 +- secretflow/component/data_utils.py | 48 +- secretflow/component/entry.py | 27 +- secretflow/component/eval_param_reader.py | 12 +- secretflow/component/i18n.py | 4 +- secretflow/component/ml/boost/sgb/sgb.py | 7 +- .../component/ml/boost/ss_xgb/ss_xgb.py | 7 +- .../ml/eval/biclassification_eval.py | 15 +- .../component/ml/eval/prediction_bias_eval.py | 13 +- secretflow/component/ml/eval/ss_pvalue.py | 8 +- secretflow/component/ml/linear/ss_glm.py | 16 +- secretflow/component/ml/linear/ss_sgd.py | 7 +- .../preprocessing/condition_filter.py | 4 +- .../component/preprocessing/feature_filter.py | 5 +- secretflow/component/preprocessing/psi.py | 23 +- .../preprocessing/train_test_split.py | 4 +- .../component/preprocessing/vert_binning.py | 205 + .../vert_woe_binning.py | 101 +- secretflow/component/stats/ss_pearsonr.py | 8 +- secretflow/component/stats/ss_vif.py | 8 +- .../component/stats/table_statistics.py | 12 +- secretflow/data/__init__.py | 7 +- secretflow/data/base.py | 219 +- .../component => data/core}/__init__.py | 7 + secretflow/data/core/agent.py | 392 ++ secretflow/data/core/base.py | 604 +++ secretflow/data/{partition => core}/io.py | 12 +- .../{partition => core}/pandas/__init__.py | 2 +- secretflow/data/core/pandas/dataframe.py | 236 + secretflow/data/core/partition.py | 355 ++ .../{partition => core}/polars/__init__.py | 7 +- secretflow/data/core/polars/dataframe.py | 452 ++ .../data/{partition => core}/polars/util.py | 34 +- secretflow/data/horizontal/dataframe.py | 59 +- secretflow/data/horizontal/io.py | 14 +- secretflow/data/mix/dataframe.py | 21 +- secretflow/data/ndarray/ndarray.py | 3 +- secretflow/data/partition/pandas/partition.py | 444 -- secretflow/data/partition/polars/partition.py | 317 -- secretflow/data/split.py | 65 +- secretflow/data/vertical/dataframe.py | 535 +- secretflow/data/vertical/io.py | 24 +- secretflow/device/device/base.py | 2 + secretflow/device/device/pyu.py | 10 +- secretflow/device/driver.py | 64 +- secretflow/device/link.py | 3 +- secretflow/device/proxy.py | 13 +- secretflow/distributed/__init__.py | 17 +- secretflow/distributed/memory_actor.py | 90 + secretflow/distributed/memory_api.py | 115 + secretflow/distributed/memory_function.py | 48 + secretflow/distributed/primitive.py | 107 +- secretflow/kuscia/README.md | 2 - secretflow/kuscia/entry.py | 48 +- secretflow/kuscia/sf_config.py | 72 +- secretflow/kuscia/task_config.py | 5 +- secretflow/ml/nn/sl/sl_model.py | 6 +- ...bstitution.py => vert_bin_substitution.py} | 36 +- .../preprocessing/binning/vert_binning.py | 110 + .../preprocessing/binning/vert_binning_pyu.py | 254 + .../preprocessing/binning/vert_woe_binning.py | 14 +- .../binning/vert_woe_binning_pyu.py | 6 +- secretflow/preprocessing/cond_filter_v.py | 3 +- secretflow/preprocessing/discretization.py | 11 +- secretflow/preprocessing/encoder.py | 31 +- secretflow/preprocessing/scaler.py | 20 +- secretflow/preprocessing/transformer.py | 37 +- secretflow/protos/README.md | 8 - secretflow/protos/component/README.md | 26 - secretflow/protos/component/cluster_pb2.py | 566 -- secretflow/protos/component/comp.proto | 223 - secretflow/protos/component/comp_pb2.py | 734 --- secretflow/protos/component/data.proto | 159 - secretflow/protos/component/data_pb2.py | 473 -- secretflow/protos/component/evaluation.proto | 61 - secretflow/protos/component/evaluation_pb2.py | 159 - secretflow/protos/component/report.proto | 117 - secretflow/protos/component/report_pb2.py | 565 -- .../spec/extend}/cluster.proto | 26 +- .../protos/secretflow/spec/extend/data.proto | 34 + .../{component/feature => spec}/__init__.py | 0 .../partition => spec/extend}/__init__.py | 0 secretflow/spec/extend/cluster_pb2.py | 111 + secretflow/spec/extend/data_pb2.py | 46 + secretflow/{protos => spec/v1}/__init__.py | 0 secretflow/spec/v1/component_pb2.py | 125 + secretflow/spec/v1/data_pb2.py | 108 + secretflow/spec/v1/evaluation_pb2.py | 47 + secretflow/spec/v1/report_pb2.py | 120 + secretflow/stats/ss_pearsonr_v.py | 7 +- secretflow/utils/simulation/data/dataframe.py | 9 +- setup.py | 6 - submodules/spec | 1 + tests/component/test_biclassification_eval.py | 27 +- tests/component/test_component.py | 16 +- tests/component/test_condition_filter.py | 17 +- tests/component/test_eval_param_reader.py | 34 +- tests/component/test_feature_filter.py | 17 +- tests/component/test_i18n.py | 9 +- tests/component/test_prediction_bias_eval.py | 23 +- tests/component/test_psi.py | 37 +- tests/component/test_sgb.py | 23 +- tests/component/test_ss_glm.py | 23 +- tests/component/test_ss_pearsonr.py | 19 +- tests/component/test_ss_pvalue.py | 25 +- tests/component/test_ss_sgd.py | 23 +- tests/component/test_ss_vif.py | 19 +- tests/component/test_ss_xgb.py | 23 +- tests/component/test_table_statistics.py | 34 +- tests/component/test_train_test_split.py | 17 +- tests/component/test_vert_binning.py | 156 + tests/component/test_woe_binning.py | 40 +- tests/conftest.py | 51 +- tests/data/horizontal/test_io.py | 6 +- tests/data/mix/test_mixdataframe.py | 2 +- tests/data/test_ndarray.py | 2 +- tests/data/vertical/test_io.py | 2 +- tests/data/vertical/test_vdataframe.py | 6 +- tests/data/vertical/test_vdataframe_polars.py | 461 ++ tests/device/test_driver_brpc.py | 4 +- tests/device/test_teeu.py | 3 +- tests/kuscia/test_kuscia.py | 47 +- tests/ml/linear/test_fl_lr_mix.py | 5 +- tests/ml/linear/test_fl_lr_v.py | 5 +- tests/ml/linear/test_hess_lr.py | 3 +- tests/ml/nn/test_fl_model_tf.py | 43 + tests/preprocessing/test_cond_filter_v.py | 2 +- tests/preprocessing/test_discretizer.py | 2 +- tests/preprocessing/test_encoder.py | 7 +- tests/preprocessing/test_encoder_polars.py | 551 ++ .../test_function_transformer.py | 2 +- .../test_function_transformer_polars.py | 305 ++ tests/preprocessing/test_minmax_scaler.py | 2 +- .../test_minmax_scaler_polars.py | 264 + tests/preprocessing/test_standard_scaler.py | 2 +- tests/preprocessing/test_vert_binning.py | 28 +- tests/preprocessing/test_vert_substitution.py | 92 +- tests/stats/test_biclassification_eval.py | 2 +- tests/stats/test_prediction_bias.py | 2 +- tests/stats/test_psi.py | 2 +- tests/stats/test_ss_pvalue.py | 2 +- tests/stats/test_ss_vif_v.py | 2 +- tests/stats/test_table_statistics.py | 2 +- 176 files changed, 10477 insertions(+), 10654 deletions(-) create mode 100644 .gitmodules create mode 100644 secretflow/component/preprocessing/vert_binning.py rename secretflow/component/{feature => preprocessing}/vert_woe_binning.py (72%) rename secretflow/{protos/component => data/core}/__init__.py (86%) create mode 100644 secretflow/data/core/agent.py create mode 100644 secretflow/data/core/base.py rename secretflow/data/{partition => core}/io.py (83%) rename secretflow/data/{partition => core}/pandas/__init__.py (93%) create mode 100644 secretflow/data/core/pandas/dataframe.py create mode 100644 secretflow/data/core/partition.py rename secretflow/data/{partition => core}/polars/__init__.py (87%) create mode 100644 secretflow/data/core/polars/dataframe.py rename secretflow/data/{partition => core}/polars/util.py (72%) delete mode 100644 secretflow/data/partition/pandas/partition.py delete mode 100644 secretflow/data/partition/polars/partition.py create mode 100644 secretflow/distributed/memory_actor.py create mode 100644 secretflow/distributed/memory_api.py create mode 100644 secretflow/distributed/memory_function.py delete mode 100644 secretflow/kuscia/README.md rename secretflow/preprocessing/binning/{vert_woe_substitution.py => vert_bin_substitution.py} (77%) create mode 100644 secretflow/preprocessing/binning/vert_binning.py create mode 100644 secretflow/preprocessing/binning/vert_binning_pyu.py delete mode 100644 secretflow/protos/README.md delete mode 100644 secretflow/protos/component/README.md delete mode 100644 secretflow/protos/component/cluster_pb2.py delete mode 100644 secretflow/protos/component/comp.proto delete mode 100644 secretflow/protos/component/comp_pb2.py delete mode 100644 secretflow/protos/component/data.proto delete mode 100644 secretflow/protos/component/data_pb2.py delete mode 100644 secretflow/protos/component/evaluation.proto delete mode 100644 secretflow/protos/component/evaluation_pb2.py delete mode 100644 secretflow/protos/component/report.proto delete mode 100644 secretflow/protos/component/report_pb2.py rename secretflow/protos/{component => secretflow/spec/extend}/cluster.proto (87%) create mode 100644 secretflow/protos/secretflow/spec/extend/data.proto rename secretflow/{component/feature => spec}/__init__.py (100%) rename secretflow/{data/partition => spec/extend}/__init__.py (100%) create mode 100644 secretflow/spec/extend/cluster_pb2.py create mode 100644 secretflow/spec/extend/data_pb2.py rename secretflow/{protos => spec/v1}/__init__.py (100%) create mode 100644 secretflow/spec/v1/component_pb2.py create mode 100644 secretflow/spec/v1/data_pb2.py create mode 100644 secretflow/spec/v1/evaluation_pb2.py create mode 100644 secretflow/spec/v1/report_pb2.py create mode 160000 submodules/spec create mode 100644 tests/component/test_vert_binning.py create mode 100644 tests/data/vertical/test_vdataframe_polars.py create mode 100644 tests/preprocessing/test_encoder_polars.py create mode 100644 tests/preprocessing/test_function_transformer_polars.py create mode 100644 tests/preprocessing/test_minmax_scaler_polars.py diff --git a/.bazelrc b/.bazelrc index 101451312..13b386351 100644 --- a/.bazelrc +++ b/.bazelrc @@ -1,6 +1,6 @@ common --experimental_repo_remote_exec -build --incompatible_new_actions_api=false +build --incompatible_new_actions_api=false build --copt=-fdiagnostics-color=always build --enable_platform_specific_config diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..bc2c18cca --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "submodules/spec"] + path = submodules/spec + url = https://github.com/secretflow/spec.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 20ead4279..b0da89066 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `Removed` for now removed features. `Fixed` for any bug fixes. `Security` in case of vulnerabilities. +## [1.2.0.dev231016] - 2023-10-16 +### Added +- Secretflow support debug mode. +- Add vert binning for equal range bining method + +### Changed +- The data preprocessing module (VDataFrame, Partition) has been refactored, enhancing the data processing performance (primarily targeting the Polars backend). + +### Fixed +- Fix error when flmodel with tf backend use gpu. +- Fix kuscia adapter + ## [1.2.0.dev231009] - 2023-10-9 ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0318ce516..4f19dea70 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,4 +66,30 @@ Follow follwing steps to update documentation: 4. All `fuzzy` should be removed in `*.po` file, because it won't take effect in the Chinese version of the documentation. 5. All strings which start with `#~` such as `#~ msgid ` or `#~ msgstr` should be removed, because it is redundant. 6. Only commit the files which you update and pull request. -7. If your document is conflict with the main branch of SecretFlow, you are supposed to solve the conflict locally and commit. \ No newline at end of file +7. If your document is conflict with the main branch of SecretFlow, you are supposed to solve the conflict locally and commit. + + +## Compiling Protocol Buffers + +You should use [protoc v3.19.6](https://github.com/protocolbuffers/protobuf/releases/tag/v3.19.6) + + +### Compiling SecretFlow Open Specification + +Protocol Buffers resides at submodules/spec/ as git submodules. + +```bash + +~/protoc-3.19.6/bin/protoc --proto_path submodules/spec/ --python_out . submodules/spec/secretflow/spec/v1/*.proto + +``` + +### Compiling Extended Specification + +Protocol Buffers resides at secretflow/protos. + +```bash +~/protoc-3.19.6/bin/protoc --proto_path secretflow/protos/ --python_out . secretflow/protos/secretflow/spec/extend/*.proto +``` + +All generated Python code resides at secretflow/spec. diff --git a/README.md b/README.md index a144d04f4..dc86caab1 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ it provides: ## SecretFlow Related Projects -- [Kuscia](https://github.com/secretflow/kuscia): A K8s-based privacy-preserving computing task orchestration framework. +- [Kuscia](https://github.com/secretflow/kuscia): A lightweight privacy-preserving computing task orchestration framework based on K3s. - [SCQL](https://github.com/secretflow/scql): A system that allows multiple distrusting parties to run joint analysis without revealing their private data. - [SPU](https://github.com/secretflow/spu): A provable, measurable secure computation device, which provides computation ability while keeping your private data protected. - [HEU](https://github.com/secretflow/heu): A high-performance homomorphic encryption algorithm library. diff --git a/README.zh-CN.md b/README.zh-CN.md index fe2bbaaa1..1dc2c0ae0 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -29,7 +29,7 @@ SecretFlow是一个统一的框架,用于保护隐私的数据智能和机器 - [教程](https://www.secretflow.org.cn/docs/secretflow/zh_CN/tutorial/index.html) ## 相关项目 -- [Kuscia](https://github.com/secretflow/kuscia): 一个基于 K8s 的隐私计算任务编排框架。 +- [Kuscia](https://github.com/secretflow/kuscia): 一款基于 K3s 的轻量级隐私计算任务编排框架。 - [SCQL](https://github.com/secretflow/scql): 允许多个不信任方在不泄露其私人数据的情况下进行联合分析的系统。 - [SPU](https://github.com/secretflow/spu): 一种可证明、可测量的安全计算设备,在提供计算能力的同时保护您的数据隐私。 - [HEU](https://github.com/secretflow/heu): 一个高性能的同态加密算法库。 diff --git a/dev-requirements.txt b/dev-requirements.txt index 770aa0d94..9dba01893 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,6 @@ matplotlib -protobuf-distutils statsmodels==0.13.2 -polars==0.18.15 +polars==0.19.3 pytest pytest-cov pytest-xdist diff --git a/docker/comp_list.json b/docker/comp_list.json index d9ef665b1..0e1ae4630 100644 --- a/docker/comp_list.json +++ b/docker/comp_list.json @@ -3,6 +3,100 @@ "desc": "First-party SecretFlow components.", "version": "0.0.1", "comps": [ + { + "domain": "feature", + "name": "vert_bin_substitution", + "desc": "Substitute datasets' value by bin substitution rules.", + "version": "0.0.1", + "inputs": [ + { + "name": "input_data", + "desc": "Vertical partitioning dataset to be substituted.", + "types": [ + "sf.table.vertical_table" + ] + }, + { + "name": "bin_rule", + "desc": "Input bin substitution rule.", + "types": [ + "sf.rule.binning" + ] + } + ], + "outputs": [ + { + "name": "output_data", + "desc": "Output vertical table.", + "types": [ + "sf.table.vertical_table" + ] + } + ] + }, + { + "domain": "feature", + "name": "vert_binning", + "desc": "Generate equal frequency or equal range binning rules for vertical partitioning datasets.", + "version": "0.0.1", + "attrs": [ + { + "name": "binning_method", + "desc": "How to bin features with numeric types: \"quantile\"(equal frequency)/\"eq_range\"(equal range)", + "type": "AT_STRING", + "atomic": { + "isOptional": true, + "defaultValue": { + "s": "eq_range" + }, + "allowedValues": { + "ss": [ + "eq_range", + "quantile" + ] + } + } + }, + { + "name": "bin_num", + "desc": "Max bin counts for one features.", + "type": "AT_INT", + "atomic": { + "isOptional": true, + "defaultValue": { + "i64": "10" + }, + "lowerBoundEnabled": true, + "lowerBound": {} + } + } + ], + "inputs": [ + { + "name": "input_data", + "desc": "Input vertical table.", + "types": [ + "sf.table.vertical_table" + ], + "attrs": [ + { + "name": "feature_selects", + "desc": "which features should be binned.", + "colMinCntInclusive": "1" + } + ] + } + ], + "outputs": [ + { + "name": "bin_rule", + "desc": "Output bin rule.", + "types": [ + "sf.rule.binning" + ] + } + ] + }, { "domain": "feature", "name": "vert_woe_binning", @@ -52,7 +146,7 @@ "defaultValue": { "i64": "10" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, @@ -76,7 +170,7 @@ "defaultValue": { "i64": "100" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "2" } @@ -91,7 +185,7 @@ "defaultValue": { "i64": "10" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "2" }, @@ -107,9 +201,9 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -135,41 +229,10 @@ ], "outputs": [ { - "name": "woe_rule", + "name": "bin_rule", "desc": "Output WOE rule.", "types": [ - "sf.rule.woe_binning" - ] - } - ] - }, - { - "domain": "feature", - "name": "vert_woe_substitution", - "desc": "Substitute datasets' value by WOE substitution rules.", - "version": "0.0.1", - "inputs": [ - { - "name": "input_data", - "desc": "Vertical partitioning dataset to be substituted.", - "types": [ - "sf.table.vertical_table" - ] - }, - { - "name": "woe_rule", - "desc": "Input WOE substitution rule.", - "types": [ - "sf.rule.woe_binning" - ] - } - ], - "outputs": [ - { - "name": "output_data", - "desc": "Output vertical table.", - "types": [ - "sf.table.vertical_table" + "sf.rule.binning" ] } ] @@ -189,7 +252,7 @@ "defaultValue": { "i64": "10" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, @@ -205,7 +268,7 @@ "defaultValue": { "i64": "5" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "5" }, @@ -270,7 +333,7 @@ "defaultValue": { "i64": "10" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, @@ -286,7 +349,7 @@ "defaultValue": { "i64": "2" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "2" }, @@ -544,7 +607,7 @@ "defaultValue": { "i64": "1024" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, @@ -693,7 +756,7 @@ "defaultValue": { "i64": "10" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, @@ -709,12 +772,12 @@ "defaultValue": { "i64": "5" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "i64": "16" }, @@ -730,9 +793,9 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -765,10 +828,10 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 10000.0 }, @@ -784,10 +847,10 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 10000.0 }, @@ -803,9 +866,9 @@ "defaultValue": { "f": 1.0 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -821,9 +884,9 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -837,7 +900,7 @@ "atomic": { "isOptional": true, "defaultValue": {}, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -851,7 +914,7 @@ "defaultValue": { "i64": "42" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -865,12 +928,12 @@ "defaultValue": { "i64": "20" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "i64": "100" }, @@ -915,10 +978,10 @@ "defaultValue": { "f": 10000.0 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 10000000.0 }, @@ -934,12 +997,12 @@ "defaultValue": { "i64": "15" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "i64": "32768" }, @@ -955,9 +1018,9 @@ "defaultValue": { "f": 1.0 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -982,9 +1045,9 @@ "defaultValue": { "f": 0.3 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1000,9 +1063,9 @@ "defaultValue": { "f": 0.5 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1016,7 +1079,7 @@ "atomic": { "isOptional": true, "defaultValue": {}, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -1028,10 +1091,10 @@ "atomic": { "isOptional": true, "defaultValue": {}, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1084,7 +1147,7 @@ "defaultValue": { "i64": "10" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, @@ -1100,7 +1163,7 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, @@ -1113,7 +1176,7 @@ "defaultValue": { "i64": "1024" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, @@ -1156,10 +1219,10 @@ "defaultValue": { "f": 1.0 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 2.0 }, @@ -1175,7 +1238,7 @@ "defaultValue": { "f": 1.0 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "f": 1.0 }, @@ -1191,7 +1254,7 @@ "defaultValue": { "f": 0.0001 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -1203,7 +1266,7 @@ "atomic": { "isOptional": true, "defaultValue": {}, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -1215,7 +1278,7 @@ "atomic": { "isOptional": true, "defaultValue": {}, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -1227,10 +1290,10 @@ "atomic": { "isOptional": true, "defaultValue": {}, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 } @@ -1302,7 +1365,7 @@ "defaultValue": { "i64": "10" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, @@ -1318,7 +1381,7 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, @@ -1331,7 +1394,7 @@ "defaultValue": { "i64": "1024" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, @@ -1401,7 +1464,7 @@ "defaultValue": { "f": 0.5 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -1415,7 +1478,7 @@ "defaultValue": { "f": 0.001 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -1455,7 +1518,7 @@ "defaultValue": { "i64": "10" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, @@ -1471,12 +1534,12 @@ "defaultValue": { "i64": "5" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": { "i64": "1" }, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "i64": "16" }, @@ -1492,9 +1555,9 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1527,10 +1590,10 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 10000.0 }, @@ -1546,9 +1609,9 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1564,9 +1627,9 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1582,9 +1645,9 @@ "defaultValue": { "f": 0.1 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1598,7 +1661,7 @@ "atomic": { "isOptional": true, "defaultValue": {}, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -1612,7 +1675,7 @@ "defaultValue": { "i64": "42" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true } @@ -1691,6 +1754,15 @@ } } }, + { + "name": "sort", + "desc": "Sort the output.", + "type": "AT_BOOL", + "atomic": { + "isOptional": true, + "defaultValue": {} + } + }, { "name": "bucket_size", "desc": "Specify the hash bucket size used in PSI. Larger values consume more memory.", @@ -1700,7 +1772,7 @@ "defaultValue": { "i64": "1048576" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, @@ -1777,10 +1849,10 @@ "defaultValue": { "f": 0.75 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1796,10 +1868,10 @@ "defaultValue": { "f": 0.25 }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {}, "lowerBoundInclusive": true, - "hasUpperBound": true, + "upperBoundEnabled": true, "upperBound": { "f": 1.0 }, @@ -1815,7 +1887,7 @@ "defaultValue": { "i64": "1024" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, diff --git a/docker/translation.json b/docker/translation.json index e812fb5fc..937d17643 100644 --- a/docker/translation.json +++ b/docker/translation.json @@ -3,6 +3,34 @@ "secretflow": "secretflow", "First-party SecretFlow components.": "官方SecretFlow组件" }, + "feature/vert_bin_substitution:0.0.1": { + "feature": "特征", + "vert_bin_substitution": "vert_bin_substitution", + "Substitute datasets' value by bin substitution rules.": "用箱替换规则替换数据集的值。", + "0.0.1": "0.0.1", + "input_data": "input_data", + "Vertical partitioning dataset to be substituted.": "要替换的垂直分区数据集。", + "bin_rule": "bin_rule", + "Input bin substitution rule.": "输入箱替换规则。", + "output_data": "output_data", + "Output vertical table.": "输出垂直表。" + }, + "feature/vert_binning:0.0.1": { + "feature": "特征", + "vert_binning": "vert_binning", + "Generate equal frequency or equal range binning rules for vertical partitioning datasets.": "为垂直分区数据集生成等频或等范围分箱规则。", + "0.0.1": "0.0.1", + "binning_method": "binning_method", + "How to bin features with numeric types: \"quantile\"(equal frequency)/\"eq_range\"(equal range)": "如何使用数值类型对要素进行装箱:“分位数”(等频)/“eq_range”(相等范围)", + "bin_num": "bin_num", + "Max bin counts for one features.": "一个要素的最大条柱计数。", + "input_data": "input_data", + "Input vertical table.": "输入垂直表。", + "feature_selects": "feature_selects", + "which features should be binned.": "应对哪些要素进行分组。", + "bin_rule": "bin_rule", + "Output bin rule.": "输出箱规则。" + }, "feature/vert_woe_binning:0.0.1": { "feature": "特征工程", "vert_woe_binning": "WOE分箱", @@ -26,21 +54,9 @@ "Input vertical table.": "输入联合表", "feature_selects": "选择特征", "which features should be binned.": "应对哪些特征进行分组", - "woe_rule": "WOE 规则", + "bin_rule": "bin_rule", "Output WOE rule.": "输出 WOE 规则" }, - "feature/vert_woe_substitution:0.0.1": { - "feature": "特征工程", - "vert_woe_substitution": "WOE转换", - "Substitute datasets' value by WOE substitution rules.": "根据WOE分箱规则替换数据集的值", - "0.0.1": "0.0.1", - "input_data": "输入数据", - "Vertical partitioning dataset to be substituted.": "要替换的垂直分割数据集", - "woe_rule": "WOE 规则", - "Input WOE substitution rule.": "输入WOE分箱规则", - "output_data": "输出数据", - "Output vertical table.": "输出联合表" - }, "ml.eval/biclassification_eval:0.0.1": { "ml.eval": "模型评估", "biclassification_eval": "二分类评估", @@ -343,6 +359,8 @@ "0.0.1": "0.0.1", "protocol": "协议", "PSI protocol.": "PSI 协议", + "sort": "排序", + "Sort the output.": "对输出进行排序", "bucket_size": "分桶数", "Specify the hash bucket size used in PSI. Larger values consume more memory.": "指定 PSI 中使用的哈希桶大小;值越大,占用的内存越多", "ecdh_curve_type": "ECDH 曲线类型", diff --git a/docs/component/comp_guide.rst b/docs/component/comp_guide.rst index 082a8b286..deaab0765 100644 --- a/docs/component/comp_guide.rst +++ b/docs/component/comp_guide.rst @@ -56,100 +56,102 @@ Get Definition of Component(s) You must specify a component with the following format: **domain/name:version**. +e.g. Let's check the component definition of PSI. + .. code-block:: sh - $ secretflow component inspect preprocessing/train_test_split:0.0.1 - You are inspecting definition of component with id [preprocessing/train_test_split:0.0.1]. + $ secretflow component inspect preprocessing/psi:0.0.1 + You are inspecting definition of component with id [preprocessing/psi:0.0.1]. --------------------------------------------------------------------------------------------------------- { "domain": "preprocessing", - "name": "train_test_split", - "desc": "Split datasets into random train and test subsets. Please check: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html", + "name": "psi", + "desc": "PSI between two parties.", "version": "0.0.1", "attrs": [ { - "name": "train_size", - "desc": "Proportion of the dataset to include in the train subset.", - "type": "AT_FLOAT", + "name": "protocol", + "desc": "PSI protocol.", + "type": "AT_STRING", "atomic": { "isOptional": true, "defaultValue": { - "f": 0.75 - }, - "hasLowerBound": true, - "lowerBound": {}, - "lowerBoundInclusive": true, - "hasUpperBound": true, - "upperBound": { - "f": 1.0 + "s": "ECDH_PSI_2PC" }, - "upperBoundInclusive": true - } - }, - { - "name": "test_size", - "desc": "Proportion of the dataset to include in the test subset.", - "type": "AT_FLOAT", - "atomic": { - "isOptional": true, - "defaultValue": { - "f": 0.25 - }, - "hasLowerBound": true, - "lowerBound": {}, - "lowerBoundInclusive": true, - "hasUpperBound": true, - "upperBound": { - "f": 1.0 - }, - "upperBoundInclusive": true + "allowedValues": { + "ss": [ + "ECDH_PSI_2PC", + "KKRT_PSI_2PC", + "BC22_PSI_2PC" + ] + } } }, { - "name": "random_state", - "desc": "Specify the random seed of the shuffling.", + "name": "bucket_size", + "desc": "Specify the hash bucket size used in PSI. Larger values consume more memory.", "type": "AT_INT", "atomic": { "isOptional": true, "defaultValue": { - "i64": "1024" + "i64": "1048576" }, - "hasLowerBound": true, + "lowerBoundEnabled": true, "lowerBound": {} } }, { - "name": "shuffle", - "desc": "Whether to shuffle the data before splitting.", - "type": "AT_BOOL", + "name": "ecdh_curve_type", + "desc": "Curve type for ECDH PSI.", + "type": "AT_STRING", "atomic": { "isOptional": true, "defaultValue": { - "b": true + "s": "CURVE_FOURQ" + }, + "allowedValues": { + "ss": [ + "CURVE_25519", + "CURVE_FOURQ", + "CURVE_SM2", + "CURVE_SECP256K1" + ] } } } ], "inputs": [ { - "name": "input_data", - "desc": "Input dataset.", + "name": "receiver_input", + "desc": "Individual table for receiver", "types": [ - "sf.table.vertical_table" + "sf.table.individual" + ], + "attrs": [ + { + "name": "key", + "desc": "Column(s) used to join. If not provided, ids of the dataset will be used." + } ] - } - ], - "outputs": [ + }, { - "name": "train", - "desc": "Output train dataset.", + "name": "sender_input", + "desc": "Individual table for sender", "types": [ - "sf.table.vertical_table" + "sf.table.individual" + ], + "attrs": [ + { + "name": "key", + "desc": "Column(s) used to join. If not provided, ids of the dataset will be used." + } ] - }, + } + ], + "outputs": [ { - "name": "test", - "desc": "Output test dataset.", + "name": "psi_output", + "desc": "Output vertical table", "types": [ "sf.table.vertical_table" ] @@ -158,7 +160,7 @@ You must specify a component with the following format: **domain/name:version**. } -You could inspect all components at once by +You could inspect all components at once by .. code-block:: sh @@ -178,166 +180,306 @@ You may save the list to file by: Evaluate a Node --------------- -You should use **secretflow.component.entry.comp_eval** to evaluate a node. - -The following code demonstrate how to use this API and could not be run directly. - Python API ^^^^^^^^^^ +In the following examples, we would demonstrate how to evaluate a node with Python API. + +We are going to test PSI component with tiny datasets. + +1. Save the following bash script as *generate_csv.sh* + +.. code-block:: bash + + #!/bin/bash + + set -e + show_help() { + echo "Usage: bash generate_csv.sh -c {col_name} -p {file_name}" + echo " -c" + echo " the column name of id." + echo " -p" + echo " the path of output csv." + } + if [[ "$#" -lt 1 ]]; then + show_help + exit + fi + + while getopts ":c:p:" OPTION; do + case $OPTION in + c) + COL_NAME=$OPTARG + ;; + p) + FILE_PATH=$OPTARG + ;; + *) + echo "Incorrect options provided" + exit 1 + ;; + esac + done + + + # header + echo $COL_NAME > $FILE_PATH + + # generate 800 random int + for ((i=0; i<800; i++)) + do + # from 0 to 1000 + id=$(shuf -i 0-1000 -n 1) + + # check duplicates + while grep -q "^$id$" $FILE_PATH + do + id=$(shuf -i 0-1000 -n 1) + done + + # write + echo "$id" >> $FILE_PATH + done + + echo "Generated csv file is $FILE_PATH." + + +Then generate input for two parties. + +.. code-block:: bash + + mkdir -p /tmp/alice + sh generate_csv.sh -c id1 -p /tmp/alice/input.csv + + mkdir -p /tmp/bob + sh generate_csv.sh -c id2 -p /tmp/bob/input.csv + + +2. Save the following Python code as *psi_demo.py* + .. code-block:: python import json from secretflow.component.entry import comp_eval - from secretflow.protos.component.cluster_pb2 import ( + from secretflow.spec.extend.cluster_pb2 import ( SFClusterConfig, SFClusterDesc, + ) + from secretflow.spec.v1.component_pb2 import Attribute + from secretflow.spec.v1.data_pb2 import ( + DistData, + TableSchema, + IndividualTable, StorageConfig, ) - from secretflow.protos.component.comp_pb2 import Attribute - from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable - from secretflow.protos.component.evaluation_pb2 import NodeEvalParam - - desc = SFClusterDesc( - parties=["alice", "bob"], - devices=[ - SFClusterDesc.DeviceDesc( - name="spu", - type="spu", - parties=["alice", "bob"], - config=json.dumps( - { - "runtime_config": {"protocol": "REF2K", "field": "FM64"}, - "link_desc": { - "connect_retry_times": 60, - "connect_retry_interval_ms": 1000, - "brpc_channel_protocol": "http", - "brpc_channel_connection_type": "pooled", - "recv_timeout_ms": 1200 * 1000, - "http_timeout_ms": 1200 * 1000, - }, - } + from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam + import click + + + @click.command() + @click.argument("party", type=str) + def run(party: str): + desc = SFClusterDesc( + parties=["alice", "bob"], + devices=[ + SFClusterDesc.DeviceDesc( + name="spu", + type="spu", + parties=["alice", "bob"], + config=json.dumps( + { + "runtime_config": {"protocol": "REF2K", "field": "FM64"}, + "link_desc": { + "connect_retry_times": 60, + "connect_retry_interval_ms": 1000, + "brpc_channel_protocol": "http", + "brpc_channel_connection_type": "pooled", + "recv_timeout_ms": 1200 * 1000, + "http_timeout_ms": 1200 * 1000, + }, + } + ), ), - ), - SFClusterDesc.DeviceDesc( - name="heu", - type="heu", - parties=[], - config=json.dumps( - { - "mode": "PHEU", - "schema": "paillier", - "key_size": 2048, - } + SFClusterDesc.DeviceDesc( + name="heu", + type="heu", + parties=[], + config=json.dumps( + { + "mode": "PHEU", + "schema": "paillier", + "key_size": 2048, + } + ), ), - ), - ], - ) + ], + ) - sf_cluster_config = SFClusterConfig( - desc=desc, - public_config=SFClusterConfig.PublicConfig( - rayfed_config=SFClusterConfig.RayFedConfig( - parties=["alice", "bob", "carol", "davy"], - addresses=[ - "127.0.0.1:61041", - "127.0.0.1:61042", - ], - ), - spu_configs=[ - SFClusterConfig.SPUConfig( - name="spu", + sf_cluster_config = SFClusterConfig( + desc=desc, + public_config=SFClusterConfig.PublicConfig( + ray_fed_config=SFClusterConfig.RayFedConfig( parties=["alice", "bob"], addresses=[ - "127.0.0.1:61045", - "127.0.0.1:61046", + "127.0.0.1:61041", + "127.0.0.1:61042", ], - ) + ), + spu_configs=[ + SFClusterConfig.SPUConfig( + name="spu", + parties=["alice", "bob"], + addresses=[ + "127.0.0.1:61045", + "127.0.0.1:61046", + ], + ) + ], + ), + private_config=SFClusterConfig.PrivateConfig( + self_party=party, + ray_head_addr="local", # local means setup a Ray cluster instead connecting to an existed one. + ), + ) + + # check https://www.secretflow.org.cn/docs/spec/latest/zh-Hans/intro#nodeevalparam for details. + sf_node_eval_param = NodeEvalParam( + domain="preprocessing", + name="psi", + version="0.0.1", + attr_paths=[ + "protocol", + "sort", + "bucket_size", + "ecdh_curve_type", + "input/receiver_input/key", + "input/sender_input/key", + ], + attrs=[ + Attribute(s="ECDH_PSI_2PC"), + Attribute(b=True), + Attribute(i64=1048576), + Attribute(s="CURVE_FOURQ"), + Attribute(ss=["id1"]), + Attribute(ss=["id2"]), + ], + inputs=[ + DistData( + name="receiver_input", + type="sf.table.individual", + data_refs=[ + DistData.DataRef(uri="input.csv", party="alice", format="csv"), + ], + ), + DistData( + name="sender_input", + type="sf.table.individual", + data_refs=[ + DistData.DataRef(uri="input.csv", party="bob", format="csv"), + ], + ), ], - ), - private_config=SFClusterConfig.PrivateConfig( - self_party="self_party", - ray_head_addr="local", # local means setup a Ray cluster instead connecting to an existed one. - storage_config=StorageConfig( - type="local_fs", - local_fs=StorageConfig.LocalFSConfig(wd="storage_path"), + output_uris=[ + "output.csv", + ], + ) + + sf_node_eval_param.inputs[0].meta.Pack( + IndividualTable( + schema=TableSchema( + id_types=["str"], + ids=["id1"], + ), + line_count=-1, ), - ), - ) + ) + sf_node_eval_param.inputs[1].meta.Pack( + IndividualTable( + schema=TableSchema( + id_types=["str"], + ids=["id2"], + ), + line_count=-1, + ), + ) - sf_node_eval_param = NodeEvalParam( - domain="preprocessing", - name="train_test_split", - version="0.0.1", - attr_paths=["train_size", "test_size", "random_state", "shuffle"], - attrs=[ - Attribute(f=0.75), - Attribute(f=0.25), - Attribute(i64=1234), - Attribute(b=False), - ], - inputs=[ - DistData( - name="input_data", - type=str("sf.table.vertical_table"), - data_refs=[ - DistData.DataRef(uri="bob_input_path", party="bob", format="csv"), - DistData.DataRef(uri="alice_input_path", party="alice", format="csv"), - ], - ) - ], - output_uris=[ - "train_output_path", - "test_output_path", - ], - ) + storage_config = StorageConfig( + type="local_fs", + local_fs=StorageConfig.LocalFSConfig(wd=f"/tmp/{party}"), + ) - meta = VerticalTable( - schemas=[ - TableSchema( - id_types=["str"], - ids=["id2"], - feature_types=["f32", "str", "f32"], - features=["b4", "b5", "b6"], - ), - TableSchema( - id_types=["str"], - ids=["id1"], - feature_types=["str", "str", "f32"], - features=["a1", "a2", "a3"], - label_types=["f32"], - labels=["y"], - ), - ], - ) + res = comp_eval(sf_node_eval_param, storage_config, sf_cluster_config) - sf_node_eval_param.inputs[0].meta.Pack(meta) + print(f'Node eval res is \n{res}') - res = comp_eval(sf_node_eval_param, sf_cluster_config) + if __name__ == "__main__": + run() + + +3. In two separate terminals, run + +.. code-block:: python + $ python psi_demo.py alice + +.. code-block:: python + $ python psi_demo.py bob + +You should see the following output at both terminals: + +.. code-block:: python + + Node eval res is + outputs { + name: "output.csv" + type: "sf.table.vertical_table" + system_info { + } + meta { + type_url: "type.googleapis.com/secretflow.spec.v1.VerticalTable" + value: "\n\n\n\003id1\"\003str\n\n\n\003id2\"\003str\020\211\005" + } + data_refs { + uri: "output.csv" + party: "alice" + format: "csv" + } + data_refs { + uri: "output.csv" + party: "bob" + format: "csv" + } + } + +4. Check result at */tmp/alice/output.csv* and */tmp/bob/output.csv*. The content of two files should be same except the header. CLI ^^^ +You could also use SecretFlow CLI to evaluate a node. + .. code-block:: sh - $ secretflow component run --log_file={log_file} --result_file={result_file_path} --eval_param={encoded_eval_param} --cluster={encoded_cluster_def} + $ secretflow component run --log_file={log_file} --result_file={result_file_path} --eval_param={encoded_eval_param} --storage={encoded_storage_config} --cluster={encoded_cluster_def} - log_file: log file path. - result_file: result file path. - eval_param: base64-encoded NodeEvalParam prototext. +- storage: base64-encoded StorageConfig prototext. - cluster: base64-encoded SFClusterConfig prototext. +Since you need to encode prototext to use CLI, we don't expect you to use SecretFlow CLI for node evaluation. + Create a Component ------------------ Python API ^^^^^^^^^^ -If you want to create a new component in SecretFlow, you may check one of simplest component: +If you want to create a new component in SecretFlow, you may check one of simplest component: `secretflow/component/preprocessing/train_test_split.py `_ The brief steps to build a SecretFlow Component are: @@ -359,7 +501,7 @@ The brief steps to build a SecretFlow Component are: """, ) -3. Declare attributes and IO. +3. Declare attributes and IO. .. code-block:: python @@ -430,7 +572,7 @@ The brief steps to build a SecretFlow Component are: .. code-block:: python - from secretflow.protos.component.data_pb2 import DistData + from secretflow.spec.v1.data_pb2 import DistData # Signature of eval_fn must be # func(*, ctx, attr_0, attr_1, ..., input_0, input_1, ..., output_0, output_1, ...) -> typing.Dict[str, DistData] @@ -446,4 +588,4 @@ The brief steps to build a SecretFlow Component are: return {"train": DistData(), "test": DistData()} -5. Put your new component in ALL_COMPONENTS of `secretflow.component.entry `_ . \ No newline at end of file +5. Put your new component in ALL_COMPONENTS of `secretflow.component.entry `_ . diff --git a/docs/component/comp_list.md b/docs/component/comp_list.md index 49e39560f..621eb06f6 100644 --- a/docs/component/comp_list.md +++ b/docs/component/comp_list.md @@ -6,31 +6,47 @@ SecretFlow Component List ========================= -Last update: Mon Sep 18 16:30:27 2023 +Last update: Sat Oct 14 16:41:07 2023 Version: 0.0.1 First-party SecretFlow components. ## feature -### vert_woe_binning +### vert_bin_substitution Component version: 0.0.1 -Generate Weight of Evidence (WOE) binning rules for vertical partitioning datasets. +Substitute datasets' value by bin substitution rules. +#### Inputs + + +|Name|Description|Type(s)|Notes| +| :--- | :--- | :--- | :--- | +|input_data|Vertical partitioning dataset to be substituted.|['sf.table.vertical_table']|| +|bin_rule|Input bin substitution rule.|['sf.rule.binning']|| + +#### Outputs + + +|Name|Description|Type(s)|Notes| +| :--- | :--- | :--- | :--- | +|output_data|Output vertical table.|['sf.table.vertical_table']|| + +### vert_binning + + +Component version: 0.0.1 + +Generate equal frequency or equal range binning rules for vertical partitioning datasets. #### Attrs |Name|Description|Type|Required|Notes| | :--- | :--- | :--- | :--- | :--- | -|secure_device_type|Use SPU(Secure multi-party computation or MPC) or HEU(Homomorphic encryption or HE) to secure bucket summation.|String|N|Default: spu. Allowed: ['spu', 'heu'].| -|binning_method|How to bin features with numeric types: "quantile"(equal frequency)/"chimerge"(ChiMerge from AAAI92-019: https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf)|String|N|Default: quantile. Allowed: ['quantile', 'chimerge'].| +|binning_method|How to bin features with numeric types: "quantile"(equal frequency)/"eq_range"(equal range)|String|N|Default: eq_range. Allowed: ['eq_range', 'quantile'].| |bin_num|Max bin counts for one features.|Integer|N|Default: 10. Range: (0, $\infty$).| -|positive_label|Which value represent positive value in label.|String|N|Default: 1.| -|chimerge_init_bins|Max bin counts for initialization binning in ChiMerge.|Integer|N|Default: 100. Range: (2, $\infty$).| -|chimerge_target_bins|Stop merging if remaining bin counts is less than or equal to this value.|Integer|N|Default: 10. Range: [2, $\infty$).| -|chimerge_target_pvalue|Stop merging if biggest pvalue of remaining bins is greater than this value.|Float|N|Default: 0.1. Range: (0.0, 1.0].| #### Inputs @@ -44,28 +60,40 @@ Generate Weight of Evidence (WOE) binning rules for vertical partitioning datase |Name|Description|Type(s)|Notes| | :--- | :--- | :--- | :--- | -|woe_rule|Output WOE rule.|['sf.rule.woe_binning']|| +|bin_rule|Output bin rule.|['sf.rule.binning']|| -### vert_woe_substitution +### vert_woe_binning Component version: 0.0.1 -Substitute datasets' value by WOE substitution rules. +Generate Weight of Evidence (WOE) binning rules for vertical partitioning datasets. +#### Attrs + + +|Name|Description|Type|Required|Notes| +| :--- | :--- | :--- | :--- | :--- | +|secure_device_type|Use SPU(Secure multi-party computation or MPC) or HEU(Homomorphic encryption or HE) to secure bucket summation.|String|N|Default: spu. Allowed: ['spu', 'heu'].| +|binning_method|How to bin features with numeric types: "quantile"(equal frequency)/"chimerge"(ChiMerge from AAAI92-019: https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf)|String|N|Default: quantile. Allowed: ['quantile', 'chimerge'].| +|bin_num|Max bin counts for one features.|Integer|N|Default: 10. Range: (0, $\infty$).| +|positive_label|Which value represent positive value in label.|String|N|Default: 1.| +|chimerge_init_bins|Max bin counts for initialization binning in ChiMerge.|Integer|N|Default: 100. Range: (2, $\infty$).| +|chimerge_target_bins|Stop merging if remaining bin counts is less than or equal to this value.|Integer|N|Default: 10. Range: [2, $\infty$).| +|chimerge_target_pvalue|Stop merging if biggest pvalue of remaining bins is greater than this value.|Float|N|Default: 0.1. Range: (0.0, 1.0].| + #### Inputs |Name|Description|Type(s)|Notes| | :--- | :--- | :--- | :--- | -|input_data|Vertical partitioning dataset to be substituted.|['sf.table.vertical_table']|| -|woe_rule|Input WOE substitution rule.|['sf.rule.woe_binning']|| +|input_data|Input vertical table.|['sf.table.vertical_table']|Extra table attributes.(0) feature_selects - which features should be binned. Min column number to select(inclusive): 1. | #### Outputs |Name|Description|Type(s)|Notes| | :--- | :--- | :--- | :--- | -|output_data|Output vertical table.|['sf.table.vertical_table']|| +|bin_rule|Output WOE rule.|['sf.rule.binning']|| ## ml.eval @@ -491,6 +519,7 @@ PSI between two parties. |Name|Description|Type|Required|Notes| | :--- | :--- | :--- | :--- | :--- | |protocol|PSI protocol.|String|N|Default: ECDH_PSI_2PC. Allowed: ['ECDH_PSI_2PC', 'KKRT_PSI_2PC', 'BC22_PSI_2PC'].| +|sort|Sort the output.|Boolean|N|Default: False.| |bucket_size|Specify the hash bucket size used in PSI. Larger values consume more memory.|Integer|N|Default: 1048576. Range: (0, $\infty$).| |ecdh_curve_type|Curve type for ECDH PSI.|String|N|Default: CURVE_FOURQ. Allowed: ['CURVE_25519', 'CURVE_FOURQ', 'CURVE_SM2', 'CURVE_SECP256K1'].| diff --git a/docs/component/comp_spec.md b/docs/component/comp_spec.md index 603ec799e..0e0a8217b 100644 --- a/docs/component/comp_spec.md +++ b/docs/component/comp_spec.md @@ -1,4 +1,4 @@ -# Component Specification +# Extended Specification ## Table of Contents @@ -16,8 +16,6 @@ - [SFClusterDesc](#sfclusterdesc) - [SFClusterDesc.DeviceDesc](#sfclusterdescdevicedesc) - [SFClusterDesc.RayFedConfig](#sfclusterdescrayfedconfig) - - [StorageConfig](#storageconfig) - - [StorageConfig.LocalFSConfig](#storageconfiglocalfsconfig) @@ -30,65 +28,6 @@ - Messages - [DeviceObjectCollection](#deviceobjectcollection) - [DeviceObjectCollection.DeviceObject](#deviceobjectcollectiondeviceobject) - - [DistData](#distdata) - - [DistData.DataRef](#distdatadataref) - - [IndividualTable](#individualtable) - - [SystemInfo](#systeminfo) - - [TableSchema](#tableschema) - - [VerticalTable](#verticaltable) - - - - - - ### comp.proto - - - -- Messages - - [Attribute](#attribute) - - [AttributeDef](#attributedef) - - [AttributeDef.AtomicAttrDesc](#attributedefatomicattrdesc) - - [AttributeDef.UnionAttrGroupDesc](#attributedefunionattrgroupdesc) - - [CompListDef](#complistdef) - - [ComponentDef](#componentdef) - - [IoDef](#iodef) - - [IoDef.TableAttrDef](#iodeftableattrdef) - - - -- Enums - - [AttrType](#attrtype) - - - - - ### evaluation.proto - - - -- Messages - - [NodeEvalParam](#nodeevalparam) - - [NodeEvalResult](#nodeevalresult) - - - - - - ### report.proto - - - -- Messages - - [Descriptions](#descriptions) - - [Descriptions.Item](#descriptionsitem) - - [Div](#div) - - [Div.Child](#divchild) - - [Report](#report) - - [Tab](#tab) - - [Table](#table) - - [Table.HeaderItem](#tableheaderitem) - - [Table.Row](#tablerow) @@ -98,7 +37,7 @@ ## cluster.proto -Proto file: [secretflow/protos/component/cluster.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/cluster.proto) +Proto file: [secretflow/protos/secretflow/spec/extend/cluster.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/secretflow/spec/extend/cluster.proto) @@ -129,7 +68,6 @@ Private and unique to each party. | ----- | ---- | ----------- | | self_party | [ string](#string) | none | | ray_head_addr | [ string](#string) | none | -| storage_config | [ StorageConfig](#storageconfig) | none | @@ -216,6 +154,11 @@ We are going to formalize this part in future. } } ``` +Referrences: +SPU: +https://www.secretflow.org.cn/docs/spu/latest/en-US/reference/runtime_config#runtimeconfig +HEU: +https://www.secretflow.org.cn/docs/secretflow/latest/en-US/source/secretflow.device.device.device#secretflow.device.device.heu.HEU.__init__ | Field | Type | Description | @@ -237,31 +180,6 @@ We are going to formalize this part in future. | cross_silo_comm_backend | [ string](#string) | Indicates communication backend of RayFed. Accepted: 'grpc', 'brpc_link' Dafault is 'grpc' | - - -#### StorageConfig -A StorageConfig specifies the root for all data for one party. -- At this moment, only local_fs is supported. We would support OSS, database -in future. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| type | [ string](#string) | Supported: local_fs. | -| local_fs | [ StorageConfig.LocalFSConfig](#storageconfiglocalfsconfig) | local_fs config. | - - - - -#### StorageConfig.LocalFSConfig - - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| wd | [ string](#string) | Working directory. | - - ### Enums @@ -270,7 +188,7 @@ in future. ## data.proto -Proto file: [secretflow/protos/component/data.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/data.proto) +Proto file: [secretflow/protos/secretflow/spec/extend/data.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/secretflow/spec/extend/data.proto) @@ -300,493 +218,6 @@ Descibes public storage info for a collection of Device Objects. | data_ref_idxs | [repeated int32](#int32) | Index of data_ref in the parent DistData message. | - - -#### DistData -A public record for a general distributed data. - -The type of this distributed data, should be meaningful to components. - -The concrete data format (include public and private parts) is defined by -other protos. - -Suggested names, i.e. -- sf.table.vertical_table represent a secretflow vertical table -- sf.model.* represent a secretflow models. -- sf.rule.* represent a secretflow rules. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | The name of this distributed data. | -| type | [ string](#string) | none | -| sys_info | [ SystemInfo](#systeminfo) | Describe the system information that used to generate this distributed data. | -| meta | [ google.protobuf.Any](#googleprotobufany) | Public information, known to all parties. i.e. VerticalTable | -| data_refs | [repeated DistData.DataRef](#distdatadataref) | Remote data references. | - - - - -#### DistData.DataRef -A reference to a data that is stored in the remote path. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| uri | [ string](#string) | The path information relative to StorageConfig of the party. | -| party | [ string](#string) | The owner party. | -| format | [ string](#string) | The storage format, i.e. csv. | - - - - -#### IndividualTable -IndividualTable describes a table owned by a single party. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| schema | [ TableSchema](#tableschema) | none | -| num_lines | [ int64](#int64) | If -1, the number is unknown. | - - - - -#### SystemInfo -Describe the application related to data. -- SCQL, GRM related meta information should be here. -- You can add more field here, when another application is added. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| app_name | [ string](#string) | The application name. Supported: `secretflow` | -| secretflow | [ SFClusterDesc](#sfclusterdesc) | For secretflow. | - - - - -#### TableSchema -The schema of a table. -- A col must be one of `id | feature | label`. By default, it should be a -feature. -- All names must match the regexp `[A-Za-z0-9.][A-Za-z0-9_>./]*`. -- All data type must be one of -* int8 -* int16 -* int32 -* int64 -* uint8 -* uint16 -* uint32 -* uint64 -* float16 -* float32 -* float64 -* bool -* int -* float -* str - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| ids | [repeated string](#string) | Id column name(s). Optional, can be empty. | -| features | [repeated string](#string) | Feature column name(s). | -| labels | [repeated string](#string) | Label column name(s). Optional, can be empty. | -| id_types | [repeated string](#string) | Id column data type(s). Len(id) should match len(id_types). | -| feature_types | [repeated string](#string) | Feature column data type(s). Len(features) should match len(feature_types). | -| label_types | [repeated string](#string) | Label column data type(s). Len(labels) should match len(label_types). | - - - - -#### VerticalTable -VerticalTable describes a vertical virtual table from multiple parties. -> TODO: move this to secretflow/protos/builtin/ - -> Guide: if some type is only required to be handle inside a specific system, -for instance woe.rule file in engine, we don't need to define a new type -here. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| schemas | [repeated TableSchema](#tableschema) | The vertical partitioned slices' schema. Must match data_refs in the parent DistData message. | -| num_lines | [ int64](#int64) | If -1, the number is unknown. | - - - - -### Enums - - - - ## comp.proto - -Proto file: [secretflow/protos/component/comp.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/comp.proto) - - - - -### Messages - - -#### Attribute -The value of an attribute - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| f | [ float](#float) | FLOAT | -| i64 | [ int64](#int64) | INT NOTE(junfeng): "is" is preserved by Python. Replaced with "i64". | -| s | [ string](#string) | STRING | -| b | [ bool](#bool) | BOOL | -| fs | [repeated float](#float) | FLOATS | -| i64s | [repeated int64](#int64) | INTS | -| ss | [repeated string](#string) | STRINGS | -| bs | [repeated bool](#bool) | BOOLS | -| is_na | [ bool](#bool) | Indicates the value is missing explicitly. | - - - - -#### AttributeDef -Describe an attribute. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| prefixes | [repeated string](#string) | Indicates the ancestors of a node, e.g. `[name_a, name_b, name_c]` means the path prefixes of current Attribute is `name_a/name_b/name_c/`. Only `^[a-zA-Z0-9_.-]*$` is allowed. `input` and `output` are reserved. | -| name | [ string](#string) | Must be unique in the same level just like Linux file systems. Only `^[a-zA-Z0-9_.-]*$` is allowed. `input` and `output` are reserved. | -| desc | [ string](#string) | none | -| type | [ AttrType](#attrtype) | none | -| atomic | [ AttributeDef.AtomicAttrDesc](#attributedefatomicattrdesc) | none | -| union | [ AttributeDef.UnionAttrGroupDesc](#attributedefunionattrgroupdesc) | none | - - - - -#### AttributeDef.AtomicAttrDesc -Extras for an atomic attribute. -Including: `AT_FLOAT | AT_INT | AT_STRING | AT_BOOL | AT_FLOATS | AT_INTS | -AT_STRINGS | AT_BOOLS`. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| list_min_length_inclusive | [ int64](#int64) | Only valid when type is `AT_FLOATS \| AT_INTS \| AT_STRINGS \| AT_BOOLS`. | -| list_max_length_inclusive | [ int64](#int64) | Only valid when type is `AT_FLOATS \| AT_INTS \| AT_STRINGS \| AT_BOOLS`. | -| is_optional | [ bool](#bool) | If True, when Atmoic Attr is not provided or is_na, default_value would be used. Else, Atmoic Attr must be provided. | -| default_value | [ Attribute](#attribute) | A reasonable default for this attribute if the user does not supply a value. | -| allowed_values | [ Attribute](#attribute) | Only valid when type is `AT_FLOAT \| AT_INT \| AT_STRING \| AT_FLOATS \| AT_INTS \| AT_STRINGS`. Please use list fields of AtomicParameter, i.e. `ss`, `i64s`, `fs`. If the attribute is a list, allowed_values is applied to each element. | -| has_lower_bound | [ bool](#bool) | Only valid when type is `AT_FLOAT \| AT_INT \| AT_FLOATS \| AT_INTS `. If the attribute is a list, lower_bound is applied to each element. | -| lower_bound | [ Attribute](#attribute) | none | -| lower_bound_inclusive | [ bool](#bool) | none | -| has_upper_bound | [ bool](#bool) | Only valid when type is `AT_FLOAT \| AT_INT \| AT_FLOATS \| AT_INTS `. If the attribute is a list, upper_bound is applied to each element. | -| upper_bound | [ Attribute](#attribute) | none | -| upper_bound_inclusive | [ bool](#bool) | none | - - - - -#### AttributeDef.UnionAttrGroupDesc -Extras for a union attribute group. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| default_selection | [ string](#string) | The default selected child. | - - - - -#### CompListDef -A list of components - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | none | -| desc | [ string](#string) | none | -| version | [ string](#string) | none | -| comps | [repeated ComponentDef](#componentdef) | none | - - - - -#### ComponentDef -The definition of a comp. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| domain | [ string](#string) | Namespace of the comp. | -| name | [ string](#string) | Should be unique among all comps of the same domain. | -| desc | [ string](#string) | none | -| version | [ string](#string) | Version of the comp. | -| attrs | [repeated AttributeDef](#attributedef) | none | -| inputs | [repeated IoDef](#iodef) | none | -| outputs | [repeated IoDef](#iodef) | none | - - - - -#### IoDef -Define an input/output for component. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | should be unique among all IOs of the component. | -| desc | [ string](#string) | none | -| types | [repeated string](#string) | Must be one of DistData.type in data.proto | -| attrs | [repeated IoDef.TableAttrDef](#iodeftableattrdef) | Only valid for tables. The attribute path for a TableAttrDef is `{input\|output}/{IoDef name}/{TableAttrDef name}`. | - - - - -#### IoDef.TableAttrDef -An extra attribute for a table. - -If provided in a IoDef, e.g. -```json -{ - "name": "feature", - "types": [ - "int", - "float" - ], - "col_min_cnt_inclusive": 1, - "col_max_cnt": 3, - "attrs": [ - { - "name": "bucket_size", - "type": "AT_INT" - } - ] -} -``` -means after a user provide a table as IO, they should also specify -cols as "feature": -- col_min_cnt_inclusive is 1: At least 1 col to be selected. -- col_max_cnt_inclusive is 3: At most 3 cols to be selected. -And afterwards, user have to fill an int attribute called bucket_size for -each selected cols. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | Must be unique among all attributes for the table. | -| desc | [ string](#string) | none | -| types | [repeated string](#string) | Accepted col data types. Please check DistData.VerticalTable in data.proto. | -| col_min_cnt_inclusive | [ int64](#int64) | inclusive | -| col_max_cnt_inclusive | [ int64](#int64) | none | -| attrs | [repeated AttributeDef](#attributedef) | extra attribute for specified col. | - - - - -### Enums - - -#### AttrType -Supported attribute types. - -| Name | Number | Description | -| ---- | ------ | ----------- | -| AT_UNDEFINED | 0 | none | -| AT_FLOAT | 1 | FLOAT | -| AT_INT | 2 | INT | -| AT_STRING | 3 | STRING | -| AT_BOOL | 4 | BOOL | -| AT_FLOATS | 5 | FLOATS | -| AT_INTS | 6 | INTS | -| AT_STRINGS | 7 | STRINGS | -| AT_BOOLS | 8 | BOOLS | -| AT_STRUCT_GROUP | 9 | none | -| AT_UNION_GROUP | 10 | none | -| AT_SF_TABLE_COL | 11 | none | - - - - - - ## evaluation.proto - -Proto file: [secretflow/protos/component/evaluation.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/evaluation.proto) - - - - -### Messages - - -#### NodeEvalParam -Evaluate a node. -- comp.evaluate(NodeEvalParam, SFClusterConfig) -> NodeEvalResult - -NodeEvalParam contains all the information to evaluate a component. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| domain | [ string](#string) | Domain of the component. | -| name | [ string](#string) | Name of the component. | -| version | [ string](#string) | Version of the component. | -| attr_paths | [repeated string](#string) | The path of attributes. The attribute path for a TableAttrDef is `{input\|output}/{IoDef name}/{TableAttrDef name}`. | -| attrs | [repeated Attribute](#attribute) | The value of the attribute. Must match attr_paths. | -| inputs | [repeated DistData](#distdata) | The input data, the order of inputs must match inputs in ComponentDef. NOTE: Names of DistData doesn't need to match those of inputs in ComponentDef definition. | -| output_uris | [repeated string](#string) | The output data uris, the order of output_uris must match outputs in ComponentDef. | - - - - -#### NodeEvalResult -NodeEvalResult contains outputs of a component evaluation. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| outputs | [repeated DistData](#distdata) | Output data. | - - - - -### Enums - - - - ## report.proto - -Proto file: [secretflow/protos/component/report.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/report.proto) - - - - -### Messages - - -#### Descriptions -Displays multiple read-only fields in groups. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | Name of the Descriptions. | -| desc | [ string](#string) | none | -| items | [repeated Descriptions.Item](#descriptionsitem) | none | - - - - -#### Descriptions.Item - - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | Name of the field. | -| desc | [ string](#string) | none | -| type | [ string](#string) | Must be one of bool/int/float/str | -| value | [ Attribute](#attribute) | none | - - - - -#### Div -A division or a section of a page. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | Name of the Div. | -| desc | [ string](#string) | none | -| children | [repeated Div.Child](#divchild) | none | - - - - -#### Div.Child - - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| type | [ string](#string) | Supported: descriptions, table, div. | -| descriptions | [ Descriptions](#descriptions) | none | -| table | [ Table](#table) | none | -| div | [ Div](#div) | none | - - - - -#### Report - - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | Name of the Report. | -| desc | [ string](#string) | none | -| tabs | [repeated Tab](#tab) | none | -| err_code | [ int32](#int32) | none | -| err_detail | [ string](#string) | Structed error detail (JSON encoded message). | - - - - -#### Tab -A page of a report. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | Name of the Tab. | -| desc | [ string](#string) | none | -| divs | [repeated Div](#div) | none | - - - - -#### Table -Displays rows of data. - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | Name of the Table. | -| desc | [ string](#string) | none | -| headers | [repeated Table.HeaderItem](#tableheaderitem) | none | -| rows | [repeated Table.Row](#tablerow) | none | - - - - -#### Table.HeaderItem - - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | none | -| desc | [ string](#string) | none | -| type | [ string](#string) | Must be one of bool/int/float/str | - - - - -#### Table.Row - - - -| Field | Type | Description | -| ----- | ---- | ----------- | -| name | [ string](#string) | none | -| desc | [ string](#string) | none | -| items | [repeated Attribute](#attribute) | none | - - ### Enums diff --git a/docs/component/comp_spec.tmpl b/docs/component/comp_spec.tmpl index 481922333..01dd69a22 100644 --- a/docs/component/comp_spec.tmpl +++ b/docs/component/comp_spec.tmpl @@ -1,9 +1,9 @@ -# Component Specification +# Extended Specification ## Table of Contents {{range .Files}} -{{$file_name := .Name}} ### {{.Name | replace "secretflow/protos/component/" ""}} +{{$file_name := .Name}} ### {{.Name | replace "secretflow/protos/secretflow/spec/extend/" ""}} {{if .HasServices}} - Services @@ -24,7 +24,7 @@ {{range .Files}} -{{$file_name := .Name}} ## {{.Name | replace "secretflow/protos/component/" ""}} +{{$file_name := .Name}} ## {{.Name | replace "secretflow/protos/secretflow/spec/extend/" ""}} Proto file: [{{.Name }}](https://github.com/secretflow/secretflow/tree/main/{{.Name }}) diff --git a/docs/component/comp_spec_design.rst b/docs/component/comp_spec_design.rst index d390cff88..b5e0d1df0 100644 --- a/docs/component/comp_spec_design.rst +++ b/docs/component/comp_spec_design.rst @@ -1,149 +1,29 @@ -Design of Component Specification -================================= - -Component Specification could be splitted into two parts: Core protocols including Data, Component and Node Evaluation Protocol and Ancillary protocols -including SecretFlow Cluster Configurations and SecretFlow Data. Core protocols apply to all applications integrated to -SecretFlow Ecosystem while Ancillary protocols are only for SecretFlow. - -Component Specification takes some assumptions from our point of view. Please read this document before -checking :doc:`/component/comp_spec`. - -.. note:: At this moment, we don't have an official protocol for Pipelines and Scheduling. - - -Data ----- - -Defined in data.proto. - -SystemInfo -^^^^^^^^^^ -For input data, **SystemInfo** describes the application and environment which could consume the data. -For output data, it bears such information. - - -DataRef -^^^^^^^ - -A **DataRef** is a pointer to a single file belongs to one party. **uri** is the relative path to storage root of its owner. -**DataRef** is public and open to all parties. Don't try to store any secret with uris. You need to protect the files pointed by uris instead. - - -DistData -^^^^^^^^ - -We call all the data consumed and generated by components as **DistData**, aka Distributed Data, which means it is consists multiple parts owned by -different parties. - -Don't confused with data partitioning in Database systems. In such systems, dividing a large dataset into several small partitions placed on different machines is quite -common. However, there are no owner enforcements on partitions, which means which machine is selected to place a partition is random. - -In a privacy-preserving system, some models and tables are splitted and partitions are owned by different parties. One partition of **DistData** is called **DataRef**. - -For simplicity, all data are DistDatas even it only consists one partition (even none at all). - -Besides **DataRef**, **DistData** also contains **meta** to store any extra public information for different types of **DistData**. -We also proposed some common types of **DistData** including Vertical tables and Individual tables. - -**DistData** is public and open to all parties, again don't store any secret with **DistData**. - - -Component ---------- - -Defined in comp.proto. - -AttributeDef -^^^^^^^^^^^^ - -We organize all attributes of a component as attribute trees. The leaves of the tree are called **Atomic Attributes**, -which represent solid fields for users to fill-in e.g. bucket size or learning rate. The non-leaf nodes of -the tree are called **Attribute Group**. There are two kind of Attribute Groups: - -* Struct Group: all children of the group need to fill-in together. -* Union Group: select one child of the group to fill-in. - -The child of an Attribute Group could be another Attribute Group. - -A **AttributeDef** represents a node of a component attribute tree. - -.. note:: - - **Attribute Groups** are advanced usage in Component Attribute declaration. Only a small part of audiences may utilize - this feature one day. You may check **Attribute Groups** when you need to organize groups of attributes. - -For **Atomic Attributes**, you should check **AtomicAttrDesc** to specify the types of attributes, default_value, allowed_values, ranges, etc. - -For **Union Group**, you should use **UnionAttrGroupDesc** to specify the default selection of children. - -If you are going to build a attribute tree, you may use **prefixes** to indicate the parent of the attribute tree node. - - -IoDef -^^^^^ - -IoDef is to specify the requirement of an input or output of the component. You should use **types** to declare accepted types -of **DistData**. - -For table **DistDatas**, we found many SecretFlow applications would require users to select columns and fill-in extra attributes for each selected -columns, this is satisfied by **TableAttrDef**. - -.. note:: - Again, you could leave **TableAttrDef** alone at this moment since it is unusual to use. - - - -ComponentDef -^^^^^^^^^^^^ - -You could use ComponentDef to define a component: - -- domain: namespace of component. -- name: should be unique among the domain. -- version: component version is separate from component list version. -- attrs, inputs and outputs. - -Only a tuple of domain, name and version could locate a component. - -CompListDef -^^^^^^^^^^^ - -A group of a components could be organized by a **CompListDef**. -SecretFlow Component List is an instance of a CompListDef. +Notes on Extended Specification +=============================== +Extended Specification are for SecretFlow only and not application to other privacy-preserving applications. +If you are looking for definition like DistData, NodeEvalParam, etc. They are part of `SecretFlow Open Specification `_. SecretFlow Cluster ------------------ Defined in cluster.proto. -.. note:: - SecretFlow Cluster protos are not application to other privacy-preserving applications. - - SFClusterDesc ^^^^^^^^^^^^^ SFClusterDesc stores intrinsic properties of a SecretFlow cluster, including: -- SecretFlow version -- Python version -- parties participated in computation -- Security configs of secret devices like protocol of SPU devices. +- sf_version: SecretFlow version +- py_version: Python version +- parties: Parties participated in computation +- devices: Security configs of secret devices like protocol of SPU devices. +- ray_fed_config: Backend selection of RayFed. The reason we regard SFClusterDesc as intrinsic properties is because SFClusterDesc is important to data security and integrity. -For example, a DistData generated by one cluster could only consumed by another cluster only -if their SFClusterDesc are compatible. - -SFClusterDesc is part of **SystemInfo**. - +For example, a DistData generated by one cluster could only consumed by another cluster only if their SFClusterDesc are compatible. -StorageConfig -^^^^^^^^^^^^^ - -**StorageConfig** specifies the storage root of a party. It could be a local file path, -a database table or an OSS bucket. SFClusterConfig @@ -154,89 +34,21 @@ public configs and private configs. **PublicConfig** should be revealed to all parties, including: -- Addresses for RayFed. Parties need this information to communicate to each other. -- Addresses for SPU devices. SPU Runtimes of all parties need to this information to connect. +- ray_fed_config: Addresses for RayFed. Parties need this information to communicate to each other. +- spu_configs: Addresses for SPU devices. SPU Runtimes of all parties need this information to connect. **PrivateConfig** is unique to each party, including: - self_party: who am I? - ray_head_addr: The address of Ray cluster. -- storage_config: storage root of the party. - -.. note:: - You may be surprised at storage_config is private since it seems fine to be public. - Well, SFClusterConfig is the minimum set of configs to start a SecretFlow cluster, so we just make storage_config private since it is unnecessary to broadcast everyone's storage root. SecretFlow Data Types --------------------- -Based on **DistData**, we also proposed some common types for SecretFlow applications. - -**IndividualTable**, **VerticalTable** and **DeviceObjectCollection** are defined in data.proto. -**Report** is defined in report.proto. - - -IndividualTable -^^^^^^^^^^^^^^^ - -**IndividualTable** is a table owned by one party, which means there is a single item in data_refs field of DistData. -**IndividualTable** should be packed into **meta** field of DistData which includes **schema** and **num_lines**. - - -VerticalTable -^^^^^^^^^^^^^ - -**VerticalTable** is a vertical partitioned table owned by multiple parties. **VerticalTable** contains multiple **schema**. -Correspondingly, there should be multiple data_refs in DistData. - +Defined in data.proto. DeviceObjectCollection ^^^^^^^^^^^^^^^^^^^^^^ -We use **DeviceObjectCollection** to an MPC models. We would provide more details for this part later. - - -Report -^^^^^^ - -Report is a special DistData which is totally public and doesn't own any data_ref. -We use a **Report** to reveal statistic outputs in most cases. - -Report related protos are: - -- Descriptions: Displays multiple read-only fields in groups. -- Table: Displays rows of data. -- Div: A division or a section of a page, consists of Descriptions, Tables or Divs. -- Tab: A page of a report, consists of Divs. -- Report: The top-level of a report, consists of Tabs. - - -Node Evalution --------------- - -A runtime instance of a component is called a Node. We use **NodeEvalParam** to -fill-in all required attributes and inputs and get outputs in **NodeEvalResult** from node. - -NodeEvalParam -^^^^^^^^^^^^^ - -It contains: - -- domain, name, version: To locate a component. -- attr_paths, attrs: Attributes of the component. -- inputs: Inputs of the component, should be DistDatas. -- output_uris: Output uris for each output. - -.. note:: - **Why only one uri for each output?** For each output, only one uri is provided. It will be used by - all parties to generate all data_refs of this output DistData. It looks weird since we may give each party - a different uri. However, this is not a good idea: - - - When we have multiple parties, the list of output uris would be extremely long. - - Each party has the full control of the storage root and they could move the files afterwards. We hope to keep our system simple and don't invest any effort in file system management. - -NodeEvalResult -^^^^^^^^^^^^^^ - -It contains output DistDatas. +We usually use **DeviceObjectCollection** to represent an MPC models. We would provide more details for this part later. diff --git a/docs/component/index.rst b/docs/component/index.rst index cab38f816..12f3f2297 100644 --- a/docs/component/index.rst +++ b/docs/component/index.rst @@ -3,32 +3,50 @@ Component ========= -.. Note:: Both Component Specification and SecretFlow Component List are subject to modify at this moment. +SecretFlow Component is based on two Specifications. On the one hand, we obey `SecretFlow Open Specification `_. On the other, we defined cluster config and +some other data types with Extended Specification at :doc:`/component/comp_spec`. Extended Specification only applies on SecretFlow and may not used in other privacy-preserving applications in SecretFlow ecosystem. +The detailed explanation of Extended Specification is at :doc:`/component/comp_spec_design`. -Component Specification is a protocol brought by SecretFlow Ecosystem. It mainly includes Data, Component and Node Evaluation protocol. -Logically, if you wrap any program with Component Specification, it will be recognized by Kuscia(Deployment and scheduling of SecretFlow stack) and -SecretFlow Platform [#f1]_ (User Interface and Control Panel) . In the one word, for system developers, if you wrap your application with Component Specification, -you could utilize other open-source SecretFlow projects with little effort. +We wrapped some commonly used SecretFlow applications as SecretFlow Component List. However, it is not final and we are updating the list promptly. The full SecretFlow Component List is available at :doc:`/component/comp_list`. -Based on Component Specification, we wrapped some commonly used SecretFlow applications. The SecretFlow Component List is not final, -we are updating the list promptly. Now, you could run SecretFlow applications with component API or CLI apart from writing Python programs. +Now, you could run SecretFlow applications with component API or CLI apart from writing Python programs. Please check :doc:`/component/comp_guide`. -Besides SecretFlow, we are going to wrap other applications including SCQL and TECC with Component Specification. -For developers who are not familiar with :doc:`/component/comp_spec`, they are encouraged to check :doc:`/component/comp_spec_design` first. -After that, please check :doc:`/component/comp_list` and corresponding guides at :doc:`/component/comp_guide`. +Migration to `SecretFlow Open Specification `_ +----------------------------------------------------------------------------------------------- +There are some breaking changes after introduction of **SecretFlow Open Specification**, including -.. rubric:: Footnotes +comp.proto +^^^^^^^^^^ + +1. *comp.proto* is renamed to *component.proto* +2. In message *AttrType*, *AT_UNDEFINED* is replaced with *ATTR_TYPE_UNSPECIFIED* +3. In message *Attribute*, *has_lower_bound* is renamed to *lower_bound_enabled* and *has_upper_bound* is renamed to *upper_bound_enabled* +4. In message *IoDef.TableAttrDef*, *attrs* is renamed to *extra_attrs* -.. [#f1] Product name is undecided. +data.proto +^^^^^^^^^^ +1. In message *SystemInfo*, *app_name* is renamed to *app* while *secretflow* (SFClusterDesc) is replaced with *app_meta* (Any). +2. Message *StorageConfig* is moved from *cluster.proto* +3. In message *IndividualTable* and *VerticalTable*, *num_lines* is renamed to *line_count*. +4. In message *DistData*, *sys_info* is renamed to *system_info*. +We are sorry for inconvenience. Announcement ------------ +October, 2023 +^^^^^^^^^^^^^ + +1. We officially launched `SecretFlow Open Specification `_ which contains the most common parts which are shared by other privacy-preserving applications. +2. The remaining part are called Extended Specification. + + + July, 2023 ^^^^^^^^^^ diff --git a/docs/component/update_comp_list.py b/docs/component/update_comp_list.py index eeb6c29fe..3c5eae130 100644 --- a/docs/component/update_comp_list.py +++ b/docs/component/update_comp_list.py @@ -15,7 +15,7 @@ import os from secretflow.component.entry import COMP_LIST -from secretflow.protos.component.comp_pb2 import AttrType, Attribute +from secretflow.spec.v1.component_pb2 import AttrType, Attribute from mdutils.mdutils import MdUtils import datetime @@ -33,7 +33,7 @@ mdFile.new_paragraph(COMP_LIST.desc) AttrTypeStrMap = { - AttrType.AT_UNDEFINED: 'Undefined', + AttrType.ATTR_TYPE_UNSPECIFIED: 'Undefined', AttrType.AT_FLOAT: 'Float', AttrType.AT_INT: 'Integer', AttrType.AT_STRING: 'String', @@ -82,24 +82,24 @@ def get_allowed_atomic_attr_value(at: AttrType, attr: Attribute): def get_bound( at: AttrType, - has_lower_bound: bool, + lower_bound_enabled: bool, lower_bound: Attribute, lower_bound_inclusive: bool, - has_upper_bound: bool, + upper_bound_enabled: bool, upper_bound: Attribute, upper_bound_inclusive: bool, ): if at in [AttrType.AT_FLOAT, AttrType.AT_FLOATS, AttrType.AT_INT, AttrType.AT_INTS]: - if has_lower_bound or has_upper_bound: + if lower_bound_enabled or upper_bound_enabled: ret = '' - if has_lower_bound: + if lower_bound_enabled: ret += '[' if lower_bound_inclusive else '(' ret += str(get_atomic_attr_value(at, lower_bound)) ret += ', ' else: ret += '($-\infty$, ' - if has_upper_bound: + if upper_bound_enabled: ret += str(get_atomic_attr_value(at, upper_bound)) ret += ']' if upper_bound_inclusive else ')' else: @@ -129,8 +129,10 @@ def parse_comp_io(md, io_defs): if attr.col_max_cnt_inclusive > 0: notes_str += f'Max column number to select(inclusive): {attr.col_max_cnt_inclusive}. ' - if len(attr.attrs): - raise NotImplementedError('todo: parse attrs of TableAttrDef.') + if len(attr.extra_attrs): + raise NotImplementedError( + 'todo: parse extra_attrs of TableAttrDef.' + ) io_table_text.extend( [io_def.name, io_def.desc, str(list(io_def.types)), notes_str] @@ -217,10 +219,10 @@ def parse_comp_io(md, io_defs): bound = get_bound( attr.type, - attr.atomic.has_lower_bound, + attr.atomic.lower_bound_enabled, attr.atomic.lower_bound, attr.atomic.lower_bound_inclusive, - attr.atomic.has_upper_bound, + attr.atomic.upper_bound_enabled, attr.atomic.upper_bound, attr.atomic.upper_bound_inclusive, ) diff --git a/docs/getting_started/deployment.md b/docs/getting_started/deployment.md index 587b04f29..b0a2570b2 100644 --- a/docs/getting_started/deployment.md +++ b/docs/getting_started/deployment.md @@ -6,8 +6,8 @@ SecretFlow uses Ray as its distributed framework. A Ray cluster consists of a he ## Deploy based on Kuscia -Kuscia is a K8s-based privacy computing task orchestration framework. -It provides a unified privacy computing foundation that can abstract away heterogeneous infrastructure and protocols. +Kuscia is a lightweight privacy-preserving computing task orchestration framework based on K3s. +It provides a unified privacy-preserving computing foundation that can abstract away heterogeneous infrastructure and protocols. With Kuscia, you can easily manage and execute SecretFlow jobs through kubectl commands or apis without paying attention to the details of SecretFlow networking. In addition, Kuscia supports communication security and running SecretFlow jobs concurrently. diff --git a/docs/locales/zh_CN/LC_MESSAGES/component/comp_guide.po b/docs/locales/zh_CN/LC_MESSAGES/component/comp_guide.po index f3727348a..129112a1c 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/component/comp_guide.po +++ b/docs/locales/zh_CN/LC_MESSAGES/component/comp_guide.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: SecretFlow \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-07-02 23:05+0800\n" +"POT-Creation-Date: 2023-10-14 16:59+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -24,8 +24,8 @@ msgstr "SecretFlow 组件指南" msgid "Get the Component List" msgstr "获取组件列表" -#: ../../component/comp_guide.rst:8 ../../component/comp_guide.rst:186 -#: ../../component/comp_guide.rst:338 +#: ../../component/comp_guide.rst:8 ../../component/comp_guide.rst:184 +#: ../../component/comp_guide.rst:480 msgid "Python API" msgstr "" @@ -37,7 +37,7 @@ msgstr "可以通过下列方式检查 SecretFlow 组件列表:" msgid "**COMP_LIST** is a CompListDef instance." msgstr "**COMP_LIST** 是 CompListDef 实例。" -#: ../../component/comp_guide.rst:19 ../../component/comp_guide.rst:322 +#: ../../component/comp_guide.rst:19 ../../component/comp_guide.rst:459 msgid "CLI" msgstr "" @@ -59,87 +59,131 @@ msgid "" "**domain/name:version**." msgstr "您必须用以下格式指定一个组件:domain/name:version。" -#: ../../component/comp_guide.rst:161 +#: ../../component/comp_guide.rst:59 +msgid "e.g. Let's check the component definition of PSI." +msgstr "比如我们开查看 PSI 的组件定义。" + +#: ../../component/comp_guide.rst:163 msgid "You could inspect all components at once by" msgstr "您可以通过以下方式一次性检查所有组件:" -#: ../../component/comp_guide.rst:168 +#: ../../component/comp_guide.rst:170 msgid "You may save the list to file by:" msgstr "您可以通过以下方式将列表保存到文件:" -#: ../../component/comp_guide.rst:179 +#: ../../component/comp_guide.rst:181 msgid "Evaluate a Node" msgstr "执行节点" -#: ../../component/comp_guide.rst:181 +#: ../../component/comp_guide.rst:186 msgid "" -"You should use **secretflow.component.entry.comp_eval** to evaluate a " -"node." -msgstr "您应该使用 secretflow.component.entry.comp_eval 来执行节点。" +"In the following examples, we would demonstrate how to evaluate a node " +"with Python API." +msgstr "在下面的例子中,我们展示如何用 Python API 来执行节点。" + +#: ../../component/comp_guide.rst:188 +msgid "We are going to test PSI component with tiny datasets." +msgstr "我们将用一个极小的数据集来检测 PSI 组件。" + +#: ../../component/comp_guide.rst:190 +msgid "Save the following bash script as *generate_csv.sh*" +msgstr "将以下 bash 脚本保存为 *generate_csv.sh* 。" + +#: ../../component/comp_guide.rst:247 +msgid "Then generate input for two parties." +msgstr "然后为两方产生输入。" + +#: ../../component/comp_guide.rst:258 +msgid "Save the following Python code as *psi_demo.py*" +msgstr "将以下 Python 代码保存为 *psi_demo.py* 。" + +#: ../../component/comp_guide.rst:422 +msgid "In two separate terminals, run" +msgstr "在两个终端中执行" -#: ../../component/comp_guide.rst:183 +#: ../../component/comp_guide.rst:430 +msgid "You should see the following output at both terminals:" +msgstr "你应该在两个终端中看到以下输出:" + +#: ../../component/comp_guide.rst:456 msgid "" -"The following code demonstrate how to use this API and could not be run " -"directly." -msgstr "以下代码演示了如何使用此API,但不能直接运行。" +"Check result at */tmp/alice/output.csv* and */tmp/bob/output.csv*. The " +"content of two files should be same except the header." +msgstr "在 */tmp/alice/output.csv* 和 */tmp/bob/output.csv* 中检查结果。" +"两个文件的内容除了表头应该是一致的。" + +#: ../../component/comp_guide.rst:461 +msgid "You could also use SecretFlow CLI to evaluate a node." +msgstr "您也可以使用 SecretFlow CLI 来执行节点。" -#: ../../component/comp_guide.rst:329 +#: ../../component/comp_guide.rst:468 msgid "log_file: log file path." msgstr "log_file: 日志文件路径" -#: ../../component/comp_guide.rst:330 +#: ../../component/comp_guide.rst:469 msgid "result_file: result file path." msgstr "result_file: 结果文件路径。" -#: ../../component/comp_guide.rst:331 +#: ../../component/comp_guide.rst:470 msgid "eval_param: base64-encoded NodeEvalParam prototext." -msgstr "eval_param: base64编码的NodeEvalParam prototext。" +msgstr "eval_param: base64编码的 NodeEvalParam prototext。" -#: ../../component/comp_guide.rst:332 +#: ../../component/comp_guide.rst:471 +msgid "storage: base64-encoded StorageConfig prototext." +msgstr "storage: Base64编码的StorageConfig prototext。" + +#: ../../component/comp_guide.rst:472 msgid "cluster: base64-encoded SFClusterConfig prototext." -msgstr "cluster: Base64编码的SFClusterConfig prototext。" +msgstr "cluster: Base64编码的 SFClusterConfig prototext。" + +#: ../../component/comp_guide.rst:474 +msgid "" +"Since you need to encode prototext to use CLI, we don't expect you to use" +" SecretFlow CLI for node evaluation." +msgstr "因为你必须将 prototext 编码之后才能使用 CLI , 因此我们不推荐你用 CLI 的方式做节点执行。" -#: ../../component/comp_guide.rst:335 +#: ../../component/comp_guide.rst:477 msgid "Create a Component" msgstr "创建组件" -#: ../../component/comp_guide.rst:340 +#: ../../component/comp_guide.rst:482 msgid "" "If you want to create a new component in SecretFlow, you may check one of" " simplest component: " "`secretflow/component/preprocessing/train_test_split.py " "`_" -msgstr "如果你想要在 SecretFlow 中创建一个新的组件,你可以参考" -" `_ " -"这个是最简单的组件。" +msgstr "" +"如果你想要在 SecretFlow 中创建一个新的组件,你可以参考 `secretflow/component/preprocessing/train_test_split.py `_ 。" +"这几乎是最简单的组件。" -#: ../../component/comp_guide.rst:343 +#: ../../component/comp_guide.rst:485 msgid "The brief steps to build a SecretFlow Component are:" msgstr "构建一个 SecretFlow 组件的简要步骤如下:" -#: ../../component/comp_guide.rst:345 +#: ../../component/comp_guide.rst:487 msgid "Create a new file under **secretflow/component/** ." msgstr "在 secretflow/component/ 目录下创建一个新的文件。" -#: ../../component/comp_guide.rst:347 +#: ../../component/comp_guide.rst:489 msgid "" "Create a Component class with " "**secretflow.component.component.Component**:" msgstr "使用 **secretflow.component.component.Component** 创建一个组件类:" -#: ../../component/comp_guide.rst:362 +#: ../../component/comp_guide.rst:504 msgid "Declare attributes and IO." msgstr "定义属性和输入输出。" -#: ../../component/comp_guide.rst:429 +#: ../../component/comp_guide.rst:571 msgid "Declare evaluation function." msgstr "定义执行函数。" -#: ../../component/comp_guide.rst:449 +#: ../../component/comp_guide.rst:591 msgid "" "Put your new component in ALL_COMPONENTS of `secretflow.component.entry " "`_" " ." -msgstr "将你的新组件加入到" -" `secretflow.component.entry `_ " -"的 ALL_COMPONENTS 中。" +msgstr "" +"将你的新组件加入到 `secretflow.component.entry " +"`_" +" 的 ALL_COMPONENTS 中。" diff --git a/docs/locales/zh_CN/LC_MESSAGES/component/comp_list.po b/docs/locales/zh_CN/LC_MESSAGES/component/comp_list.po index 16cf7cf84..7d16a17b5 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/component/comp_list.po +++ b/docs/locales/zh_CN/LC_MESSAGES/component/comp_list.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: SecretFlow \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-08-16 15:21+0800\n" +"POT-Creation-Date: 2023-10-14 16:59+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -18,10 +18,10 @@ msgstr "" #: ../../component/comp_list.md:5 msgid "SecretFlow Component List" -msgstr "SecretFlow组件列表" +msgstr "SecretFlow 组件列表" #: ../../component/comp_list.md:9 -msgid "Last update: Wed Aug 16 15:20:41 2023" +msgid "Last update: Sat Oct 14 16:41:07 2023" msgstr "" #: ../../component/comp_list.md:11 @@ -37,37 +37,38 @@ msgid "feature" msgstr "" #: ../../component/comp_list.md:16 -msgid "vert_woe_binning" -msgstr "" - -#: ../../component/comp_list.md:19 ../../component/comp_list.md:52 -#: ../../component/comp_list.md:75 ../../component/comp_list.md:110 -#: ../../component/comp_list.md:140 ../../component/comp_list.md:165 -#: ../../component/comp_list.md:196 ../../component/comp_list.md:228 -#: ../../component/comp_list.md:260 ../../component/comp_list.md:293 -#: ../../component/comp_list.md:345 ../../component/comp_list.md:388 -#: ../../component/comp_list.md:424 ../../component/comp_list.md:465 -#: ../../component/comp_list.md:485 ../../component/comp_list.md:515 -#: ../../component/comp_list.md:549 ../../component/comp_list.md:571 -#: ../../component/comp_list.md:593 +msgid "vert_bin_substitution" +msgstr "" + +#: ../../component/comp_list.md:19 ../../component/comp_list.md:40 +#: ../../component/comp_list.md:68 ../../component/comp_list.md:103 +#: ../../component/comp_list.md:138 ../../component/comp_list.md:168 +#: ../../component/comp_list.md:193 ../../component/comp_list.md:224 +#: ../../component/comp_list.md:256 ../../component/comp_list.md:288 +#: ../../component/comp_list.md:321 ../../component/comp_list.md:373 +#: ../../component/comp_list.md:416 ../../component/comp_list.md:452 +#: ../../component/comp_list.md:493 ../../component/comp_list.md:513 +#: ../../component/comp_list.md:544 ../../component/comp_list.md:578 +#: ../../component/comp_list.md:600 ../../component/comp_list.md:622 msgid "Component version: 0.0.1" msgstr "组件版本:0.0.1" #: ../../component/comp_list.md:21 -msgid "" -"Generate Weight of Evidence (WOE) binning rules for vertical partitioning" -" datasets." -msgstr "为垂直分割数据集生成权重信息(WOE)分箱规则。" - -#: ../../component/comp_list.md:22 ../../component/comp_list.md:84 -#: ../../component/comp_list.md:113 ../../component/comp_list.md:168 -#: ../../component/comp_list.md:199 ../../component/comp_list.md:231 -#: ../../component/comp_list.md:263 ../../component/comp_list.md:299 -#: ../../component/comp_list.md:351 ../../component/comp_list.md:393 -#: ../../component/comp_list.md:430 ../../component/comp_list.md:488 -#: ../../component/comp_list.md:519 -msgid "Attrs" -msgstr "参数" +msgid "Substitute datasets' value by bin substitution rules." +msgstr "使用分箱规则替代数据集的值。" + +#: ../../component/comp_list.md:22 ../../component/comp_list.md:51 +#: ../../component/comp_list.md:84 ../../component/comp_list.md:120 +#: ../../component/comp_list.md:150 ../../component/comp_list.md:173 +#: ../../component/comp_list.md:206 ../../component/comp_list.md:238 +#: ../../component/comp_list.md:270 ../../component/comp_list.md:301 +#: ../../component/comp_list.md:356 ../../component/comp_list.md:399 +#: ../../component/comp_list.md:435 ../../component/comp_list.md:474 +#: ../../component/comp_list.md:496 ../../component/comp_list.md:526 +#: ../../component/comp_list.md:558 ../../component/comp_list.md:583 +#: ../../component/comp_list.md:605 ../../component/comp_list.md:653 +msgid "Inputs" +msgstr "输入" #: ../../component/comp_list.md msgid "Name" @@ -78,38 +79,85 @@ msgid "Description" msgstr "描述" #: ../../component/comp_list.md -msgid "Type" +msgid "Type(s)" msgstr "类型" -#: ../../component/comp_list.md -msgid "Required" -msgstr "必填" - #: ../../component/comp_list.md msgid "Notes" msgstr "注释" #: ../../component/comp_list.md -msgid "secure_device_type" +msgid "input_data" msgstr "" #: ../../component/comp_list.md -msgid "" -"Use SPU(Secure multi-party computation or MPC) or HEU(Homomorphic " -"encryption or HE) to secure bucket summation." -msgstr "使用 SPU(安全多方计算, MPC)或 HEU(同态加密, HE)来保护桶求和。" +msgid "Vertical partitioning dataset to be substituted." +msgstr "需要进行替代的垂直分区数据集。" #: ../../component/comp_list.md -msgid "String" +msgid "['sf.table.vertical_table']" msgstr "" #: ../../component/comp_list.md -msgid "N" +msgid "bin_rule" msgstr "" #: ../../component/comp_list.md -msgid "Default: spu. Allowed: ['spu', 'heu']." -msgstr "默认:spu。可选:['spu','heu']。" +msgid "Input bin substitution rule." +msgstr "输入分箱规则" + +#: ../../component/comp_list.md +msgid "['sf.rule.binning']" +msgstr "" + +#: ../../component/comp_list.md:30 ../../component/comp_list.md:58 +#: ../../component/comp_list.md:91 ../../component/comp_list.md:128 +#: ../../component/comp_list.md:158 ../../component/comp_list.md:181 +#: ../../component/comp_list.md:214 ../../component/comp_list.md:246 +#: ../../component/comp_list.md:278 ../../component/comp_list.md:309 +#: ../../component/comp_list.md:363 ../../component/comp_list.md:406 +#: ../../component/comp_list.md:442 ../../component/comp_list.md:481 +#: ../../component/comp_list.md:503 ../../component/comp_list.md:534 +#: ../../component/comp_list.md:565 ../../component/comp_list.md:590 +#: ../../component/comp_list.md:612 ../../component/comp_list.md:660 +msgid "Outputs" +msgstr "输出" + +#: ../../component/comp_list.md +msgid "output_data" +msgstr "" + +#: ../../component/comp_list.md +msgid "Output vertical table." +msgstr "输出联合表" + +#: ../../component/comp_list.md:37 +msgid "vert_binning" +msgstr "" + +#: ../../component/comp_list.md:42 +msgid "" +"Generate equal frequency or equal range binning rules for vertical " +"partitioning datasets." +msgstr "为垂直分割数据集生成分箱规则。" + +#: ../../component/comp_list.md:43 ../../component/comp_list.md:71 +#: ../../component/comp_list.md:112 ../../component/comp_list.md:141 +#: ../../component/comp_list.md:196 ../../component/comp_list.md:227 +#: ../../component/comp_list.md:259 ../../component/comp_list.md:291 +#: ../../component/comp_list.md:327 ../../component/comp_list.md:379 +#: ../../component/comp_list.md:421 ../../component/comp_list.md:458 +#: ../../component/comp_list.md:516 ../../component/comp_list.md:548 +msgid "Attrs" +msgstr "参数" + +#: ../../component/comp_list.md +msgid "Type" +msgstr "类型" + +#: ../../component/comp_list.md +msgid "Required" +msgstr "必填" #: ../../component/comp_list.md msgid "binning_method" @@ -118,15 +166,20 @@ msgstr "" #: ../../component/comp_list.md msgid "" "How to bin features with numeric types: \"quantile\"(equal " -"frequency)/\"chimerge\"(ChiMerge from AAAI92-019: " -"https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf)" +"frequency)/\"eq_range\"(equal range)" +msgstr "如何对数值型特征进行分箱:\"quantile\"(等频), \"eq_range\" (等距)" + +#: ../../component/comp_list.md +msgid "String" msgstr "" -"如何对数值型特征进行分箱:\"quantile\"(等频), " -"\"chimerge\"(来自AAAI92-019的ChiMerge算法:https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf)" #: ../../component/comp_list.md -msgid "Default: quantile. Allowed: ['quantile', 'chimerge']." -msgstr "默认:quantile。可选:['quantile','chimerge']。" +msgid "N" +msgstr "" + +#: ../../component/comp_list.md +msgid "Default: eq_range. Allowed: ['eq_range', 'quantile']." +msgstr "默认:eq_range。可选:['eq_range', 'quantile']。" #: ../../component/comp_list.md msgid "bin_num" @@ -144,6 +197,57 @@ msgstr "" msgid "Default: 10. Range: (0, $\\infty$)." msgstr "默认: 10. 范围: (0, $\\infty$)." +#: ../../component/comp_list.md +msgid "Input vertical table." +msgstr "输入联合表" + +#: ../../component/comp_list.md +msgid "" +"Extra table attributes.(0) feature_selects - which features should be " +"binned. Min column number to select(inclusive): 1." +msgstr "额外的表属性。 (0) feature_selects - 应该对哪些特征进行分箱。要选择的最小数量(包括在内):1。" + +#: ../../component/comp_list.md +msgid "Output bin rule." +msgstr "输出分桶规则" + +#: ../../component/comp_list.md:65 +msgid "vert_woe_binning" +msgstr "" + +#: ../../component/comp_list.md:70 +msgid "" +"Generate Weight of Evidence (WOE) binning rules for vertical partitioning" +" datasets." +msgstr "为垂直分割数据集生成权重信息(WOE)分箱规则。" + +#: ../../component/comp_list.md +msgid "secure_device_type" +msgstr "" + +#: ../../component/comp_list.md +msgid "" +"Use SPU(Secure multi-party computation or MPC) or HEU(Homomorphic " +"encryption or HE) to secure bucket summation." +msgstr "使用 SPU(安全多方计算, MPC)或 HEU(同态加密, HE)来保护桶求和。" + +#: ../../component/comp_list.md +msgid "Default: spu. Allowed: ['spu', 'heu']." +msgstr "默认:spu。可选:['spu','heu']。" + +#: ../../component/comp_list.md +msgid "" +"How to bin features with numeric types: \"quantile\"(equal " +"frequency)/\"chimerge\"(ChiMerge from AAAI92-019: " +"https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf)" +msgstr "" +"如何对数值型特征进行分箱:\"quantile\"(等频), " +"\"chimerge\"(来自AAAI92-019的ChiMerge算法: https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf )" + +#: ../../component/comp_list.md +msgid "Default: quantile. Allowed: ['quantile', 'chimerge']." +msgstr "默认:quantile。可选:['quantile','chimerge']。" + #: ../../component/comp_list.md msgid "positive_label" msgstr "" @@ -198,119 +302,39 @@ msgstr "" msgid "Default: 0.1. Range: (0.0, 1.0]." msgstr "默认: 0.1. 范围: (0.0, 1.0]." -#: ../../component/comp_list.md:35 ../../component/comp_list.md:55 -#: ../../component/comp_list.md:92 ../../component/comp_list.md:122 -#: ../../component/comp_list.md:145 ../../component/comp_list.md:178 -#: ../../component/comp_list.md:210 ../../component/comp_list.md:242 -#: ../../component/comp_list.md:273 ../../component/comp_list.md:328 -#: ../../component/comp_list.md:371 ../../component/comp_list.md:407 -#: ../../component/comp_list.md:446 ../../component/comp_list.md:468 -#: ../../component/comp_list.md:497 ../../component/comp_list.md:529 -#: ../../component/comp_list.md:554 ../../component/comp_list.md:576 -#: ../../component/comp_list.md:624 -msgid "Inputs" -msgstr "输入" - -#: ../../component/comp_list.md -msgid "Type(s)" -msgstr "类型" - -#: ../../component/comp_list.md -msgid "input_data" -msgstr "" - -#: ../../component/comp_list.md -msgid "Input vertical table." -msgstr "输入联合表" - -#: ../../component/comp_list.md -msgid "['sf.table.vertical_table']" -msgstr "" - -#: ../../component/comp_list.md -msgid "" -"Extra table attributes.(0) feature_selects - which features should be " -"binned. Min column number to select(inclusive): 1." -msgstr "额外的表属性。 (0) feature_selects - 应该对哪些特征进行分箱。要选择的最小数量(包括在内):1。" - -#: ../../component/comp_list.md:42 ../../component/comp_list.md:63 -#: ../../component/comp_list.md:100 ../../component/comp_list.md:130 -#: ../../component/comp_list.md:153 ../../component/comp_list.md:186 -#: ../../component/comp_list.md:218 ../../component/comp_list.md:250 -#: ../../component/comp_list.md:281 ../../component/comp_list.md:335 -#: ../../component/comp_list.md:378 ../../component/comp_list.md:414 -#: ../../component/comp_list.md:453 ../../component/comp_list.md:475 -#: ../../component/comp_list.md:505 ../../component/comp_list.md:536 -#: ../../component/comp_list.md:561 ../../component/comp_list.md:583 -#: ../../component/comp_list.md:631 -msgid "Outputs" -msgstr "输出" - -#: ../../component/comp_list.md -msgid "woe_rule" -msgstr "" - #: ../../component/comp_list.md msgid "Output WOE rule." msgstr "输出WOE规则" -#: ../../component/comp_list.md -msgid "['sf.rule.woe_binning']" -msgstr "" - -#: ../../component/comp_list.md:49 -msgid "vert_woe_substitution" -msgstr "" - -#: ../../component/comp_list.md:54 -msgid "Substitute datasets' value by WOE substitution rules." -msgstr "使用WOE替代规则替代数据集的值。" - -#: ../../component/comp_list.md -msgid "Vertical partitioning dataset to be substituted." -msgstr "需要进行替代的垂直分区数据集。" - -#: ../../component/comp_list.md -msgid "Input WOE substitution rule." -msgstr "输入WOE分箱规则" - -#: ../../component/comp_list.md -msgid "output_data" -msgstr "" - -#: ../../component/comp_list.md -msgid "Output vertical table." -msgstr "输出联合表" - -#: ../../component/comp_list.md:70 +#: ../../component/comp_list.md:98 msgid "ml.eval" msgstr "" -#: ../../component/comp_list.md:72 +#: ../../component/comp_list.md:100 msgid "biclassification_eval" msgstr "" -#: ../../component/comp_list.md:77 +#: ../../component/comp_list.md:105 msgid "Statistics evaluation for a bi-classification model on a dataset." msgstr "数据集上二分类模型的统计评估。" -#: ../../component/comp_list.md:78 +#: ../../component/comp_list.md:106 msgid "summary_report: SummaryReport" msgstr "" -#: ../../component/comp_list.md:79 +#: ../../component/comp_list.md:107 msgid "group_reports: List[GroupReport]" msgstr "" -#: ../../component/comp_list.md:80 +#: ../../component/comp_list.md:108 msgid "eq_frequent_bin_report: List[EqBinReport]" msgstr "" -#: ../../component/comp_list.md:81 +#: ../../component/comp_list.md:109 msgid "eq_range_bin_report: List[EqBinReport]" msgstr "" -#: ../../component/comp_list.md:82 +#: ../../component/comp_list.md:110 msgid "" "head_report: List[PrReport] reports for fpr = 0.001, 0.005, 0.01, 0.05, " "0.1, 0.2" @@ -382,11 +406,11 @@ msgstr "输出报告" msgid "['sf.report']" msgstr "" -#: ../../component/comp_list.md:107 +#: ../../component/comp_list.md:135 msgid "prediction_bias_eval" msgstr "" -#: ../../component/comp_list.md:112 +#: ../../component/comp_list.md:140 msgid "Calculate prediction bias, ie. average of predictions - average of labels." msgstr "计算预测偏差,即预测平均值 - 标签平均值" @@ -433,11 +457,11 @@ msgstr "输入预测表" msgid "result" msgstr "" -#: ../../component/comp_list.md:137 +#: ../../component/comp_list.md:165 msgid "ss_pvalue" msgstr "" -#: ../../component/comp_list.md:142 +#: ../../component/comp_list.md:170 msgid "" "Calculate P-Value for LR model training on vertical partitioning dataset " "by using secret sharing. For large dataset(large than 10w samples & 200 " @@ -467,15 +491,15 @@ msgstr "" msgid "Output P-Value report." msgstr "输出P-VALUE结果表" -#: ../../component/comp_list.md:160 +#: ../../component/comp_list.md:188 msgid "ml.predict" msgstr "" -#: ../../component/comp_list.md:162 +#: ../../component/comp_list.md:190 msgid "sgb_predict" msgstr "" -#: ../../component/comp_list.md:167 +#: ../../component/comp_list.md:195 msgid "Predict using SGB model." msgstr "使用SGB模型进行预测。" @@ -557,11 +581,11 @@ msgstr "输出预测值" msgid "['sf.table.individual']" msgstr "" -#: ../../component/comp_list.md:193 +#: ../../component/comp_list.md:221 msgid "ss_glm_predict" msgstr "" -#: ../../component/comp_list.md:198 +#: ../../component/comp_list.md:226 msgid "Predict using the SSGLM model." msgstr "使用 SSGLM 模型进行预测" @@ -581,11 +605,11 @@ msgstr "" msgid "['sf.model.ss_glm']" msgstr "" -#: ../../component/comp_list.md:225 +#: ../../component/comp_list.md:253 msgid "ss_sgd_predict" msgstr "" -#: ../../component/comp_list.md:230 +#: ../../component/comp_list.md:258 msgid "Predict using the SS-SGD model." msgstr "使用 SS-SGD 模型预测" @@ -601,11 +625,11 @@ msgstr "在一次迭代中使用的训练样本数量。" msgid "Default: 1024. Range: (0, $\\infty$)." msgstr "默认: 1024. 范围: (0, $\\infty$)." -#: ../../component/comp_list.md:257 +#: ../../component/comp_list.md:285 msgid "ss_xgb_predict" msgstr "" -#: ../../component/comp_list.md:262 +#: ../../component/comp_list.md:290 msgid "Predict using the SS-XGB model." msgstr "使用 SS-XGB 模型预测" @@ -613,29 +637,29 @@ msgstr "使用 SS-XGB 模型预测" msgid "['sf.model.ss_xgb']" msgstr "" -#: ../../component/comp_list.md:288 +#: ../../component/comp_list.md:316 msgid "ml.train" msgstr "" -#: ../../component/comp_list.md:290 +#: ../../component/comp_list.md:318 msgid "sgb_train" msgstr "" -#: ../../component/comp_list.md:295 +#: ../../component/comp_list.md:323 msgid "" "Provides both classification and regression tree boosting (also known as " "GBDT, GBM) for vertical split dataset setting by using secure boost." msgstr "此方法使用秘密共享,为垂直分区的数据集设置提供了分类和回归树提升(也称为GBDT、GBM)。" -#: ../../component/comp_list.md:297 +#: ../../component/comp_list.md:325 msgid "" "SGB is short for SecureBoost. Compared to its safer counterpart SS-XGB, " "SecureBoost focused on protecting label holder." msgstr "SGB是SecureBoost的缩写。与更安全的SS-XGB相比,SecureBoost专注于保护标签持有人。" -#: ../../component/comp_list.md:298 +#: ../../component/comp_list.md:326 msgid "Check https://arxiv.org/abs/1901.08755." -msgstr "详情请见:https://arxiv.org/pdf/2005.08479.pdf。" +msgstr "详情请见:https://arxiv.org/pdf/2005.08479.pdf 。" #: ../../component/comp_list.md msgid "num_boost_round" @@ -900,11 +924,11 @@ msgstr "" msgid "Output model." msgstr "输出模型" -#: ../../component/comp_list.md:342 +#: ../../component/comp_list.md:370 msgid "ss_glm_train" msgstr "" -#: ../../component/comp_list.md:347 +#: ../../component/comp_list.md:375 msgid "" "generalized linear model (GLM) is a flexible generalization of ordinary " "linear regression. The GLM generalizes linear regression by allowing the " @@ -1045,18 +1069,18 @@ msgstr "" msgid "Specify a column to use for the observation weights" msgstr "" -#: ../../component/comp_list.md:385 +#: ../../component/comp_list.md:413 msgid "ss_sgd_train" msgstr "" -#: ../../component/comp_list.md:390 +#: ../../component/comp_list.md:418 msgid "" "Train both linear and logistic regression linear models for vertical " "partitioning dataset with mini batch SGD training solver by using secret " "sharing." msgstr "使用秘密共享,通过小批量SGD训练求解器,在垂直划分数据集上训练线性和逻辑回归线性模型。" -#: ../../component/comp_list.md:392 +#: ../../component/comp_list.md:420 msgid "SS-SGD is short for secret sharing SGD training." msgstr "SS-SGD是秘密共享SGD训练的缩写" @@ -1105,27 +1129,27 @@ msgid "Default: 0.5. Range: [0.0, $\\infty$)." msgstr "默认: 0.5. 范围: [0.0, $\\infty$)." #: ../../component/comp_list.md -msgid "Default: 0.001. Range: (0.0, $\\infty$)." +msgid "Default: 0.001. Range: [0.0, $\\infty$)." msgstr "默认: 0.001. 范围: (0.0, $\\infty$)." -#: ../../component/comp_list.md:421 +#: ../../component/comp_list.md:449 msgid "ss_xgb_train" msgstr "" -#: ../../component/comp_list.md:426 +#: ../../component/comp_list.md:454 msgid "" "This method provides both classification and regression tree boosting " "(also known as GBDT, GBM) for vertical partitioning dataset setting by " "using secret sharing." msgstr "此方法使用秘密共享,为垂直分区的数据集设置提供了分类和回归树提升(也称为GBDT、GBM)。" -#: ../../component/comp_list.md:428 +#: ../../component/comp_list.md:456 msgid "SS-XGB is short for secret sharing XGB." msgstr "SS-XGB是秘密分享XGB的缩写。" -#: ../../component/comp_list.md:429 +#: ../../component/comp_list.md:457 msgid "More details: https://arxiv.org/pdf/2005.08479.pdf" -msgstr "详情请见:https://arxiv.org/pdf/2005.08479.pdf。" +msgstr "详情请见:https://arxiv.org/pdf/2005.08479.pdf 。" #: ../../component/comp_list.md msgid "Step size shrinkage used in updates to prevent overfitting." @@ -1139,15 +1163,15 @@ msgstr "" msgid "Subsample ratio of the training instances." msgstr "训练实例的子采样比率。" -#: ../../component/comp_list.md:460 +#: ../../component/comp_list.md:488 msgid "preprocessing" msgstr "" -#: ../../component/comp_list.md:462 +#: ../../component/comp_list.md:490 msgid "feature_filter" msgstr "" -#: ../../component/comp_list.md:467 +#: ../../component/comp_list.md:495 msgid "Drop features from the dataset." msgstr "从数据集中删除特征" @@ -1163,11 +1187,11 @@ msgstr "额外的表属性。(0) drop_features - 要删除的特征。" msgid "out_ds" msgstr "" -#: ../../component/comp_list.md:482 +#: ../../component/comp_list.md:510 msgid "psi" msgstr "" -#: ../../component/comp_list.md:487 +#: ../../component/comp_list.md:515 msgid "PSI between two parties." msgstr "两方之间的PSI(隐私保护交集计算)。" @@ -1185,6 +1209,14 @@ msgid "" "'BC22_PSI_2PC']." msgstr "默认: ECDH_PSI_2PC. 可选: ['ECDH_PSI_2PC', 'KKRT_PSI_2PC', 'BC22_PSI_2PC']." +#: ../../component/comp_list.md +msgid "sort" +msgstr "" + +#: ../../component/comp_list.md +msgid "Sort the output." +msgstr "将结果排序。" + #: ../../component/comp_list.md msgid "" "Specify the hash bucket size used in PSI. Larger values consume more " @@ -1241,15 +1273,15 @@ msgstr "" msgid "Output vertical table" msgstr "输出联合表" -#: ../../component/comp_list.md:512 +#: ../../component/comp_list.md:541 msgid "train_test_split" msgstr "" -#: ../../component/comp_list.md:517 +#: ../../component/comp_list.md:546 msgid "Split datasets into random train and test subsets." msgstr "将数据集随机切分成训练集和测试集。" -#: ../../component/comp_list.md:518 +#: ../../component/comp_list.md:547 msgid "" "Please check: https://scikit-" "learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html" @@ -1313,21 +1345,21 @@ msgstr "" msgid "Output test dataset." msgstr "输出测试数据集" -#: ../../component/comp_list.md:544 +#: ../../component/comp_list.md:573 msgid "stats" msgstr "" -#: ../../component/comp_list.md:546 +#: ../../component/comp_list.md:575 msgid "ss_pearsonr" msgstr "" -#: ../../component/comp_list.md:551 +#: ../../component/comp_list.md:580 msgid "" "Calculate Pearson's product-moment correlation coefficient for vertical " "partitioning dataset by using secret sharing." msgstr "使用秘密共享计算垂直分区数据集的Pearson积矩相关系数。" -#: ../../component/comp_list.md:553 ../../component/comp_list.md:575 +#: ../../component/comp_list.md:582 ../../component/comp_list.md:604 msgid "" "For large dataset(large than 10w samples & 200 features), recommend to " "use [Ring size: 128, Fxp: 40] options for SPU device." @@ -1344,11 +1376,11 @@ msgstr "额外的表属性。(0) feature_selects - 指定要计算相关系数 msgid "Output Pearson's product-moment correlation coefficient report." msgstr "输出相关系数矩阵表。" -#: ../../component/comp_list.md:568 +#: ../../component/comp_list.md:597 msgid "ss_vif" msgstr "" -#: ../../component/comp_list.md:573 +#: ../../component/comp_list.md:602 msgid "" "Calculate Variance Inflation Factor(VIF) for vertical partitioning " "dataset by using secret sharing." @@ -1364,119 +1396,119 @@ msgstr "额外的表属性。(0) feature_selects - 指定要计算VIF的特征 msgid "Output Variance Inflation Factor(VIF) report." msgstr "" -#: ../../component/comp_list.md:590 +#: ../../component/comp_list.md:619 msgid "table_statistics" msgstr "" -#: ../../component/comp_list.md:595 +#: ../../component/comp_list.md:624 msgid "Get a table of statistics, including each column's" msgstr "获取统计信息表,包括每列的" -#: ../../component/comp_list.md:597 +#: ../../component/comp_list.md:626 msgid "datatype" msgstr "" -#: ../../component/comp_list.md:598 +#: ../../component/comp_list.md:627 msgid "total_count" msgstr "" -#: ../../component/comp_list.md:599 +#: ../../component/comp_list.md:628 msgid "count" msgstr "" -#: ../../component/comp_list.md:600 +#: ../../component/comp_list.md:629 msgid "count_na" msgstr "" -#: ../../component/comp_list.md:601 +#: ../../component/comp_list.md:630 msgid "min" msgstr "" -#: ../../component/comp_list.md:602 +#: ../../component/comp_list.md:631 msgid "max" msgstr "" -#: ../../component/comp_list.md:603 +#: ../../component/comp_list.md:632 msgid "var" msgstr "" -#: ../../component/comp_list.md:604 +#: ../../component/comp_list.md:633 msgid "std" msgstr "" -#: ../../component/comp_list.md:605 +#: ../../component/comp_list.md:634 msgid "sem" msgstr "" -#: ../../component/comp_list.md:606 +#: ../../component/comp_list.md:635 msgid "skewness" msgstr "" -#: ../../component/comp_list.md:607 +#: ../../component/comp_list.md:636 msgid "kurtosis" msgstr "" -#: ../../component/comp_list.md:608 +#: ../../component/comp_list.md:637 msgid "q1" msgstr "" -#: ../../component/comp_list.md:609 +#: ../../component/comp_list.md:638 msgid "q2" msgstr "" -#: ../../component/comp_list.md:610 +#: ../../component/comp_list.md:639 msgid "q3" msgstr "" -#: ../../component/comp_list.md:611 +#: ../../component/comp_list.md:640 msgid "moment_2" msgstr "" -#: ../../component/comp_list.md:612 +#: ../../component/comp_list.md:641 msgid "moment_3" msgstr "" -#: ../../component/comp_list.md:613 +#: ../../component/comp_list.md:642 msgid "moment_4" msgstr "" -#: ../../component/comp_list.md:614 +#: ../../component/comp_list.md:643 msgid "central_moment_2" msgstr "" -#: ../../component/comp_list.md:615 +#: ../../component/comp_list.md:644 msgid "central_moment_3" msgstr "" -#: ../../component/comp_list.md:616 +#: ../../component/comp_list.md:645 msgid "central_moment_4" msgstr "" -#: ../../component/comp_list.md:617 +#: ../../component/comp_list.md:646 msgid "sum" msgstr "" -#: ../../component/comp_list.md:618 +#: ../../component/comp_list.md:647 msgid "sum_2" msgstr "" -#: ../../component/comp_list.md:619 +#: ../../component/comp_list.md:648 msgid "sum_3" msgstr "" -#: ../../component/comp_list.md:620 +#: ../../component/comp_list.md:649 msgid "sum_4" msgstr "" -#: ../../component/comp_list.md:621 +#: ../../component/comp_list.md:650 msgid "moment_2 means E[X^2]." msgstr "moment_2 表示 E[X^2]." -#: ../../component/comp_list.md:622 +#: ../../component/comp_list.md:651 msgid "central_moment_2 means E[(X - mean(X))^2]." msgstr "central_moment_2 表示 E[(X - mean(X))^2]." -#: ../../component/comp_list.md:623 +#: ../../component/comp_list.md:652 msgid "sum_2 means sum(X^2)." msgstr "sum_2 表示 sum(X^2)." diff --git a/docs/locales/zh_CN/LC_MESSAGES/component/comp_spec.po b/docs/locales/zh_CN/LC_MESSAGES/component/comp_spec.po index ba9125c94..d14cfef5e 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/component/comp_spec.po +++ b/docs/locales/zh_CN/LC_MESSAGES/component/comp_spec.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: SecretFlow \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-08-16 15:21+0800\n" +"POT-Creation-Date: 2023-10-14 16:59+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -17,22 +17,19 @@ msgstr "" "Generated-By: Babel 2.12.1\n" #: ../../component/comp_spec.md:1 -msgid "Component Specification" -msgstr "组件规范" +msgid "Extended Specification" +msgstr "扩展标准" #: ../../component/comp_spec.md:3 msgid "Table of Contents" msgstr "目录" -#: ../../component/comp_spec.md:6 ../../component/comp_spec.md:98 +#: ../../component/comp_spec.md:6 ../../component/comp_spec.md:38 msgid "cluster.proto" msgstr "" -#: ../../component/comp_spec.md:10 ../../component/comp_spec.md:29 -#: ../../component/comp_spec.md:47 ../../component/comp_spec.md:69 -#: ../../component/comp_spec.md:81 ../../component/comp_spec.md:105 -#: ../../component/comp_spec.md:265 ../../component/comp_spec.md:419 -#: ../../component/comp_spec.md:609 ../../component/comp_spec.md:654 +#: ../../component/comp_spec.md:10 ../../component/comp_spec.md:28 +#: ../../component/comp_spec.md:45 ../../component/comp_spec.md:196 msgid "Messages" msgstr "" @@ -65,158 +62,32 @@ msgid "[SFClusterDesc.DeviceDesc](#sfclusterdescdevicedesc)" msgstr "" #: ../../component/comp_spec.md:18 -msgid "[StorageConfig](#storageconfig)" +msgid "[SFClusterDesc.RayFedConfig](#sfclusterdescrayfedconfig)" msgstr "" -#: ../../component/comp_spec.md:19 -msgid "[StorageConfig.LocalFSConfig](#storageconfiglocalfsconfig)" -msgstr "" - -#: ../../component/comp_spec.md:25 ../../component/comp_spec.md:258 +#: ../../component/comp_spec.md:24 ../../component/comp_spec.md:189 msgid "data.proto" msgstr "" -#: ../../component/comp_spec.md:30 +#: ../../component/comp_spec.md:29 msgid "[DeviceObjectCollection](#deviceobjectcollection)" msgstr "" -#: ../../component/comp_spec.md:31 +#: ../../component/comp_spec.md:30 msgid "[DeviceObjectCollection.DeviceObject](#deviceobjectcollectiondeviceobject)" msgstr "" -#: ../../component/comp_spec.md:32 -msgid "[DistData](#distdata)" -msgstr "" - -#: ../../component/comp_spec.md:33 -msgid "[DistData.DataRef](#distdatadataref)" -msgstr "" - -#: ../../component/comp_spec.md:34 -msgid "[IndividualTable](#individualtable)" -msgstr "" - -#: ../../component/comp_spec.md:35 -msgid "[SystemInfo](#systeminfo)" -msgstr "" - -#: ../../component/comp_spec.md:36 -msgid "[TableSchema](#tableschema)" -msgstr "" - -#: ../../component/comp_spec.md:37 -msgid "[VerticalTable](#verticaltable)" -msgstr "" - -#: ../../component/comp_spec.md:43 ../../component/comp_spec.md:412 -msgid "comp.proto" -msgstr "" - -#: ../../component/comp_spec.md:48 -msgid "[Attribute](#attribute)" -msgstr "" - -#: ../../component/comp_spec.md:49 -msgid "[AttributeDef](#attributedef)" -msgstr "" - -#: ../../component/comp_spec.md:50 -msgid "[AttributeDef.AtomicAttrDesc](#attributedefatomicattrdesc)" -msgstr "" - -#: ../../component/comp_spec.md:51 -msgid "[AttributeDef.UnionAttrGroupDesc](#attributedefunionattrgroupdesc)" -msgstr "" - -#: ../../component/comp_spec.md:52 -msgid "[CompListDef](#complistdef)" -msgstr "" - -#: ../../component/comp_spec.md:53 -msgid "[ComponentDef](#componentdef)" -msgstr "" - -#: ../../component/comp_spec.md:54 -msgid "[IoDef](#iodef)" -msgstr "" - -#: ../../component/comp_spec.md:55 -msgid "[IoDef.TableAttrDef](#iodeftableattrdef)" -msgstr "" - -#: ../../component/comp_spec.md:59 ../../component/comp_spec.md:254 -#: ../../component/comp_spec.md:408 ../../component/comp_spec.md:577 -#: ../../component/comp_spec.md:643 ../../component/comp_spec.md:779 -msgid "Enums" -msgstr "" - -#: ../../component/comp_spec.md:60 -msgid "[AttrType](#attrtype)" -msgstr "" - -#: ../../component/comp_spec.md:65 ../../component/comp_spec.md:602 -msgid "evaluation.proto" -msgstr "" - -#: ../../component/comp_spec.md:70 -msgid "[NodeEvalParam](#nodeevalparam)" -msgstr "" - -#: ../../component/comp_spec.md:71 -msgid "[NodeEvalResult](#nodeevalresult)" -msgstr "" - -#: ../../component/comp_spec.md:77 ../../component/comp_spec.md:647 -msgid "report.proto" -msgstr "" - -#: ../../component/comp_spec.md:82 -msgid "[Descriptions](#descriptions)" -msgstr "" - -#: ../../component/comp_spec.md:83 -msgid "[Descriptions.Item](#descriptionsitem)" -msgstr "" - -#: ../../component/comp_spec.md:84 -msgid "[Div](#div)" -msgstr "" - -#: ../../component/comp_spec.md:85 -msgid "[Div.Child](#divchild)" -msgstr "" - -#: ../../component/comp_spec.md:86 -msgid "[Report](#report)" -msgstr "" - -#: ../../component/comp_spec.md:87 -msgid "[Tab](#tab)" -msgstr "" - -#: ../../component/comp_spec.md:88 -msgid "[Table](#table)" -msgstr "" - -#: ../../component/comp_spec.md:89 -msgid "[Table.HeaderItem](#tableheaderitem)" -msgstr "" - -#: ../../component/comp_spec.md:90 -msgid "[Table.Row](#tablerow)" -msgstr "" - -#: ../../component/comp_spec.md:100 +#: ../../component/comp_spec.md:40 msgid "" "Proto file: " -"[secretflow/protos/component/cluster.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/cluster.proto)" +"[secretflow/protos/secretflow/spec/extend/cluster.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/secretflow/spec/extend/cluster.proto)" msgstr "" -#: ../../component/comp_spec.md:108 +#: ../../component/comp_spec.md:48 msgid "SFClusterConfig" msgstr "" -#: ../../component/comp_spec.md:109 +#: ../../component/comp_spec.md:49 msgid "" "Runtime Config for a SecretFlow cluster. Besides intrinsic SFClusterDesc," " dynamic network configs are provided." @@ -270,11 +141,11 @@ msgstr "" msgid "Dynamic runtime private configs." msgstr "动态运行时私有配置。" -#: ../../component/comp_spec.md:122 +#: ../../component/comp_spec.md:62 msgid "SFClusterConfig.PrivateConfig" msgstr "" -#: ../../component/comp_spec.md:123 +#: ../../component/comp_spec.md:63 msgid "Different for each party. Private and unique to each party." msgstr "对于每个参与方都不同。对于每个参与方是私有且唯一的。" @@ -294,25 +165,17 @@ msgstr "" msgid "ray_head_addr" msgstr "" -#: ../../component/comp_spec.md -msgid "storage_config" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ StorageConfig](#storageconfig)" -msgstr "" - -#: ../../component/comp_spec.md:136 +#: ../../component/comp_spec.md:75 msgid "SFClusterConfig.PublicConfig" msgstr "" -#: ../../component/comp_spec.md:137 +#: ../../component/comp_spec.md:76 msgid "Public and shared to all parties." msgstr "对所有参与方公开且共享的。" #: ../../component/comp_spec.md -msgid "rayfed_config" -msgstr "" +msgid "ray_fed_config" +msgstr "ray_fed 配置。" #: ../../component/comp_spec.md msgid "[ SFClusterConfig.RayFedConfig](#sfclusterconfigrayfedconfig)" @@ -326,11 +189,11 @@ msgstr "" msgid "[repeated SFClusterConfig.SPUConfig](#sfclusterconfigspuconfig)" msgstr "" -#: ../../component/comp_spec.md:148 +#: ../../component/comp_spec.md:87 msgid "SFClusterConfig.RayFedConfig" msgstr "" -#: ../../component/comp_spec.md:149 +#: ../../component/comp_spec.md:88 msgid "Addresses for the RayFed cluster of each party." msgstr "每个参与方的 RayFed 集群地址。" @@ -350,11 +213,11 @@ msgstr "" msgid "listen_addresses" msgstr "" -#: ../../component/comp_spec.md:161 +#: ../../component/comp_spec.md:100 msgid "SFClusterConfig.SPUConfig" msgstr "" -#: ../../component/comp_spec.md:162 +#: ../../component/comp_spec.md:101 msgid "Contains addresses for one SPU device." msgstr "包含一个 SPU 设备的地址。" @@ -366,23 +229,23 @@ msgstr "" msgid "Should match SPU name in SFClusterDesc.devices." msgstr "应与 SFClusterDesc.devices 中的 SPU 名称匹配" -#: ../../component/comp_spec.md:175 +#: ../../component/comp_spec.md:114 msgid "SFClusterDesc" msgstr "" -#: ../../component/comp_spec.md:176 +#: ../../component/comp_spec.md:115 msgid "Intrinsic properties of a SecretFlow cluster, including:" msgstr "SecretFlow 集群的内在属性,包括" -#: ../../component/comp_spec.md:178 +#: ../../component/comp_spec.md:117 msgid "Version info." msgstr "版本信息。" -#: ../../component/comp_spec.md:179 +#: ../../component/comp_spec.md:118 msgid "Parties: who participate in the computation." msgstr "参与计算的各方。" -#: ../../component/comp_spec.md:180 +#: ../../component/comp_spec.md:119 msgid "Secret devices including and their configs." msgstr "包括秘密设备及其配置" @@ -418,28 +281,41 @@ msgstr "" msgid "Description of secret devices" msgstr "秘密设备的描述" -#: ../../component/comp_spec.md:193 +#: ../../component/comp_spec.md +msgid "[ SFClusterDesc.RayFedConfig](#sfclusterdescrayfedconfig)" +msgstr "" + +#: ../../component/comp_spec.md:133 msgid "SFClusterDesc.DeviceDesc" msgstr "" -#: ../../component/comp_spec.md:194 +#: ../../component/comp_spec.md:134 msgid "Description for a secret device." msgstr "一个秘密设备的描述。" -#: ../../component/comp_spec.md:195 +#: ../../component/comp_spec.md:135 msgid "PYUs do not need to claim since they are plaintext devices." msgstr "PYU 不需要声明,因为它们是明文设备。" -#: ../../component/comp_spec.md:196 +#: ../../component/comp_spec.md:136 msgid "" "Notes for config: At this moment, you have to provide a JSON string for " "different devices. We are going to formalize this part in future." msgstr "配置说明:目前,您必须为不同的设备提供一个 JSON 字符串。我们将在未来规范化此部分。" -#: ../../component/comp_spec.md:199 +#: ../../component/comp_spec.md:139 msgid "Example SPU config:" msgstr "示例 SPU 配置:" +#: ../../component/comp_spec.md:157 +msgid "" +"Referrences: SPU: https://www.secretflow.org.cn/docs/spu/latest/en-" +"US/reference/runtime_config#runtimeconfig HEU: " +"https://www.secretflow.org.cn/docs/secretflow/latest/en-" +"US/source/secretflow.device.device.device#secretflow.device.device.heu.HEU.__init__" +msgstr "" +"参考: [SPU](https://www.secretflow.org.cn/docs/spu/latest/en-US/reference/runtime_config#runtimeconfig) | [HEU](https://www.secretflow.org.cn/docs/secretflow/latest/en-US/source/secretflow.device.device.device#secretflow.device.device.heu.HEU.__init__) 。" + #: ../../component/comp_spec.md msgid "Name of the device." msgstr "设备名称。" @@ -464,59 +340,35 @@ msgstr "" msgid "Specific config for the secret device." msgstr "秘密设备的特别配置。" -#: ../../component/comp_spec.md:229 -msgid "StorageConfig" -msgstr "" - -#: ../../component/comp_spec.md:230 -msgid "A StorageConfig specifies the root for all data for one party." -msgstr "StorageConfig 指定了一个参与方的所有数据的根目录。" - -#: ../../component/comp_spec.md:231 -msgid "" -"At this moment, only local_fs is supported. We would support OSS, " -"database in future." -msgstr "目前只支持 local_fs。我们将来会支持 OSS、数据库等。" - -#: ../../component/comp_spec.md -msgid "Supported: local_fs." -msgstr "可选: local_fs." - -#: ../../component/comp_spec.md -msgid "local_fs" +#: ../../component/comp_spec.md:174 +msgid "SFClusterDesc.RayFedConfig" msgstr "" #: ../../component/comp_spec.md -msgid "[ StorageConfig.LocalFSConfig](#storageconfiglocalfsconfig)" +msgid "cross_silo_comm_backend" msgstr "" #: ../../component/comp_spec.md -msgid "local_fs config." -msgstr "local_fs 配置。" - -#: ../../component/comp_spec.md:243 -msgid "StorageConfig.LocalFSConfig" +msgid "" +"Indicates communication backend of RayFed. Accepted: 'grpc', 'brpc_link' " +"Dafault is 'grpc'" msgstr "" -#: ../../component/comp_spec.md -msgid "wd" +#: ../../component/comp_spec.md:185 ../../component/comp_spec.md:223 +msgid "Enums" msgstr "" -#: ../../component/comp_spec.md -msgid "Working directory." -msgstr "工作目录。" - -#: ../../component/comp_spec.md:260 +#: ../../component/comp_spec.md:191 msgid "" "Proto file: " -"[secretflow/protos/component/data.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/data.proto)" +"[secretflow/protos/secretflow/spec/extend/data.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/secretflow/spec/extend/data.proto)" msgstr "" -#: ../../component/comp_spec.md:268 +#: ../../component/comp_spec.md:199 msgid "DeviceObjectCollection" msgstr "" -#: ../../component/comp_spec.md:269 +#: ../../component/comp_spec.md:200 msgid "Descibes public storage info for a collection of Device Objects." msgstr "描述设备对象集合的公共存储信息。" @@ -538,7 +390,7 @@ msgstr "" msgid "Any public information." msgstr "任何公共信息。" -#: ../../component/comp_spec.md:280 +#: ../../component/comp_spec.md:211 msgid "DeviceObjectCollection.DeviceObject" msgstr "" @@ -557,1149 +409,3 @@ msgstr "" #: ../../component/comp_spec.md msgid "Index of data_ref in the parent DistData message." msgstr "data_ref 在父级 DistData 消息中的索引。" - -#: ../../component/comp_spec.md:292 -msgid "DistData" -msgstr "" - -#: ../../component/comp_spec.md:293 -msgid "A public record for a general distributed data." -msgstr "分布式数据的公共记录。" - -#: ../../component/comp_spec.md:295 -msgid "The type of this distributed data, should be meaningful to components." -msgstr "该分布式数据的类型,应对组件具有意义。" - -#: ../../component/comp_spec.md:297 -msgid "" -"The concrete data format (include public and private parts) is defined by" -" other protos." -msgstr "具体的数据格式(包括公共部分和私有部分)由其他协议定义。" - -#: ../../component/comp_spec.md:300 -msgid "Suggested names, i.e." -msgstr "建议的名称,例如" - -#: ../../component/comp_spec.md:301 -msgid "sf.table.vertical_table represent a secretflow vertical table" -msgstr "sf.table.vertical_table 代表 SecretFlow 垂直表" - -#: ../../component/comp_spec.md:302 -msgid "sf.model.* represent a secretflow models." -msgstr "sf.model.* 代表 SecretFlow 模型。" - -#: ../../component/comp_spec.md:303 -msgid "sf.rule.* represent a secretflow rules." -msgstr "sf.rule.* 代表 SecretFlow 规则。" - -#: ../../component/comp_spec.md -msgid "The name of this distributed data." -msgstr "该分布式数据的名称。" - -#: ../../component/comp_spec.md -msgid "sys_info" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ SystemInfo](#systeminfo)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"Describe the system information that used to generate this distributed " -"data." -msgstr "描述用于生成此分布式数据的系统信息。" - -#: ../../component/comp_spec.md -msgid "meta" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ google.protobuf.Any](#googleprotobufany)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Public information, known to all parties. i.e. VerticalTable" -msgstr "公共信息,所有参与方都知道。例如:VerticalTable" - -#: ../../component/comp_spec.md -msgid "data_refs" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated DistData.DataRef](#distdatadataref)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Remote data references." -msgstr "远程数据引用。" - -#: ../../component/comp_spec.md:317 -msgid "DistData.DataRef" -msgstr "" - -#: ../../component/comp_spec.md:318 -msgid "A reference to a data that is stored in the remote path." -msgstr "引用存储在远程路径中的数据。" - -#: ../../component/comp_spec.md -msgid "uri" -msgstr "" - -#: ../../component/comp_spec.md -msgid "The path information relative to StorageConfig of the party." -msgstr "相对于参与方的 StorageConfig 的路径信息。" - -#: ../../component/comp_spec.md -msgid "party" -msgstr "" - -#: ../../component/comp_spec.md -msgid "The owner party." -msgstr "拥有方" - -#: ../../component/comp_spec.md -msgid "format" -msgstr "" - -#: ../../component/comp_spec.md -msgid "The storage format, i.e. csv." -msgstr "存储类型,例如csv" - -#: ../../component/comp_spec.md:330 -msgid "IndividualTable" -msgstr "" - -#: ../../component/comp_spec.md:331 -msgid "IndividualTable describes a table owned by a single party." -msgstr "IndividualTable 描述属于单个参与方的表。" - -#: ../../component/comp_spec.md -msgid "schema" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ TableSchema](#tableschema)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "num_lines" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ int64](#int64)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "If -1, the number is unknown." -msgstr "如果是 -1,则表示数字未知。" - -#: ../../component/comp_spec.md:342 -msgid "SystemInfo" -msgstr "" - -#: ../../component/comp_spec.md:343 -msgid "Describe the application related to data." -msgstr "描述与数据相关的应用程序。" - -#: ../../component/comp_spec.md:344 -msgid "SCQL, GRM related meta information should be here." -msgstr "SCQL,GRM 相关的元信息应该在这里。" - -#: ../../component/comp_spec.md:345 -msgid "You can add more field here, when another application is added." -msgstr "当添加另一个应用程序时,您可以在此添加更多字段。" - -#: ../../component/comp_spec.md -msgid "app_name" -msgstr "" - -#: ../../component/comp_spec.md -msgid "The application name. Supported: `secretflow`" -msgstr "应用程序名称。支持:secretflow" - -#: ../../component/comp_spec.md -msgid "secretflow" -msgstr "" - -#: ../../component/comp_spec.md -msgid "For secretflow." -msgstr "专属于secretflow。" - -#: ../../component/comp_spec.md:356 -msgid "TableSchema" -msgstr "" - -#: ../../component/comp_spec.md:357 -msgid "The schema of a table." -msgstr "表的模式。" - -#: ../../component/comp_spec.md:358 -msgid "" -"A col must be one of `id | feature | label`. By default, it should be a " -"feature." -msgstr "Col 必须是 id | feature | label 其中之一。默认情况下,应该是一个 feature。" - -#: ../../component/comp_spec.md:360 -msgid "All names must match the regexp `[A-Za-z0-9.][A-Za-z0-9_>./]*`." -msgstr "所有名称必须匹配正则表达式 [A-Za-z0-9.][A-Za-z0-9_>./]*。" - -#: ../../component/comp_spec.md:361 -msgid "All data type must be one of" -msgstr "所有数据类型必须属于" - -#: ../../component/comp_spec.md:362 -msgid "int8" -msgstr "" - -#: ../../component/comp_spec.md:363 -msgid "int16" -msgstr "" - -#: ../../component/comp_spec.md:364 -msgid "int32" -msgstr "" - -#: ../../component/comp_spec.md:365 -msgid "int64" -msgstr "" - -#: ../../component/comp_spec.md:366 -msgid "uint8" -msgstr "" - -#: ../../component/comp_spec.md:367 -msgid "uint16" -msgstr "" - -#: ../../component/comp_spec.md:368 -msgid "uint32" -msgstr "" - -#: ../../component/comp_spec.md:369 -msgid "uint64" -msgstr "" - -#: ../../component/comp_spec.md:370 -msgid "float16" -msgstr "" - -#: ../../component/comp_spec.md:371 -msgid "float32" -msgstr "" - -#: ../../component/comp_spec.md:372 -msgid "float64" -msgstr "" - -#: ../../component/comp_spec.md:373 -msgid "bool" -msgstr "" - -#: ../../component/comp_spec.md:374 -msgid "int" -msgstr "" - -#: ../../component/comp_spec.md:375 -msgid "float" -msgstr "" - -#: ../../component/comp_spec.md:376 -msgid "str" -msgstr "" - -#: ../../component/comp_spec.md -msgid "ids" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Id column name(s). Optional, can be empty." -msgstr "ID 列名。可选,可以为空。" - -#: ../../component/comp_spec.md -msgid "features" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Feature column name(s)." -msgstr "特征列名。" - -#: ../../component/comp_spec.md -msgid "labels" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Label column name(s). Optional, can be empty." -msgstr "标签列名。可选,可以为空。" - -#: ../../component/comp_spec.md -msgid "id_types" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Id column data type(s). Len(id) should match len(id_types)." -msgstr "ID 列的数据类型。len(id) 应该和 len(id_types) 匹配。" - -#: ../../component/comp_spec.md -msgid "feature_types" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"Feature column data type(s). Len(features) should match " -"len(feature_types)." -msgstr "特征列的数据类型。len(features) 应该和 len(feature_types) 匹配。" - -#: ../../component/comp_spec.md -msgid "label_types" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Label column data type(s). Len(labels) should match len(label_types)." -msgstr "标签列的数据类型。len(labels) 应该和 len(label_types) 匹配。" - -#: ../../component/comp_spec.md:391 -msgid "VerticalTable" -msgstr "" - -#: ../../component/comp_spec.md:392 -msgid "VerticalTable describes a vertical virtual table from multiple parties." -msgstr "VerticalTable 描述来自多个参与方的垂直虚拟表。" - -#: ../../component/comp_spec.md:393 -msgid "TODO: move this to secretflow/protos/builtin/" -msgstr "TODO:将此移动到 secretflow/protos/builtin/。" - -#: ../../component/comp_spec.md:395 -msgid "" -"Guide: if some type is only required to be handle inside a specific " -"system, for instance woe.rule file in engine, we don't need to define a " -"new type here." -msgstr "指南:如果某个类型仅需要在特定系统内处理,例如引擎中的 woe.rule 文件,我们不需要在这里定义新类型。" - -#: ../../component/comp_spec.md -msgid "schemas" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated TableSchema](#tableschema)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"The vertical partitioned slices' schema. Must match data_refs in the " -"parent DistData message." -msgstr "垂直分区片的模式。必须与父 DistData 消息中的 data_refs 匹配。" - -#: ../../component/comp_spec.md:414 -msgid "" -"Proto file: " -"[secretflow/protos/component/comp.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/comp.proto)" -msgstr "" - -#: ../../component/comp_spec.md:422 -msgid "Attribute" -msgstr "" - -#: ../../component/comp_spec.md:423 -msgid "The value of an attribute" -msgstr "属性的值" - -#: ../../component/comp_spec.md -msgid "f" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ float](#float)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "FLOAT" -msgstr "" - -#: ../../component/comp_spec.md -msgid "i64" -msgstr "" - -#: ../../component/comp_spec.md -msgid "INT NOTE(junfeng): \"is\" is preserved by Python. Replaced with \"i64\"." -msgstr "INT NOTE(junfeng):\"is\" 受 Python 保留。替换为 \"i64\"。" - -#: ../../component/comp_spec.md -msgid "s" -msgstr "" - -#: ../../component/comp_spec.md -msgid "STRING" -msgstr "" - -#: ../../component/comp_spec.md -msgid "b" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ bool](#bool)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "BOOL" -msgstr "" - -#: ../../component/comp_spec.md -msgid "fs" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated float](#float)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "FLOATS" -msgstr "" - -#: ../../component/comp_spec.md -msgid "i64s" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated int64](#int64)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "INTS" -msgstr "" - -#: ../../component/comp_spec.md -msgid "ss" -msgstr "" - -#: ../../component/comp_spec.md -msgid "STRINGS" -msgstr "" - -#: ../../component/comp_spec.md -msgid "bs" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated bool](#bool)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "BOOLS" -msgstr "" - -#: ../../component/comp_spec.md -msgid "is_na" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Indicates the value is missing explicitly." -msgstr "指示该值明确缺失。" - -#: ../../component/comp_spec.md:441 -msgid "AttributeDef" -msgstr "" - -#: ../../component/comp_spec.md:442 -msgid "Describe an attribute." -msgstr "描述一个属性。" - -#: ../../component/comp_spec.md -msgid "prefixes" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"Indicates the ancestors of a node, e.g. `[name_a, name_b, name_c]` means " -"the path prefixes of current Attribute is `name_a/name_b/name_c/`. Only " -"`^[a-zA-Z0-9_.-]*$` is allowed. `input` and `output` are reserved." -msgstr "" -"表示节点的祖先,例如 `[name_a, name_b, name_c]` 意味着当前属性的路径前缀为 " -"`name_a/name_b/name_c/` 。仅允许使用 `^[a-zA-Z0-9_.-]*$` ,其中 `input` 和 `output`" -" 保留。" - -#: ../../component/comp_spec.md -msgid "" -"Must be unique in the same level just like Linux file systems. Only " -"`^[a-zA-Z0-9_.-]*$` is allowed. `input` and `output` are reserved." -msgstr "" -"在同一级别中必须是唯一的,就像 Linux 文件系统一样。仅允许使用 `^[a-zA-Z0-9_.-]*$` ,其中 `input` 和 " -"`output` 保留。" - -#: ../../component/comp_spec.md -msgid "[ AttrType](#attrtype)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "atomic" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ AttributeDef.AtomicAttrDesc](#attributedefatomicattrdesc)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "union" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ AttributeDef.UnionAttrGroupDesc](#attributedefunionattrgroupdesc)" -msgstr "" - -#: ../../component/comp_spec.md:457 -msgid "AttributeDef.AtomicAttrDesc" -msgstr "" - -#: ../../component/comp_spec.md:458 -msgid "" -"Extras for an atomic attribute. Including: `AT_FLOAT | AT_INT | AT_STRING" -" | AT_BOOL | AT_FLOATS | AT_INTS | AT_STRINGS | AT_BOOLS`." -msgstr "" -"原子属性的额外信息,包括: `AT_FLOAT | AT_INT | AT_STRING | AT_BOOL | AT_FLOATS | " -"AT_INTS | AT_STRINGS | AT_BOOLS` 。" - -#: ../../component/comp_spec.md -msgid "list_min_length_inclusive" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Only valid when type is `AT_FLOATS | AT_INTS | AT_STRINGS | AT_BOOLS`." -msgstr "仅当类型为 `AT_FLOATS | AT_INTS | AT_STRINGS | AT_BOOLS` 时有效。" - -#: ../../component/comp_spec.md -msgid "list_max_length_inclusive" -msgstr "" - -#: ../../component/comp_spec.md -msgid "is_optional" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"If True, when Atmoic Attr is not provided or is_na, default_value would " -"be used. Else, Atmoic Attr must be provided." -msgstr "如果为True,当Atmoic Attr没有提供或者is_na为True是,default_value将会被使用。否则,Atmoic Attr必须提供。" - -#: ../../component/comp_spec.md -msgid "default_value" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ Attribute](#attribute)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"A reasonable default for this attribute if the user does not supply a " -"value." -msgstr "如果用户没有提供值,则为此属性提供一个合理的默认值。" - -#: ../../component/comp_spec.md -msgid "allowed_values" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"Only valid when type is `AT_FLOAT | AT_INT | AT_STRING | AT_FLOATS | " -"AT_INTS | AT_STRINGS`. Please use list fields of AtomicParameter, i.e. " -"`ss`, `i64s`, `fs`. If the attribute is a list, allowed_values is applied" -" to each element." -msgstr "" -"仅当类型为 `AT_FLOAT | AT_INT | AT_STRING | AT_FLOATS | AT_INTS | AT_STRINGS` " -"时有效。请使用 AtomicParameter 的列表字段,即 `ss、i64s、fs` 。如果属性是列表,则 allowed_values " -"应用于每个元素。" - -#: ../../component/comp_spec.md -msgid "has_lower_bound" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"Only valid when type is `AT_FLOAT | AT_INT | AT_FLOATS | AT_INTS `. If " -"the attribute is a list, lower_bound is applied to each element." -msgstr "" -"仅当类型为 `AT_FLOAT | AT_INT | AT_FLOATS | AT_INTS` 时有效。如果属性是列表,则 lower_bound" -" 应用于每个元素。" - -#: ../../component/comp_spec.md -msgid "lower_bound" -msgstr "" - -#: ../../component/comp_spec.md -msgid "lower_bound_inclusive" -msgstr "" - -#: ../../component/comp_spec.md -msgid "has_upper_bound" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"Only valid when type is `AT_FLOAT | AT_INT | AT_FLOATS | AT_INTS `. If " -"the attribute is a list, upper_bound is applied to each element." -msgstr "" -"仅当类型为 `AT_FLOAT | AT_INT | AT_FLOATS | AT_INTS` 时有效。如果属性是列表,则 upper_bound" -" 应用于每个元素。" - -#: ../../component/comp_spec.md -msgid "upper_bound" -msgstr "" - -#: ../../component/comp_spec.md -msgid "upper_bound_inclusive" -msgstr "" - -#: ../../component/comp_spec.md:480 -msgid "AttributeDef.UnionAttrGroupDesc" -msgstr "" - -#: ../../component/comp_spec.md:481 -msgid "Extras for a union attribute group." -msgstr "union attribute group的额外信息。" - -#: ../../component/comp_spec.md -msgid "default_selection" -msgstr "" - -#: ../../component/comp_spec.md -msgid "The default selected child." -msgstr "默认选中的子节点。" - -#: ../../component/comp_spec.md:491 -msgid "CompListDef" -msgstr "" - -#: ../../component/comp_spec.md:492 -msgid "A list of components" -msgstr "一个组件列表。" - -#: ../../component/comp_spec.md -msgid "version" -msgstr "" - -#: ../../component/comp_spec.md -msgid "comps" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated ComponentDef](#componentdef)" -msgstr "" - -#: ../../component/comp_spec.md:505 -msgid "ComponentDef" -msgstr "" - -#: ../../component/comp_spec.md:506 -msgid "The definition of a comp." -msgstr "组件的定义。" - -#: ../../component/comp_spec.md -msgid "domain" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Namespace of the comp." -msgstr "组件的命名空间。" - -#: ../../component/comp_spec.md -msgid "Should be unique among all comps of the same domain." -msgstr "应该在同一域中的所有组件中是唯一的。" - -#: ../../component/comp_spec.md -msgid "Version of the comp." -msgstr "组件版本" - -#: ../../component/comp_spec.md -msgid "attrs" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated AttributeDef](#attributedef)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "inputs" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated IoDef](#iodef)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "outputs" -msgstr "" - -#: ../../component/comp_spec.md:522 -msgid "IoDef" -msgstr "" - -#: ../../component/comp_spec.md:523 -msgid "Define an input/output for component." -msgstr "定义组件的输入/输出。" - -#: ../../component/comp_spec.md -msgid "should be unique among all IOs of the component." -msgstr "应该在组件的所有 IO 中是唯一的。" - -#: ../../component/comp_spec.md -msgid "types" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Must be one of DistData.type in data.proto" -msgstr "必须是 data.proto 中的 DistData.type 之一。" - -#: ../../component/comp_spec.md -msgid "[repeated IoDef.TableAttrDef](#iodeftableattrdef)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"Only valid for tables. The attribute path for a TableAttrDef is " -"`{input|output}/{IoDef name}/{TableAttrDef name}`." -msgstr "" -"仅对表格有效。TableAttrDef 的属性路径为 `{input|output}/{IoDef name}/{TableAttrDef " -"name}` 。" - -#: ../../component/comp_spec.md:536 -msgid "IoDef.TableAttrDef" -msgstr "" - -#: ../../component/comp_spec.md:537 -msgid "An extra attribute for a table." -msgstr "表格的额外属性。" - -#: ../../component/comp_spec.md:539 -msgid "If provided in a IoDef, e.g." -msgstr "如果在 IoDef 中提供,例如:" - -#: ../../component/comp_spec.md:557 -msgid "" -"means after a user provide a table as IO, they should also specify cols " -"as \"feature\":" -msgstr "意味着用户提供表格作为 IO 数据后,他们还应该指定 cols 作为 \"feature\" :" - -#: ../../component/comp_spec.md:559 -msgid "col_min_cnt_inclusive is 1: At least 1 col to be selected." -msgstr "col_min_cnt_inclusive 为 1:至少选择 1 列。" - -#: ../../component/comp_spec.md:560 -msgid "" -"col_max_cnt_inclusive is 3: At most 3 cols to be selected. And " -"afterwards, user have to fill an int attribute called bucket_size for " -"each selected cols." -msgstr "col_max_cnt_inclusive 为 3:最多选择 3 列。然后,用户必须为每个所选列填写名为 bucket_size 的整数属性。" - -#: ../../component/comp_spec.md -msgid "Must be unique among all attributes for the table." -msgstr "表格中所有属性必须是唯一的。" - -#: ../../component/comp_spec.md -msgid "" -"Accepted col data types. Please check DistData.VerticalTable in " -"data.proto." -msgstr "接受的列数据类型。请参阅 data.proto 中的 DistData.VerticalTable。" - -#: ../../component/comp_spec.md -msgid "col_min_cnt_inclusive" -msgstr "" - -#: ../../component/comp_spec.md -msgid "inclusive" -msgstr "" - -#: ../../component/comp_spec.md -msgid "col_max_cnt_inclusive" -msgstr "" - -#: ../../component/comp_spec.md -msgid "extra attribute for specified col." -msgstr "指定列的额外属性。" - -#: ../../component/comp_spec.md:580 -msgid "AttrType" -msgstr "" - -#: ../../component/comp_spec.md:581 -msgid "Supported attribute types." -msgstr "支持的属性类型。" - -#: ../../component/comp_spec.md -msgid "Name" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Number" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_UNDEFINED" -msgstr "" - -#: ../../component/comp_spec.md -msgid "0" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_FLOAT" -msgstr "" - -#: ../../component/comp_spec.md -msgid "1" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_INT" -msgstr "" - -#: ../../component/comp_spec.md -msgid "2" -msgstr "" - -#: ../../component/comp_spec.md -msgid "INT" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_STRING" -msgstr "" - -#: ../../component/comp_spec.md -msgid "3" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_BOOL" -msgstr "" - -#: ../../component/comp_spec.md -msgid "4" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_FLOATS" -msgstr "" - -#: ../../component/comp_spec.md -msgid "5" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_INTS" -msgstr "" - -#: ../../component/comp_spec.md -msgid "6" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_STRINGS" -msgstr "" - -#: ../../component/comp_spec.md -msgid "7" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_BOOLS" -msgstr "" - -#: ../../component/comp_spec.md -msgid "8" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_STRUCT_GROUP" -msgstr "" - -#: ../../component/comp_spec.md -msgid "9" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_UNION_GROUP" -msgstr "" - -#: ../../component/comp_spec.md -msgid "10" -msgstr "" - -#: ../../component/comp_spec.md -msgid "AT_SF_TABLE_COL" -msgstr "" - -#: ../../component/comp_spec.md -msgid "11" -msgstr "" - -#: ../../component/comp_spec.md:604 -msgid "" -"Proto file: " -"[secretflow/protos/component/evaluation.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/evaluation.proto)" -msgstr "" - -#: ../../component/comp_spec.md:612 -msgid "NodeEvalParam" -msgstr "" - -#: ../../component/comp_spec.md:613 -msgid "Evaluate a node." -msgstr "执行节点" - -#: ../../component/comp_spec.md:614 -msgid "comp.evaluate(NodeEvalParam, SFClusterConfig) -> NodeEvalResult" -msgstr "" - -#: ../../component/comp_spec.md:616 -msgid "NodeEvalParam contains all the information to evaluate a component." -msgstr "NodeEvalParam 包含评估组件所需的所有信息。" - -#: ../../component/comp_spec.md -msgid "Domain of the component." -msgstr "组件的域。" - -#: ../../component/comp_spec.md -msgid "Name of the component." -msgstr "组件的名称。" - -#: ../../component/comp_spec.md -msgid "Version of the component." -msgstr "组件的版本。" - -#: ../../component/comp_spec.md -msgid "attr_paths" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"The path of attributes. The attribute path for a TableAttrDef is " -"`{input|output}/{IoDef name}/{TableAttrDef name}`." -msgstr "" -"属性的路径。TableAttrDef 的属性路径为 `{input|output}/{IoDef name}/{TableAttrDef " -"name}` 。" - -#: ../../component/comp_spec.md -msgid "[repeated Attribute](#attribute)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "The value of the attribute. Must match attr_paths." -msgstr "属性的值。必须与 attr_paths 匹配。" - -#: ../../component/comp_spec.md -msgid "[repeated DistData](#distdata)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"The input data, the order of inputs must match inputs in ComponentDef. " -"NOTE: Names of DistData doesn't need to match those of inputs in " -"ComponentDef definition." -msgstr "" -"输入数据,输入顺序必须与 ComponentDef 中定义的输入相匹配。注意:DistData 的名称不需要与 ComponentDef " -"定义中的输入名称匹配。" - -#: ../../component/comp_spec.md -msgid "output_uris" -msgstr "" - -#: ../../component/comp_spec.md -msgid "" -"The output data uris, the order of output_uris must match outputs in " -"ComponentDef." -msgstr "输出数据 URI,输出 URI 的顺序必须与 ComponentDef 中定义的输出顺序相匹配。" - -#: ../../component/comp_spec.md:632 -msgid "NodeEvalResult" -msgstr "" - -#: ../../component/comp_spec.md:633 -msgid "NodeEvalResult contains outputs of a component evaluation." -msgstr "NodeEvalResult 包含组件执行的输出。" - -#: ../../component/comp_spec.md -msgid "Output data." -msgstr "输出数据。" - -#: ../../component/comp_spec.md:649 -msgid "" -"Proto file: " -"[secretflow/protos/component/report.proto](https://github.com/secretflow/secretflow/tree/main/secretflow/protos/component/report.proto)" -msgstr "" - -#: ../../component/comp_spec.md:657 -msgid "Descriptions" -msgstr "" - -#: ../../component/comp_spec.md:658 -msgid "Displays multiple read-only fields in groups." -msgstr "以组的形式显示多个只读字段。" - -#: ../../component/comp_spec.md -msgid "Name of the Descriptions." -msgstr "Descriptions名称" - -#: ../../component/comp_spec.md -msgid "items" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated Descriptions.Item](#descriptionsitem)" -msgstr "" - -#: ../../component/comp_spec.md:670 -msgid "Descriptions.Item" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Name of the field." -msgstr "字段的名称。" - -#: ../../component/comp_spec.md -msgid "Must be one of bool/int/float/str" -msgstr "必须属于 bool/int/float/str 之一" - -#: ../../component/comp_spec.md -msgid "value" -msgstr "" - -#: ../../component/comp_spec.md:684 -msgid "Div" -msgstr "" - -#: ../../component/comp_spec.md:685 -msgid "A division or a section of a page." -msgstr "页面的一个部分或一个章节。" - -#: ../../component/comp_spec.md -msgid "Name of the Div." -msgstr "Div名称" - -#: ../../component/comp_spec.md -msgid "children" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated Div.Child](#divchild)" -msgstr "" - -#: ../../component/comp_spec.md:697 -msgid "Div.Child" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Supported: descriptions, table, div." -msgstr "支持:descriptions, table, div 。" - -#: ../../component/comp_spec.md -msgid "descriptions" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ Descriptions](#descriptions)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "table" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ Table](#table)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "div" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ Div](#div)" -msgstr "" - -#: ../../component/comp_spec.md:711 -msgid "Report" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Name of the Report." -msgstr "Report 名称" - -#: ../../component/comp_spec.md -msgid "tabs" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated Tab](#tab)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "err_code" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[ int32](#int32)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "err_detail" -msgstr "" - -#: ../../component/comp_spec.md -msgid "Structed error detail (JSON encoded message)." -msgstr "结构化的错误详情(JSON 编码的消息)。" - -#: ../../component/comp_spec.md:726 -msgid "Tab" -msgstr "" - -#: ../../component/comp_spec.md:727 -msgid "A page of a report." -msgstr "report的一页" - -#: ../../component/comp_spec.md -msgid "Name of the Tab." -msgstr "Tab名称" - -#: ../../component/comp_spec.md -msgid "divs" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated Div](#div)" -msgstr "" - -#: ../../component/comp_spec.md:739 -msgid "Table" -msgstr "" - -#: ../../component/comp_spec.md:740 -msgid "Displays rows of data." -msgstr "以行的形式展示数据。" - -#: ../../component/comp_spec.md -msgid "Name of the Table." -msgstr "Table名称" - -#: ../../component/comp_spec.md -msgid "headers" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated Table.HeaderItem](#tableheaderitem)" -msgstr "" - -#: ../../component/comp_spec.md -msgid "rows" -msgstr "" - -#: ../../component/comp_spec.md -msgid "[repeated Table.Row](#tablerow)" -msgstr "" - -#: ../../component/comp_spec.md:753 -msgid "Table.HeaderItem" -msgstr "" - -#: ../../component/comp_spec.md:766 -msgid "Table.Row" -msgstr "" diff --git a/docs/locales/zh_CN/LC_MESSAGES/component/comp_spec_design.po b/docs/locales/zh_CN/LC_MESSAGES/component/comp_spec_design.po index 2baa06ddf..a38f9ed27 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/component/comp_spec_design.po +++ b/docs/locales/zh_CN/LC_MESSAGES/component/comp_spec_design.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: SecretFlow \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-07-02 23:05+0800\n" +"POT-Creation-Date: 2023-10-14 16:59+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -17,535 +17,126 @@ msgstr "" "Generated-By: Babel 2.12.1\n" #: ../../component/comp_spec_design.rst:2 -msgid "Design of Component Specification" -msgstr "组件规范的设计" +msgid "Notes on Extended Specification" +msgstr "扩展标准注释" #: ../../component/comp_spec_design.rst:4 msgid "" -"Component Specification could be splitted into two parts: Core protocols " -"including Data, Component and Node Evaluation Protocol and Ancillary " -"protocols including SecretFlow Cluster Configurations and SecretFlow " -"Data. Core protocols apply to all applications integrated to SecretFlow " -"Ecosystem while Ancillary protocols are only for SecretFlow." -msgstr "" -"组件规范可以分为两部分:核心协议包括数据协议、组件协议和节点评估协议,附属协议包括SecretFlow集群配置和SecretFlow数据。核心协议适用于集成到SecretFlow生态系统的所有应用程序,而附属协议仅适用于SecretFlow。" - -#: ../../component/comp_spec_design.rst:8 -msgid "" -"Component Specification takes some assumptions from our point of view. " -"Please read this document before checking :doc:`/component/comp_spec`." -msgstr "" -"组件规范从我们的角度做出了一些假设。请在查看 :doc:`/component/comp_spec` 之前阅读本文档。" - -#: ../../component/comp_spec_design.rst:11 -msgid "" -"At this moment, we don't have an official protocol for Pipelines and " -"Scheduling." -msgstr "目前,我们还没有针对管道和调度的官方协议。" - -#: ../../component/comp_spec_design.rst:15 -msgid "Data" -msgstr "" - -#: ../../component/comp_spec_design.rst:17 -msgid "Defined in data.proto." -msgstr "定义在data.proto中。" - -#: ../../component/comp_spec_design.rst:20 -msgid "SystemInfo" -msgstr "" - -#: ../../component/comp_spec_design.rst:21 -msgid "" -"For input data, **SystemInfo** describes the application and environment " -"which could consume the data. For output data, it bears such information." -msgstr "对于输入数据,**SystemInfo** 描述了可能使用数据的应用程序和环境。对于输出数据,它包含了这些信息。" - -#: ../../component/comp_spec_design.rst:26 -msgid "DataRef" -msgstr "" - -#: ../../component/comp_spec_design.rst:28 -msgid "" -"A **DataRef** is a pointer to a single file belongs to one party. **uri**" -" is the relative path to storage root of its owner. **DataRef** is public" -" and open to all parties. Don't try to store any secret with uris. You " -"need to protect the files pointed by uris instead." -msgstr "" -"**DataRef** 是指向属于一个参与方的单个文件的指针。**uri** 是其所有者存储根的相对路径。**DataRef** 是公开的,并对所有参与方开放。不要试图使用uris存储任何机密信息。而是需要保护由uris指向的文件。" +"Extended Specification are for SecretFlow only and not application to " +"other privacy-preserving applications." +msgstr "扩展协议仅适用于隐语,不适用于其他隐私保护应用程序。" -#: ../../component/comp_spec_design.rst:33 -msgid "DistData" -msgstr "" - -#: ../../component/comp_spec_design.rst:35 -msgid "" -"We call all the data consumed and generated by components as " -"**DistData**, aka Distributed Data, which means it is consists multiple " -"parts owned by different parties." -msgstr "我们称组件消费和生成的所有数据为DistData,即分布式数据,这意味着它由不同参与方拥有的多个部分组成。" - -#: ../../component/comp_spec_design.rst:38 -msgid "" -"Don't confused with data partitioning in Database systems. In such " -"systems, dividing a large dataset into several small partitions placed on" -" different machines is quite common. However, there are no owner " -"enforcements on partitions, which means which machine is selected to " -"place a partition is random." -msgstr "" -"不要将此概念与数据库系统中的数据分区混淆。在这样的系统中,将大型数据集分成放置在不同计算机上的几个小分区是非常常见的。但是,对于分区没有所有者的限制,这意味着选择哪个计算机来放置分区是随机的。" - -#: ../../component/comp_spec_design.rst:41 -msgid "" -"In a privacy-preserving system, some models and tables are splitted and " -"partitions are owned by different parties. One partition of **DistData** " -"is called **DataRef**." -msgstr "在隐私保护系统中,有些模型和表被分裂,分区由不同的参与方拥有。**DistData** 的一个分区称为 **DataRef** 。" - -#: ../../component/comp_spec_design.rst:43 +#: ../../component/comp_spec_design.rst:6 msgid "" -"For simplicity, all data are DistDatas even it only consists one " -"partition (even none at all)." -msgstr "为了简化,即使只有一个分区(甚至没有),我们也将所有数据都视为 DistDatas。" - -#: ../../component/comp_spec_design.rst:45 -msgid "" -"Besides **DataRef**, **DistData** also contains **meta** to store any " -"extra public information for different types of **DistData**. We also " -"proposed some common types of **DistData** including Vertical tables and " -"Individual tables." -msgstr "" -"除了 DataRef之外,DistData 还包含 meta,用于存储不同类型DistData的任何额外公共信息。我们还提出了一些常见的 DistData 类型,包括垂直表和个体表。" - -#: ../../component/comp_spec_design.rst:48 -msgid "" -"**DistData** is public and open to all parties, again don't store any " -"secret with **DistData**." -msgstr "" -"**DistData** 是公开的,对所有参与方开放,同样不要将任何机密信息存储到 **DistData** 中。" - -#: ../../component/comp_spec_design.rst:52 -msgid "Component" -msgstr "" - -#: ../../component/comp_spec_design.rst:54 -msgid "Defined in comp.proto." -msgstr "定义在comp.proto中。" +"If you are looking for definition like DistData, NodeEvalParam, etc. They" +" are part of `SecretFlow Open Specification " +"`_." +msgstr "如果你在查阅 DistData, NodeEvalParam message的定义。 请查阅 `隐语开放标准 `_ 。" -#: ../../component/comp_spec_design.rst:57 -msgid "AttributeDef" -msgstr "" - -#: ../../component/comp_spec_design.rst:59 -msgid "" -"We organize all attributes of a component as attribute trees. The leaves " -"of the tree are called **Atomic Attributes**, which represent solid " -"fields for users to fill-in e.g. bucket size or learning rate. The non-" -"leaf nodes of the tree are called **Attribute Group**. There are two kind" -" of Attribute Groups:" -msgstr "" -"我们将组件的所有属性组织为属性树。树的叶子被称为 Atomic Attributes,它们代表用户填写的实体字段,例如桶大小或学习率。树的非叶节点称为 Attribute Group。有两种属性组:" - -#: ../../component/comp_spec_design.rst:63 -msgid "Struct Group: all children of the group need to fill-in together." -msgstr "" -"Struct Group:该组的所有子项需要一起填写。" - -#: ../../component/comp_spec_design.rst:64 -msgid "Union Group: select one child of the group to fill-in." -msgstr "" -"Union Group:从组中选择一个子项填写。" - -#: ../../component/comp_spec_design.rst:66 -msgid "The child of an Attribute Group could be another Attribute Group." -msgstr "" -"Attribute Group的子项可以是另一个Attribute Group。" - -#: ../../component/comp_spec_design.rst:68 -msgid "A **AttributeDef** represents a node of a component attribute tree." -msgstr "**AttributeDef** 表示组件属性树的节点。" - -#: ../../component/comp_spec_design.rst:72 -msgid "" -"**Attribute Groups** are advanced usage in Component Attribute " -"declaration. Only a small part of audiences may utilize this feature one " -"day. You may check **Attribute Groups** when you need to organize groups " -"of attributes." -msgstr "" -"**Attribute Groups** 是组件属性声明中的高级用法。只有少部分受众可能会在某一天利用此功能。只有您需要组织属性组时,才需要了解 **Attribute Groups**。" - -#: ../../component/comp_spec_design.rst:75 -msgid "" -"For **Atomic Attributes**, you should check **AtomicAttrDesc** to specify" -" the types of attributes, default_value, allowed_values, ranges, etc." -msgstr "对于 **Atomic Attributes** ,您应该查看 **AtomicAttrDesc** 来指定属性类型、默认值、允许的值范围等。" - -#: ../../component/comp_spec_design.rst:77 -msgid "" -"For **Union Group**, you should use **UnionAttrGroupDesc** to specify the" -" default selection of children." -msgstr "对于 Union Group,应使用 UnionAttrGroupDesc 来指定子项的默认选择。" - -#: ../../component/comp_spec_design.rst:79 -msgid "" -"If you are going to build a attribute tree, you may use **prefixes** to " -"indicate the parent of the attribute tree node." -msgstr "如果要构建属性树,您可以使用 **prefixes** 来指示属性树节点的父级。" - -#: ../../component/comp_spec_design.rst:83 -msgid "IoDef" -msgstr "" - -#: ../../component/comp_spec_design.rst:85 -msgid "" -"IoDef is to specify the requirement of an input or output of the " -"component. You should use **types** to declare accepted types of " -"**DistData**." -msgstr "IoDef 用于指定组件的输入或输出要求。您应该使用 types 来声明所接受的 **DistData** 的类型。" - -#: ../../component/comp_spec_design.rst:88 -msgid "" -"For table **DistDatas**, we found many SecretFlow applications would " -"require users to select columns and fill-in extra attributes for each " -"selected columns, this is satisfied by **TableAttrDef**." -msgstr "对于表格 **DistDatas** ,我们发现许多 SecretFlow 应用程序需要用户选择列并为每个所选列填写额外的属性,这可以通过 **TableAttrDef** 满足" - -#: ../../component/comp_spec_design.rst:92 -msgid "" -"Again, you could leave **TableAttrDef** alone at this moment since it is " -"unusual to use." -msgstr "同样,您可以在此时忽略 **TableAttrDef**,因为它不常用。" - -#: ../../component/comp_spec_design.rst:97 -msgid "ComponentDef" -msgstr "" - -#: ../../component/comp_spec_design.rst:99 -msgid "You could use ComponentDef to define a component:" -msgstr "您可以使用 ComponentDef 定义组件:" - -#: ../../component/comp_spec_design.rst:101 -msgid "domain: namespace of component." -msgstr "domain:组件的命名空间。" - -#: ../../component/comp_spec_design.rst:102 -msgid "name: should be unique among the domain." -msgstr "name:应在命名空间内唯一。" - -#: ../../component/comp_spec_design.rst:103 -msgid "version: component version is separate from component list version." -msgstr "version:组件版本与组件列表版本不同。" - -#: ../../component/comp_spec_design.rst:104 -msgid "attrs, inputs and outputs." -msgstr "attrs,inputs 和 outputs。" - -#: ../../component/comp_spec_design.rst:106 -msgid "Only a tuple of domain, name and version could locate a component." -msgstr "只有domain、name和version的元组才能定位组件。" - -#: ../../component/comp_spec_design.rst:109 -msgid "CompListDef" -msgstr "" - -#: ../../component/comp_spec_design.rst:111 -msgid "" -"A group of a components could be organized by a **CompListDef**. " -"SecretFlow Component List is an instance of a CompListDef." -msgstr "一组组件可以由 **CompListDef** 组织。SecretFlow 组件列表是 **CompListDef** 的一个实例。" - -#: ../../component/comp_spec_design.rst:117 +#: ../../component/comp_spec_design.rst:9 msgid "SecretFlow Cluster" msgstr "SecretFlow 集群" -#: ../../component/comp_spec_design.rst:119 +#: ../../component/comp_spec_design.rst:11 msgid "Defined in cluster.proto." msgstr "定义在 cluster.proto 中。" -#: ../../component/comp_spec_design.rst:122 -msgid "" -"SecretFlow Cluster protos are not application to other privacy-preserving" -" applications." -msgstr "SecretFlow 集群协议不适用于其他隐私保护应用程序。" - -#: ../../component/comp_spec_design.rst:126 +#: ../../component/comp_spec_design.rst:14 msgid "SFClusterDesc" msgstr "" -#: ../../component/comp_spec_design.rst:128 +#: ../../component/comp_spec_design.rst:16 msgid "" "SFClusterDesc stores intrinsic properties of a SecretFlow cluster, " "including:" -msgstr "" -"SFClusterDesc 存储 SecretFlow 集群的固有属性,包括:" +msgstr "SFClusterDesc 存储 SecretFlow 集群的固有属性,包括:" -#: ../../component/comp_spec_design.rst:130 -msgid "SecretFlow version" -msgstr "" -"SecretFlow 版本" +#: ../../component/comp_spec_design.rst:18 +msgid "sf_version: SecretFlow version" +msgstr "sf_version: SecretFlow 版本" -#: ../../component/comp_spec_design.rst:131 -msgid "Python version" -msgstr "" -"Python 版本" +#: ../../component/comp_spec_design.rst:19 +msgid "py_version: Python version" +msgstr "py_version: Python 版本" -#: ../../component/comp_spec_design.rst:132 -msgid "parties participated in computation" -msgstr "" -"参与计算的参与方" +#: ../../component/comp_spec_design.rst:20 +msgid "parties: Parties participated in computation" +msgstr "parties: 计算参与方" -#: ../../component/comp_spec_design.rst:133 -msgid "Security configs of secret devices like protocol of SPU devices." -msgstr "" -"用于保护设备安全的安全配置,如 SPU 设备的协议。" +#: ../../component/comp_spec_design.rst:21 +msgid "devices: Security configs of secret devices like protocol of SPU devices." +msgstr "devices: 用于保护设备安全的安全配置,如 SPU 设备的协议。" -#: ../../component/comp_spec_design.rst:135 +#: ../../component/comp_spec_design.rst:22 +msgid "ray_fed_config: Backend selection of RayFed." +msgstr "ray_fed_config: RayFed使用的的后段。" + +#: ../../component/comp_spec_design.rst:24 msgid "" "The reason we regard SFClusterDesc as intrinsic properties is because " "SFClusterDesc is important to data security and integrity. For example, a" " DistData generated by one cluster could only consumed by another cluster" " only if their SFClusterDesc are compatible." msgstr "" -"我们之所以将 SFClusterDesc 视为固有属性,是因为 SFClusterDesc 对于数据安全和完整性非常重要。例如,只有当两个集群的 SFClusterDesc 兼容时,一个集群生成的 DistData 才能被另一个集群使用。" - -#: ../../component/comp_spec_design.rst:139 -msgid "SFClusterDesc is part of **SystemInfo**." -msgstr "" -"SFClusterDesc 是 **SystemInfo** 的一部分。" +"我们之所以将 SFClusterDesc 视为固有属性,是因为 SFClusterDesc 对于数据安全和完整性非常重要。例如,只有当两个集群的 " +"SFClusterDesc 兼容时,一个集群生成的 DistData 才能被另一个集群使用。" -#: ../../component/comp_spec_design.rst:143 -msgid "StorageConfig" -msgstr "" - -#: ../../component/comp_spec_design.rst:145 -msgid "" -"**StorageConfig** specifies the storage root of a party. It could be a " -"local file path, a database table or an OSS bucket." -msgstr "" -"**StorageConfig** 指定参与方的存储根目录。它可以是本地文件路径、数据库表或 OSS 存储桶。" - -#: ../../component/comp_spec_design.rst:150 +#: ../../component/comp_spec_design.rst:30 msgid "SFClusterConfig" msgstr "" -#: ../../component/comp_spec_design.rst:152 +#: ../../component/comp_spec_design.rst:32 msgid "" "A SFClusterConfig contains all information to setup a SecretFlow cluster," " besides **SFClusterDesc**, it contains public configs and private " "configs." msgstr "" -"**SFClusterConfig** 包含所有设置 SecretFlow 集群所需的信息,除了 **SFClusterDesc** ,它包含公共配置和私有配置。" +"**SFClusterConfig** 包含所有设置 SecretFlow 集群所需的信息,除了 **SFClusterDesc** " +",它包含公共配置和私有配置。" -#: ../../component/comp_spec_design.rst:155 +#: ../../component/comp_spec_design.rst:35 msgid "**PublicConfig** should be revealed to all parties, including:" -msgstr "" -"**PublicConfig** 应该向所有参与方公开,包括:" +msgstr "**PublicConfig** 应该向所有参与方公开,包括:" -#: ../../component/comp_spec_design.rst:157 +#: ../../component/comp_spec_design.rst:37 msgid "" -"Addresses for RayFed. Parties need this information to communicate to " -"each other." -msgstr "" -"RayFed 的地址。参与方需要此信息以相互通信。" +"ray_fed_config: Addresses for RayFed. Parties need this information to " +"communicate to each other." +msgstr "ray_fed_config: RayFed 的地址。参与方需要此信息以相互通信。" -#: ../../component/comp_spec_design.rst:158 +#: ../../component/comp_spec_design.rst:38 msgid "" -"Addresses for SPU devices. SPU Runtimes of all parties need to this " -"information to connect." -msgstr "" -"SPU 设备的地址。所有参与方的 SPU 运行时都需要此信息以连接设备。" +"spu_configs: Addresses for SPU devices. SPU Runtimes of all parties need " +"this information to connect." +msgstr "spu_configs: SPU 设备的地址。所有参与方的 SPU 运行时都需要此信息以连接设备。" -#: ../../component/comp_spec_design.rst:160 +#: ../../component/comp_spec_design.rst:40 msgid "**PrivateConfig** is unique to each party, including:" msgstr "**PrivateConfig** 对于每个参与方都是唯一的,包括:" -#: ../../component/comp_spec_design.rst:162 +#: ../../component/comp_spec_design.rst:42 msgid "self_party: who am I?" msgstr "self_party:我是谁?" -#: ../../component/comp_spec_design.rst:163 +#: ../../component/comp_spec_design.rst:43 msgid "ray_head_addr: The address of Ray cluster." msgstr "ray_head_addr:Ray 集群的地址。" -#: ../../component/comp_spec_design.rst:164 -msgid "storage_config: storage root of the party." -msgstr "storage_config:参与方的存储根目录。" - -#: ../../component/comp_spec_design.rst:167 -msgid "" -"You may be surprised at storage_config is private since it seems fine to " -"be public. Well, SFClusterConfig is the minimum set of configs to start a" -" SecretFlow cluster, so we just make storage_config private since it is " -"unnecessary to broadcast everyone's storage root." -msgstr "" -"您可能对 storage_config 是私有的感到惊讶,因为它似乎可以公开。嗯,SFClusterConfig 是启动 SecretFlow 集群所需的最小配置集,因此我们只需将 storage_config 设为私有,因为没有必要广播每个人的存储根目录。" - -#: ../../component/comp_spec_design.rst:172 +#: ../../component/comp_spec_design.rst:47 msgid "SecretFlow Data Types" msgstr "" -#: ../../component/comp_spec_design.rst:174 -msgid "" -"Based on **DistData**, we also proposed some common types for SecretFlow " -"applications." -msgstr "" -"基于 **DistData**,我们还为 SecretFlow 应用程序提出了一些常见类型。" - -#: ../../component/comp_spec_design.rst:176 -msgid "" -"**IndividualTable**, **VerticalTable** and **DeviceObjectCollection** are" -" defined in data.proto. **Report** is defined in report.proto." -msgstr "" -"**IndividualTable**、**VerticalTable** 和 **DeviceObjectCollection** 定义在 data.proto 中。**Report** 定义在 report.proto 中。" - -#: ../../component/comp_spec_design.rst:181 -msgid "IndividualTable" -msgstr "" - -#: ../../component/comp_spec_design.rst:183 -msgid "" -"**IndividualTable** is a table owned by one party, which means there is a" -" single item in data_refs field of DistData. **IndividualTable** should " -"be packed into **meta** field of DistData which includes **schema** and " -"**num_lines**." -msgstr "" -"IndividualTable 是一个由一个参与方拥有的表,这意味着 DistData 的 data_refs 字段中只有一个条目。IndividualTable 应该打包到 DistData 的 meta 字段中,其中包括 schema 和 num_lines。" - -#: ../../component/comp_spec_design.rst:188 -msgid "VerticalTable" -msgstr "" - -#: ../../component/comp_spec_design.rst:190 -msgid "" -"**VerticalTable** is a vertical partitioned table owned by multiple " -"parties. **VerticalTable** contains multiple **schema**. Correspondingly," -" there should be multiple data_refs in DistData." -msgstr "" -"**VerticalTable** 是由多个参与方拥有的垂直分割表。**VerticalTable** 包含多个 schema。相应地,DistData 中应该有多个 data_refs。" +#: ../../component/comp_spec_design.rst:49 +msgid "Defined in data.proto." +msgstr "定义在data.proto中。" -#: ../../component/comp_spec_design.rst:195 +#: ../../component/comp_spec_design.rst:52 msgid "DeviceObjectCollection" msgstr "" -#: ../../component/comp_spec_design.rst:197 -msgid "" -"We use **DeviceObjectCollection** to an MPC models. We would provide more" -" details for this part later." -msgstr "" -"我们使用 **DeviceObjectCollection** 来处理 MPC 模型。稍后我们将为这部分提供更多细节。" - -#: ../../component/comp_spec_design.rst:201 -msgid "Report" -msgstr "" - -#: ../../component/comp_spec_design.rst:203 -msgid "" -"Report is a special DistData which is totally public and doesn't own any " -"data_ref. We use a **Report** to reveal statistic outputs in most cases." -msgstr "" -"**Report** 是一个完全公开的特殊 **DistData** ,不拥有任何 data_ref。我们在大多数情况下使用 Report 来显示统计输出。" - -#: ../../component/comp_spec_design.rst:206 -msgid "Report related protos are:" -msgstr "" -"Report 相关的protos包括:" - -#: ../../component/comp_spec_design.rst:208 -msgid "Descriptions: Displays multiple read-only fields in groups." -msgstr "" -"Descriptions:显示多个只读字段分组。" - -#: ../../component/comp_spec_design.rst:209 -msgid "Table: Displays rows of data." -msgstr "" -"Table:显示行数据。" - -#: ../../component/comp_spec_design.rst:210 -msgid "" -"Div: A division or a section of a page, consists of Descriptions, Tables " -"or Divs." -msgstr "" - -#: ../../component/comp_spec_design.rst:211 -msgid "Tab: A page of a report, consists of Divs." -msgstr "" -"Div:页面的一个部分或一个章节,由 Descriptions、Tables 或 Divs 组成。" - -#: ../../component/comp_spec_design.rst:212 -msgid "Report: The top-level of a report, consists of Tabs." -msgstr "" -"Report:报告的顶层,由标签页组成。" - -#: ../../component/comp_spec_design.rst:216 -msgid "Node Evalution" -msgstr "节点执行" - -#: ../../component/comp_spec_design.rst:218 -msgid "" -"A runtime instance of a component is called a Node. We use " -"**NodeEvalParam** to fill-in all required attributes and inputs and get " -"outputs in **NodeEvalResult** from node." -msgstr "" -"组件的运行时实例称为节点。我们使用 **NodeEvalParam** 来填写所有必需的属性和输入,并从节点中获取 **NodeEvalResult** 输出。" - -#: ../../component/comp_spec_design.rst:222 -msgid "NodeEvalParam" -msgstr "" - -#: ../../component/comp_spec_design.rst:224 -msgid "It contains:" -msgstr "" -"它包含:" - -#: ../../component/comp_spec_design.rst:226 -msgid "domain, name, version: To locate a component." -msgstr "" -"domain、name、version:用于定位组件。" - -#: ../../component/comp_spec_design.rst:227 -msgid "attr_paths, attrs: Attributes of the component." -msgstr "" -"attr_paths、attrs:组件的属性。" - -#: ../../component/comp_spec_design.rst:228 -msgid "inputs: Inputs of the component, should be DistDatas." -msgstr "" -"inputs:组件的输入,应该是 DistDatas。" - -#: ../../component/comp_spec_design.rst:229 -msgid "output_uris: Output uris for each output." -msgstr "" -"output_uris:每个输出的输出 URI。" - -#: ../../component/comp_spec_design.rst:232 -msgid "" -"**Why only one uri for each output?** For each output, only one uri is " -"provided. It will be used by all parties to generate all data_refs of " -"this output DistData. It looks weird since we may give each party a " -"different uri. However, this is not a good idea:" -msgstr "" -"**为什么每个输出只有一个 URI?** 对于每个输出,只提供一个 URI。它将被所有参与方用于生成此输出 DistData 的所有 data_ref。这看起来很奇怪,因为我们也可以会给每个参与方一个不同的 URI。然而,这不是一个好主意:" - -#: ../../component/comp_spec_design.rst:236 -msgid "" -"When we have multiple parties, the list of output uris would be extremely" -" long." -msgstr "" -"当我们有多个参与方时,输出 URI 列表会变得非常长。" - -#: ../../component/comp_spec_design.rst:237 +#: ../../component/comp_spec_design.rst:54 msgid "" -"Each party has the full control of the storage root and they could move " -"the files afterwards. We hope to keep our system simple and don't invest " -"any effort in file system management." -msgstr "" -"每个参与方都完全控制存储根目录,他们可以在之后移动文件。我们希望保持我们的系统简单,并不投入任何精力进行文件系统管理。" - -#: ../../component/comp_spec_design.rst:240 -msgid "NodeEvalResult" -msgstr "" - -#: ../../component/comp_spec_design.rst:242 -msgid "It contains output DistDatas." -msgstr "它包含输出 DistDatas。" +"We usually use **DeviceObjectCollection** to represent an MPC models. We " +"would provide more details for this part later." +msgstr "我们通常使用 **DeviceObjectCollection** 来代表 MPC 模型。稍后我们将为这部分提供更多细节。" diff --git a/docs/locales/zh_CN/LC_MESSAGES/component/index.po b/docs/locales/zh_CN/LC_MESSAGES/component/index.po index f5e9e52d8..ae6a6cb5c 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/component/index.po +++ b/docs/locales/zh_CN/LC_MESSAGES/component/index.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: SecretFlow \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-07-02 23:05+0800\n" +"POT-Creation-Date: 2023-10-14 16:59+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -16,98 +16,149 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" "Generated-By: Babel 2.12.1\n" -#: ../../component/index.rst:40 +#: ../../component/index.rst:58 msgid "References" -msgstr "" -"参考资料" +msgstr "参考资料" -#: ../../component/index.rst:47 +#: ../../component/index.rst:65 msgid "Documentation" -msgstr "" -"文档" +msgstr "文档" #: ../../component/index.rst:4 msgid "Component" -msgstr "" -"组件" +msgstr "组件" #: ../../component/index.rst:6 msgid "" -"Both Component Specification and SecretFlow Component List are subject to" -" modify at this moment." -msgstr "" -"当前组件规范和 SecretFlow 组件列表均有可能发生变化。" +"SecretFlow Component is based on two Specifications. On the one hand, we " +"obey `SecretFlow Open Specification " +"`_. On the other, we " +"defined cluster config and some other data types with Extended " +"Specification at :doc:`/component/comp_spec`. Extended Specification only" +" applies on SecretFlow and may not used in other privacy-preserving " +"applications in SecretFlow ecosystem. The detailed explanation of " +"Extended Specification is at :doc:`/component/comp_spec_design`." +msgstr "隐语组件基于两个标准。" +"首先我们遵守 `隐语开放标准 `_ 。" +"并且我们通过扩展标准定义了集群定义和其他数据类型,请查看 :doc:`/component/comp_spec` 。" +"扩展标准仅适用于隐语,并不适用于隐语生态下的其他隐私保护应用。对于扩展标准的详细解释在 :doc:`/component/comp_spec_design` 。" -#: ../../component/index.rst:8 +#: ../../component/index.rst:10 msgid "" -"Component Specification is a protocol brought by SecretFlow Ecosystem. It" -" mainly includes Data, Component and Node Evaluation protocol. Logically," -" if you wrap any program with Component Specification, it will be " -"recognized by Kuscia(Deployment and scheduling of SecretFlow stack) and " -"SecretFlow Platform [#f1]_ (User Interface and Control Panel) . In the " -"one word, for system developers, if you wrap your application with " -"Component Specification, you could utilize other open-source SecretFlow " -"projects with little effort." +"We wrapped some commonly used SecretFlow applications as SecretFlow " +"Component List. However, it is not final and we are updating the list " +"promptly. The full SecretFlow Component List is available at " +":doc:`/component/comp_list`." msgstr "" -"组件规范是 SecretFlow 生态系统提出的一种协议。它主要包括数据、组件" -"和节点执行协议。从逻辑上讲,如果您使用组件规范包装任何程序,它将被" -" Kuscia(SecretFlow 技术栈的部署和调度系统) 和 SecretFlow 平台" -" [#f1]_ (用户界面和控制面板) 认可。简而言之,对于系统开发人员,如果您使" -"用组件规范包装应用程序,您可以轻松地利用其他开源 SecretFlow 项目。" +"我们将一些常用的SecretFlow 应用程序包装为组件。SecretFlow 组件列表不是最终版本,我们正在不断更新列表。" +"完整的组件列表在 :doc:`/component/comp_list` 。" + +#: ../../component/index.rst:12 +msgid "" +"Now, you could run SecretFlow applications with component API or CLI " +"apart from writing Python programs. Please check " +":doc:`/component/comp_guide`." +msgstr "现在除了在Python程序中引入lib, 还还可以使用组件API或者CLI调用隐语程序,请查阅 :doc:`/component/comp_guide` 。" + +#: ../../component/index.rst:17 +msgid "" +"Migration to `SecretFlow Open Specification " +"`_" +msgstr "迁移至 `隐语开放标准 `_ " -#: ../../component/index.rst:13 +#: ../../component/index.rst:19 msgid "" -"Based on Component Specification, we wrapped some commonly used " -"SecretFlow applications. The SecretFlow Component List is not final, we " -"are updating the list promptly. Now, you could run SecretFlow " -"applications with component API or CLI apart from writing Python " -"programs." +"There are some breaking changes after introduction of **SecretFlow Open " +"Specification**, including" +msgstr "引入 **隐语开放标准** 导致了一些不兼容的改动,包括" + +#: ../../component/index.rst:22 +msgid "comp.proto" msgstr "" -"基于组件规范,我们将一些常用的 SecretFlow 应用程序包装起来。SecretFlow" -" 组件列表不是最终版本,我们正在及时更新列表。现在,您可以使用组件 API" -" 或 CLI 运行 SecretFlow 应用程序,而不必编写 Python 程序。" -#: ../../component/index.rst:16 +#: ../../component/index.rst:24 +msgid "*comp.proto* is renamed to *component.proto*" +msgstr "*comp.proto* 重命名为 *component.proto*" + +#: ../../component/index.rst:25 msgid "" -"Besides SecretFlow, we are going to wrap other applications including " -"SCQL and TECC with Component Specification." +"In message *AttrType*, *AT_UNDEFINED* is replaced with " +"*ATTR_TYPE_UNSPECIFIED*" +msgstr "在 message *AttrType* 中, *AT_UNDEFINED* 替换为 *ATTR_TYPE_UNSPECIFIED*" + +#: ../../component/index.rst:26 +msgid "" +"In message *Attribute*, *has_lower_bound* is renamed to " +"*lower_bound_enabled* and *has_upper_bound* is renamed to " +"*upper_bound_enabled*" msgstr "" -"除了 SecretFlow,我们还将使用组件规范包装其他应用,包括 SCQL 和 TECC。" +"在 message *Attribute* 中, *has_lower_bound* 重命名为 *lower_bound_enabled*, " +"*has_upper_bound* 重命名为 *upper_bound_enabled*" + +#: ../../component/index.rst:27 +msgid "In message *IoDef.TableAttrDef*, *attrs* is renamed to *extra_attrs*" +msgstr "在 message *IoDef.TableAttrDef* 中, *attrs* 重命名为 *extra_attrs*" -#: ../../component/index.rst:18 +#: ../../component/index.rst:30 +msgid "data.proto" +msgstr "" + +#: ../../component/index.rst:32 msgid "" -"For developers who are not familiar with :doc:`/component/comp_spec`, " -"they are encouraged to check :doc:`/component/comp_spec_design` first." +"In message *SystemInfo*, *app_name* is renamed to *app* while " +"*secretflow* (SFClusterDesc) is replaced with *app_meta* (Any)." msgstr "" -"对于不熟悉 :doc:`/component/comp_spec` 的开发人员,建议首先查看 :doc:`/component/comp_spec_design` 。" +"在 message *SystemInfo*, *app_name* 重命名为 *app*, *secretflow* " +"(SFClusterDesc) 替换为 *app_meta* (Any)" + +#: ../../component/index.rst:33 +msgid "Message *StorageConfig* is moved from *cluster.proto*" +msgstr "Message *StorageConfig* 迁移至 *cluster.proto*" -#: ../../component/index.rst:20 +#: ../../component/index.rst:34 msgid "" -"After that, please check :doc:`/component/comp_list` and corresponding " -"guides at :doc:`/component/comp_guide`." +"In message *IndividualTable* and *VerticalTable*, *num_lines* is renamed " +"to *line_count*." msgstr "" -"之后,请查看 :doc:`/component/comp_list` 以及 :doc:`/component/comp_guide` 中的相关指南。" +"在 message *IndividualTable* 和 *VerticalTable*, *num_lines* 重命名为 " +"*line_count* 。" -#: ../../component/index.rst:24 -msgid "Footnotes" -msgstr "脚注" +#: ../../component/index.rst:35 +msgid "In message *DistData*, *sys_info* is renamed to *system_info*." +msgstr "在 message *DistData*, *sys_info* 重命名为 *system_info* 。" -#: ../../component/index.rst:25 -msgid "Product name is undecided." -msgstr "产品名称未定。" +#: ../../component/index.rst:37 +msgid "We are sorry for inconvenience." +msgstr "我们对带来的不便感到抱歉。" -#: ../../component/index.rst:30 +#: ../../component/index.rst:40 msgid "Announcement" msgstr "公告" -#: ../../component/index.rst:33 +#: ../../component/index.rst:43 +msgid "October, 2023" +msgstr "" + +#: ../../component/index.rst:45 +msgid "" +"We officially launched `SecretFlow Open Specification " +"`_ which contains the " +"most common parts which are shared by other privacy-preserving " +"applications." +msgstr "" +"我们正式发布了 `隐语开放标准 `_ , " +"包含了所有隐私计算应用共享的部分。" + +#: ../../component/index.rst:46 +msgid "The remaining part are called Extended Specification." +msgstr "剩余部分被称为 **扩展标准** 。" + +#: ../../component/index.rst:51 msgid "July, 2023" msgstr "" -#: ../../component/index.rst:35 +#: ../../component/index.rst:53 msgid "" "From SecretFlow 1.0.0, we officially launch the first version of " "Component Specification and SecretFlow Component List based on it." -msgstr "" -"从 SecretFlow 1.0.0 开始,我们正式推出基于组件规范的第一个版本和相" -"应的 SecretFlow 组件列表。" +msgstr "从 SecretFlow 1.0.0 开始,我们正式推出基于组件规范的第一个版本和相应的 SecretFlow 组件列表。" diff --git a/docs/locales/zh_CN/LC_MESSAGES/getting_started/deployment.po b/docs/locales/zh_CN/LC_MESSAGES/getting_started/deployment.po index 2b458d783..d5aa0c476 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/getting_started/deployment.po +++ b/docs/locales/zh_CN/LC_MESSAGES/getting_started/deployment.po @@ -37,10 +37,10 @@ msgstr "基于Kuscia部署" #: ../../getting_started/deployment.md:9 msgid "" -"Kuscia is a K8s-based privacy computing task orchestration framework. It " -"provides a unified privacy computing foundation that can abstract away " +"Kuscia is a lightweight privacy-preserving computing task orchestration framework based on K3s. It " +"provides a unified privacy-preserving computing foundation that can abstract away " "heterogeneous infrastructure and protocols." -msgstr "Kuscia是一款基于K8s的隐私计算任务编排框架,旨在屏蔽异构的基础设施和协议,提供统一的隐私计算底座。" +msgstr "Kuscia是一款基于K3s的轻量级隐私计算任务编排框架,旨在屏蔽异构的基础设施和协议,提供统一的隐私计算底座。" #: ../../getting_started/deployment.md:12 msgid "" diff --git a/docs/locales/zh_CN/LC_MESSAGES/tutorial/data_preprocessing_with_data_frame.po b/docs/locales/zh_CN/LC_MESSAGES/tutorial/data_preprocessing_with_data_frame.po index a91dc8101..2a68884f4 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/tutorial/data_preprocessing_with_data_frame.po +++ b/docs/locales/zh_CN/LC_MESSAGES/tutorial/data_preprocessing_with_data_frame.po @@ -187,11 +187,11 @@ msgstr "WOE编码" msgid "" "secretflow provides ``VertWoeBinning`` to bin the features into buckets " "by quantile or chimerge method, and calculate the woe value and iv value " -"in each bucket. And ``VertWOESubstitution`` can substitute the features " +"in each bucket. And ``VertBinSubstitution`` can substitute the features " "with the woe value." msgstr "" "隐语提供了 `VertWoeBinning` " -",可以对特征按照数量或者chimerge方法进行分桶,并计算每个分桶的WOE和IV值。**VertWOESubstitution** " +",可以对特征按照数量或者chimerge方法进行分桶,并计算每个分桶的WOE和IV值。**VertBinSubstitution** " "可以把特征值替换成WOE值。" #: ../../tutorial/data_preprocessing_with_data_frame.ipynb:921 diff --git a/docs/tutorial/WeightOfEvidenceEncoding.ipynb b/docs/tutorial/WeightOfEvidenceEncoding.ipynb index 791bce618..91f13f745 100644 --- a/docs/tutorial/WeightOfEvidenceEncoding.ipynb +++ b/docs/tutorial/WeightOfEvidenceEncoding.ipynb @@ -71,7 +71,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-06-26 13:58:28,126\tINFO worker.py:1544 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8266 \u001b[39m\u001b[22m\n" + "2023-10-07 18:03:23,909\tINFO worker.py:1538 -- Started a local Ray instance.\n" ] } ], @@ -123,36 +123,53 @@ "output_type": "stream", "text": [ "INFO:root:Create proxy actor with party alice.\n", - "INFO:root:Create proxy actor with party bob.\n", - "\u001b[2m\u001b[36m(_run pid=3833768)\u001b[0m INFO:jax._src.lib.xla_bridge:Unable to initialize backend 'tpu_driver': NOT_FOUND: Unable to find driver in registry given worker: \n", - "\u001b[2m\u001b[36m(_run pid=3833768)\u001b[0m INFO:jax._src.lib.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001b[2m\u001b[36m(_run pid=3833768)\u001b[0m INFO:jax._src.lib.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001b[2m\u001b[36m(_run pid=3833768)\u001b[0m INFO:jax._src.lib.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", - "\u001b[2m\u001b[36m(_run pid=3833768)\u001b[0m INFO:jax._src.lib.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", - "\u001b[2m\u001b[36m(_run pid=3833768)\u001b[0m WARNING:jax._src.lib.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n" + "INFO:root:Create proxy actor with party bob.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_run pid=3833768)\u001b[0m [2023-06-26 13:58:34.369] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", - "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3839631)\u001b[0m 2023-06-26 13:58:34.779 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", - "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3839630)\u001b[0m 2023-06-26 13:58:34.869 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n" + "\u001b[2m\u001b[36m(SPURuntime pid=3800442)\u001b[0m 2023-10-07 18:03:26.686 [info] [default_brpc_retry_policy.cc:DoRetry:52] socket error, sleep=1000000us and retry\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:Create proxy actor with party bob.\n", - "INFO:root:Create proxy actor with party alice.\n" + "\u001b[2m\u001b[36m(_run pid=3793086)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3793086)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3793086)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", + "\u001b[2m\u001b[36m(_run pid=3793086)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", + "\u001b[2m\u001b[36m(_run pid=3793086)\u001b[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "\u001b[2m\u001b[36m(_run pid=3793086)\u001b[0m [2023-10-07 18:03:27.436] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3800442)\u001b[0m 2023-10-07 18:03:27.686 [info] [default_brpc_retry_policy.cc:LogHttpDetail:29] cntl ErrorCode '112', http status code '200', response header '', error msg '[E111]Fail to connect Socket{id=0 addr=127.0.0.1:34547} (0x0x32463c0): Connection refused [R1][E112]Not connected to 127.0.0.1:34547 yet, server_id=0'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3800442)\u001b[0m 2023-10-07 18:03:27.686 [info] [default_brpc_retry_policy.cc:DoRetry:75] aggressive retry, sleep=1000000us and retry\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3800442)\u001b[0m 2023-10-07 18:03:28.686 [info] [default_brpc_retry_policy.cc:LogHttpDetail:29] cntl ErrorCode '112', http status code '200', response header '', error msg '[E111]Fail to connect Socket{id=0 addr=127.0.0.1:34547} (0x0x32463c0): Connection refused [R1][E112]Not connected to 127.0.0.1:34547 yet, server_id=0 [R2][E112]Not connected to 127.0.0.1:34547 yet, server_id=0'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3800442)\u001b[0m 2023-10-07 18:03:28.686 [info] [default_brpc_retry_policy.cc:DoRetry:75] aggressive retry, sleep=1000000us and retry\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3800441)\u001b[0m 2023-10-07 18:03:28.687 [info] [default_brpc_retry_policy.cc:DoRetry:69] not retry for reached rcp timeout, ErrorCode '1008', error msg '[E1008]Reached timeout=2000ms @127.0.0.1:57517'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Create proxy actor with party bob.\n", + "INFO:root:Create proxy actor with party alice.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3800442)\u001b[0m 2023-10-07 18:03:30.776 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3800441)\u001b[0m 2023-10-07 18:03:30.794 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", " x11 x12 x13 x14 x15 x16 x17 \\\n", "0 0.241531 -0.705729 -0.020094 -0.486932 0.851992 0.035219 -0.796096 \n", "1 -0.402727 0.115744 0.468149 -0.697152 0.386395 0.712798 0.239583 \n", @@ -212,10 +229,10 @@ ], "source": [ "from secretflow.preprocessing.binning.vert_woe_binning import VertWoeBinning\n", - "from secretflow.preprocessing.binning.vert_woe_substitution import VertWOESubstitution\n", + "from secretflow.preprocessing.binning.vert_bin_substitution import VertBinSubstitution\n", "\n", "binning = VertWoeBinning(spu)\n", - "woe_rules = binning.binning(\n", + "bin_rules = binning.binning(\n", " vdf,\n", " binning_method=\"chimerge\",\n", " bin_num=4,\n", @@ -223,8 +240,8 @@ " label_name=\"y\",\n", ")\n", "\n", - "woe_sub = VertWOESubstitution()\n", - "vdf = woe_sub.substitution(vdf, woe_rules)\n", + "woe_sub = VertBinSubstitution()\n", + "vdf = woe_sub.substitution(vdf, bin_rules)\n", "\n", "# this is for demo only, be careful with reveal\n", "print(sf.reveal(vdf.partitions[alice].data))\n", @@ -250,7 +267,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Recall that the woe_rules is a dictionary `{PYU: PYUObject}`, where each `PYUObject` itself is a dictionary of the following type:\n", + "Recall that the bin_rules is a dictionary `{PYU: PYUObject}`, where each `PYUObject` itself is a dictionary of the following type:\n", "```\n", "{\n", " \"variables\":[\n", @@ -261,8 +278,8 @@ " \"split_points\": list[float], # left-open right-close split points\n", " \"total_counts\": list[int], # total samples count in each bins.\n", " \"else_counts\": int, # np.nan samples count\n", - " \"woes\": list[float], # woe values for each bins.\n", - " \"else_woe\": float, # woe value for np.nan samples.\n", + " \"filling_values\": list[float], # woe values for each bins.\n", + " \"else_filling_value\": float, # woe value for np.nan samples.\n", " },\n", " # ... others feature\n", " ],\n", @@ -291,7 +308,7 @@ "outputs": [], "source": [ "# alice is label holder\n", - "dict_pyu_object = woe_rules[alice]\n", + "dict_pyu_object = bin_rules[alice]\n", "\n", "\n", "def extract_name_and_feature_iv(list_of_feature_iv_info):\n", @@ -363,7 +380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.15" + "version": "3.8.17" }, "orig_nbformat": 4 }, diff --git a/docs/tutorial/data_preprocessing_with_data_frame.ipynb b/docs/tutorial/data_preprocessing_with_data_frame.ipynb index 7ef32bfe5..de64fa920 100644 --- a/docs/tutorial/data_preprocessing_with_data_frame.ipynb +++ b/docs/tutorial/data_preprocessing_with_data_frame.ipynb @@ -60,10 +60,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "9ad74320-2c3a-4c86-aea4-6688d96d2230", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The version of SecretFlow: 1.2.0.dev20231007\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-10-07 18:02:54,076\tINFO worker.py:1538 -- Started a local Ray instance.\n" + ] + } + ], "source": [ "import secretflow as sf\n", "\n", @@ -526,7 +541,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Columns: Index(['target_setosa', 'target_versicolor', 'target_virginica'], dtype='object')\n", + "Columns: ['target_setosa', 'target_versicolor', 'target_virginica']\n", "Min: \n", " target_setosa 0.0\n", "target_versicolor 0.0\n", @@ -578,7 +593,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Columns: Index(['target'], dtype='object')\n", + "Columns: ['target']\n", "Min: \n", " target 0\n", "dtype: int64\n", @@ -586,6 +601,16 @@ " target 2\n", "dtype: int64\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(_run pid=3770865)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but MinMaxScaler was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3770865)\u001b[0m warnings.warn(\n", + "\u001b[2m\u001b[36m(_run pid=3770865)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3770865)\u001b[0m warnings.warn(\n" + ] } ], "source": [ @@ -649,14 +674,14 @@ "source": [ "#### WOE encoding\n", "\n", - "secretflow provides `VertWoeBinning` to bin the features into buckets by quantile or chimerge method, and calculate the woe value and iv value in each bucket. And `VertWOESubstitution` can substitute the features with the woe value.\n", + "secretflow provides `VertWoeBinning` to bin the features into buckets by quantile or chimerge method, and calculate the woe value and iv value in each bucket. And `VertBinSubstitution` can substitute the features with the woe value.\n", "\n", "Here is an example to encode features to woe." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "id": "16e22ed1", "metadata": {}, "outputs": [ @@ -693,11 +718,67 @@ "9998 0.914291 -0.473056 0.616257 1\n", "9999 -0.602927 -0.021368 0.885519 0\n", "\n", - "[10000 rows x 4 columns]\n", - "woe_rules for alice:\n", - " {'variables': [{'name': 'x1', 'type': 'numeric', 'split_points': [-0.6048731088638305, -0.2093792676925656, 0.1864844083786014, 0.59245548248291], 'woes': [0.13818949789069251, 0.1043626580338657, 0.012473718947119546, -0.08312553911263658, -0.16055365315128886], 'ivs': [0.003719895277030594, 0.0021358508878795324, 3.104792224414912e-05, 0.001402356358949689, 0.005298781871642081], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_woe': -0.7620562438001163, 'else_iv': 6.385923806287768e-05, 'else_counts': 0}, {'name': 'x2', 'type': 'numeric', 'split_points': [-0.6180597543716427, -0.21352910995483343, 0.18739376068115243, 0.5941788196563724], 'woes': [-0.5795513521445242, -0.17800092651085536, 0.02175062133493428, 0.32061945260518093, 0.5508555713857505], 'ivs': [0.07282166470764677, 0.006530977061248972, 9.424153452104671e-05, 0.019271267842100412, 0.05393057944296773], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_woe': -0.7620562438001163, 'else_iv': 6.385923806287768e-05, 'else_counts': 0}, {'name': 'x3', 'type': 'numeric', 'split_points': [-0.5902724504470824, -0.19980529546737677, 0.2072824716567998, 0.6102998018264773], 'woes': [-0.5371125119817587, -0.25762552591997334, -0.022037294110497735, 0.3445721198562295, 0.6304998785437507], 'ivs': [0.062290057806557865, 0.013846189451494859, 9.751520288052276e-05, 0.022140413942886045, 0.06928418076454097], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_woe': -0.7620562438001163, 'else_iv': 6.385923806287768e-05, 'else_counts': 0}]}\n", - "woe_rules for bob:\n", - " {'variables': [{'name': 'x18', 'type': 'numeric', 'split_points': [-0.595701837539673, -0.18646149635314926, 0.20281808376312258, 0.5969645977020259], 'woes': [0.7644870924575128, 0.3796894156855692, 0.09717493242210018, -0.3856750302449858, -0.6258460389655672], 'ivs': [0.0984553650514661, 0.026672043182215357, 0.0018543743703697353, 0.031572379289703925, 0.08527363286990977], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_woe': -0.7620562438001163, 'else_iv': 6.385923806287768e-05, 'else_counts': 0}, {'name': 'x19', 'type': 'numeric', 'split_points': [-0.5988080263137814, -0.2046342611312865, 0.1958462238311768, 0.6044608354568479], 'woes': [-0.24268812281101115, -0.18886157950622262, 0.061543825157264156, 0.15773711862524092, 0.24528753075504478], 'ivs': [0.012260322787780317, 0.0073647296376464335, 0.0007489127774465146, 0.0048277504221347, 0.011464529974064627], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_woe': -0.7620562438001163, 'else_iv': 6.385923806287768e-05, 'else_counts': 0}, {'name': 'x20', 'type': 'numeric', 'split_points': [-0.6013513207435608, -0.2053116083145139, 0.19144065380096467, 0.5987063169479374], 'woes': [1.1083043875152403, 0.5598579367731444, 0.15773711862524092, -0.4618210945247346, -0.9083164208649596], 'ivs': [0.1887116758575296, 0.055586120695474514, 0.0048277504221347, 0.04568212645465593, 0.18279475271010598], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_woe': -0.7620562438001163, 'else_iv': 6.385923806287768e-05, 'else_counts': 0}]}\n", + "[10000 rows x 4 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(_run pid=3770865)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but KBinsDiscretizer was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3770865)\u001b[0m warnings.warn(\n", + "INFO:root:Create proxy actor with party alice.\n", + "INFO:root:Create proxy actor with party bob.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(SPURuntime pid=3776105)\u001b[0m 2023-10-07 18:02:57.469 [info] [default_brpc_retry_policy.cc:DoRetry:52] socket error, sleep=1000000us and retry\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(_run pid=3770127)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3770127)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3770127)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", + "\u001b[2m\u001b[36m(_run pid=3770127)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", + "\u001b[2m\u001b[36m(_run pid=3770127)\u001b[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(_run pid=3770127)\u001b[0m [2023-10-07 18:02:57.670] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3776105)\u001b[0m 2023-10-07 18:02:58.469 [info] [default_brpc_retry_policy.cc:LogHttpDetail:29] cntl ErrorCode '112', http status code '200', response header '', error msg '[E111]Fail to connect Socket{id=0 addr=127.0.0.1:51263} (0x0x32ed440): Connection refused [R1][E112]Not connected to 127.0.0.1:51263 yet, server_id=0'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3776105)\u001b[0m 2023-10-07 18:02:58.469 [info] [default_brpc_retry_policy.cc:DoRetry:75] aggressive retry, sleep=1000000us and retry\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3776107)\u001b[0m 2023-10-07 18:02:59.479 [info] [default_brpc_retry_policy.cc:DoRetry:69] not retry for reached rcp timeout, ErrorCode '1008', error msg '[E1008]Reached timeout=2000ms @127.0.0.1:44205'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3776105)\u001b[0m 2023-10-07 18:02:59.469 [info] [default_brpc_retry_policy.cc:LogHttpDetail:29] cntl ErrorCode '112', http status code '200', response header '', error msg '[E111]Fail to connect Socket{id=0 addr=127.0.0.1:51263} (0x0x32ed440): Connection refused [R1][E112]Not connected to 127.0.0.1:51263 yet, server_id=0 [R2][E112]Not connected to 127.0.0.1:51263 yet, server_id=0'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3776105)\u001b[0m 2023-10-07 18:02:59.470 [info] [default_brpc_retry_policy.cc:DoRetry:75] aggressive retry, sleep=1000000us and retry\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Create proxy actor with party alice.\n", + "INFO:root:Create proxy actor with party bob.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3776107)\u001b[0m 2023-10-07 18:03:01.500 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3776105)\u001b[0m 2023-10-07 18:03:01.503 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", + "bin_rules for alice:\n", + " {'variables': [{'name': 'x1', 'type': 'numeric', 'split_points': [-0.6048731088638305, -0.2093792676925656, 0.1864844083786014, 0.59245548248291], 'filling_values': [0.13818949789069251, 0.1043626580338657, 0.012473718947119546, -0.08312553911263658, -0.16055365315128886], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_filling_value': -0.7620562438001163, 'else_counts': 0}, {'name': 'x2', 'type': 'numeric', 'split_points': [-0.6180597543716427, -0.21352910995483343, 0.18739376068115243, 0.5941788196563724], 'filling_values': [-0.5795513521445242, -0.17800092651085536, 0.02175062133493428, 0.32061945260518093, 0.5508555713857505], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_filling_value': -0.7620562438001163, 'else_counts': 0}, {'name': 'x3', 'type': 'numeric', 'split_points': [-0.5902724504470824, -0.19980529546737677, 0.2072824716567998, 0.6102998018264773], 'filling_values': [-0.5371125119817587, -0.25762552591997334, -0.022037294110497735, 0.3445721198562295, 0.6304998785437507], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_filling_value': -0.7620562438001163, 'else_counts': 0}]}\n", + "bin_rules for bob:\n", + " {'variables': [{'name': 'x18', 'type': 'numeric', 'split_points': [-0.595701837539673, -0.18646149635314926, 0.20281808376312258, 0.5969645977020259], 'filling_values': [0.7644870924575128, 0.3796894156855692, 0.09717493242210018, -0.3856750302449858, -0.6258460389655672], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_filling_value': -0.7620562438001163, 'else_counts': 0}, {'name': 'x19', 'type': 'numeric', 'split_points': [-0.5988080263137814, -0.2046342611312865, 0.1958462238311768, 0.6044608354568479], 'filling_values': [-0.24268812281101115, -0.18886157950622262, 0.061543825157264156, 0.15773711862524092, 0.24528753075504478], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_filling_value': -0.7620562438001163, 'else_counts': 0}, {'name': 'x20', 'type': 'numeric', 'split_points': [-0.6013513207435608, -0.2053116083145139, 0.19144065380096467, 0.5987063169479374], 'filling_values': [1.1083043875152403, 0.5598579367731444, 0.15773711862524092, -0.4618210945247346, -0.9083164208649596], 'total_counts': [2000, 2000, 2000, 2000, 2000], 'else_filling_value': -0.7620562438001163, 'else_counts': 0}], 'feature_iv_info': [{'name': 'x18', 'ivs': (0.0984553650514661, 0.026672043182215357, 0.0018543743703697353, 0.031572379289703925, 0.08527363286990977), 'else_iv': 6.385923806287768e-05, 'feature_iv': 0.24382779476366487}, {'name': 'x19', 'ivs': (0.012260322787780317, 0.0073647296376464335, 0.0007489127774465146, 0.0048277504221347, 0.011464529974064627), 'else_iv': 6.385923806287768e-05, 'feature_iv': 0.03666624559907259}, {'name': 'x20', 'ivs': (0.1887116758575296, 0.055586120695474514, 0.0048277504221347, 0.04568212645465593, 0.18279475271010598), 'else_iv': 6.385923806287768e-05, 'feature_iv': 0.4776024261399007}, {'name': 'x1', 'ivs': (0.003719895277030594, 0.0021358508878795324, 3.104792224414912e-05, 0.001402356358949689, 0.005298781871642081), 'else_iv': 6.385923806287768e-05, 'feature_iv': 0.012587932317746046}, {'name': 'x2', 'ivs': (0.07282166470764677, 0.006530977061248972, 9.424153452104671e-05, 0.019271267842100412, 0.05393057944296773), 'else_iv': 6.385923806287768e-05, 'feature_iv': 0.15264873058848494}, {'name': 'x3', 'ivs': (0.062290057806557865, 0.013846189451494859, 9.751520288052276e-05, 0.022140413942886045, 0.06928418076454097), 'else_iv': 6.385923806287768e-05, 'feature_iv': 0.16765835716836025}]}\n", "substituted ds in alice:\n", " x1 x2 x3\n", "0 0.104363 0.550856 -0.537113\n", @@ -746,7 +827,7 @@ "from secretflow.preprocessing.binning.vert_woe_binning import VertWoeBinning\n", "\n", "binning = VertWoeBinning(spu)\n", - "woe_rules = binning.binning(\n", + "bin_rules = binning.binning(\n", " vdf,\n", " binning_method=\"quantile\",\n", " bin_num=5,\n", @@ -754,13 +835,13 @@ " label_name=\"y\",\n", ")\n", "\n", - "print(f\"woe_rules for alice:\\n {sf.reveal(woe_rules[alice])}\")\n", - "print(f\"woe_rules for bob:\\n {sf.reveal(woe_rules[bob])}\")\n", + "print(f\"bin_rules for alice:\\n {sf.reveal(bin_rules[alice])}\")\n", + "print(f\"bin_rules for bob:\\n {sf.reveal(bin_rules[bob])}\")\n", "\n", - "from secretflow.preprocessing.binning.vert_woe_substitution import VertWOESubstitution\n", + "from secretflow.preprocessing.binning.vert_bin_substitution import VertBinSubstitution\n", "\n", - "woe_sub = VertWOESubstitution()\n", - "sub_data = woe_sub.substitution(vdf, woe_rules)\n", + "woe_sub = VertBinSubstitution()\n", + "sub_data = woe_sub.substitution(vdf, bin_rules)\n", "\n", "print(f\"substituted ds in alice:\\n {sf.reveal(sub_data.partitions[alice].data)}\")\n", "print(f\"substituted ds in bob:\\n {sf.reveal(sub_data.partitions[bob].data)}\")" @@ -810,7 +891,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.17" }, "vscode": { "interpreter": { diff --git a/docs/tutorial/practical_case_walkthrough_using_sf_with_spu.ipynb b/docs/tutorial/practical_case_walkthrough_using_sf_with_spu.ipynb index d105d3308..1deba5209 100644 --- a/docs/tutorial/practical_case_walkthrough_using_sf_with_spu.ipynb +++ b/docs/tutorial/practical_case_walkthrough_using_sf_with_spu.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -21,7 +20,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -29,7 +27,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "collapsed": true @@ -45,7 +42,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -59,7 +55,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -67,7 +62,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -77,7 +71,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -106,7 +99,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -140,7 +132,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -178,7 +169,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -217,7 +207,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -225,7 +214,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -256,7 +244,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -285,7 +272,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -332,7 +318,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -342,7 +327,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -611,7 +595,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1141,7 +1124,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1246,7 +1228,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1280,7 +1261,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1289,7 +1269,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1341,7 +1320,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1595,31 +1573,13 @@ "\n", "# NOTE: Reveal is only used for demo only!!\n", "print(f\"orig ds in alice:\\n {sf.reveal(train_ds_v2.partitions[alice].data)}\")\n", - "train_ds_v2.partitions[alice].data = alice(replace)(\n", - " train_ds_v2.partitions[alice].data, col_maps\n", - ")\n", - "train_ds_v2.partitions[bob].data = bob(replace)(\n", - " train_ds_v2.partitions[bob].data, col_maps\n", - ")\n", - "train_ds_v2.partitions[carol].data = carol(replace)(\n", - " train_ds_v2.partitions[carol].data, col_maps\n", - ")\n", + "train_ds_v2 = train_ds_v2.apply_func(replace, col_maps=col_maps)\n", "\n", "print(f\"orig ds in alice:\\n {sf.reveal(train_ds_v2.partitions[alice].data)}\")\n", - "\n", - "test_ds_v2.partitions[alice].data = alice(replace)(\n", - " test_ds_v2.partitions[alice].data, col_maps\n", - ")\n", - "test_ds_v2.partitions[bob].data = bob(replace)(\n", - " test_ds_v2.partitions[bob].data, col_maps\n", - ")\n", - "test_ds_v2.partitions[carol].data = carol(replace)(\n", - " test_ds_v2.partitions[carol].data, col_maps\n", - ")" + "test_ds_v2 = test_ds_v2.apply_func(replace, col_maps=col_maps)" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1628,7 +1588,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1761,7 +1720,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1872,7 +1830,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1886,7 +1843,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -2040,7 +1996,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -2048,7 +2003,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [] @@ -2181,7 +2135,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -2491,7 +2444,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -2510,7 +2462,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -2523,7 +2474,7 @@ ], "metadata": { "kernelspec": { - "display_name": "sf", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/docs/tutorial/risk_control_scenario.ipynb b/docs/tutorial/risk_control_scenario.ipynb index 89afade86..a31eb8d41 100644 --- a/docs/tutorial/risk_control_scenario.ipynb +++ b/docs/tutorial/risk_control_scenario.ipynb @@ -15,7 +15,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> Last updated: May 12, 2023\n", + "> Last updated: Oct 7, 2023\n", ">\n", "> 请使用v0.8.3或以上版本的隐语进行实验。\n", ">\n", @@ -82,12 +82,18 @@ "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The version of SecretFlow: 1.2.0.dev20231007\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n", - "2023-06-19 19:58:52,337\tINFO worker.py:1538 -- Started a local Ray instance.\n" + "2023-10-07 18:03:47,500\tINFO worker.py:1538 -- Started a local Ray instance.\n" ] } ], @@ -560,64 +566,64 @@ " \n", " \n", " \n", - " 34307\n", - " 30\n", - " self-employed\n", - " married\n", + " 29713\n", + " 55\n", + " management\n", + " single\n", " tertiary\n", " no\n", - " 2666\n", + " -350\n", " no\n", " no\n", - " 34308\n", + " 29714\n", " \n", " \n", - " 39488\n", - " 34\n", - " management\n", + " 37441\n", + " 49\n", + " services\n", " married\n", " secondary\n", " no\n", - " 724\n", + " 2409\n", " yes\n", " no\n", - " 39489\n", + " 37442\n", " \n", " \n", - " 15177\n", - " 29\n", - " management\n", + " 16828\n", + " 37\n", + " entrepreneur\n", " married\n", " tertiary\n", " no\n", - " 1\n", - " yes\n", + " 1514\n", " no\n", - " 15178\n", + " yes\n", + " 16829\n", " \n", " \n", - " 8390\n", - " 42\n", - " technician\n", - " single\n", - " unknown\n", + " 20613\n", + " 55\n", + " blue-collar\n", + " married\n", + " secondary\n", + " no\n", + " 388\n", " no\n", - " 344\n", - " yes\n", " no\n", - " 8391\n", + " 20614\n", " \n", " \n", - " 24188\n", - " 58\n", - " technician\n", + " 25582\n", + " 54\n", + " admin.\n", " married\n", " secondary\n", " no\n", - " -942\n", + " 3136\n", " yes\n", " no\n", - " 24189\n", + " 25583\n", " \n", " \n", " ...\n", @@ -632,64 +638,64 @@ " ...\n", " \n", " \n", - " 30242\n", - " 29\n", - " services\n", + " 42934\n", + " 34\n", + " technician\n", " married\n", - " primary\n", + " secondary\n", " no\n", - " 549\n", + " 3000\n", " yes\n", - " no\n", - " 30243\n", + " yes\n", + " 42935\n", " \n", " \n", - " 29936\n", - " 31\n", + " 4141\n", + " 35\n", " technician\n", " single\n", - " tertiary\n", + " secondary\n", " no\n", - " 665\n", + " -206\n", " yes\n", " no\n", - " 29937\n", + " 4142\n", " \n", " \n", - " 13661\n", - " 32\n", + " 5478\n", + " 31\n", " admin.\n", " single\n", - " secondary\n", - " yes\n", - " -17\n", + " tertiary\n", " no\n", + " 701\n", + " yes\n", " no\n", - " 13662\n", + " 5479\n", " \n", " \n", - " 8968\n", - " 41\n", - " management\n", + " 6257\n", + " 33\n", + " blue-collar\n", " married\n", - " tertiary\n", - " no\n", - " 5\n", - " no\n", + " secondary\n", " no\n", - " 8969\n", + " -241\n", + " yes\n", + " yes\n", + " 6258\n", " \n", " \n", - " 23030\n", - " 41\n", - " management\n", + " 2776\n", + " 40\n", + " blue-collar\n", " married\n", - " tertiary\n", - " no\n", - " 725\n", + " secondary\n", " no\n", + " -31\n", + " yes\n", " no\n", - " 23031\n", + " 2777\n", " \n", " \n", "\n", @@ -697,31 +703,31 @@ "" ], "text/plain": [ - " age job marital education default balance housing loan \\\n", - "34307 30 self-employed married tertiary no 2666 no no \n", - "39488 34 management married secondary no 724 yes no \n", - "15177 29 management married tertiary no 1 yes no \n", - "8390 42 technician single unknown no 344 yes no \n", - "24188 58 technician married secondary no -942 yes no \n", - "... ... ... ... ... ... ... ... ... \n", - "30242 29 services married primary no 549 yes no \n", - "29936 31 technician single tertiary no 665 yes no \n", - "13661 32 admin. single secondary yes -17 no no \n", - "8968 41 management married tertiary no 5 no no \n", - "23030 41 management married tertiary no 725 no no \n", + " age job marital education default balance housing loan \\\n", + "29713 55 management single tertiary no -350 no no \n", + "37441 49 services married secondary no 2409 yes no \n", + "16828 37 entrepreneur married tertiary no 1514 no yes \n", + "20613 55 blue-collar married secondary no 388 no no \n", + "25582 54 admin. married secondary no 3136 yes no \n", + "... ... ... ... ... ... ... ... ... \n", + "42934 34 technician married secondary no 3000 yes yes \n", + "4141 35 technician single secondary no -206 yes no \n", + "5478 31 admin. single tertiary no 701 yes no \n", + "6257 33 blue-collar married secondary no -241 yes yes \n", + "2776 40 blue-collar married secondary no -31 yes no \n", "\n", " uid \n", - "34307 34308 \n", - "39488 39489 \n", - "15177 15178 \n", - "8390 8391 \n", - "24188 24189 \n", + "29713 29714 \n", + "37441 37442 \n", + "16828 16829 \n", + "20613 20614 \n", + "25582 25583 \n", "... ... \n", - "30242 30243 \n", - "29936 29937 \n", - "13661 13662 \n", - "8968 8969 \n", - "23030 23031 \n", + "42934 42935 \n", + "4141 4142 \n", + "5478 5479 \n", + "6257 6258 \n", + "2776 2777 \n", "\n", "[40690 rows x 9 columns]" ] @@ -779,69 +785,69 @@ " \n", " \n", " \n", - " 21927\n", + " 35491\n", " cellular\n", - " 20\n", - " aug\n", - " 210\n", + " 7\n", + " may\n", + " 600\n", " 2\n", - " -1\n", - " 0\n", - " unknown\n", + " 330\n", + " 17\n", + " failure\n", " no\n", - " 21928\n", + " 35492\n", " \n", " \n", - " 11391\n", + " 371\n", " unknown\n", - " 19\n", - " jun\n", - " 573\n", + " 6\n", + " may\n", + " 195\n", " 1\n", " -1\n", " 0\n", " unknown\n", " no\n", - " 11392\n", + " 372\n", " \n", " \n", - " 26036\n", + " 33382\n", " cellular\n", - " 19\n", - " nov\n", - " 266\n", - " 2\n", + " 20\n", + " apr\n", + " 198\n", + " 1\n", " -1\n", " 0\n", " unknown\n", " no\n", - " 26037\n", + " 33383\n", " \n", " \n", - " 43335\n", + " 26770\n", " cellular\n", - " 22\n", - " mar\n", - " 254\n", - " 2\n", + " 20\n", + " nov\n", + " 207\n", + " 1\n", " -1\n", " 0\n", " unknown\n", - " yes\n", - " 43336\n", + " no\n", + " 26771\n", " \n", " \n", - " 43222\n", + " 33620\n", " cellular\n", - " 5\n", - " mar\n", - " 162\n", - " 1\n", - " 179\n", - " 2\n", - " failure\n", + " 20\n", + " apr\n", + " 20\n", + " 4\n", + " -1\n", + " 0\n", + " unknown\n", " no\n", - " 43223\n", + " 33621\n", " \n", " \n", " ...\n", @@ -857,69 +863,69 @@ " ...\n", " \n", " \n", - " 28562\n", + " 28643\n", " cellular\n", " 29\n", " jan\n", - " 146\n", + " 94\n", " 2\n", " -1\n", " 0\n", " unknown\n", " no\n", - " 28563\n", + " 28644\n", " \n", " \n", - " 37451\n", + " 16449\n", " cellular\n", - " 13\n", - " may\n", - " 777\n", - " 2\n", + " 23\n", + " jul\n", + " 122\n", + " 3\n", " -1\n", " 0\n", " unknown\n", " no\n", - " 37452\n", + " 16450\n", " \n", " \n", - " 10843\n", - " unknown\n", - " 17\n", - " jun\n", - " 491\n", - " 1\n", + " 29287\n", + " cellular\n", + " 2\n", + " feb\n", + " 175\n", + " 2\n", " -1\n", " 0\n", " unknown\n", " no\n", - " 10844\n", + " 29288\n", " \n", " \n", - " 17215\n", - " cellular\n", - " 28\n", - " jul\n", - " 68\n", + " 33126\n", + " telephone\n", + " 20\n", + " apr\n", + " 73\n", " 3\n", - " -1\n", - " 0\n", - " unknown\n", + " 325\n", + " 3\n", + " failure\n", " no\n", - " 17216\n", + " 33127\n", " \n", " \n", - " 26220\n", + " 34736\n", " cellular\n", - " 20\n", - " nov\n", - " 110\n", - " 2\n", + " 6\n", + " may\n", + " 87\n", + " 1\n", " -1\n", " 0\n", " unknown\n", " no\n", - " 26221\n", + " 34737\n", " \n", " \n", "\n", @@ -927,31 +933,31 @@ "" ], "text/plain": [ - " contact day month duration campaign pdays previous poutcome y \\\n", - "21927 cellular 20 aug 210 2 -1 0 unknown no \n", - "11391 unknown 19 jun 573 1 -1 0 unknown no \n", - "26036 cellular 19 nov 266 2 -1 0 unknown no \n", - "43335 cellular 22 mar 254 2 -1 0 unknown yes \n", - "43222 cellular 5 mar 162 1 179 2 failure no \n", - "... ... ... ... ... ... ... ... ... ... \n", - "28562 cellular 29 jan 146 2 -1 0 unknown no \n", - "37451 cellular 13 may 777 2 -1 0 unknown no \n", - "10843 unknown 17 jun 491 1 -1 0 unknown no \n", - "17215 cellular 28 jul 68 3 -1 0 unknown no \n", - "26220 cellular 20 nov 110 2 -1 0 unknown no \n", + " contact day month duration campaign pdays previous poutcome y \\\n", + "35491 cellular 7 may 600 2 330 17 failure no \n", + "371 unknown 6 may 195 1 -1 0 unknown no \n", + "33382 cellular 20 apr 198 1 -1 0 unknown no \n", + "26770 cellular 20 nov 207 1 -1 0 unknown no \n", + "33620 cellular 20 apr 20 4 -1 0 unknown no \n", + "... ... ... ... ... ... ... ... ... .. \n", + "28643 cellular 29 jan 94 2 -1 0 unknown no \n", + "16449 cellular 23 jul 122 3 -1 0 unknown no \n", + "29287 cellular 2 feb 175 2 -1 0 unknown no \n", + "33126 telephone 20 apr 73 3 325 3 failure no \n", + "34736 cellular 6 may 87 1 -1 0 unknown no \n", "\n", " uid \n", - "21927 21928 \n", - "11391 11392 \n", - "26036 26037 \n", - "43335 43336 \n", - "43222 43223 \n", + "35491 35492 \n", + "371 372 \n", + "33382 33383 \n", + "26770 26771 \n", + "33620 33621 \n", "... ... \n", - "28562 28563 \n", - "37451 37452 \n", - "10843 10844 \n", - "17215 17216 \n", - "26220 26221 \n", + "28643 28644 \n", + "16449 16450 \n", + "29287 29288 \n", + "33126 33127 \n", + "34736 34737 \n", "\n", "[40690 rows x 10 columns]" ] @@ -1036,66 +1042,67 @@ "execution_count": 6, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001B[2m\u001B[36m(pid=12271)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n", - "\u001B[2m\u001B[36m(pid=12278)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:57.878 [info] [bucket_psi.cc:Init:228] bucket size set to 1048576\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:57.879 [info] [bucket_psi.cc:Run:97] Begin sanity check for input file: /tmp/tmp694nk0jt, precheck_switch:true\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:57.900 [info] [csv_checker.cc:CsvChecker:121] Executing duplicated scripts: LC_ALL=C sort --buffer-size=1G --temporary-directory=/tmp --stable selected-keys.1687175937879190715 | LC_ALL=C uniq -d > duplicate-keys.1687175937879190715\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:57.917 [info] [bucket_psi.cc:Run:115] End sanity check for input file: /tmp/tmp694nk0jt, size=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:57.917 [info] [bucket_psi.cc:RunPsi:419] Run psi protocol=1, self_items_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:57.917 [info] [cryptor_selector.cc:GetSodiumCryptor:45] Using libSodium\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:57.917 [info] [cipher_store.cc:DiskCipherStore:33] Disk cache choose num_bins=64\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:57.921 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:57.879 [info] [bucket_psi.cc:Init:228] bucket size set to 1048576\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:57.879 [info] [bucket_psi.cc:Run:97] Begin sanity check for input file: /tmp/tmpuk_ondd_, precheck_switch:true\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:57.901 [info] [csv_checker.cc:CsvChecker:121] Executing duplicated scripts: LC_ALL=C sort --buffer-size=1G --temporary-directory=/tmp --stable selected-keys.1687175937879550920 | LC_ALL=C uniq -d > duplicate-keys.1687175937879550920\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:57.917 [info] [bucket_psi.cc:Run:115] End sanity check for input file: /tmp/tmpuk_ondd_, size=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:57.917 [info] [bucket_psi.cc:RunPsi:419] Run psi protocol=1, self_items_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:57.917 [info] [cryptor_selector.cc:GetSodiumCryptor:45] Using libSodium\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:57.917 [info] [cipher_store.cc:DiskCipherStore:33] Disk cache choose num_bins=64\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:57.921 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:58.899 [info] [ecdh_psi.cc:MaskSelf:75] MaskSelf:root--finished, batch_count=10, self_item_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:58.908 [info] [ecdh_psi.cc:MaskPeer:119] MaskPeer:root--finished, batch_count=10, peer_item_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:58.864 [info] [ecdh_psi.cc:MaskSelf:75] MaskSelf:root--finished, batch_count=10, self_item_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:58.910 [info] [ecdh_psi.cc:RecvDualMaskedSelf:148] root recv last batch finished, batch_count=10\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:58.940 [info] [ecdh_psi.cc:RecvDualMaskedSelf:148] root recv last batch finished, batch_count=10\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:58.977 [info] [bucket_psi.cc:Run:178] Begin post filtering, indices.size=36623, should_sort=true\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:58.986 [info] [utils.cc:MultiKeySort:88] Executing sort scripts: tail -n +2 /tmp/tmp-sort-in-1687175938978090836 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=9,9 >>/tmp/tmp-sort-out-1687175938978090836\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:59.015 [info] [utils.cc:MultiKeySort:90] Finished sort scripts: tail -n +2 /tmp/tmp-sort-in-1687175938978090836 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=9,9 >>/tmp/tmp-sort-out-1687175938978090836, ret=0\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:58:59.015 [info] [bucket_psi.cc:Run:216] End post filtering, in=/tmp/tmp694nk0jt, out=/tmp/tmpsm2_cbxb\n" + "\u001b[2m\u001b[36m(SPURuntime pid=3812799)\u001b[0m 2023-10-07 18:03:49.999 [info] [default_brpc_retry_policy.cc:DoRetry:52] socket error, sleep=1000000us and retry\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3812799)\u001b[0m 2023-10-07 18:03:50.999 [info] [default_brpc_retry_policy.cc:LogHttpDetail:29] cntl ErrorCode '112', http status code '200', response header '', error msg '[E111]Fail to connect Socket{id=0 addr=127.0.0.1:33409} (0x0x3bbdcc0): Connection refused [R1][E112]Not connected to 127.0.0.1:33409 yet, server_id=0'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3812799)\u001b[0m 2023-10-07 18:03:50.999 [info] [default_brpc_retry_policy.cc:DoRetry:75] aggressive retry, sleep=1000000us and retry\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3812799)\u001b[0m 2023-10-07 18:03:51.999 [info] [default_brpc_retry_policy.cc:LogHttpDetail:29] cntl ErrorCode '112', http status code '200', response header '', error msg '[E111]Fail to connect Socket{id=0 addr=127.0.0.1:33409} (0x0x3bbdcc0): Connection refused [R1][E112]Not connected to 127.0.0.1:33409 yet, server_id=0 [R2][E112]Not connected to 127.0.0.1:33409 yet, server_id=0'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3812799)\u001b[0m 2023-10-07 18:03:51.999 [info] [default_brpc_retry_policy.cc:DoRetry:75] aggressive retry, sleep=1000000us and retry\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3812800)\u001b[0m 2023-10-07 18:03:52.009 [info] [default_brpc_retry_policy.cc:DoRetry:69] not retry for reached rcp timeout, ErrorCode '1008', error msg '[E1008]Reached timeout=2000ms @127.0.0.1:41217'\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.012 [info] [bucket_psi.cc:Init:315] bucket size set to 1048576\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.013 [info] [bucket_psi.cc:CheckInput:229] Begin sanity check for input file: /tmp/tmpalym9ko6, precheck_switch:true\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.012 [info] [bucket_psi.cc:Init:315] bucket size set to 1048576\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.012 [info] [bucket_psi.cc:CheckInput:229] Begin sanity check for input file: /tmp/tmpr7dudg52, precheck_switch:true\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.022 [info] [csv_checker.cc:CsvChecker:121] Executing duplicated scripts: LC_ALL=C sort --buffer-size=1G --temporary-directory=/tmp --stable selected-keys.1696673034013188158 | LC_ALL=C uniq -d > duplicate-keys.1696673034013188158\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.023 [info] [csv_checker.cc:CsvChecker:121] Executing duplicated scripts: LC_ALL=C sort --buffer-size=1G --temporary-directory=/tmp --stable selected-keys.1696673034012852841 | LC_ALL=C uniq -d > duplicate-keys.1696673034012852841\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.032 [info] [bucket_psi.cc:CheckInput:246] End sanity check for input file: /tmp/tmpalym9ko6, size=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.032 [info] [bucket_psi.cc:RunPsi:348] Run psi protocol=1, self_items_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.032 [info] [cryptor_selector.cc:GetSodiumCryptor:46] Using libSodium\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.032 [info] [cipher_store.cc:DiskCipherStore:33] Disk cache choose num_bins=64\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.036 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.078 [info] [bucket_psi.cc:operator():385] ECDH progress 10%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.102 [info] [bucket_psi.cc:operator():385] ECDH progress 20%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.122 [info] [bucket_psi.cc:operator():385] ECDH progress 30%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.032 [info] [bucket_psi.cc:CheckInput:246] End sanity check for input file: /tmp/tmpr7dudg52, size=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.033 [info] [bucket_psi.cc:RunPsi:348] Run psi protocol=1, self_items_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.033 [info] [cryptor_selector.cc:GetSodiumCryptor:46] Using libSodium\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.033 [info] [cipher_store.cc:DiskCipherStore:33] Disk cache choose num_bins=64\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.036 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.079 [info] [bucket_psi.cc:operator():385] ECDH progress 10%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.097 [info] [bucket_psi.cc:operator():385] ECDH progress 20%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.115 [info] [bucket_psi.cc:operator():385] ECDH progress 30%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.133 [info] [bucket_psi.cc:operator():385] ECDH progress 40%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.146 [info] [bucket_psi.cc:operator():385] ECDH progress 50%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.171 [info] [bucket_psi.cc:operator():385] ECDH progress 60%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.184 [info] [bucket_psi.cc:operator():385] ECDH progress 70%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.205 [info] [ecdh_psi.cc:MaskSelf:75] MaskSelf:root--finished, batch_count=10, self_item_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.209 [info] [bucket_psi.cc:operator():385] ECDH progress 80%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.225 [info] [bucket_psi.cc:operator():385] ECDH progress 90%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.135 [info] [bucket_psi.cc:operator():385] ECDH progress 40%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.159 [info] [bucket_psi.cc:operator():385] ECDH progress 50%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.185 [info] [bucket_psi.cc:operator():385] ECDH progress 60%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.204 [info] [bucket_psi.cc:operator():385] ECDH progress 70%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.217 [info] [bucket_psi.cc:operator():385] ECDH progress 80%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.230 [info] [bucket_psi.cc:operator():385] ECDH progress 90%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.234 [info] [ecdh_psi.cc:MaskPeer:120] MaskPeer:root--finished, batch_count=10, peer_item_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.234 [info] [bucket_psi.cc:operator():385] ECDH progress 100%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.234 [info] [ecdh_psi.cc:RecvDualMaskedSelf:149] root recv last batch finished, batch_count=10\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.235 [info] [ecdh_psi.cc:MaskSelf:75] MaskSelf:root--finished, batch_count=10, self_item_count=40690\n" ] }, { "data": { "text/plain": [ - "[{'party': 'alice', 'original_count': 40690, 'intersection_count': 36623},\n", - " {'party': 'bob', 'original_count': 40690, 'intersection_count': 36623}]" + "[{'party': 'alice', 'original_count': 40690, 'intersection_count': 36634},\n", + " {'party': 'bob', 'original_count': 40690, 'intersection_count': 36634}]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:58.939 [info] [ecdh_psi.cc:MaskPeer:119] MaskPeer:root--finished, batch_count=10, peer_item_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:58.976 [info] [bucket_psi.cc:Run:178] Begin post filtering, indices.size=36623, should_sort=true\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:58.985 [info] [utils.cc:MultiKeySort:88] Executing sort scripts: tail -n +2 /tmp/tmp-sort-in-1687175938977402931 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=10,10 >>/tmp/tmp-sort-out-1687175938977402931\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:59.012 [info] [utils.cc:MultiKeySort:90] Finished sort scripts: tail -n +2 /tmp/tmp-sort-in-1687175938977402931 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=10,10 >>/tmp/tmp-sort-out-1687175938977402931, ret=0\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:58:59.012 [info] [bucket_psi.cc:Run:216] End post filtering, in=/tmp/tmpuk_ondd_, out=/tmp/tmpktc3mmb9\n" - ] } ], "source": [ @@ -1131,61 +1138,91 @@ "execution_count": 7, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001B[2m\u001B[36m(pid=4578)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:01.035 [info] [bucket_psi.cc:Init:228] bucket size set to 1048576\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:01.035 [info] [bucket_psi.cc:Run:97] Begin sanity check for input file: /tmp/tmp694nk0jt, precheck_switch:true\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:01.058 [info] [csv_checker.cc:CsvChecker:121] Executing duplicated scripts: LC_ALL=C sort --buffer-size=1G --temporary-directory=/tmp --stable selected-keys.1687175941035742943 | LC_ALL=C uniq -d > duplicate-keys.1687175941035742943\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:01.035 [info] [bucket_psi.cc:Init:228] bucket size set to 1048576\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:01.035 [info] [bucket_psi.cc:Run:97] Begin sanity check for input file: /tmp/tmpuk_ondd_, precheck_switch:true\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:01.058 [info] [csv_checker.cc:CsvChecker:121] Executing duplicated scripts: LC_ALL=C sort --buffer-size=1G --temporary-directory=/tmp --stable selected-keys.1687175941036362722 | LC_ALL=C uniq -d > duplicate-keys.1687175941036362722\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:01.086 [info] [bucket_psi.cc:Run:115] End sanity check for input file: /tmp/tmp694nk0jt, size=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:01.086 [info] [bucket_psi.cc:RunPsi:419] Run psi protocol=1, self_items_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:01.086 [info] [cryptor_selector.cc:GetSodiumCryptor:45] Using libSodium\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:01.086 [info] [cipher_store.cc:DiskCipherStore:33] Disk cache choose num_bins=64\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:01.086 [info] [bucket_psi.cc:Run:115] End sanity check for input file: /tmp/tmpuk_ondd_, size=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:01.086 [info] [bucket_psi.cc:RunPsi:419] Run psi protocol=1, self_items_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:01.086 [info] [cryptor_selector.cc:GetSodiumCryptor:45] Using libSodium\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:01.086 [info] [cipher_store.cc:DiskCipherStore:33] Disk cache choose num_bins=64\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:02.371 [info] [ecdh_psi.cc:MaskSelf:75] MaskSelf:root--finished, batch_count=10, self_item_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:02.480 [info] [ecdh_psi.cc:MaskSelf:75] MaskSelf:root--finished, batch_count=10, self_item_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:02.556 [info] [ecdh_psi.cc:MaskPeer:119] MaskPeer:root--finished, batch_count=10, peer_item_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:02.565 [info] [ecdh_psi.cc:RecvDualMaskedSelf:148] root recv last batch finished, batch_count=10\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:02.602 [info] [bucket_psi.cc:Run:178] Begin post filtering, indices.size=36623, should_sort=true\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:02.610 [info] [utils.cc:MultiKeySort:88] Executing sort scripts: tail -n +2 /tmp/tmp-sort-in-1687175942602523600 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=9,9 >>/tmp/tmp-sort-out-1687175942602523600\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:02.639 [info] [utils.cc:MultiKeySort:90] Finished sort scripts: tail -n +2 /tmp/tmp-sort-in-1687175942602523600 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=9,9 >>/tmp/tmp-sort-out-1687175942602523600, ret=0\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=alice) pid=12271)\u001B[0m 2023-06-19 19:59:02.640 [info] [bucket_psi.cc:Run:216] End post filtering, in=/tmp/tmp694nk0jt, out=/tmp/tmp694nk0jt.psi_output_13006\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:02.557 [info] [ecdh_psi.cc:RecvDualMaskedSelf:148] root recv last batch finished, batch_count=10\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:02.563 [info] [ecdh_psi.cc:MaskPeer:119] MaskPeer:root--finished, batch_count=10, peer_item_count=40690\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:02.600 [info] [bucket_psi.cc:Run:178] Begin post filtering, indices.size=36623, should_sort=true\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:02.608 [info] [utils.cc:MultiKeySort:88] Executing sort scripts: tail -n +2 /tmp/tmp-sort-in-1687175942600777595 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=10,10 >>/tmp/tmp-sort-out-1687175942600777595\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:02.637 [info] [utils.cc:MultiKeySort:90] Finished sort scripts: tail -n +2 /tmp/tmp-sort-in-1687175942600777595 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=10,10 >>/tmp/tmp-sort-out-1687175942600777595, ret=0\n", - "\u001B[2m\u001B[36m(SPURuntime(device_id=None, party=bob) pid=12278)\u001B[0m 2023-06-19 19:59:02.637 [info] [bucket_psi.cc:Run:216] End post filtering, in=/tmp/tmpuk_ondd_, out=/tmp/tmpuk_ondd_.psi_output_13006\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001B[2m\u001B[36m(pid=4620)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n" + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.242 [info] [ecdh_psi.cc:MaskPeer:120] MaskPeer:root--finished, batch_count=10, peer_item_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.257 [info] [bucket_psi.cc:ProduceOutput:267] Begin post filtering, indices.size=36634, should_sort=true\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.260 [info] [utils.cc:MultiKeySort:88] Executing sort scripts: tail -n +2 /tmp/tmp-sort-in-1696673034257569380 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=9,9 >>/tmp/tmp-sort-out-1696673034257569380\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.275 [info] [utils.cc:MultiKeySort:90] Finished sort scripts: tail -n +2 /tmp/tmp-sort-in-1696673034257569380 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=9,9 >>/tmp/tmp-sort-out-1696673034257569380, ret=0\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.276 [info] [bucket_psi.cc:ProduceOutput:305] End post filtering, in=/tmp/tmpalym9ko6, out=/tmp/tmplvme8bi7\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.243 [info] [bucket_psi.cc:operator():385] ECDH progress 100%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.243 [info] [ecdh_psi.cc:RecvDualMaskedSelf:149] root recv last batch finished, batch_count=10\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.258 [info] [bucket_psi.cc:ProduceOutput:267] Begin post filtering, indices.size=36634, should_sort=true\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.261 [info] [utils.cc:MultiKeySort:88] Executing sort scripts: tail -n +2 /tmp/tmp-sort-in-1696673034258490369 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=10,10 >>/tmp/tmp-sort-out-1696673034258490369\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.275 [info] [utils.cc:MultiKeySort:90] Finished sort scripts: tail -n +2 /tmp/tmp-sort-in-1696673034258490369 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=10,10 >>/tmp/tmp-sort-out-1696673034258490369, ret=0\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.275 [info] [bucket_psi.cc:ProduceOutput:305] End post filtering, in=/tmp/tmpr7dudg52, out=/tmp/tmpagzpomeb\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.819 [info] [bucket_psi.cc:Init:315] bucket size set to 1048576\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.819 [info] [bucket_psi.cc:CheckInput:229] Begin sanity check for input file: /tmp/tmpalym9ko6, precheck_switch:true\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.829 [info] [csv_checker.cc:CsvChecker:121] Executing duplicated scripts: LC_ALL=C sort --buffer-size=1G --temporary-directory=/tmp --stable selected-keys.1696673034819762103 | LC_ALL=C uniq -d > duplicate-keys.1696673034819762103\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.840 [info] [bucket_psi.cc:CheckInput:246] End sanity check for input file: /tmp/tmpalym9ko6, size=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.840 [info] [bucket_psi.cc:RunPsi:348] Run psi protocol=1, self_items_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.840 [info] [cryptor_selector.cc:GetSodiumCryptor:46] Using libSodium\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.840 [info] [cipher_store.cc:DiskCipherStore:33] Disk cache choose num_bins=64\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.819 [info] [bucket_psi.cc:Init:315] bucket size set to 1048576\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.819 [info] [bucket_psi.cc:CheckInput:229] Begin sanity check for input file: /tmp/tmpr7dudg52, precheck_switch:true\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.830 [info] [csv_checker.cc:CsvChecker:121] Executing duplicated scripts: LC_ALL=C sort --buffer-size=1G --temporary-directory=/tmp --stable selected-keys.1696673034819646273 | LC_ALL=C uniq -d > duplicate-keys.1696673034819646273\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.840 [info] [bucket_psi.cc:CheckInput:246] End sanity check for input file: /tmp/tmpr7dudg52, size=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.841 [info] [bucket_psi.cc:RunPsi:348] Run psi protocol=1, self_items_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.841 [info] [cryptor_selector.cc:GetSodiumCryptor:46] Using libSodium\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.841 [info] [cipher_store.cc:DiskCipherStore:33] Disk cache choose num_bins=64\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.871 [info] [bucket_psi.cc:operator():385] ECDH progress 10%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.891 [info] [bucket_psi.cc:operator():385] ECDH progress 20%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.915 [info] [bucket_psi.cc:operator():385] ECDH progress 30%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.934 [info] [bucket_psi.cc:operator():385] ECDH progress 40%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.955 [info] [bucket_psi.cc:operator():385] ECDH progress 50%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.884 [info] [bucket_psi.cc:operator():385] ECDH progress 10%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.905 [info] [bucket_psi.cc:operator():385] ECDH progress 20%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.921 [info] [bucket_psi.cc:operator():385] ECDH progress 30%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.942 [info] [bucket_psi.cc:operator():385] ECDH progress 40%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.957 [info] [bucket_psi.cc:operator():385] ECDH progress 50%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:54.992 [info] [bucket_psi.cc:operator():385] ECDH progress 60%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.003 [info] [bucket_psi.cc:operator():385] ECDH progress 70%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.017 [info] [bucket_psi.cc:operator():385] ECDH progress 80%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.029 [info] [bucket_psi.cc:operator():385] ECDH progress 90%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.031 [info] [ecdh_psi.cc:MaskSelf:75] MaskSelf:root--finished, batch_count=10, self_item_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.042 [info] [bucket_psi.cc:operator():385] ECDH progress 100%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.042 [info] [ecdh_psi.cc:RecvDualMaskedSelf:149] root recv last batch finished, batch_count=10\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.043 [info] [ecdh_psi.cc:MaskPeer:120] MaskPeer:root--finished, batch_count=10, peer_item_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.058 [info] [bucket_psi.cc:ProduceOutput:267] Begin post filtering, indices.size=36634, should_sort=true\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.061 [info] [utils.cc:MultiKeySort:88] Executing sort scripts: tail -n +2 /tmp/tmp-sort-in-1696673035058333794 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=9,9 >>/tmp/tmp-sort-out-1696673035058333794\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:54.990 [info] [bucket_psi.cc:operator():385] ECDH progress 60%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.007 [info] [bucket_psi.cc:operator():385] ECDH progress 70%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.022 [info] [bucket_psi.cc:operator():385] ECDH progress 80%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.023 [info] [ecdh_psi.cc:MaskSelf:75] MaskSelf:root--finished, batch_count=10, self_item_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.038 [info] [bucket_psi.cc:operator():385] ECDH progress 90%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.041 [info] [ecdh_psi.cc:MaskPeer:120] MaskPeer:root--finished, batch_count=10, peer_item_count=40690\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.044 [info] [bucket_psi.cc:operator():385] ECDH progress 100%\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.044 [info] [ecdh_psi.cc:RecvDualMaskedSelf:149] root recv last batch finished, batch_count=10\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.059 [info] [bucket_psi.cc:ProduceOutput:267] Begin post filtering, indices.size=36634, should_sort=true\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.062 [info] [utils.cc:MultiKeySort:88] Executing sort scripts: tail -n +2 /tmp/tmp-sort-in-1696673035059340444 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=10,10 >>/tmp/tmp-sort-out-1696673035059340444\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.076 [info] [utils.cc:MultiKeySort:90] Finished sort scripts: tail -n +2 /tmp/tmp-sort-in-1696673035058333794 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=9,9 >>/tmp/tmp-sort-out-1696673035058333794, ret=0\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3812799)\u001b[0m 2023-10-07 18:03:55.076 [info] [bucket_psi.cc:ProduceOutput:305] End post filtering, in=/tmp/tmpalym9ko6, out=/tmp/tmpalym9ko6.psi_output_97135\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.076 [info] [utils.cc:MultiKeySort:90] Finished sort scripts: tail -n +2 /tmp/tmp-sort-in-1696673035059340444 | LC_ALL=C sort --buffer-size=3G --parallel=8 --temporary-directory=./ --stable --field-separator=, --key=10,10 >>/tmp/tmp-sort-out-1696673035059340444, ret=0\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3812800)\u001b[0m 2023-10-07 18:03:55.076 [info] [bucket_psi.cc:ProduceOutput:305] End post filtering, in=/tmp/tmpr7dudg52, out=/tmp/tmpr7dudg52.psi_output_97135\n" ] }, { "data": { "text/plain": [ - "Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',\n", - " 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',\n", - " 'previous', 'poutcome', 'y'],\n", - " dtype='object')" + "['age',\n", + " 'job',\n", + " 'marital',\n", + " 'education',\n", + " 'default',\n", + " 'balance',\n", + " 'housing',\n", + " 'loan',\n", + " 'contact',\n", + " 'day',\n", + " 'month',\n", + " 'duration',\n", + " 'campaign',\n", + " 'pdays',\n", + " 'previous',\n", + " 'poutcome',\n", + " 'y']" ] }, "execution_count": 7, @@ -1246,19 +1283,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m /home/t4/fengjun.feng/repos/secretflow/secretflow/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m return fn(*args, **kwargs)\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m /home/t4/fengjun.feng/repos/secretflow/secretflow/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m return fn(*args, **kwargs)\n", - "\u001B[2m\u001B[36m(_run pid=4578)\u001B[0m /home/t4/fengjun.feng/repos/secretflow/secretflow/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", - "\u001B[2m\u001B[36m(_run pid=4578)\u001B[0m return fn(*args, **kwargs)\n", - "\u001B[2m\u001B[36m(_run pid=4578)\u001B[0m /home/t4/fengjun.feng/repos/secretflow/secretflow/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", - "\u001B[2m\u001B[36m(_run pid=4578)\u001B[0m return fn(*args, **kwargs)\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m /home/t4/fengjun.feng/repos/secretflow/secretflow/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m return fn(*args, **kwargs)\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m /home/t4/fengjun.feng/repos/secretflow/secretflow/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m return fn(*args, **kwargs)\n", - "\u001B[2m\u001B[36m(pid=4609)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n" + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m return fn(*args, **kwargs)\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m return fn(*args, **kwargs)\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m return fn(*args, **kwargs)\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m return fn(*args, **kwargs)\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m return fn(*args, **kwargs)\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/secretflow/device/device/pyu.py:154: FutureWarning: The default value of numeric_only in DataFrame.quantile is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m return fn(*args, **kwargs)\n" ] }, { @@ -1309,32 +1345,32 @@ " \n", " age\n", " int64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " 18.0\n", " 95.0\n", - " 40.973842\n", - " 1.132227e+02\n", - " 10.640615\n", - " ...\n", - " 1.792075e+03\n", - " 8.352986e+04\n", - " 4.136732e+06\n", - " 1.132196e+02\n", - " 8.235717e+02\n", - " 4.271965e+04\n", - " 1500585.0\n", - " 6.563117e+07\n", - " 3.059114e+09\n", - " 1.514995e+11\n", + " 40.938636\n", + " 1.125268e+02\n", + " 10.607867\n", + " ...\n", + " 1.788496e+03\n", + " 8.324620e+04\n", + " 4.115919e+06\n", + " 1.125238e+02\n", + " 8.144866e+02\n", + " 4.214078e+04\n", + " 1499746.0\n", + " 6.551975e+07\n", + " 3.049641e+09\n", + " 1.507826e+11\n", " \n", " \n", " job\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1357,8 +1393,8 @@ " \n", " marital\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1381,8 +1417,8 @@ " \n", " education\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1405,8 +1441,8 @@ " \n", " default\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1429,32 +1465,32 @@ " \n", " balance\n", " int64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -8019.0\n", - " 102127.0\n", - " 1364.172569\n", - " 9.389804e+06\n", - " 3064.278642\n", - " ...\n", - " 1.125051e+07\n", - " 2.846731e+11\n", - " 4.381241e+15\n", - " 9.389547e+06\n", - " 2.437076e+11\n", - " 1.301696e+16\n", - " 49960092.0\n", - " 4.120276e+11\n", - " 1.042558e+16\n", - " -5.566494e+18\n", + " -3372.0\n", + " 98417.0\n", + " 1366.720424\n", + " 9.148037e+06\n", + " 3024.572264\n", + " ...\n", + " 1.101571e+07\n", + " 2.605110e+11\n", + " 4.548695e+15\n", + " 9.147788e+06\n", + " 2.204507e+11\n", + " 1.079063e+16\n", + " 50068436.0\n", + " 4.035496e+11\n", + " 9.543560e+15\n", + " 6.161953e+17\n", " \n", " \n", " housing\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1477,8 +1513,8 @@ " \n", " loan\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1501,8 +1537,8 @@ " \n", " contact\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1525,32 +1561,32 @@ " \n", " day\n", " int64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " 1.0\n", " 31.0\n", - " 15.802528\n", - " 6.931243e+01\n", - " 8.325409\n", - " ...\n", - " 3.190304e+02\n", - " 7.285526e+03\n", - " 1.789105e+05\n", - " 6.931054e+01\n", - " 5.347452e+01\n", - " 9.321046e+03\n", - " 578736.0\n", - " 1.168385e+07\n", - " 2.668178e+08\n", - " 6.552241e+09\n", + " 15.798930\n", + " 6.932877e+01\n", + " 8.326390\n", + " ...\n", + " 3.189331e+02\n", + " 7.282283e+03\n", + " 1.787904e+05\n", + " 6.932688e+01\n", + " 5.290034e+01\n", + " 9.317523e+03\n", + " 578778.0\n", + " 1.168379e+07\n", + " 2.667791e+08\n", + " 6.549806e+09\n", " \n", " \n", " month\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1573,104 +1609,104 @@ " \n", " duration\n", " int64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " 0.0\n", - " 3422.0\n", - " 258.212681\n", - " 6.502099e+04\n", - " 254.992134\n", - " ...\n", - " 1.316930e+05\n", - " 1.151632e+08\n", - " 1.482931e+11\n", - " 6.501921e+04\n", - " 4.758080e+07\n", - " 6.869341e+10\n", - " 9456523.0\n", - " 4.822993e+09\n", - " 4.217621e+12\n", - " 5.430939e+15\n", + " 4918.0\n", + " 257.611563\n", + " 6.625854e+04\n", + " 257.407347\n", + " ...\n", + " 1.326205e+05\n", + " 1.224237e+08\n", + " 1.824202e+11\n", + " 6.625673e+04\n", + " 5.412212e+07\n", + " 9.586385e+10\n", + " 9437342.0\n", + " 4.858418e+09\n", + " 4.484869e+12\n", + " 6.682781e+15\n", " \n", " \n", " campaign\n", " int64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " 1.0\n", " 63.0\n", - " 2.744778\n", - " 9.337596e+00\n", - " 3.055748\n", - " ...\n", - " 1.687115e+01\n", - " 2.382004e+02\n", - " 5.782482e+03\n", - " 9.337341e+00\n", - " 1.406350e+02\n", - " 3.759602e+03\n", - " 100522.0\n", - " 6.178720e+05\n", - " 8.723614e+06\n", - " 2.117718e+08\n", + " 2.762379\n", + " 9.535351e+00\n", + " 3.087936\n", + " ...\n", + " 1.716583e+01\n", + " 2.437880e+02\n", + " 5.893892e+03\n", + " 9.535091e+00\n", + " 1.436904e+02\n", + " 3.811396e+03\n", + " 101197.0\n", + " 6.288530e+05\n", + " 8.930929e+06\n", + " 2.159168e+08\n", " \n", " \n", " pdays\n", " int64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " -1.0\n", " 871.0\n", - " 40.501816\n", - " 1.005977e+04\n", - " 100.298386\n", - " ...\n", - " 1.169989e+04\n", - " 3.916406e+06\n", - " 1.531144e+09\n", - " 1.005949e+04\n", - " 2.627684e+06\n", - " 1.003740e+09\n", - " 1483298.0\n", - " 4.284850e+08\n", - " 1.434305e+11\n", - " 5.607510e+13\n", + " 40.435634\n", + " 1.010338e+04\n", + " 100.515581\n", + " ...\n", + " 1.173815e+04\n", + " 3.951940e+06\n", + " 1.554830e+09\n", + " 1.010311e+04\n", + " 2.660249e+06\n", + " 1.022767e+09\n", + " 1481319.0\n", + " 4.300153e+08\n", + " 1.447754e+11\n", + " 5.695965e+13\n", " \n", " \n", " previous\n", " int64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " 0.0\n", " 275.0\n", - " 0.589302\n", - " 5.796070e+00\n", - " 2.407503\n", - " ...\n", - " 6.143189e+00\n", - " 6.326302e+02\n", - " 1.579898e+05\n", - " 5.795912e+00\n", - " 6.221789e+02\n", - " 1.565110e+05\n", - " 21582.0\n", - " 2.249820e+05\n", - " 2.316881e+07\n", - " 5.786061e+09\n", + " 0.590189\n", + " 5.856919e+00\n", + " 2.420107\n", + " ...\n", + " 6.205083e+00\n", + " 6.336154e+02\n", + " 1.579490e+05\n", + " 5.856759e+00\n", + " 6.230400e+02\n", + " 1.564658e+05\n", + " 21621.0\n", + " 2.273170e+05\n", + " 2.321187e+07\n", + " 5.786303e+09\n", " \n", " \n", " poutcome\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1693,8 +1729,8 @@ " \n", " y\n", " object\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " NaN\n", @@ -1721,97 +1757,97 @@ ], "text/plain": [ " datatype total_count count(non-NA count) count_na(NA count) \\\n", - "age int64 36623 36623 0 \n", - "job object 36623 36623 0 \n", - "marital object 36623 36623 0 \n", - "education object 36623 36623 0 \n", - "default object 36623 36623 0 \n", - "balance int64 36623 36623 0 \n", - "housing object 36623 36623 0 \n", - "loan object 36623 36623 0 \n", - "contact object 36623 36623 0 \n", - "day int64 36623 36623 0 \n", - "month object 36623 36623 0 \n", - "duration int64 36623 36623 0 \n", - "campaign int64 36623 36623 0 \n", - "pdays int64 36623 36623 0 \n", - "previous int64 36623 36623 0 \n", - "poutcome object 36623 36623 0 \n", - "y object 36623 36623 0 \n", + "age int64 36634 36634 0 \n", + "job object 36634 36634 0 \n", + "marital object 36634 36634 0 \n", + "education object 36634 36634 0 \n", + "default object 36634 36634 0 \n", + "balance int64 36634 36634 0 \n", + "housing object 36634 36634 0 \n", + "loan object 36634 36634 0 \n", + "contact object 36634 36634 0 \n", + "day int64 36634 36634 0 \n", + "month object 36634 36634 0 \n", + "duration int64 36634 36634 0 \n", + "campaign int64 36634 36634 0 \n", + "pdays int64 36634 36634 0 \n", + "previous int64 36634 36634 0 \n", + "poutcome object 36634 36634 0 \n", + "y object 36634 36634 0 \n", "\n", - " na_ratio min max mean var(variance) \\\n", - "age 0.0 18.0 95.0 40.973842 1.132227e+02 \n", - "job 0.0 NaN NaN NaN NaN \n", - "marital 0.0 NaN NaN NaN NaN \n", - "education 0.0 NaN NaN NaN NaN \n", - "default 0.0 NaN NaN NaN NaN \n", - "balance 0.0 -8019.0 102127.0 1364.172569 9.389804e+06 \n", - "housing 0.0 NaN NaN NaN NaN \n", - "loan 0.0 NaN NaN NaN NaN \n", - "contact 0.0 NaN NaN NaN NaN \n", - "day 0.0 1.0 31.0 15.802528 6.931243e+01 \n", - "month 0.0 NaN NaN NaN NaN \n", - "duration 0.0 0.0 3422.0 258.212681 6.502099e+04 \n", - "campaign 0.0 1.0 63.0 2.744778 9.337596e+00 \n", - "pdays 0.0 -1.0 871.0 40.501816 1.005977e+04 \n", - "previous 0.0 0.0 275.0 0.589302 5.796070e+00 \n", - "poutcome 0.0 NaN NaN NaN NaN \n", - "y 0.0 NaN NaN NaN NaN \n", + " na_ratio min max mean var(variance) \\\n", + "age 0.0 18.0 95.0 40.938636 1.125268e+02 \n", + "job 0.0 NaN NaN NaN NaN \n", + "marital 0.0 NaN NaN NaN NaN \n", + "education 0.0 NaN NaN NaN NaN \n", + "default 0.0 NaN NaN NaN NaN \n", + "balance 0.0 -3372.0 98417.0 1366.720424 9.148037e+06 \n", + "housing 0.0 NaN NaN NaN NaN \n", + "loan 0.0 NaN NaN NaN NaN \n", + "contact 0.0 NaN NaN NaN NaN \n", + "day 0.0 1.0 31.0 15.798930 6.932877e+01 \n", + "month 0.0 NaN NaN NaN NaN \n", + "duration 0.0 0.0 4918.0 257.611563 6.625854e+04 \n", + "campaign 0.0 1.0 63.0 2.762379 9.535351e+00 \n", + "pdays 0.0 -1.0 871.0 40.435634 1.010338e+04 \n", + "previous 0.0 0.0 275.0 0.590189 5.856919e+00 \n", + "poutcome 0.0 NaN NaN NaN NaN \n", + "y 0.0 NaN NaN NaN NaN \n", "\n", " std(standard deviation) ... moment_2 moment_3 \\\n", - "age 10.640615 ... 1.792075e+03 8.352986e+04 \n", + "age 10.607867 ... 1.788496e+03 8.324620e+04 \n", "job NaN ... NaN NaN \n", "marital NaN ... NaN NaN \n", "education NaN ... NaN NaN \n", "default NaN ... NaN NaN \n", - "balance 3064.278642 ... 1.125051e+07 2.846731e+11 \n", + "balance 3024.572264 ... 1.101571e+07 2.605110e+11 \n", "housing NaN ... NaN NaN \n", "loan NaN ... NaN NaN \n", "contact NaN ... NaN NaN \n", - "day 8.325409 ... 3.190304e+02 7.285526e+03 \n", + "day 8.326390 ... 3.189331e+02 7.282283e+03 \n", "month NaN ... NaN NaN \n", - "duration 254.992134 ... 1.316930e+05 1.151632e+08 \n", - "campaign 3.055748 ... 1.687115e+01 2.382004e+02 \n", - "pdays 100.298386 ... 1.169989e+04 3.916406e+06 \n", - "previous 2.407503 ... 6.143189e+00 6.326302e+02 \n", + "duration 257.407347 ... 1.326205e+05 1.224237e+08 \n", + "campaign 3.087936 ... 1.716583e+01 2.437880e+02 \n", + "pdays 100.515581 ... 1.173815e+04 3.951940e+06 \n", + "previous 2.420107 ... 6.205083e+00 6.336154e+02 \n", "poutcome NaN ... NaN NaN \n", "y NaN ... NaN NaN \n", "\n", " moment_4 central_moment_2 central_moment_3 central_moment_4 \\\n", - "age 4.136732e+06 1.132196e+02 8.235717e+02 4.271965e+04 \n", + "age 4.115919e+06 1.125238e+02 8.144866e+02 4.214078e+04 \n", "job NaN NaN NaN NaN \n", "marital NaN NaN NaN NaN \n", "education NaN NaN NaN NaN \n", "default NaN NaN NaN NaN \n", - "balance 4.381241e+15 9.389547e+06 2.437076e+11 1.301696e+16 \n", + "balance 4.548695e+15 9.147788e+06 2.204507e+11 1.079063e+16 \n", "housing NaN NaN NaN NaN \n", "loan NaN NaN NaN NaN \n", "contact NaN NaN NaN NaN \n", - "day 1.789105e+05 6.931054e+01 5.347452e+01 9.321046e+03 \n", + "day 1.787904e+05 6.932688e+01 5.290034e+01 9.317523e+03 \n", "month NaN NaN NaN NaN \n", - "duration 1.482931e+11 6.501921e+04 4.758080e+07 6.869341e+10 \n", - "campaign 5.782482e+03 9.337341e+00 1.406350e+02 3.759602e+03 \n", - "pdays 1.531144e+09 1.005949e+04 2.627684e+06 1.003740e+09 \n", - "previous 1.579898e+05 5.795912e+00 6.221789e+02 1.565110e+05 \n", + "duration 1.824202e+11 6.625673e+04 5.412212e+07 9.586385e+10 \n", + "campaign 5.893892e+03 9.535091e+00 1.436904e+02 3.811396e+03 \n", + "pdays 1.554830e+09 1.010311e+04 2.660249e+06 1.022767e+09 \n", + "previous 1.579490e+05 5.856759e+00 6.230400e+02 1.564658e+05 \n", "poutcome NaN NaN NaN NaN \n", "y NaN NaN NaN NaN \n", "\n", " sum sum_2 sum_3 sum_4 \n", - "age 1500585.0 6.563117e+07 3.059114e+09 1.514995e+11 \n", + "age 1499746.0 6.551975e+07 3.049641e+09 1.507826e+11 \n", "job NaN NaN NaN NaN \n", "marital NaN NaN NaN NaN \n", "education NaN NaN NaN NaN \n", "default NaN NaN NaN NaN \n", - "balance 49960092.0 4.120276e+11 1.042558e+16 -5.566494e+18 \n", + "balance 50068436.0 4.035496e+11 9.543560e+15 6.161953e+17 \n", "housing NaN NaN NaN NaN \n", "loan NaN NaN NaN NaN \n", "contact NaN NaN NaN NaN \n", - "day 578736.0 1.168385e+07 2.668178e+08 6.552241e+09 \n", + "day 578778.0 1.168379e+07 2.667791e+08 6.549806e+09 \n", "month NaN NaN NaN NaN \n", - "duration 9456523.0 4.822993e+09 4.217621e+12 5.430939e+15 \n", - "campaign 100522.0 6.178720e+05 8.723614e+06 2.117718e+08 \n", - "pdays 1483298.0 4.284850e+08 1.434305e+11 5.607510e+13 \n", - "previous 21582.0 2.249820e+05 2.316881e+07 5.786061e+09 \n", + "duration 9437342.0 4.858418e+09 4.484869e+12 6.682781e+15 \n", + "campaign 101197.0 6.288530e+05 8.930929e+06 2.159168e+08 \n", + "pdays 1481319.0 4.300153e+08 1.447754e+11 5.695965e+13 \n", + "previous 21621.0 2.273170e+05 2.321187e+07 5.786303e+09 \n", "poutcome NaN NaN NaN NaN \n", "y NaN NaN NaN NaN \n", "\n", @@ -1894,32 +1930,32 @@ "text": [ " age job marital education default balance housing loan\n", "0 58 management married 3.0 0 2143 1 0\n", - "1 43 technician single 2.0 0 593 1 0\n", + "1 46 management married 3.0 0 229 1 0\n", "2 33 technician single 2.0 0 56 0 0\n", "3 42 technician married 2.0 0 8036 0 0\n", "4 38 admin. married 1.0 0 1487 0 0\n", "... ... ... ... ... ... ... ... ...\n", - "36618 46 technician married 2.0 0 399 1 0\n", - "36619 36 self-employed single 3.0 0 4844 0 0\n", - "36620 49 technician married 2.0 0 378 0 0\n", - "36621 49 housemaid married 1.0 0 3376 0 0\n", - "36622 40 blue-collar married 1.0 0 48 0 0\n", + "36629 38 blue-collar married 1.0 0 3289 0 0\n", + "36630 36 self-employed single 3.0 0 4844 0 0\n", + "36631 49 technician married 2.0 0 378 0 0\n", + "36632 40 blue-collar married 1.0 0 48 0 0\n", + "36633 46 services married 3.0 0 474 0 0\n", "\n", - "[36623 rows x 8 columns]\n", + "[36634 rows x 8 columns]\n", " contact day month duration campaign pdays previous poutcome y\n", "0 unknown 5 5 261 1 -1 0 unknown 0\n", - "1 unknown 5 5 55 1 -1 0 unknown 0\n", + "1 unknown 5 5 197 1 -1 0 unknown 0\n", "2 unknown 7 5 236 2 -1 0 unknown 0\n", "3 unknown 9 6 948 5 -1 0 unknown 0\n", "4 unknown 9 6 332 2 -1 0 unknown 0\n", "... ... ... ... ... ... ... ... ... ..\n", - "36618 unknown 9 6 699 2 -1 0 unknown 0\n", - "36619 unknown 9 6 1137 3 -1 0 unknown 1\n", - "36620 unknown 9 6 189 2 -1 0 unknown 0\n", - "36621 unknown 9 6 119 3 -1 0 unknown 0\n", - "36622 unknown 9 6 100 5 -1 0 unknown 0\n", + "36629 unknown 9 6 553 2 -1 0 unknown 0\n", + "36630 unknown 9 6 1137 3 -1 0 unknown 1\n", + "36631 unknown 9 6 189 2 -1 0 unknown 0\n", + "36632 unknown 9 6 100 5 -1 0 unknown 0\n", + "36633 unknown 9 6 445 2 -1 0 unknown 0\n", "\n", - "[36623 rows x 9 columns]\n" + "[36634 rows x 9 columns]\n" ] } ], @@ -1997,32 +2033,32 @@ "text": [ " age job marital education default balance housing loan\n", "0 58 management married 3.0 0 2143 1 0\n", - "1 43 technician single 2.0 0 593 1 0\n", + "1 46 management married 3.0 0 229 1 0\n", "2 33 technician single 2.0 0 56 0 0\n", "3 42 technician married 2.0 0 8036 0 0\n", "4 38 admin. married 1.0 0 1487 0 0\n", "... ... ... ... ... ... ... ... ...\n", - "36618 46 technician married 2.0 0 399 1 0\n", - "36619 36 self-employed single 3.0 0 4844 0 0\n", - "36620 49 technician married 2.0 0 378 0 0\n", - "36621 49 housemaid married 1.0 0 3376 0 0\n", - "36622 40 blue-collar married 1.0 0 48 0 0\n", + "36629 38 blue-collar married 1.0 0 3289 0 0\n", + "36630 36 self-employed single 3.0 0 4844 0 0\n", + "36631 49 technician married 2.0 0 378 0 0\n", + "36632 40 blue-collar married 1.0 0 48 0 0\n", + "36633 46 services married 3.0 0 474 0 0\n", "\n", - "[36623 rows x 8 columns]\n", + "[36634 rows x 8 columns]\n", " contact day month duration campaign pdays previous poutcome y\n", "0 unknown 5 5 261 1 -1 0 unknown 0\n", - "1 unknown 5 5 55 1 -1 0 unknown 0\n", + "1 unknown 5 5 197 1 -1 0 unknown 0\n", "2 unknown 7 5 236 2 -1 0 unknown 0\n", "3 unknown 9 6 948 5 -1 0 unknown 0\n", "4 unknown 9 6 332 2 -1 0 unknown 0\n", "... ... ... ... ... ... ... ... ... ..\n", - "36618 unknown 9 6 699 2 -1 0 unknown 0\n", - "36619 unknown 9 6 1137 3 -1 0 unknown 1\n", - "36620 unknown 9 6 189 2 -1 0 unknown 0\n", - "36621 unknown 9 6 119 3 -1 0 unknown 0\n", - "36622 unknown 9 6 100 5 -1 0 unknown 0\n", + "36629 unknown 9 6 553 2 -1 0 unknown 0\n", + "36630 unknown 9 6 1137 3 -1 0 unknown 1\n", + "36631 unknown 9 6 189 2 -1 0 unknown 0\n", + "36632 unknown 9 6 100 5 -1 0 unknown 0\n", + "36633 unknown 9 6 445 2 -1 0 unknown 0\n", "\n", - "[36623 rows x 9 columns]\n" + "[36634 rows x 9 columns]\n" ] } ], @@ -2070,21 +2106,25 @@ "output_type": "stream", "text": [ "INFO:root:Create proxy actor with party alice.\n", - "INFO:root:Create proxy actor with party bob.\n", - "\u001B[2m\u001B[36m(pid=17408)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n", - "\u001B[2m\u001B[36m(pid=17366)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n", - "INFO:root:Create proxy actor with party alice.\n", - "INFO:root:Create proxy actor with party bob.\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n", - "\u001B[2m\u001B[36m(_run pid=4609)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001B[2m\u001B[36m(_run pid=4609)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001B[2m\u001B[36m(_run pid=4609)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", - "\u001B[2m\u001B[36m(_run pid=4609)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", - "\u001B[2m\u001B[36m(_run pid=4609)\u001B[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n" + "INFO:root:Create proxy actor with party bob.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n", + "INFO:root:Create proxy actor with party alice.\n", + "INFO:root:Create proxy actor with party bob.\n" ] }, { @@ -2093,41 +2133,41 @@ "text": [ " age job marital education default balance housing loan\n", "0 58 management married 3.0 0 2143 1 0\n", - "1 43 technician single 2.0 0 593 1 0\n", + "1 46 management married 3.0 0 229 1 0\n", "2 33 technician single 2.0 0 56 0 0\n", "3 42 technician married 2.0 0 8036 0 0\n", "4 38 admin. married 1.0 0 1487 0 0\n", "... ... ... ... ... ... ... ... ...\n", - "36618 46 technician married 2.0 0 399 1 0\n", - "36619 36 self-employed single 3.0 0 4844 0 0\n", - "36620 49 technician married 2.0 0 378 0 0\n", - "36621 49 housemaid married 1.0 0 3376 0 0\n", - "36622 40 blue-collar married 1.0 0 48 0 0\n", + "36629 38 blue-collar married 1.0 0 3289 0 0\n", + "36630 36 self-employed single 3.0 0 4844 0 0\n", + "36631 49 technician married 2.0 0 378 0 0\n", + "36632 40 blue-collar married 1.0 0 48 0 0\n", + "36633 46 services married 3.0 0 474 0 0\n", "\n", - "[36623 rows x 8 columns]\n", + "[36634 rows x 8 columns]\n", " contact day month duration campaign pdays previous poutcome y\n", - "0 unknown 5 5 0.204988 1 -1 0 unknown 0\n", - "1 unknown 5 5 -5.700589 1 -1 0 unknown 0\n", - "2 unknown 7 5 -0.212423 2 -1 0 unknown 0\n", - "3 unknown 9 6 2.276706 5 -1 0 unknown 0\n", - "4 unknown 9 6 0.220221 2 -1 0 unknown 0\n", + "0 unknown 5 5 0.355090 1 -1 0 unknown 0\n", + "1 unknown 5 5 -1.136022 1 -1 0 unknown 0\n", + "2 unknown 7 5 -0.061611 2 -1 0 unknown 0\n", + "3 unknown 9 6 2.328878 5 -1 0 unknown 0\n", + "4 unknown 9 6 0.202048 2 -1 0 unknown 0\n", "... ... ... ... ... ... ... ... ... ..\n", - "36618 unknown 9 6 1.729960 2 -1 0 unknown 0\n", - "36619 unknown 9 6 2.276706 3 -1 0 unknown 1\n", - "36620 unknown 9 6 -0.444719 2 -1 0 unknown 0\n", - "36621 unknown 9 6 -1.564663 3 -1 0 unknown 0\n", - "36622 unknown 9 6 -1.564663 5 -1 0 unknown 0\n", + "36629 unknown 9 6 1.131077 2 -1 0 unknown 0\n", + "36630 unknown 9 6 2.328878 3 -1 0 unknown 1\n", + "36631 unknown 9 6 -0.394791 2 -1 0 unknown 0\n", + "36632 unknown 9 6 -1.708979 5 -1 0 unknown 0\n", + "36633 unknown 9 6 0.813627 2 -1 0 unknown 0\n", "\n", - "[36623 rows x 9 columns]\n" + "[36634 rows x 9 columns]\n" ] } ], "source": [ "from secretflow.preprocessing.binning.vert_woe_binning import VertWoeBinning\n", - "from secretflow.preprocessing.binning.vert_woe_substitution import VertWOESubstitution\n", + "from secretflow.preprocessing.binning.vert_bin_substitution import VertBinSubstitution\n", "\n", "binning = VertWoeBinning(spu)\n", - "woe_rules = binning.binning(\n", + "bin_rules = binning.binning(\n", " vdf,\n", " binning_method=\"chimerge\",\n", " bin_num=4,\n", @@ -2135,8 +2175,8 @@ " label_name=\"y\",\n", ")\n", "\n", - "woe_sub = VertWOESubstitution()\n", - "vdf = woe_sub.substitution(vdf, woe_rules)\n", + "woe_sub = VertBinSubstitution()\n", + "vdf = woe_sub.substitution(vdf, bin_rules)\n", "\n", "print(sf.reveal(vdf.partitions[alice].data))\n", "print(sf.reveal(vdf.partitions[bob].data))" @@ -2179,16 +2219,16 @@ "text": [ " age education default balance housing loan job_admin. \\\n", "0 58 3.0 0 2143 1 0 0.0 \n", - "1 43 2.0 0 593 1 0 0.0 \n", + "1 46 3.0 0 229 1 0 0.0 \n", "2 33 2.0 0 56 0 0 0.0 \n", "3 42 2.0 0 8036 0 0 0.0 \n", "4 38 1.0 0 1487 0 0 1.0 \n", "... ... ... ... ... ... ... ... \n", - "36618 46 2.0 0 399 1 0 0.0 \n", - "36619 36 3.0 0 4844 0 0 0.0 \n", - "36620 49 2.0 0 378 0 0 0.0 \n", - "36621 49 1.0 0 3376 0 0 0.0 \n", - "36622 40 1.0 0 48 0 0 0.0 \n", + "36629 38 1.0 0 3289 0 0 0.0 \n", + "36630 36 3.0 0 4844 0 0 0.0 \n", + "36631 49 2.0 0 378 0 0 0.0 \n", + "36632 40 1.0 0 48 0 0 0.0 \n", + "36633 46 3.0 0 474 0 0 0.0 \n", "\n", " job_blue-collar job_entrepreneur job_housemaid ... job_retired \\\n", "0 0.0 0.0 0.0 ... 0.0 \n", @@ -2197,64 +2237,64 @@ "3 0.0 0.0 0.0 ... 0.0 \n", "4 0.0 0.0 0.0 ... 0.0 \n", "... ... ... ... ... ... \n", - "36618 0.0 0.0 0.0 ... 0.0 \n", - "36619 0.0 0.0 0.0 ... 0.0 \n", - "36620 0.0 0.0 0.0 ... 0.0 \n", - "36621 0.0 0.0 1.0 ... 0.0 \n", - "36622 1.0 0.0 0.0 ... 0.0 \n", + "36629 1.0 0.0 0.0 ... 0.0 \n", + "36630 0.0 0.0 0.0 ... 0.0 \n", + "36631 0.0 0.0 0.0 ... 0.0 \n", + "36632 1.0 0.0 0.0 ... 0.0 \n", + "36633 0.0 0.0 0.0 ... 0.0 \n", "\n", " job_self-employed job_services job_student job_technician \\\n", "0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 1.0 \n", + "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 1.0 \n", "3 0.0 0.0 0.0 1.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", - "36618 0.0 0.0 0.0 1.0 \n", - "36619 1.0 0.0 0.0 0.0 \n", - "36620 0.0 0.0 0.0 1.0 \n", - "36621 0.0 0.0 0.0 0.0 \n", - "36622 0.0 0.0 0.0 0.0 \n", + "36629 0.0 0.0 0.0 0.0 \n", + "36630 1.0 0.0 0.0 0.0 \n", + "36631 0.0 0.0 0.0 1.0 \n", + "36632 0.0 0.0 0.0 0.0 \n", + "36633 0.0 1.0 0.0 0.0 \n", "\n", " job_unemployed job_unknown marital_divorced marital_married \\\n", "0 0.0 0.0 0.0 1.0 \n", - "1 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 1.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 1.0 \n", "4 0.0 0.0 0.0 1.0 \n", "... ... ... ... ... \n", - "36618 0.0 0.0 0.0 1.0 \n", - "36619 0.0 0.0 0.0 0.0 \n", - "36620 0.0 0.0 0.0 1.0 \n", - "36621 0.0 0.0 0.0 1.0 \n", - "36622 0.0 0.0 0.0 1.0 \n", + "36629 0.0 0.0 0.0 1.0 \n", + "36630 0.0 0.0 0.0 0.0 \n", + "36631 0.0 0.0 0.0 1.0 \n", + "36632 0.0 0.0 0.0 1.0 \n", + "36633 0.0 0.0 0.0 1.0 \n", "\n", " marital_single \n", "0 0.0 \n", - "1 1.0 \n", + "1 0.0 \n", "2 1.0 \n", "3 0.0 \n", "4 0.0 \n", "... ... \n", - "36618 0.0 \n", - "36619 1.0 \n", - "36620 0.0 \n", - "36621 0.0 \n", - "36622 0.0 \n", + "36629 0.0 \n", + "36630 1.0 \n", + "36631 0.0 \n", + "36632 0.0 \n", + "36633 0.0 \n", "\n", - "[36623 rows x 21 columns]\n", + "[36634 rows x 21 columns]\n", " duration campaign pdays previous y contact_cellular \\\n", - "0 0.204988 1 -1 0 0 0.0 \n", - "1 -5.700589 1 -1 0 0 0.0 \n", - "2 -0.212423 2 -1 0 0 0.0 \n", - "3 2.276706 5 -1 0 0 0.0 \n", - "4 0.220221 2 -1 0 0 0.0 \n", + "0 0.355090 1 -1 0 0 0.0 \n", + "1 -1.136022 1 -1 0 0 0.0 \n", + "2 -0.061611 2 -1 0 0 0.0 \n", + "3 2.328878 5 -1 0 0 0.0 \n", + "4 0.202048 2 -1 0 0 0.0 \n", "... ... ... ... ... .. ... \n", - "36618 1.729960 2 -1 0 0 0.0 \n", - "36619 2.276706 3 -1 0 1 0.0 \n", - "36620 -0.444719 2 -1 0 0 0.0 \n", - "36621 -1.564663 3 -1 0 0 0.0 \n", - "36622 -1.564663 5 -1 0 0 0.0 \n", + "36629 1.131077 2 -1 0 0 0.0 \n", + "36630 2.328878 3 -1 0 1 0.0 \n", + "36631 -0.394791 2 -1 0 0 0.0 \n", + "36632 -1.708979 5 -1 0 0 0.0 \n", + "36633 0.813627 2 -1 0 0 0.0 \n", "\n", " contact_telephone contact_unknown month_1.0 month_2.0 ... \\\n", "0 0.0 1.0 0.0 0.0 ... \n", @@ -2263,11 +2303,11 @@ "3 0.0 1.0 0.0 0.0 ... \n", "4 0.0 1.0 0.0 0.0 ... \n", "... ... ... ... ... ... \n", - "36618 0.0 1.0 0.0 0.0 ... \n", - "36619 0.0 1.0 0.0 0.0 ... \n", - "36620 0.0 1.0 0.0 0.0 ... \n", - "36621 0.0 1.0 0.0 0.0 ... \n", - "36622 0.0 1.0 0.0 0.0 ... \n", + "36629 0.0 1.0 0.0 0.0 ... \n", + "36630 0.0 1.0 0.0 0.0 ... \n", + "36631 0.0 1.0 0.0 0.0 ... \n", + "36632 0.0 1.0 0.0 0.0 ... \n", + "36633 0.0 1.0 0.0 0.0 ... \n", "\n", " day_26.0 day_27.0 day_28.0 day_29.0 day_30.0 day_31.0 \\\n", "0 0.0 0.0 0.0 0.0 0.0 0.0 \n", @@ -2276,11 +2316,11 @@ "3 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... \n", - "36618 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "36619 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "36620 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "36621 0.0 0.0 0.0 0.0 0.0 0.0 \n", - "36622 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "36629 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "36630 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "36631 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "36632 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "36633 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", " poutcome_failure poutcome_other poutcome_success poutcome_unknown \n", "0 0.0 0.0 0.0 1.0 \n", @@ -2289,13 +2329,13 @@ "3 0.0 0.0 0.0 1.0 \n", "4 0.0 0.0 0.0 1.0 \n", "... ... ... ... ... \n", - "36618 0.0 0.0 0.0 1.0 \n", - "36619 0.0 0.0 0.0 1.0 \n", - "36620 0.0 0.0 0.0 1.0 \n", - "36621 0.0 0.0 0.0 1.0 \n", - "36622 0.0 0.0 0.0 1.0 \n", + "36629 0.0 0.0 0.0 1.0 \n", + "36630 0.0 0.0 0.0 1.0 \n", + "36631 0.0 0.0 0.0 1.0 \n", + "36632 0.0 0.0 0.0 1.0 \n", + "36633 0.0 0.0 0.0 1.0 \n", "\n", - "[36623 rows x 55 columns]\n" + "[36634 rows x 55 columns]\n" ] } ], @@ -2358,11 +2398,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(_run pid=4609)\u001B[0m /home/fengjun.feng/anaconda3/envs/sf/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", - "\u001B[2m\u001B[36m(_run pid=4609)\u001B[0m warnings.warn(\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m /home/fengjun.feng/anaconda3/envs/sf/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m warnings.warn(\n", - "\u001B[2m\u001B[36m(pid=4610)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n" + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m warnings.warn(\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m warnings.warn(\n" ] }, { @@ -2370,124 +2409,124 @@ "output_type": "stream", "text": [ " age education default balance housing loan \\\n", - "0 1.600132 1.317305 -0.135366 0.254167 0.896115 -0.438311 \n", - "1 0.190420 -0.216431 -0.135366 -0.251669 0.896115 -0.438311 \n", - "2 -0.749388 -0.216431 -0.135366 -0.426916 -1.115928 -0.438311 \n", - "3 0.096439 -0.216431 -0.135366 2.177321 -1.115928 -0.438311 \n", - "4 -0.279484 -1.750166 -0.135366 0.040084 -1.115928 -0.438311 \n", + "0 1.608391 1.314552 -0.135554 0.256661 0.893944 -0.436771 \n", + "1 0.477140 1.314552 -0.135554 -0.376164 0.893944 -0.436771 \n", + "2 -0.748383 -0.217705 -0.135554 -0.433363 -1.118638 -0.436771 \n", + "3 0.100056 -0.217705 -0.135554 2.205062 -1.118638 -0.436771 \n", + "4 -0.277028 -1.749962 -0.135554 0.039768 -1.118638 -0.436771 \n", "... ... ... ... ... ... ... \n", - "36618 0.472362 -0.216431 -0.135366 -0.314980 0.896115 -0.438311 \n", - "36619 -0.467446 1.317305 -0.135366 1.135626 -1.115928 -0.438311 \n", - "36620 0.754305 -0.216431 -0.135366 -0.321833 -1.115928 -0.438311 \n", - "36621 0.754305 -1.750166 -0.135366 0.656551 -1.115928 -0.438311 \n", - "36622 -0.091522 -1.750166 -0.135366 -0.429527 -1.115928 -0.438311 \n", + "36629 -0.277028 -1.749962 -0.135554 0.635563 -1.118638 -0.436771 \n", + "36630 -0.465570 1.314552 -0.135554 1.149692 -1.118638 -0.436771 \n", + "36631 0.759952 -0.217705 -0.135554 -0.326900 -1.118638 -0.436771 \n", + "36632 -0.088486 -1.749962 -0.135554 -0.436008 -1.118638 -0.436771 \n", + "36633 0.477140 1.314552 -0.135554 -0.295160 -1.118638 -0.436771 \n", "\n", " job_admin. job_blue-collar job_entrepreneur job_housemaid ... \\\n", - "0 -0.359042 -0.52347 -0.184134 -0.167632 ... \n", - "1 -0.359042 -0.52347 -0.184134 -0.167632 ... \n", - "2 -0.359042 -0.52347 -0.184134 -0.167632 ... \n", - "3 -0.359042 -0.52347 -0.184134 -0.167632 ... \n", - "4 2.785192 -0.52347 -0.184134 -0.167632 ... \n", + "0 -0.360094 -0.525105 -0.185291 -0.16795 ... \n", + "1 -0.360094 -0.525105 -0.185291 -0.16795 ... \n", + "2 -0.360094 -0.525105 -0.185291 -0.16795 ... \n", + "3 -0.360094 -0.525105 -0.185291 -0.16795 ... \n", + "4 2.777051 -0.525105 -0.185291 -0.16795 ... \n", "... ... ... ... ... ... \n", - "36618 -0.359042 -0.52347 -0.184134 -0.167632 ... \n", - "36619 -0.359042 -0.52347 -0.184134 -0.167632 ... \n", - "36620 -0.359042 -0.52347 -0.184134 -0.167632 ... \n", - "36621 -0.359042 -0.52347 -0.184134 5.965435 ... \n", - "36622 -0.359042 1.91033 -0.184134 -0.167632 ... \n", + "36629 -0.360094 1.904383 -0.185291 -0.16795 ... \n", + "36630 -0.360094 -0.525105 -0.185291 -0.16795 ... \n", + "36631 -0.360094 -0.525105 -0.185291 -0.16795 ... \n", + "36632 -0.360094 1.904383 -0.185291 -0.16795 ... \n", + "36633 -0.360094 -0.525105 -0.185291 -0.16795 ... \n", "\n", " job_retired job_self-employed job_services job_student \\\n", - "0 -0.230656 -0.191305 -0.316887 -0.148 \n", - "1 -0.230656 -0.191305 -0.316887 -0.148 \n", - "2 -0.230656 -0.191305 -0.316887 -0.148 \n", - "3 -0.230656 -0.191305 -0.316887 -0.148 \n", - "4 -0.230656 -0.191305 -0.316887 -0.148 \n", + "0 -0.227915 -0.189196 -0.316887 -0.145356 \n", + "1 -0.227915 -0.189196 -0.316887 -0.145356 \n", + "2 -0.227915 -0.189196 -0.316887 -0.145356 \n", + "3 -0.227915 -0.189196 -0.316887 -0.145356 \n", + "4 -0.227915 -0.189196 -0.316887 -0.145356 \n", "... ... ... ... ... \n", - "36618 -0.230656 -0.191305 -0.316887 -0.148 \n", - "36619 -0.230656 5.227241 -0.316887 -0.148 \n", - "36620 -0.230656 -0.191305 -0.316887 -0.148 \n", - "36621 -0.230656 -0.191305 -0.316887 -0.148 \n", - "36622 -0.230656 -0.191305 -0.316887 -0.148 \n", + "36629 -0.227915 -0.189196 -0.316887 -0.145356 \n", + "36630 -0.227915 5.285528 -0.316887 -0.145356 \n", + "36631 -0.227915 -0.189196 -0.316887 -0.145356 \n", + "36632 -0.227915 -0.189196 -0.316887 -0.145356 \n", + "36633 -0.227915 -0.189196 3.155697 -0.145356 \n", "\n", " job_technician job_unemployed job_unknown marital_divorced \\\n", - "0 -0.450251 -0.171298 -0.081048 -0.359187 \n", - "1 2.220984 -0.171298 -0.081048 -0.359187 \n", - "2 2.220984 -0.171298 -0.081048 -0.359187 \n", - "3 2.220984 -0.171298 -0.081048 -0.359187 \n", - "4 -0.450251 -0.171298 -0.081048 -0.359187 \n", + "0 -0.449906 -0.172953 -0.080522 -0.362219 \n", + "1 -0.449906 -0.172953 -0.080522 -0.362219 \n", + "2 2.222685 -0.172953 -0.080522 -0.362219 \n", + "3 2.222685 -0.172953 -0.080522 -0.362219 \n", + "4 -0.449906 -0.172953 -0.080522 -0.362219 \n", "... ... ... ... ... \n", - "36618 2.220984 -0.171298 -0.081048 -0.359187 \n", - "36619 -0.450251 -0.171298 -0.081048 -0.359187 \n", - "36620 2.220984 -0.171298 -0.081048 -0.359187 \n", - "36621 -0.450251 -0.171298 -0.081048 -0.359187 \n", - "36622 -0.450251 -0.171298 -0.081048 -0.359187 \n", + "36629 -0.449906 -0.172953 -0.080522 -0.362219 \n", + "36630 -0.449906 -0.172953 -0.080522 -0.362219 \n", + "36631 2.222685 -0.172953 -0.080522 -0.362219 \n", + "36632 -0.449906 -0.172953 -0.080522 -0.362219 \n", + "36633 -0.449906 -0.172953 -0.080522 -0.362219 \n", "\n", " marital_married marital_single \n", - "0 0.811943 -0.628323 \n", - "1 -1.231613 1.591538 \n", - "2 -1.231613 1.591538 \n", - "3 0.811943 -0.628323 \n", - "4 0.811943 -0.628323 \n", + "0 0.815216 -0.628656 \n", + "1 0.815216 -0.628656 \n", + "2 -1.226669 1.590694 \n", + "3 0.815216 -0.628656 \n", + "4 0.815216 -0.628656 \n", "... ... ... \n", - "36618 0.811943 -0.628323 \n", - "36619 -1.231613 1.591538 \n", - "36620 0.811943 -0.628323 \n", - "36621 0.811943 -0.628323 \n", - "36622 0.811943 -0.628323 \n", + "36629 0.815216 -0.628656 \n", + "36630 -1.226669 1.590694 \n", + "36631 0.815216 -0.628656 \n", + "36632 0.815216 -0.628656 \n", + "36633 0.815216 -0.628656 \n", "\n", - "[36623 rows x 21 columns]\n", + "[36634 rows x 21 columns]\n", " duration campaign pdays previous y contact_cellular \\\n", - "0 0.572287 -0.570990 -0.413789 -0.244781 0 -1.3578 \n", - "1 -2.892304 -0.570990 -0.413789 -0.244781 0 -1.3578 \n", - "2 0.327407 -0.243733 -0.413789 -0.244781 0 -1.3578 \n", - "3 1.787691 0.738036 -0.413789 -0.244781 0 -1.3578 \n", - "4 0.581224 -0.243733 -0.413789 -0.244781 0 -1.3578 \n", + "0 0.646636 -0.570738 -0.412237 -0.243872 0 -1.359335 \n", + "1 -0.174118 -0.570738 -0.412237 -0.243872 0 -1.359335 \n", + "2 0.417271 -0.246893 -0.412237 -0.243872 0 -1.359335 \n", + "3 1.733068 0.724643 -0.412237 -0.243872 0 -1.359335 \n", + "4 0.562397 -0.246893 -0.412237 -0.243872 0 -1.359335 \n", "... ... ... ... ... .. ... \n", - "36618 1.466934 -0.243733 -0.413789 -0.244781 0 -1.3578 \n", - "36619 1.787691 0.083523 -0.413789 -0.244781 1 -1.3578 \n", - "36620 0.191127 -0.243733 -0.413789 -0.244781 0 -1.3578 \n", - "36621 -0.465904 0.083523 -0.413789 -0.244781 0 -1.3578 \n", - "36622 -0.465904 0.738036 -0.413789 -0.244781 0 -1.3578 \n", + "36629 1.073762 -0.246893 -0.412237 -0.243872 0 -1.359335 \n", + "36630 1.733068 0.076952 -0.412237 -0.243872 1 -1.359335 \n", + "36631 0.233878 -0.246893 -0.412237 -0.243872 0 -1.359335 \n", + "36632 -0.489491 0.724643 -0.412237 -0.243872 0 -1.359335 \n", + "36633 0.899028 -0.246893 -0.412237 -0.243872 0 -1.359335 \n", "\n", " contact_telephone contact_unknown month_1.0 month_2.0 ... \\\n", - "0 -0.262983 1.576257 -0.180053 -0.25201 ... \n", - "1 -0.262983 1.576257 -0.180053 -0.25201 ... \n", - "2 -0.262983 1.576257 -0.180053 -0.25201 ... \n", - "3 -0.262983 1.576257 -0.180053 -0.25201 ... \n", - "4 -0.262983 1.576257 -0.180053 -0.25201 ... \n", + "0 -0.261573 1.575748 -0.178158 -0.249263 ... \n", + "1 -0.261573 1.575748 -0.178158 -0.249263 ... \n", + "2 -0.261573 1.575748 -0.178158 -0.249263 ... \n", + "3 -0.261573 1.575748 -0.178158 -0.249263 ... \n", + "4 -0.261573 1.575748 -0.178158 -0.249263 ... \n", "... ... ... ... ... ... \n", - "36618 -0.262983 1.576257 -0.180053 -0.25201 ... \n", - "36619 -0.262983 1.576257 -0.180053 -0.25201 ... \n", - "36620 -0.262983 1.576257 -0.180053 -0.25201 ... \n", - "36621 -0.262983 1.576257 -0.180053 -0.25201 ... \n", - "36622 -0.262983 1.576257 -0.180053 -0.25201 ... \n", + "36629 -0.261573 1.575748 -0.178158 -0.249263 ... \n", + "36630 -0.261573 1.575748 -0.178158 -0.249263 ... \n", + "36631 -0.261573 1.575748 -0.178158 -0.249263 ... \n", + "36632 -0.261573 1.575748 -0.178158 -0.249263 ... \n", + "36633 -0.261573 1.575748 -0.178158 -0.249263 ... \n", "\n", " day_26.0 day_27.0 day_28.0 day_29.0 day_30.0 day_31.0 \\\n", - "0 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", - "1 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", - "2 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", - "3 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", - "4 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", + "0 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", + "1 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", + "2 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", + "3 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", + "4 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", "... ... ... ... ... ... ... \n", - "36618 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", - "36619 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", - "36620 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", - "36621 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", - "36622 -0.152467 -0.158816 -0.207087 -0.199218 -0.190383 -0.119779 \n", + "36629 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", + "36630 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", + "36631 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", + "36632 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", + "36633 -0.15375 -0.160947 -0.204243 -0.200148 -0.188032 -0.120812 \n", "\n", " poutcome_failure poutcome_other poutcome_success poutcome_unknown \n", - "0 -0.349768 -0.207302 -0.18775 0.475263 \n", - "1 -0.349768 -0.207302 -0.18775 0.475263 \n", - "2 -0.349768 -0.207302 -0.18775 0.475263 \n", - "3 -0.349768 -0.207302 -0.18775 0.475263 \n", - "4 -0.349768 -0.207302 -0.18775 0.475263 \n", + "0 -0.350446 -0.204025 -0.18733 0.473664 \n", + "1 -0.350446 -0.204025 -0.18733 0.473664 \n", + "2 -0.350446 -0.204025 -0.18733 0.473664 \n", + "3 -0.350446 -0.204025 -0.18733 0.473664 \n", + "4 -0.350446 -0.204025 -0.18733 0.473664 \n", "... ... ... ... ... \n", - "36618 -0.349768 -0.207302 -0.18775 0.475263 \n", - "36619 -0.349768 -0.207302 -0.18775 0.475263 \n", - "36620 -0.349768 -0.207302 -0.18775 0.475263 \n", - "36621 -0.349768 -0.207302 -0.18775 0.475263 \n", - "36622 -0.349768 -0.207302 -0.18775 0.475263 \n", + "36629 -0.350446 -0.204025 -0.18733 0.473664 \n", + "36630 -0.350446 -0.204025 -0.18733 0.473664 \n", + "36631 -0.350446 -0.204025 -0.18733 0.473664 \n", + "36632 -0.350446 -0.204025 -0.18733 0.473664 \n", + "36633 -0.350446 -0.204025 -0.18733 0.473664 \n", "\n", - "[36623 rows x 55 columns]\n" + "[36634 rows x 55 columns]\n" ] } ], @@ -2620,1826 +2659,1826 @@ " \n", " age\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -2.159100\n", - " 5.077423\n", - " 1.490039e-16\n", + " -2.162447\n", + " 5.096416\n", + " -1.877506e-16\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 0.683627\n", - " 3.332610\n", + " 0.682366\n", + " 3.328235\n", " 1.000000\n", - " 0.683627\n", - " 3.332610\n", - " 5.456968e-12\n", - " 36623.0\n", - " 2.503646e+04\n", - " 1.220502e+05\n", + " 0.682366\n", + " 3.328235\n", + " -6.878054e-12\n", + " 36634.0\n", + " 2.499780e+04\n", + " 1.219266e+05\n", " \n", " \n", " education\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -1.750166\n", - " 1.317305\n", - " -2.235058e-16\n", + " -1.749962\n", + " 1.314552\n", + " 9.309945e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " -0.150311\n", - " 2.309413\n", + " -0.152303\n", + " 2.305096\n", " 1.000000\n", - " -0.150311\n", - " 2.309413\n", - " -8.185452e-12\n", - " 36623.0\n", - " -5.504831e+03\n", - " 8.457763e+04\n", + " -0.152303\n", + " 2.305096\n", + " 3.410605e-13\n", + " 36634.0\n", + " -5.579460e+03\n", + " 8.444490e+04\n", " \n", " \n", " default\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.135366\n", - " 7.387394\n", - " -2.483398e-17\n", + " -0.135554\n", + " 7.377133\n", + " 4.965304e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 7.252029\n", - " 53.591920\n", + " 7.241579\n", + " 53.440463\n", " 1.000000\n", - " 7.252029\n", - " 53.591920\n", - " -9.094947e-13\n", - " 36623.0\n", - " 2.655910e+05\n", - " 1.962697e+06\n", + " 7.241579\n", + " 53.440463\n", + " 1.818989e-12\n", + " 36634.0\n", + " 2.652880e+05\n", + " 1.957738e+06\n", " \n", " \n", " balance\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -3.062156\n", - " 32.883499\n", - " 3.880309e-17\n", + " -1.566762\n", + " 32.087712\n", + " -1.396492e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 8.470364\n", - " 147.645486\n", + " 7.967780\n", + " 128.947991\n", " 1.000000\n", - " 8.470364\n", - " 147.645486\n", - " 1.421085e-12\n", - " 36623.0\n", - " 3.102101e+05\n", - " 5.407221e+06\n", + " 7.967780\n", + " 128.947991\n", + " -5.115908e-13\n", + " 36634.0\n", + " 2.918916e+05\n", + " 4.723881e+06\n", " \n", " \n", " housing\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -1.115928\n", - " 0.896115\n", - " 2.110888e-16\n", + " -1.118638\n", + " 0.893944\n", + " -3.103315e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " -0.219812\n", - " 1.048317\n", + " -0.224695\n", + " 1.050488\n", " 1.000000\n", - " -0.219812\n", - " 1.048317\n", - " 7.730705e-12\n", - " 36623.0\n", - " -8.050184e+03\n", - " 3.839253e+04\n", + " -0.224695\n", + " 1.050488\n", + " -1.136868e-12\n", + " 36634.0\n", + " -8.231462e+03\n", + " 3.848356e+04\n", " \n", " \n", " loan\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.438311\n", - " 2.281487\n", - " 7.062162e-17\n", + " -0.436771\n", + " 2.289530\n", + " 5.779924e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 1.843177\n", - " 4.397301\n", + " 1.852760\n", + " 4.432718\n", " 1.000000\n", - " 1.843177\n", - " 4.397301\n", - " 2.586376e-12\n", - " 36623.0\n", - " 6.750267e+04\n", - " 1.610423e+05\n", + " 1.852760\n", + " 4.432718\n", + " 2.117417e-12\n", + " 36634.0\n", + " 6.787399e+04\n", + " 1.623882e+05\n", " \n", " \n", " job_admin.\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.359042\n", - " 2.785192\n", - " -2.561004e-17\n", + " -0.360094\n", + " 2.777051\n", + " -2.288695e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 2.426150\n", - " 6.886204\n", + " 2.416956\n", + " 6.841677\n", " 1.000000\n", - " 2.426150\n", - " 6.886204\n", - " -9.379164e-13\n", - " 36623.0\n", - " 8.885289e+04\n", - " 2.521935e+05\n", + " 2.416956\n", + " 6.841677\n", + " -8.384404e-13\n", + " 36634.0\n", + " 8.854277e+04\n", + " 2.506380e+05\n", " \n", " \n", " job_blue-collar\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.523470\n", - " 1.910330\n", - " -4.035521e-17\n", + " -0.525105\n", + " 1.904383\n", + " 3.103315e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 1.386860\n", - " 2.923380\n", + " 1.379278\n", + " 2.902408\n", " 1.000000\n", - " 1.386860\n", - " 2.923380\n", - " -1.477929e-12\n", - " 36623.0\n", - " 5.079096e+04\n", - " 1.070629e+05\n", + " 1.379278\n", + " 2.902408\n", + " 1.136868e-12\n", + " 36634.0\n", + " 5.052848e+04\n", + " 1.063268e+05\n", " \n", " \n", " job_entrepreneur\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.184134\n", - " 5.430815\n", - " 1.940154e-18\n", + " -0.185291\n", + " 5.396911\n", + " -9.697859e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.246681\n", - " 28.527661\n", + " 5.211619\n", + " 28.160978\n", " 1.000000\n", - " 5.246681\n", - " 28.527661\n", - " 7.105427e-14\n", - " 36623.0\n", - " 1.921492e+05\n", - " 1.044769e+06\n", + " 5.211619\n", + " 28.160978\n", + " -3.552714e-13\n", + " 36634.0\n", + " 1.909225e+05\n", + " 1.031649e+06\n", " \n", " \n", " job_housemaid\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.167632\n", - " 5.965435\n", - " -5.432432e-18\n", + " -0.167950\n", + " 5.954136\n", + " 5.430801e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.797803\n", - " 34.614514\n", + " 5.786186\n", + " 34.479949\n", " 1.000000\n", - " 5.797803\n", - " 34.614514\n", - " -1.989520e-13\n", - " 36623.0\n", - " 2.123329e+05\n", - " 1.267687e+06\n", + " 5.786186\n", + " 34.479949\n", + " 1.989520e-12\n", + " 36634.0\n", + " 2.119711e+05\n", + " 1.263138e+06\n", " \n", " \n", " job_management\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.513253\n", - " 1.948358\n", - " -1.179614e-16\n", + " -0.513622\n", + " 1.946956\n", + " 5.430801e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 1.435105\n", - " 3.059526\n", + " 1.433333\n", + " 3.054445\n", " 1.000000\n", - " 1.435105\n", - " 3.059526\n", - " -4.320100e-12\n", - " 36623.0\n", - " 5.255784e+04\n", - " 1.120490e+05\n", + " 1.433333\n", + " 3.054445\n", + " 1.989520e-12\n", + " 36634.0\n", + " 5.250874e+04\n", + " 1.118965e+05\n", " \n", " \n", " job_retired\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.230656\n", - " 4.335460\n", - " -2.172973e-17\n", + " -0.227915\n", + " 4.387592\n", + " 6.361796e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.104804\n", - " 17.849418\n", + " 4.159677\n", + " 18.302913\n", " 1.000000\n", - " 4.104804\n", - " 17.849418\n", - " -7.958079e-13\n", - " 36623.0\n", - " 1.503302e+05\n", - " 6.536993e+05\n", + " 4.159677\n", + " 18.302913\n", + " 2.330580e-12\n", + " 36634.0\n", + " 1.523856e+05\n", + " 6.705089e+05\n", " \n", " \n", " job_self-employed\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.191305\n", - " 5.227241\n", - " -5.820463e-17\n", + " -0.189196\n", + " 5.285528\n", + " -4.267058e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.035936\n", - " 26.360650\n", + " 5.096332\n", + " 26.972604\n", " 1.000000\n", - " 5.035936\n", - " 26.360650\n", - " -2.131628e-12\n", - " 36623.0\n", - " 1.844311e+05\n", - " 9.654061e+05\n", + " 5.096332\n", + " 26.972604\n", + " -1.563194e-12\n", + " 36634.0\n", + " 1.866990e+05\n", + " 9.881144e+05\n", " \n", " \n", " job_services\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " -0.316887\n", - " 3.155695\n", - " 2.483398e-17\n", + " 3.155697\n", + " 6.361796e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 2.838807\n", - " 9.058826\n", + " 2.838809\n", + " 9.058838\n", " 1.000000\n", - " 2.838807\n", - " 9.058826\n", - " 9.094947e-13\n", - " 36623.0\n", - " 1.039656e+05\n", - " 3.317614e+05\n", + " 2.838809\n", + " 9.058838\n", + " 2.330580e-12\n", + " 36634.0\n", + " 1.039969e+05\n", + " 3.318615e+05\n", " \n", " \n", " job_student\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.148000\n", - " 6.756738\n", - " -1.241699e-17\n", + " -0.145356\n", + " 6.879667\n", + " -9.309945e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 6.608737\n", - " 44.675407\n", + " 6.734311\n", + " 46.350944\n", " 1.000000\n", - " 6.608737\n", - " 44.675407\n", - " -4.547474e-13\n", - " 36623.0\n", - " 2.420318e+05\n", - " 1.636147e+06\n", + " 6.734311\n", + " 46.350944\n", + " -3.410605e-13\n", + " 36634.0\n", + " 2.467047e+05\n", + " 1.698020e+06\n", " \n", " \n", " job_technician\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.450251\n", - " 2.220984\n", - " 4.074324e-17\n", + " -0.449906\n", + " 2.222685\n", + " -1.861989e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 1.770734\n", - " 4.135498\n", + " 1.772778\n", + " 4.142743\n", " 1.000000\n", - " 1.770734\n", - " 4.135498\n", - " 1.492140e-12\n", - " 36623.0\n", - " 6.484958e+04\n", - " 1.514543e+05\n", + " 1.772778\n", + " 4.142743\n", + " -6.821210e-13\n", + " 36634.0\n", + " 6.494396e+04\n", + " 1.517653e+05\n", " \n", " \n", " job_unemployed\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.171298\n", - " 5.837765\n", - " -1.668533e-17\n", + " -0.172953\n", + " 5.781907\n", + " 1.474075e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.666467\n", - " 33.108845\n", + " 5.608954\n", + " 32.460364\n", " 1.000000\n", - " 5.666467\n", - " 33.108845\n", - " -6.110668e-13\n", - " 36623.0\n", - " 2.075230e+05\n", - " 1.212545e+06\n", + " 5.608954\n", + " 32.460364\n", + " 5.400125e-13\n", + " 36634.0\n", + " 2.054784e+05\n", + " 1.189153e+06\n", " \n", " \n", " job_unknown\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.081048\n", - " 12.338327\n", - " -6.984556e-18\n", + " -0.080522\n", + " 12.418889\n", + " -1.512866e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 12.257279\n", - " 151.240878\n", + " 12.338367\n", + " 153.235297\n", " 1.000000\n", - " 12.257279\n", - " 151.240878\n", - " -2.557954e-13\n", - " 36623.0\n", - " 4.488983e+05\n", - " 5.538895e+06\n", + " 12.338367\n", + " 153.235297\n", + " -5.542233e-13\n", + " 36634.0\n", + " 4.520037e+05\n", + " 5.613622e+06\n", " \n", " \n", " marital_divorced\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.359187\n", - " 2.784065\n", - " 1.319305e-17\n", + " -0.362219\n", + " 2.760760\n", + " 2.754192e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 2.424877\n", - " 6.880031\n", + " 2.398540\n", + " 6.752996\n", " 1.000000\n", - " 2.424877\n", - " 6.880031\n", - " 4.831691e-13\n", - " 36623.0\n", - " 8.880629e+04\n", - " 2.519674e+05\n", + " 2.398540\n", + " 6.752996\n", + " 1.008971e-12\n", + " 36634.0\n", + " 8.786813e+04\n", + " 2.473893e+05\n", " \n", " \n", " marital_married\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -1.231613\n", - " 0.811943\n", - " -2.696815e-17\n", + " -1.226669\n", + " 0.815216\n", + " -1.334425e-16\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " -0.419670\n", - " 1.176123\n", + " -0.411454\n", + " 1.169294\n", " 1.000000\n", - " -0.419670\n", - " 1.176123\n", - " -9.876544e-13\n", - " 36623.0\n", - " -1.536959e+04\n", - " 4.307316e+04\n", + " -0.411454\n", + " 1.169294\n", + " -4.888534e-12\n", + " 36634.0\n", + " -1.507319e+04\n", + " 4.283592e+04\n", " \n", " \n", " marital_single\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.628323\n", - " 1.591538\n", - " 1.416313e-16\n", + " -0.628656\n", + " 1.590694\n", + " 6.012673e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 0.963215\n", - " 1.927782\n", + " 0.962038\n", + " 1.925516\n", " 1.000000\n", - " 0.963215\n", - " 1.927782\n", - " 5.186962e-12\n", - " 36623.0\n", - " 3.527581e+04\n", - " 7.060118e+04\n", + " 0.962038\n", + " 1.925516\n", + " 2.202682e-12\n", + " 36634.0\n", + " 3.524328e+04\n", + " 7.053936e+04\n", " \n", " \n", " duration\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -2.892304\n", - " 1.941783\n", - " 4.908591e-17\n", + " -2.809886\n", + " 1.846421\n", + " 6.633336e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " -0.800767\n", - " 3.760395\n", + " -0.976752\n", + " 4.099032\n", " 1.000000\n", - " -0.800767\n", - " 3.760395\n", - " 1.797673e-12\n", - " 36623.0\n", - " -2.932649e+04\n", - " 1.377170e+05\n", + " -0.976752\n", + " 4.099032\n", + " 2.430056e-12\n", + " 36634.0\n", + " -3.578234e+04\n", + " 1.501639e+05\n", " \n", " \n", " campaign\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.570990\n", - " 19.718918\n", - " 6.208494e-17\n", + " -0.570738\n", + " 19.507670\n", + " 4.034309e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.928998\n", - " 43.121657\n", + " 4.880232\n", + " 41.921269\n", " 1.000000\n", - " 4.928998\n", - " 43.121657\n", - " 2.273737e-12\n", - " 36623.0\n", - " 1.805147e+05\n", - " 1.579244e+06\n", + " 4.880232\n", + " 41.921269\n", + " 1.477929e-12\n", + " 36634.0\n", + " 1.787824e+05\n", + " 1.535744e+06\n", " \n", " \n", " pdays\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.413789\n", - " 8.280388\n", - " 2.483398e-17\n", + " -0.412237\n", + " 8.263154\n", + " -6.206630e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 2.604408\n", - " 9.919031\n", + " 2.619630\n", + " 10.019985\n", " 1.000000\n", - " 2.604408\n", - " 9.919031\n", - " 9.094947e-13\n", - " 36623.0\n", - " 9.538123e+04\n", - " 3.632647e+05\n", + " 2.619630\n", + " 10.019985\n", + " -2.273737e-13\n", + " 36634.0\n", + " 9.596753e+04\n", + " 3.670721e+05\n", " \n", " \n", " previous\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.244781\n", - " 113.983016\n", - " 1.862548e-17\n", + " -0.243872\n", + " 113.389007\n", + " -4.034309e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 44.589497\n", - " 4659.092137\n", + " 43.957189\n", + " 4561.467798\n", " 1.000000\n", - " 44.589497\n", - " 4659.092137\n", - " 6.821210e-13\n", - " 36623.0\n", - " 1.633001e+06\n", - " 1.706299e+08\n", + " 43.957189\n", + " 4561.467798\n", + " -1.477929e-12\n", + " 36634.0\n", + " 1.610328e+06\n", + " 1.671048e+08\n", " \n", " \n", " y\n", " int64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", " 0.000000\n", " 1.000000\n", - " 1.165388e-01\n", - " 0.102960\n", - " 0.320874\n", - " ...\n", - " 0.116539\n", - " 0.116539\n", - " 0.116539\n", - " 0.102957\n", - " 0.078960\n", - " 0.071157\n", - " 4.268000e+03\n", - " 4268.0\n", - " 4.268000e+03\n", - " 4.268000e+03\n", + " 1.174592e-01\n", + " 0.103665\n", + " 0.321971\n", + " ...\n", + " 0.117459\n", + " 0.117459\n", + " 0.117459\n", + " 0.103663\n", + " 0.079310\n", + " 0.071425\n", + " 4.303000e+03\n", + " 4303.0\n", + " 4.303000e+03\n", + " 4.303000e+03\n", " \n", " \n", " contact_cellular\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -1.357800\n", - " 0.736485\n", - " 7.450193e-17\n", + " -1.359335\n", + " 0.735654\n", + " -4.965304e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " -0.621315\n", - " 1.386032\n", + " -0.623682\n", + " 1.388979\n", " 1.000000\n", - " -0.621315\n", - " 1.386032\n", - " 2.728484e-12\n", - " 36623.0\n", - " -2.275441e+04\n", - " 5.076065e+04\n", + " -0.623682\n", + " 1.388979\n", + " -1.818989e-12\n", + " 36634.0\n", + " -2.284795e+04\n", + " 5.088384e+04\n", " \n", " \n", " contact_telephone\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.262983\n", - " 3.802534\n", - " 2.483398e-17\n", + " -0.261573\n", + " 3.823024\n", + " 5.896298e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 3.539552\n", - " 13.528425\n", + " 3.561451\n", + " 13.683936\n", " 1.000000\n", - " 3.539552\n", - " 13.528425\n", - " 9.094947e-13\n", - " 36623.0\n", - " 1.296290e+05\n", - " 4.954515e+05\n", + " 3.561451\n", + " 13.683936\n", + " 2.160050e-12\n", + " 36634.0\n", + " 1.304702e+05\n", + " 5.012973e+05\n", " \n", " \n", " contact_unknown\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.634414\n", - " 1.576257\n", - " -1.241699e-16\n", + " -0.634619\n", + " 1.575748\n", + " -1.241326e-16\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 0.941843\n", - " 1.887068\n", + " 0.941129\n", + " 1.885723\n", " 1.000000\n", - " 0.941843\n", - " 1.887068\n", + " 0.941129\n", + " 1.885723\n", " -4.547474e-12\n", - " 36623.0\n", - " 3.449310e+04\n", - " 6.911008e+04\n", + " 36634.0\n", + " 3.447731e+04\n", + " 6.908158e+04\n", " \n", " \n", " month_1.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.180053\n", - " 5.553925\n", - " 6.208494e-18\n", + " -0.178158\n", + " 5.613000\n", + " -2.482652e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.373873\n", - " 29.878506\n", + " 5.434842\n", + " 30.537508\n", " 1.000000\n", - " 5.373873\n", - " 29.878506\n", - " 2.273737e-13\n", - " 36623.0\n", - " 1.968073e+05\n", - " 1.094241e+06\n", + " 5.434842\n", + " 30.537508\n", + " -9.094947e-13\n", + " 36634.0\n", + " 1.991000e+05\n", + " 1.118711e+06\n", " \n", " \n", " month_2.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.252010\n", - " 3.968094\n", - " 3.104247e-18\n", + " -0.249263\n", + " 4.011823\n", + " 8.378950e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 3.716084\n", - " 14.809280\n", + " 3.762560\n", + " 15.156859\n", " 1.000000\n", - " 3.716084\n", - " 14.809280\n", - " 1.136868e-13\n", - " 36623.0\n", - " 1.360941e+05\n", - " 5.423602e+05\n", + " 3.762560\n", + " 15.156859\n", + " 3.069545e-12\n", + " 36634.0\n", + " 1.378376e+05\n", + " 5.552564e+05\n", " \n", " \n", " month_3.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.102803\n", - " 9.727354\n", - " -2.483398e-17\n", + " -0.103058\n", + " 9.703260\n", + " -5.275635e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 9.624551\n", - " 93.631978\n", + " 9.600201\n", + " 93.163868\n", " 1.000000\n", - " 9.624551\n", - " 93.631978\n", - " -9.094947e-13\n", - " 36623.0\n", - " 3.524799e+05\n", - " 3.429084e+06\n", + " 9.600201\n", + " 93.163868\n", + " -1.932676e-12\n", + " 36634.0\n", + " 3.516938e+05\n", + " 3.412965e+06\n", " \n", " \n", " month_4.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.262091\n", - " 3.815465\n", - " -2.793822e-17\n", + " -0.265247\n", + " 3.770074\n", + " 1.861989e-16\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 3.553374\n", - " 13.626466\n", + " 3.504827\n", + " 13.283811\n", " 1.000000\n", - " 3.553374\n", - " 13.626466\n", - " -1.023182e-12\n", - " 36623.0\n", - " 1.301352e+05\n", - " 4.990421e+05\n", + " 3.504827\n", + " 13.283811\n", + " 6.821210e-12\n", + " 36634.0\n", + " 1.283958e+05\n", + " 4.866391e+05\n", " \n", " \n", " month_5.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.659087\n", - " 1.517250\n", - " 1.614208e-16\n", + " -0.658775\n", + " 1.517969\n", + " -1.241326e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 0.858162\n", - " 1.736443\n", + " 0.859194\n", + " 1.738215\n", " 1.000000\n", - " 0.858162\n", - " 1.736443\n", - " 5.911716e-12\n", - " 36623.0\n", - " 3.142848e+04\n", - " 6.359374e+04\n", + " 0.859194\n", + " 1.738215\n", + " -4.547474e-13\n", + " 36634.0\n", + " 3.147572e+04\n", + " 6.367775e+04\n", " \n", " \n", " month_6.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.364928\n", - " 2.740267\n", - " 1.986718e-16\n", + " -0.366401\n", + " 2.729249\n", + " -7.447956e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 2.375339\n", - " 6.642234\n", + " 2.362848\n", + " 6.583051\n", " 1.000000\n", - " 2.375339\n", - " 6.642234\n", - " 7.275958e-12\n", - " 36623.0\n", - " 8.699203e+04\n", - " 2.432585e+05\n", + " 2.362848\n", + " 6.583051\n", + " -2.728484e-12\n", + " 36634.0\n", + " 8.656057e+04\n", + " 2.411635e+05\n", " \n", " \n", " month_7.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.424911\n", - " 2.353434\n", - " 0.000000e+00\n", + " -0.425328\n", + " 2.351127\n", + " -1.551657e-16\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 1.928524\n", - " 4.719203\n", + " 1.925799\n", + " 4.708701\n", " 1.000000\n", - " 1.928524\n", - " 4.719203\n", - " 0.000000e+00\n", - " 36623.0\n", - " 7.062832e+04\n", - " 1.728314e+05\n", + " 1.925799\n", + " 4.708701\n", + " -5.684342e-12\n", + " 36634.0\n", + " 7.054972e+04\n", + " 1.724986e+05\n", " \n", " \n", " month_8.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.400668\n", - " 2.495832\n", - " 2.980077e-16\n", + " -0.402386\n", + " 2.485176\n", + " -1.861989e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 2.095163\n", - " 5.389710\n", + " 2.082791\n", + " 5.338016\n", " 1.000000\n", - " 2.095163\n", - " 5.389710\n", - " 1.091394e-11\n", - " 36623.0\n", - " 7.673117e+04\n", - " 1.973873e+05\n", + " 2.082791\n", + " 5.338016\n", + " -6.821210e-13\n", + " 36634.0\n", + " 7.630095e+04\n", + " 1.955529e+05\n", " \n", " \n", " month_9.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.114876\n", - " 8.705043\n", - " 4.190733e-17\n", + " -0.112144\n", + " 8.917078\n", + " -1.706823e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 8.590167\n", - " 74.790974\n", + " 8.804934\n", + " 78.526862\n", " 1.000000\n", - " 8.590167\n", - " 74.790974\n", - " 1.534772e-12\n", - " 36623.0\n", - " 3.145977e+05\n", - " 2.739070e+06\n", + " 8.804934\n", + " 78.526862\n", + " -6.252776e-13\n", + " 36634.0\n", + " 3.225600e+05\n", + " 2.876753e+06\n", " \n", " \n", " month_10.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.129930\n", - " 7.696441\n", - " 4.345946e-17\n", + " -0.127389\n", + " 7.849982\n", + " -9.309945e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 7.566510\n", - " 58.252079\n", + " 7.722593\n", + " 60.638450\n", " 1.000000\n", - " 7.566510\n", - " 58.252079\n", - " 1.591616e-12\n", - " 36623.0\n", - " 2.771083e+05\n", - " 2.133366e+06\n", + " 7.722593\n", + " 60.638450\n", + " -3.410605e-13\n", + " 36634.0\n", + " 2.829095e+05\n", + " 2.221429e+06\n", " \n", " \n", " month_11.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.310534\n", - " 3.220260\n", - " -4.966795e-17\n", + " -0.310483\n", + " 3.220790\n", + " 5.585967e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 2.909726\n", - " 9.466503\n", + " 2.910307\n", + " 9.469886\n", " 1.000000\n", - " 2.909726\n", - " 9.466503\n", - " -1.818989e-12\n", - " 36623.0\n", - " 1.065629e+05\n", - " 3.466917e+05\n", + " 2.910307\n", + " 9.469886\n", + " 2.046363e-12\n", + " 36634.0\n", + " 1.066162e+05\n", + " 3.469198e+05\n", " \n", " \n", " month_12.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.070476\n", - " 14.189328\n", - " 9.312741e-18\n", + " -0.068280\n", + " 14.645618\n", + " 7.758287e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 14.118852\n", - " 200.341983\n", + " 14.577338\n", + " 213.498780\n", " 1.000000\n", - " 14.118852\n", - " 200.341983\n", - " 3.410605e-13\n", - " 36623.0\n", - " 5.170747e+05\n", - " 7.337124e+06\n", + " 14.577338\n", + " 213.498780\n", + " 2.842171e-13\n", + " 36634.0\n", + " 5.340262e+05\n", + " 7.821314e+06\n", " \n", " \n", " day_1.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.085536\n", - " 11.691041\n", - " -2.172973e-17\n", + " -0.086489\n", + " 11.562172\n", + " 3.103315e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 11.605506\n", - " 135.687767\n", + " 11.475683\n", + " 132.691304\n", " 1.000000\n", - " 11.605506\n", - " 135.687767\n", - " -7.958079e-13\n", - " 36623.0\n", - " 4.250284e+05\n", - " 4.969293e+06\n", + " 11.475683\n", + " 132.691304\n", + " 1.136868e-13\n", + " 36634.0\n", + " 4.204002e+05\n", + " 4.861013e+06\n", " \n", " \n", " day_2.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.171552\n", - " 5.829150\n", - " -7.450193e-17\n", + " -0.171018\n", + " 5.847321\n", + " -5.585967e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.657598\n", - " 33.008418\n", + " 5.676302\n", + " 33.220410\n", " 1.000000\n", - " 5.657598\n", - " 33.008418\n", - " -2.728484e-12\n", - " 36623.0\n", - " 2.071982e+05\n", - " 1.208867e+06\n", + " 5.676302\n", + " 33.220410\n", + " -2.046363e-12\n", + " 36634.0\n", + " 2.079457e+05\n", + " 1.216996e+06\n", " \n", " \n", " day_3.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.156817\n", - " 6.376864\n", - " -4.966795e-17\n", + " -0.158339\n", + " 6.315549\n", + " 0.000000e+00\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 6.220047\n", - " 39.688983\n", + " 6.157210\n", + " 38.911232\n", " 1.000000\n", - " 6.220047\n", - " 39.688983\n", - " -1.818989e-12\n", - " 36623.0\n", - " 2.277968e+05\n", - " 1.453530e+06\n", + " 6.157210\n", + " 38.911232\n", + " 0.000000e+00\n", + " 36634.0\n", + " 2.255632e+05\n", + " 1.425474e+06\n", " \n", " \n", " day_4.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.181101\n", - " 5.521785\n", - " -3.414672e-17\n", + " -0.180429\n", + " 5.542360\n", + " -3.258481e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.340684\n", - " 29.522909\n", + " 5.361931\n", + " 29.750303\n", " 1.000000\n", - " 5.340684\n", - " 29.522909\n", - " -1.250555e-12\n", - " 36623.0\n", - " 1.955919e+05\n", - " 1.081218e+06\n", + " 5.361931\n", + " 29.750303\n", + " -1.193712e-12\n", + " 36634.0\n", + " 1.964290e+05\n", + " 1.089873e+06\n", " \n", " \n", " day_5.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.211141\n", - " 4.736161\n", - " 9.312741e-17\n", + " -0.211179\n", + " 4.735322\n", + " -7.447956e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.525020\n", - " 21.475803\n", + " 4.524143\n", + " 21.467870\n", " 1.000000\n", - " 4.525020\n", - " 21.475803\n", - " 3.410605e-12\n", - " 36623.0\n", - " 1.657198e+05\n", - " 7.865083e+05\n", + " 4.524143\n", + " 21.467870\n", + " -2.728484e-12\n", + " 36634.0\n", + " 1.657375e+05\n", + " 7.864540e+05\n", " \n", " \n", " day_6.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.211212\n", - " 4.734579\n", - " -6.829343e-17\n", + " -0.212024\n", + " 4.716452\n", + " 6.827293e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.523367\n", - " 21.460851\n", + " 4.504429\n", + " 21.289878\n", " 1.000000\n", - " 4.523367\n", - " 21.460851\n", - " -2.501110e-12\n", - " 36623.0\n", - " 1.656593e+05\n", - " 7.859607e+05\n", + " 4.504429\n", + " 21.289878\n", + " 2.501110e-12\n", + " 36634.0\n", + " 1.650152e+05\n", + " 7.799334e+05\n", " \n", " \n", " day_7.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.206154\n", - " 4.850753\n", - " 2.793822e-17\n", + " -0.203808\n", + " 4.906588\n", + " -4.344641e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.644600\n", - " 22.572305\n", + " 4.702780\n", + " 23.116144\n", " 1.000000\n", - " 4.644600\n", - " 22.572305\n", - " 1.023182e-12\n", - " 36623.0\n", - " 1.700992e+05\n", - " 8.266655e+05\n", + " 4.702780\n", + " 23.116144\n", + " -1.591616e-12\n", + " 36634.0\n", + " 1.722817e+05\n", + " 8.468368e+05\n", " \n", " \n", " day_8.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.204710\n", - " 4.884963\n", - " 5.587645e-17\n", + " -0.204967\n", + " 4.878830\n", + " -8.999613e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.680253\n", - " 22.904771\n", + " 4.673862\n", + " 22.844991\n", " 1.000000\n", - " 4.680253\n", - " 22.904771\n", - " 2.046363e-12\n", - " 36623.0\n", - " 1.714049e+05\n", - " 8.388414e+05\n", + " 4.673862\n", + " 22.844991\n", + " -3.296918e-12\n", + " 36634.0\n", + " 1.712223e+05\n", + " 8.369034e+05\n", " \n", " \n", " day_9.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.187516\n", - " 5.332890\n", - " 6.829343e-17\n", + " -0.189814\n", + " 5.268311\n", + " 1.861989e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.145374\n", - " 27.474873\n", + " 5.078497\n", + " 26.791131\n", " 1.000000\n", - " 5.145374\n", - " 27.474873\n", - " 2.501110e-12\n", - " 36623.0\n", - " 1.884390e+05\n", - " 1.006212e+06\n", + " 5.078497\n", + " 26.791131\n", + " 6.821210e-13\n", + " 36634.0\n", + " 1.860457e+05\n", + " 9.814663e+05\n", " \n", " \n", " day_10.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.108613\n", - " 9.206962\n", - " 7.760617e-18\n", + " -0.109110\n", + " 9.165025\n", + " -4.034309e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 9.098349\n", - " 83.779947\n", + " 9.055914\n", + " 83.009585\n", " 1.000000\n", - " 9.098349\n", - " 83.779947\n", - " 2.842171e-13\n", - " 36623.0\n", - " 3.332088e+05\n", - " 3.068273e+06\n", + " 9.055914\n", + " 83.009585\n", + " -1.477929e-12\n", + " 36634.0\n", + " 3.317544e+05\n", + " 3.040973e+06\n", " \n", " \n", " day_11.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.185556\n", - " 5.389200\n", - " -1.241699e-17\n", + " -0.183073\n", + " 5.462298\n", + " -4.344641e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.203644\n", - " 28.077909\n", + " 5.279225\n", + " 28.870216\n", " 1.000000\n", - " 5.203644\n", - " 28.077909\n", - " -4.547474e-13\n", - " 36623.0\n", - " 1.905730e+05\n", - " 1.028297e+06\n", + " 5.279225\n", + " 28.870216\n", + " -1.591616e-12\n", + " 36634.0\n", + " 1.933991e+05\n", + " 1.057631e+06\n", " \n", " \n", " day_12.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.190229\n", - " 5.256814\n", - " -1.862548e-17\n", + " -0.191352\n", + " 5.225961\n", + " -9.309945e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.066584\n", - " 26.670276\n", + " 5.034608\n", + " 26.347280\n", " 1.000000\n", - " 5.066584\n", - " 26.670276\n", - " -6.821210e-13\n", - " 36623.0\n", - " 1.855535e+05\n", - " 9.767455e+05\n", + " 5.034608\n", + " 26.347280\n", + " -3.410605e-12\n", + " 36634.0\n", + " 1.844378e+05\n", + " 9.652063e+05\n", " \n", " \n", " day_13.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.189766\n", - " 5.269635\n", - " -6.518919e-17\n", + " -0.190661\n", + " 5.244897\n", + " 4.034309e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.079868\n", - " 26.805061\n", + " 5.054236\n", + " 26.545301\n", " 1.000000\n", - " 5.079868\n", - " 26.805061\n", - " -2.387424e-12\n", - " 36623.0\n", - " 1.860400e+05\n", - " 9.816817e+05\n", + " 5.054236\n", + " 26.545301\n", + " 1.477929e-12\n", + " 36634.0\n", + " 1.851569e+05\n", + " 9.724606e+05\n", " \n", " \n", " day_14.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.206872\n", - " 4.833901\n", - " -2.172973e-17\n", + " -0.206696\n", + " 4.838016\n", + " -1.551657e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.627029\n", - " 22.409396\n", + " 4.631319\n", + " 22.449119\n", " 1.000000\n", - " 4.627029\n", - " 22.409396\n", - " -7.958079e-13\n", - " 36623.0\n", - " 1.694557e+05\n", - " 8.206993e+05\n", + " 4.631319\n", + " 22.449119\n", + " -5.684342e-13\n", + " 36634.0\n", + " 1.696638e+05\n", + " 8.224010e+05\n", " \n", " \n", " day_15.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.197060\n", - " 5.074608\n", - " 1.862548e-17\n", + " -0.195980\n", + " 5.102564\n", + " 6.827293e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.877548\n", - " 24.790476\n", + " 4.906584\n", + " 25.074570\n", " 1.000000\n", - " 4.877548\n", - " 24.790476\n", - " 6.821210e-13\n", - " 36623.0\n", - " 1.786304e+05\n", - " 9.079016e+05\n", + " 4.906584\n", + " 25.074570\n", + " 2.501110e-12\n", + " 36634.0\n", + " 1.797478e+05\n", + " 9.185818e+05\n", " \n", " \n", " day_16.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.180053\n", - " 5.553925\n", - " 2.793822e-17\n", + " -0.178728\n", + " 5.595097\n", + " 9.309945e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.373873\n", - " 29.878506\n", + " 5.416369\n", + " 30.337058\n", " 1.000000\n", - " 5.373873\n", - " 29.878506\n", - " 1.023182e-12\n", - " 36623.0\n", - " 1.968073e+05\n", - " 1.094241e+06\n", + " 5.416369\n", + " 30.337058\n", + " 3.410605e-13\n", + " 36634.0\n", + " 1.984233e+05\n", + " 1.111368e+06\n", " \n", " \n", " day_17.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.212268\n", - " 4.711029\n", - " 4.345946e-17\n", + " -0.213286\n", + " 4.688543\n", + " 8.689282e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.498761\n", - " 21.238851\n", + " 4.475257\n", + " 21.027925\n", " 1.000000\n", - " 4.498761\n", - " 21.238851\n", - " 1.591616e-12\n", - " 36623.0\n", - " 1.647581e+05\n", - " 7.778304e+05\n", + " 4.475257\n", + " 21.027925\n", + " 3.183231e-12\n", + " 36634.0\n", + " 1.639466e+05\n", + " 7.703370e+05\n", " \n", " \n", " day_18.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.230787\n", - " 4.332994\n", - " 4.035521e-17\n", + " -0.231079\n", + " 4.327530\n", + " 2.482652e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.102207\n", - " 17.828101\n", + " 4.096451\n", + " 17.780915\n", " 1.000000\n", - " 4.102207\n", - " 17.828101\n", - " 1.477929e-12\n", - " 36623.0\n", - " 1.502351e+05\n", - " 6.529185e+05\n", + " 4.096451\n", + " 17.780915\n", + " 9.094947e-13\n", + " 36634.0\n", + " 1.500694e+05\n", + " 6.513860e+05\n", " \n", " \n", " day_19.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.200768\n", - " 4.980865\n", - " 5.587645e-17\n", + " -0.201472\n", + " 4.963478\n", + " -3.723978e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.780097\n", - " 23.849328\n", + " 4.762006\n", + " 23.676700\n", " 1.000000\n", - " 4.780097\n", - " 23.849328\n", - " 2.046363e-12\n", - " 36623.0\n", - " 1.750615e+05\n", - " 8.734340e+05\n", + " 4.762006\n", + " 23.676700\n", + " -1.364242e-12\n", + " 36634.0\n", + " 1.744513e+05\n", + " 8.673722e+05\n", " \n", " \n", " day_20.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.256151\n", - " 3.903946\n", - " 1.303784e-16\n", + " -0.256473\n", + " 3.899047\n", + " -4.344641e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 3.647795\n", - " 14.306412\n", + " 3.642574\n", + " 14.268344\n", " 1.000000\n", - " 3.647795\n", - " 14.306412\n", - " 4.774847e-12\n", - " 36623.0\n", - " 1.335932e+05\n", - " 5.239437e+05\n", + " 3.642574\n", + " 14.268344\n", + " -1.591616e-12\n", + " 36634.0\n", + " 1.334420e+05\n", + " 5.227065e+05\n", " \n", " \n", " day_21.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.218308\n", - " 4.580676\n", - " 2.172973e-17\n", + " -0.214333\n", + " 4.665638\n", + " -6.206630e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.362368\n", - " 20.030252\n", + " 4.451305\n", + " 20.814118\n", " 1.000000\n", - " 4.362368\n", - " 20.030252\n", - " 7.958079e-13\n", - " 36623.0\n", - " 1.597630e+05\n", - " 7.335679e+05\n", + " 4.451305\n", + " 20.814118\n", + " -2.273737e-12\n", + " 36634.0\n", + " 1.630691e+05\n", + " 7.625044e+05\n", " \n", " \n", " day_22.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.142911\n", - " 6.997368\n", - " -6.208494e-18\n", + " -0.142988\n", + " 6.993574\n", + " -6.206630e-18\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 6.854458\n", - " 47.983589\n", + " 6.850586\n", + " 47.930527\n", " 1.000000\n", - " 6.854458\n", - " 47.983589\n", + " 6.850586\n", + " 47.930527\n", " -2.273737e-13\n", - " 36623.0\n", - " 2.510308e+05\n", - " 1.757303e+06\n", + " 36634.0\n", + " 2.509644e+05\n", + " 1.755887e+06\n", " \n", " \n", " day_23.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.146062\n", - " 6.846401\n", - " -5.587645e-17\n", + " -0.146526\n", + " 6.824707\n", + " -3.103315e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 6.700339\n", - " 45.894537\n", + " 6.678180\n", + " 45.598093\n", " 1.000000\n", - " 6.700339\n", - " 45.894537\n", - " -2.046363e-12\n", - " 36623.0\n", - " 2.453865e+05\n", - " 1.680796e+06\n", + " 6.678180\n", + " 45.598093\n", + " -1.136868e-12\n", + " 36634.0\n", + " 2.446485e+05\n", + " 1.670441e+06\n", " \n", " \n", " day_24.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.098230\n", - " 10.180233\n", - " -2.172973e-17\n", + " -0.100040\n", + " 9.996005\n", + " 2.017155e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 10.082003\n", - " 102.646792\n", + " 9.895965\n", + " 98.930118\n", " 1.000000\n", - " 10.082003\n", - " 102.646792\n", - " -7.958079e-13\n", - " 36623.0\n", - " 3.692332e+05\n", - " 3.759233e+06\n", + " 9.895965\n", + " 98.930118\n", + " 7.389644e-13\n", + " 36634.0\n", + " 3.625288e+05\n", + " 3.624206e+06\n", " \n", " \n", " day_25.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.135052\n", - " 7.404576\n", - " 0.000000e+00\n", + " -0.138142\n", + " 7.238946\n", + " -5.896298e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 7.269524\n", - " 53.845983\n", + " 7.100804\n", + " 51.421415\n", " 1.000000\n", - " 7.269524\n", - " 53.845983\n", - " 0.000000e+00\n", - " 36623.0\n", - " 2.662318e+05\n", - " 1.972001e+06\n", + " 7.100804\n", + " 51.421415\n", + " -2.160050e-12\n", + " 36634.0\n", + " 2.601308e+05\n", + " 1.883772e+06\n", " \n", " \n", " day_26.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.152467\n", - " 6.558813\n", - " -2.483398e-17\n", + " -0.153750\n", + " 6.504045\n", + " -1.861989e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 6.406346\n", - " 42.041275\n", + " 6.350294\n", + " 41.326240\n", " 1.000000\n", - " 6.406346\n", - " 42.041275\n", - " -9.094947e-13\n", - " 36623.0\n", - " 2.346196e+05\n", - " 1.539678e+06\n", + " 6.350294\n", + " 41.326240\n", + " -6.821210e-13\n", + " 36634.0\n", + " 2.326367e+05\n", + " 1.513945e+06\n", " \n", " \n", " day_27.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.158816\n", - " 6.296591\n", - " -2.483398e-17\n", + " -0.160947\n", + " 6.213238\n", + " 2.482652e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 6.137775\n", - " 38.672281\n", + " 6.052291\n", + " 37.630228\n", " 1.000000\n", - " 6.137775\n", - " 38.672281\n", - " -9.094947e-13\n", - " 36623.0\n", - " 2.247837e+05\n", - " 1.416295e+06\n", + " 6.052291\n", + " 37.630228\n", + " 9.094947e-13\n", + " 36634.0\n", + " 2.217196e+05\n", + " 1.378546e+06\n", " \n", " \n", " day_28.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.207087\n", - " 4.828878\n", - " -7.450193e-17\n", + " -0.204243\n", + " 4.896126\n", + " 3.103315e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.621790\n", - " 22.360946\n", + " 4.691883\n", + " 23.013767\n", " 1.000000\n", - " 4.621790\n", - " 22.360946\n", - " -2.728484e-12\n", - " 36623.0\n", - " 1.692638e+05\n", - " 8.189249e+05\n", + " 4.691883\n", + " 23.013767\n", + " 1.136868e-12\n", + " 36634.0\n", + " 1.718824e+05\n", + " 8.430863e+05\n", " \n", " \n", " day_29.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.199218\n", - " 5.019632\n", - " -7.450193e-17\n", + " -0.200148\n", + " 4.996313\n", + " -1.861989e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.820415\n", - " 24.236397\n", + " 4.796166\n", + " 24.003206\n", " 1.000000\n", - " 4.820415\n", - " 24.236397\n", - " -2.728484e-12\n", - " 36623.0\n", - " 1.765380e+05\n", - " 8.876096e+05\n", + " 4.796166\n", + " 24.003206\n", + " -6.821210e-13\n", + " 36634.0\n", + " 1.757027e+05\n", + " 8.793334e+05\n", " \n", " \n", " day_30.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.190383\n", - " 5.252560\n", - " 0.000000e+00\n", + " -0.188032\n", + " 5.318249\n", + " 3.103315e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.062176\n", - " 26.625629\n", + " 5.130217\n", + " 27.319129\n", " 1.000000\n", - " 5.062176\n", - " 26.625629\n", - " 0.000000e+00\n", - " 36623.0\n", - " 1.853921e+05\n", - " 9.751104e+05\n", + " 5.130217\n", + " 27.319129\n", + " 1.136868e-12\n", + " 36634.0\n", + " 1.879404e+05\n", + " 1.000809e+06\n", " \n", " \n", " day_31.0\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.119779\n", - " 8.348699\n", - " -3.104247e-18\n", + " -0.120812\n", + " 8.277332\n", + " 3.103315e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 8.228920\n", - " 68.715119\n", + " 8.156521\n", + " 67.528827\n", " 1.000000\n", - " 8.228920\n", - " 68.715119\n", - " -1.136868e-13\n", - " 36623.0\n", - " 3.013677e+05\n", - " 2.516554e+06\n", + " 8.156521\n", + " 67.528827\n", + " 1.136868e-12\n", + " 36634.0\n", + " 2.988060e+05\n", + " 2.473851e+06\n", " \n", " \n", " poutcome_failure\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.349768\n", - " 2.859038\n", - " 3.104247e-17\n", + " -0.350446\n", + " 2.853507\n", + " -3.103315e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 2.509270\n", - " 7.296436\n", + " 2.503061\n", + " 7.265313\n", " 1.000000\n", - " 2.509270\n", - " 7.296436\n", - " 1.136868e-12\n", - " 36623.0\n", - " 9.189699e+04\n", - " 2.672174e+05\n", + " 2.503061\n", + " 7.265313\n", + " -1.136868e-12\n", + " 36634.0\n", + " 9.169713e+04\n", + " 2.661575e+05\n", " \n", " \n", " poutcome_other\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.207302\n", - " 4.823869\n", - " 6.208494e-18\n", + " -0.204025\n", + " 4.901349\n", + " -7.447956e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 4.616567\n", - " 22.312689\n", + " 4.697324\n", + " 23.064850\n", " 1.000000\n", - " 4.616567\n", - " 22.312689\n", - " 2.273737e-13\n", - " 36623.0\n", - " 1.690725e+05\n", - " 8.171576e+05\n", + " 4.697324\n", + " 23.064850\n", + " -2.728484e-12\n", + " 36634.0\n", + " 1.720818e+05\n", + " 8.449577e+05\n", " \n", " \n", " poutcome_success\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -0.187750\n", - " 5.326245\n", - " -1.862548e-17\n", + " -0.187330\n", + " 5.338162\n", + " -3.413646e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " 5.138495\n", - " 27.404135\n", + " 5.150832\n", + " 27.531067\n", " 1.000000\n", - " 5.138495\n", - " 27.404135\n", - " -6.821210e-13\n", - " 36623.0\n", - " 1.881871e+05\n", - " 1.003622e+06\n", + " 5.150832\n", + " 27.531067\n", + " -1.250555e-12\n", + " 36634.0\n", + " 1.886956e+05\n", + " 1.008573e+06\n", " \n", " \n", " poutcome_unknown\n", " float64\n", - " 36623\n", - " 36623\n", + " 36634\n", + " 36634\n", " 0\n", " 0.0\n", - " -2.104100\n", - " 0.475263\n", - " 1.490039e-16\n", + " -2.111202\n", + " 0.473664\n", + " -1.241326e-17\n", " 1.000027\n", " 1.000014\n", " ...\n", " 1.000000\n", - " -1.628838\n", - " 3.653112\n", + " -1.637538\n", + " 3.681530\n", " 1.000000\n", - " -1.628838\n", - " 3.653112\n", - " 5.456968e-12\n", - " 36623.0\n", - " -5.965292e+04\n", - " 1.337879e+05\n", + " -1.637538\n", + " 3.681530\n", + " -4.547474e-13\n", + " 36634.0\n", + " -5.998956e+04\n", + " 1.348692e+05\n", " \n", " \n", "\n", @@ -4448,472 +4487,472 @@ ], "text/plain": [ " datatype total_count count(non-NA count) \\\n", - "age float64 36623 36623 \n", - "education float64 36623 36623 \n", - "default float64 36623 36623 \n", - "balance float64 36623 36623 \n", - "housing float64 36623 36623 \n", - "loan float64 36623 36623 \n", - "job_admin. float64 36623 36623 \n", - "job_blue-collar float64 36623 36623 \n", - "job_entrepreneur float64 36623 36623 \n", - "job_housemaid float64 36623 36623 \n", - "job_management float64 36623 36623 \n", - "job_retired float64 36623 36623 \n", - "job_self-employed float64 36623 36623 \n", - "job_services float64 36623 36623 \n", - "job_student float64 36623 36623 \n", - "job_technician float64 36623 36623 \n", - "job_unemployed float64 36623 36623 \n", - "job_unknown float64 36623 36623 \n", - "marital_divorced float64 36623 36623 \n", - "marital_married float64 36623 36623 \n", - "marital_single float64 36623 36623 \n", - "duration float64 36623 36623 \n", - "campaign float64 36623 36623 \n", - "pdays float64 36623 36623 \n", - "previous float64 36623 36623 \n", - "y int64 36623 36623 \n", - "contact_cellular float64 36623 36623 \n", - "contact_telephone float64 36623 36623 \n", - "contact_unknown float64 36623 36623 \n", - "month_1.0 float64 36623 36623 \n", - "month_2.0 float64 36623 36623 \n", - "month_3.0 float64 36623 36623 \n", - "month_4.0 float64 36623 36623 \n", - "month_5.0 float64 36623 36623 \n", - "month_6.0 float64 36623 36623 \n", - "month_7.0 float64 36623 36623 \n", - "month_8.0 float64 36623 36623 \n", - "month_9.0 float64 36623 36623 \n", - "month_10.0 float64 36623 36623 \n", - "month_11.0 float64 36623 36623 \n", - "month_12.0 float64 36623 36623 \n", - "day_1.0 float64 36623 36623 \n", - "day_2.0 float64 36623 36623 \n", - "day_3.0 float64 36623 36623 \n", - "day_4.0 float64 36623 36623 \n", - "day_5.0 float64 36623 36623 \n", - "day_6.0 float64 36623 36623 \n", - "day_7.0 float64 36623 36623 \n", - "day_8.0 float64 36623 36623 \n", - "day_9.0 float64 36623 36623 \n", - "day_10.0 float64 36623 36623 \n", - "day_11.0 float64 36623 36623 \n", - "day_12.0 float64 36623 36623 \n", - "day_13.0 float64 36623 36623 \n", - "day_14.0 float64 36623 36623 \n", - "day_15.0 float64 36623 36623 \n", - "day_16.0 float64 36623 36623 \n", - "day_17.0 float64 36623 36623 \n", - "day_18.0 float64 36623 36623 \n", - "day_19.0 float64 36623 36623 \n", - "day_20.0 float64 36623 36623 \n", - "day_21.0 float64 36623 36623 \n", - "day_22.0 float64 36623 36623 \n", - "day_23.0 float64 36623 36623 \n", - "day_24.0 float64 36623 36623 \n", - "day_25.0 float64 36623 36623 \n", - "day_26.0 float64 36623 36623 \n", - "day_27.0 float64 36623 36623 \n", - "day_28.0 float64 36623 36623 \n", - "day_29.0 float64 36623 36623 \n", - "day_30.0 float64 36623 36623 \n", - "day_31.0 float64 36623 36623 \n", - "poutcome_failure float64 36623 36623 \n", - "poutcome_other float64 36623 36623 \n", - "poutcome_success float64 36623 36623 \n", - "poutcome_unknown float64 36623 36623 \n", + "age float64 36634 36634 \n", + "education float64 36634 36634 \n", + "default float64 36634 36634 \n", + "balance float64 36634 36634 \n", + "housing float64 36634 36634 \n", + "loan float64 36634 36634 \n", + "job_admin. float64 36634 36634 \n", + "job_blue-collar float64 36634 36634 \n", + "job_entrepreneur float64 36634 36634 \n", + "job_housemaid float64 36634 36634 \n", + "job_management float64 36634 36634 \n", + "job_retired float64 36634 36634 \n", + "job_self-employed float64 36634 36634 \n", + "job_services float64 36634 36634 \n", + "job_student float64 36634 36634 \n", + "job_technician float64 36634 36634 \n", + "job_unemployed float64 36634 36634 \n", + "job_unknown float64 36634 36634 \n", + "marital_divorced float64 36634 36634 \n", + "marital_married float64 36634 36634 \n", + "marital_single float64 36634 36634 \n", + "duration float64 36634 36634 \n", + "campaign float64 36634 36634 \n", + "pdays float64 36634 36634 \n", + "previous float64 36634 36634 \n", + "y int64 36634 36634 \n", + "contact_cellular float64 36634 36634 \n", + "contact_telephone float64 36634 36634 \n", + "contact_unknown float64 36634 36634 \n", + "month_1.0 float64 36634 36634 \n", + "month_2.0 float64 36634 36634 \n", + "month_3.0 float64 36634 36634 \n", + "month_4.0 float64 36634 36634 \n", + "month_5.0 float64 36634 36634 \n", + "month_6.0 float64 36634 36634 \n", + "month_7.0 float64 36634 36634 \n", + "month_8.0 float64 36634 36634 \n", + "month_9.0 float64 36634 36634 \n", + "month_10.0 float64 36634 36634 \n", + "month_11.0 float64 36634 36634 \n", + "month_12.0 float64 36634 36634 \n", + "day_1.0 float64 36634 36634 \n", + "day_2.0 float64 36634 36634 \n", + "day_3.0 float64 36634 36634 \n", + "day_4.0 float64 36634 36634 \n", + "day_5.0 float64 36634 36634 \n", + "day_6.0 float64 36634 36634 \n", + "day_7.0 float64 36634 36634 \n", + "day_8.0 float64 36634 36634 \n", + "day_9.0 float64 36634 36634 \n", + "day_10.0 float64 36634 36634 \n", + "day_11.0 float64 36634 36634 \n", + "day_12.0 float64 36634 36634 \n", + "day_13.0 float64 36634 36634 \n", + "day_14.0 float64 36634 36634 \n", + "day_15.0 float64 36634 36634 \n", + "day_16.0 float64 36634 36634 \n", + "day_17.0 float64 36634 36634 \n", + "day_18.0 float64 36634 36634 \n", + "day_19.0 float64 36634 36634 \n", + "day_20.0 float64 36634 36634 \n", + "day_21.0 float64 36634 36634 \n", + "day_22.0 float64 36634 36634 \n", + "day_23.0 float64 36634 36634 \n", + "day_24.0 float64 36634 36634 \n", + "day_25.0 float64 36634 36634 \n", + "day_26.0 float64 36634 36634 \n", + "day_27.0 float64 36634 36634 \n", + "day_28.0 float64 36634 36634 \n", + "day_29.0 float64 36634 36634 \n", + "day_30.0 float64 36634 36634 \n", + "day_31.0 float64 36634 36634 \n", + "poutcome_failure float64 36634 36634 \n", + "poutcome_other float64 36634 36634 \n", + "poutcome_success float64 36634 36634 \n", + "poutcome_unknown float64 36634 36634 \n", "\n", " count_na(NA count) na_ratio min max \\\n", - "age 0 0.0 -2.159100 5.077423 \n", - "education 0 0.0 -1.750166 1.317305 \n", - "default 0 0.0 -0.135366 7.387394 \n", - "balance 0 0.0 -3.062156 32.883499 \n", - "housing 0 0.0 -1.115928 0.896115 \n", - "loan 0 0.0 -0.438311 2.281487 \n", - "job_admin. 0 0.0 -0.359042 2.785192 \n", - "job_blue-collar 0 0.0 -0.523470 1.910330 \n", - "job_entrepreneur 0 0.0 -0.184134 5.430815 \n", - "job_housemaid 0 0.0 -0.167632 5.965435 \n", - "job_management 0 0.0 -0.513253 1.948358 \n", - "job_retired 0 0.0 -0.230656 4.335460 \n", - "job_self-employed 0 0.0 -0.191305 5.227241 \n", - "job_services 0 0.0 -0.316887 3.155695 \n", - "job_student 0 0.0 -0.148000 6.756738 \n", - "job_technician 0 0.0 -0.450251 2.220984 \n", - "job_unemployed 0 0.0 -0.171298 5.837765 \n", - "job_unknown 0 0.0 -0.081048 12.338327 \n", - "marital_divorced 0 0.0 -0.359187 2.784065 \n", - "marital_married 0 0.0 -1.231613 0.811943 \n", - "marital_single 0 0.0 -0.628323 1.591538 \n", - "duration 0 0.0 -2.892304 1.941783 \n", - "campaign 0 0.0 -0.570990 19.718918 \n", - "pdays 0 0.0 -0.413789 8.280388 \n", - "previous 0 0.0 -0.244781 113.983016 \n", + "age 0 0.0 -2.162447 5.096416 \n", + "education 0 0.0 -1.749962 1.314552 \n", + "default 0 0.0 -0.135554 7.377133 \n", + "balance 0 0.0 -1.566762 32.087712 \n", + "housing 0 0.0 -1.118638 0.893944 \n", + "loan 0 0.0 -0.436771 2.289530 \n", + "job_admin. 0 0.0 -0.360094 2.777051 \n", + "job_blue-collar 0 0.0 -0.525105 1.904383 \n", + "job_entrepreneur 0 0.0 -0.185291 5.396911 \n", + "job_housemaid 0 0.0 -0.167950 5.954136 \n", + "job_management 0 0.0 -0.513622 1.946956 \n", + "job_retired 0 0.0 -0.227915 4.387592 \n", + "job_self-employed 0 0.0 -0.189196 5.285528 \n", + "job_services 0 0.0 -0.316887 3.155697 \n", + "job_student 0 0.0 -0.145356 6.879667 \n", + "job_technician 0 0.0 -0.449906 2.222685 \n", + "job_unemployed 0 0.0 -0.172953 5.781907 \n", + "job_unknown 0 0.0 -0.080522 12.418889 \n", + "marital_divorced 0 0.0 -0.362219 2.760760 \n", + "marital_married 0 0.0 -1.226669 0.815216 \n", + "marital_single 0 0.0 -0.628656 1.590694 \n", + "duration 0 0.0 -2.809886 1.846421 \n", + "campaign 0 0.0 -0.570738 19.507670 \n", + "pdays 0 0.0 -0.412237 8.263154 \n", + "previous 0 0.0 -0.243872 113.389007 \n", "y 0 0.0 0.000000 1.000000 \n", - "contact_cellular 0 0.0 -1.357800 0.736485 \n", - "contact_telephone 0 0.0 -0.262983 3.802534 \n", - "contact_unknown 0 0.0 -0.634414 1.576257 \n", - "month_1.0 0 0.0 -0.180053 5.553925 \n", - "month_2.0 0 0.0 -0.252010 3.968094 \n", - "month_3.0 0 0.0 -0.102803 9.727354 \n", - "month_4.0 0 0.0 -0.262091 3.815465 \n", - "month_5.0 0 0.0 -0.659087 1.517250 \n", - "month_6.0 0 0.0 -0.364928 2.740267 \n", - "month_7.0 0 0.0 -0.424911 2.353434 \n", - "month_8.0 0 0.0 -0.400668 2.495832 \n", - "month_9.0 0 0.0 -0.114876 8.705043 \n", - "month_10.0 0 0.0 -0.129930 7.696441 \n", - "month_11.0 0 0.0 -0.310534 3.220260 \n", - "month_12.0 0 0.0 -0.070476 14.189328 \n", - "day_1.0 0 0.0 -0.085536 11.691041 \n", - "day_2.0 0 0.0 -0.171552 5.829150 \n", - "day_3.0 0 0.0 -0.156817 6.376864 \n", - "day_4.0 0 0.0 -0.181101 5.521785 \n", - "day_5.0 0 0.0 -0.211141 4.736161 \n", - "day_6.0 0 0.0 -0.211212 4.734579 \n", - "day_7.0 0 0.0 -0.206154 4.850753 \n", - "day_8.0 0 0.0 -0.204710 4.884963 \n", - "day_9.0 0 0.0 -0.187516 5.332890 \n", - "day_10.0 0 0.0 -0.108613 9.206962 \n", - "day_11.0 0 0.0 -0.185556 5.389200 \n", - "day_12.0 0 0.0 -0.190229 5.256814 \n", - "day_13.0 0 0.0 -0.189766 5.269635 \n", - "day_14.0 0 0.0 -0.206872 4.833901 \n", - "day_15.0 0 0.0 -0.197060 5.074608 \n", - "day_16.0 0 0.0 -0.180053 5.553925 \n", - "day_17.0 0 0.0 -0.212268 4.711029 \n", - "day_18.0 0 0.0 -0.230787 4.332994 \n", - "day_19.0 0 0.0 -0.200768 4.980865 \n", - "day_20.0 0 0.0 -0.256151 3.903946 \n", - "day_21.0 0 0.0 -0.218308 4.580676 \n", - "day_22.0 0 0.0 -0.142911 6.997368 \n", - "day_23.0 0 0.0 -0.146062 6.846401 \n", - "day_24.0 0 0.0 -0.098230 10.180233 \n", - "day_25.0 0 0.0 -0.135052 7.404576 \n", - "day_26.0 0 0.0 -0.152467 6.558813 \n", - "day_27.0 0 0.0 -0.158816 6.296591 \n", - "day_28.0 0 0.0 -0.207087 4.828878 \n", - "day_29.0 0 0.0 -0.199218 5.019632 \n", - "day_30.0 0 0.0 -0.190383 5.252560 \n", - "day_31.0 0 0.0 -0.119779 8.348699 \n", - "poutcome_failure 0 0.0 -0.349768 2.859038 \n", - "poutcome_other 0 0.0 -0.207302 4.823869 \n", - "poutcome_success 0 0.0 -0.187750 5.326245 \n", - "poutcome_unknown 0 0.0 -2.104100 0.475263 \n", + "contact_cellular 0 0.0 -1.359335 0.735654 \n", + "contact_telephone 0 0.0 -0.261573 3.823024 \n", + "contact_unknown 0 0.0 -0.634619 1.575748 \n", + "month_1.0 0 0.0 -0.178158 5.613000 \n", + "month_2.0 0 0.0 -0.249263 4.011823 \n", + "month_3.0 0 0.0 -0.103058 9.703260 \n", + "month_4.0 0 0.0 -0.265247 3.770074 \n", + "month_5.0 0 0.0 -0.658775 1.517969 \n", + "month_6.0 0 0.0 -0.366401 2.729249 \n", + "month_7.0 0 0.0 -0.425328 2.351127 \n", + "month_8.0 0 0.0 -0.402386 2.485176 \n", + "month_9.0 0 0.0 -0.112144 8.917078 \n", + "month_10.0 0 0.0 -0.127389 7.849982 \n", + "month_11.0 0 0.0 -0.310483 3.220790 \n", + "month_12.0 0 0.0 -0.068280 14.645618 \n", + "day_1.0 0 0.0 -0.086489 11.562172 \n", + "day_2.0 0 0.0 -0.171018 5.847321 \n", + "day_3.0 0 0.0 -0.158339 6.315549 \n", + "day_4.0 0 0.0 -0.180429 5.542360 \n", + "day_5.0 0 0.0 -0.211179 4.735322 \n", + "day_6.0 0 0.0 -0.212024 4.716452 \n", + "day_7.0 0 0.0 -0.203808 4.906588 \n", + "day_8.0 0 0.0 -0.204967 4.878830 \n", + "day_9.0 0 0.0 -0.189814 5.268311 \n", + "day_10.0 0 0.0 -0.109110 9.165025 \n", + "day_11.0 0 0.0 -0.183073 5.462298 \n", + "day_12.0 0 0.0 -0.191352 5.225961 \n", + "day_13.0 0 0.0 -0.190661 5.244897 \n", + "day_14.0 0 0.0 -0.206696 4.838016 \n", + "day_15.0 0 0.0 -0.195980 5.102564 \n", + "day_16.0 0 0.0 -0.178728 5.595097 \n", + "day_17.0 0 0.0 -0.213286 4.688543 \n", + "day_18.0 0 0.0 -0.231079 4.327530 \n", + "day_19.0 0 0.0 -0.201472 4.963478 \n", + "day_20.0 0 0.0 -0.256473 3.899047 \n", + "day_21.0 0 0.0 -0.214333 4.665638 \n", + "day_22.0 0 0.0 -0.142988 6.993574 \n", + "day_23.0 0 0.0 -0.146526 6.824707 \n", + "day_24.0 0 0.0 -0.100040 9.996005 \n", + "day_25.0 0 0.0 -0.138142 7.238946 \n", + "day_26.0 0 0.0 -0.153750 6.504045 \n", + "day_27.0 0 0.0 -0.160947 6.213238 \n", + "day_28.0 0 0.0 -0.204243 4.896126 \n", + "day_29.0 0 0.0 -0.200148 4.996313 \n", + "day_30.0 0 0.0 -0.188032 5.318249 \n", + "day_31.0 0 0.0 -0.120812 8.277332 \n", + "poutcome_failure 0 0.0 -0.350446 2.853507 \n", + "poutcome_other 0 0.0 -0.204025 4.901349 \n", + "poutcome_success 0 0.0 -0.187330 5.338162 \n", + "poutcome_unknown 0 0.0 -2.111202 0.473664 \n", "\n", " mean var(variance) std(standard deviation) ... \\\n", - "age 1.490039e-16 1.000027 1.000014 ... \n", - "education -2.235058e-16 1.000027 1.000014 ... \n", - "default -2.483398e-17 1.000027 1.000014 ... \n", - "balance 3.880309e-17 1.000027 1.000014 ... \n", - "housing 2.110888e-16 1.000027 1.000014 ... \n", - "loan 7.062162e-17 1.000027 1.000014 ... \n", - "job_admin. -2.561004e-17 1.000027 1.000014 ... \n", - "job_blue-collar -4.035521e-17 1.000027 1.000014 ... \n", - "job_entrepreneur 1.940154e-18 1.000027 1.000014 ... \n", - "job_housemaid -5.432432e-18 1.000027 1.000014 ... \n", - "job_management -1.179614e-16 1.000027 1.000014 ... \n", - "job_retired -2.172973e-17 1.000027 1.000014 ... \n", - "job_self-employed -5.820463e-17 1.000027 1.000014 ... \n", - "job_services 2.483398e-17 1.000027 1.000014 ... \n", - "job_student -1.241699e-17 1.000027 1.000014 ... \n", - "job_technician 4.074324e-17 1.000027 1.000014 ... \n", - "job_unemployed -1.668533e-17 1.000027 1.000014 ... \n", - "job_unknown -6.984556e-18 1.000027 1.000014 ... \n", - "marital_divorced 1.319305e-17 1.000027 1.000014 ... \n", - "marital_married -2.696815e-17 1.000027 1.000014 ... \n", - "marital_single 1.416313e-16 1.000027 1.000014 ... \n", - "duration 4.908591e-17 1.000027 1.000014 ... \n", - "campaign 6.208494e-17 1.000027 1.000014 ... \n", - "pdays 2.483398e-17 1.000027 1.000014 ... \n", - "previous 1.862548e-17 1.000027 1.000014 ... \n", - "y 1.165388e-01 0.102960 0.320874 ... \n", - "contact_cellular 7.450193e-17 1.000027 1.000014 ... \n", - "contact_telephone 2.483398e-17 1.000027 1.000014 ... \n", - "contact_unknown -1.241699e-16 1.000027 1.000014 ... \n", - "month_1.0 6.208494e-18 1.000027 1.000014 ... \n", - "month_2.0 3.104247e-18 1.000027 1.000014 ... \n", - "month_3.0 -2.483398e-17 1.000027 1.000014 ... \n", - "month_4.0 -2.793822e-17 1.000027 1.000014 ... \n", - "month_5.0 1.614208e-16 1.000027 1.000014 ... \n", - "month_6.0 1.986718e-16 1.000027 1.000014 ... \n", - "month_7.0 0.000000e+00 1.000027 1.000014 ... \n", - "month_8.0 2.980077e-16 1.000027 1.000014 ... \n", - "month_9.0 4.190733e-17 1.000027 1.000014 ... \n", - "month_10.0 4.345946e-17 1.000027 1.000014 ... \n", - "month_11.0 -4.966795e-17 1.000027 1.000014 ... \n", - "month_12.0 9.312741e-18 1.000027 1.000014 ... \n", - "day_1.0 -2.172973e-17 1.000027 1.000014 ... \n", - "day_2.0 -7.450193e-17 1.000027 1.000014 ... \n", - "day_3.0 -4.966795e-17 1.000027 1.000014 ... \n", - "day_4.0 -3.414672e-17 1.000027 1.000014 ... \n", - "day_5.0 9.312741e-17 1.000027 1.000014 ... \n", - "day_6.0 -6.829343e-17 1.000027 1.000014 ... \n", - "day_7.0 2.793822e-17 1.000027 1.000014 ... \n", - "day_8.0 5.587645e-17 1.000027 1.000014 ... \n", - "day_9.0 6.829343e-17 1.000027 1.000014 ... \n", - "day_10.0 7.760617e-18 1.000027 1.000014 ... \n", - "day_11.0 -1.241699e-17 1.000027 1.000014 ... \n", - "day_12.0 -1.862548e-17 1.000027 1.000014 ... \n", - "day_13.0 -6.518919e-17 1.000027 1.000014 ... \n", - "day_14.0 -2.172973e-17 1.000027 1.000014 ... \n", - "day_15.0 1.862548e-17 1.000027 1.000014 ... \n", - "day_16.0 2.793822e-17 1.000027 1.000014 ... \n", - "day_17.0 4.345946e-17 1.000027 1.000014 ... \n", - "day_18.0 4.035521e-17 1.000027 1.000014 ... \n", - "day_19.0 5.587645e-17 1.000027 1.000014 ... \n", - "day_20.0 1.303784e-16 1.000027 1.000014 ... \n", - "day_21.0 2.172973e-17 1.000027 1.000014 ... \n", - "day_22.0 -6.208494e-18 1.000027 1.000014 ... \n", - "day_23.0 -5.587645e-17 1.000027 1.000014 ... \n", - "day_24.0 -2.172973e-17 1.000027 1.000014 ... \n", - "day_25.0 0.000000e+00 1.000027 1.000014 ... \n", - "day_26.0 -2.483398e-17 1.000027 1.000014 ... \n", - "day_27.0 -2.483398e-17 1.000027 1.000014 ... \n", - "day_28.0 -7.450193e-17 1.000027 1.000014 ... \n", - "day_29.0 -7.450193e-17 1.000027 1.000014 ... \n", - "day_30.0 0.000000e+00 1.000027 1.000014 ... \n", - "day_31.0 -3.104247e-18 1.000027 1.000014 ... \n", - "poutcome_failure 3.104247e-17 1.000027 1.000014 ... \n", - "poutcome_other 6.208494e-18 1.000027 1.000014 ... \n", - "poutcome_success -1.862548e-17 1.000027 1.000014 ... \n", - "poutcome_unknown 1.490039e-16 1.000027 1.000014 ... \n", + "age -1.877506e-16 1.000027 1.000014 ... \n", + "education 9.309945e-18 1.000027 1.000014 ... \n", + "default 4.965304e-17 1.000027 1.000014 ... \n", + "balance -1.396492e-17 1.000027 1.000014 ... \n", + "housing -3.103315e-17 1.000027 1.000014 ... \n", + "loan 5.779924e-17 1.000027 1.000014 ... \n", + "job_admin. -2.288695e-17 1.000027 1.000014 ... \n", + "job_blue-collar 3.103315e-17 1.000027 1.000014 ... \n", + "job_entrepreneur -9.697859e-18 1.000027 1.000014 ... \n", + "job_housemaid 5.430801e-17 1.000027 1.000014 ... \n", + "job_management 5.430801e-17 1.000027 1.000014 ... \n", + "job_retired 6.361796e-17 1.000027 1.000014 ... \n", + "job_self-employed -4.267058e-17 1.000027 1.000014 ... \n", + "job_services 6.361796e-17 1.000027 1.000014 ... \n", + "job_student -9.309945e-18 1.000027 1.000014 ... \n", + "job_technician -1.861989e-17 1.000027 1.000014 ... \n", + "job_unemployed 1.474075e-17 1.000027 1.000014 ... \n", + "job_unknown -1.512866e-17 1.000027 1.000014 ... \n", + "marital_divorced 2.754192e-17 1.000027 1.000014 ... \n", + "marital_married -1.334425e-16 1.000027 1.000014 ... \n", + "marital_single 6.012673e-17 1.000027 1.000014 ... \n", + "duration 6.633336e-17 1.000027 1.000014 ... \n", + "campaign 4.034309e-17 1.000027 1.000014 ... \n", + "pdays -6.206630e-18 1.000027 1.000014 ... \n", + "previous -4.034309e-17 1.000027 1.000014 ... \n", + "y 1.174592e-01 0.103665 0.321971 ... \n", + "contact_cellular -4.965304e-17 1.000027 1.000014 ... \n", + "contact_telephone 5.896298e-17 1.000027 1.000014 ... \n", + "contact_unknown -1.241326e-16 1.000027 1.000014 ... \n", + "month_1.0 -2.482652e-17 1.000027 1.000014 ... \n", + "month_2.0 8.378950e-17 1.000027 1.000014 ... \n", + "month_3.0 -5.275635e-17 1.000027 1.000014 ... \n", + "month_4.0 1.861989e-16 1.000027 1.000014 ... \n", + "month_5.0 -1.241326e-17 1.000027 1.000014 ... \n", + "month_6.0 -7.447956e-17 1.000027 1.000014 ... \n", + "month_7.0 -1.551657e-16 1.000027 1.000014 ... \n", + "month_8.0 -1.861989e-17 1.000027 1.000014 ... \n", + "month_9.0 -1.706823e-17 1.000027 1.000014 ... \n", + "month_10.0 -9.309945e-18 1.000027 1.000014 ... \n", + "month_11.0 5.585967e-17 1.000027 1.000014 ... \n", + "month_12.0 7.758287e-18 1.000027 1.000014 ... \n", + "day_1.0 3.103315e-18 1.000027 1.000014 ... \n", + "day_2.0 -5.585967e-17 1.000027 1.000014 ... \n", + "day_3.0 0.000000e+00 1.000027 1.000014 ... \n", + "day_4.0 -3.258481e-17 1.000027 1.000014 ... \n", + "day_5.0 -7.447956e-17 1.000027 1.000014 ... \n", + "day_6.0 6.827293e-17 1.000027 1.000014 ... \n", + "day_7.0 -4.344641e-17 1.000027 1.000014 ... \n", + "day_8.0 -8.999613e-17 1.000027 1.000014 ... \n", + "day_9.0 1.861989e-17 1.000027 1.000014 ... \n", + "day_10.0 -4.034309e-17 1.000027 1.000014 ... \n", + "day_11.0 -4.344641e-17 1.000027 1.000014 ... \n", + "day_12.0 -9.309945e-17 1.000027 1.000014 ... \n", + "day_13.0 4.034309e-17 1.000027 1.000014 ... \n", + "day_14.0 -1.551657e-17 1.000027 1.000014 ... \n", + "day_15.0 6.827293e-17 1.000027 1.000014 ... \n", + "day_16.0 9.309945e-18 1.000027 1.000014 ... \n", + "day_17.0 8.689282e-17 1.000027 1.000014 ... \n", + "day_18.0 2.482652e-17 1.000027 1.000014 ... \n", + "day_19.0 -3.723978e-17 1.000027 1.000014 ... \n", + "day_20.0 -4.344641e-17 1.000027 1.000014 ... \n", + "day_21.0 -6.206630e-17 1.000027 1.000014 ... \n", + "day_22.0 -6.206630e-18 1.000027 1.000014 ... \n", + "day_23.0 -3.103315e-17 1.000027 1.000014 ... \n", + "day_24.0 2.017155e-17 1.000027 1.000014 ... \n", + "day_25.0 -5.896298e-17 1.000027 1.000014 ... \n", + "day_26.0 -1.861989e-17 1.000027 1.000014 ... \n", + "day_27.0 2.482652e-17 1.000027 1.000014 ... \n", + "day_28.0 3.103315e-17 1.000027 1.000014 ... \n", + "day_29.0 -1.861989e-17 1.000027 1.000014 ... \n", + "day_30.0 3.103315e-17 1.000027 1.000014 ... \n", + "day_31.0 3.103315e-17 1.000027 1.000014 ... \n", + "poutcome_failure -3.103315e-17 1.000027 1.000014 ... \n", + "poutcome_other -7.447956e-17 1.000027 1.000014 ... \n", + "poutcome_success -3.413646e-17 1.000027 1.000014 ... \n", + "poutcome_unknown -1.241326e-17 1.000027 1.000014 ... \n", "\n", " moment_2 moment_3 moment_4 central_moment_2 \\\n", - "age 1.000000 0.683627 3.332610 1.000000 \n", - "education 1.000000 -0.150311 2.309413 1.000000 \n", - "default 1.000000 7.252029 53.591920 1.000000 \n", - "balance 1.000000 8.470364 147.645486 1.000000 \n", - "housing 1.000000 -0.219812 1.048317 1.000000 \n", - "loan 1.000000 1.843177 4.397301 1.000000 \n", - "job_admin. 1.000000 2.426150 6.886204 1.000000 \n", - "job_blue-collar 1.000000 1.386860 2.923380 1.000000 \n", - "job_entrepreneur 1.000000 5.246681 28.527661 1.000000 \n", - "job_housemaid 1.000000 5.797803 34.614514 1.000000 \n", - "job_management 1.000000 1.435105 3.059526 1.000000 \n", - "job_retired 1.000000 4.104804 17.849418 1.000000 \n", - "job_self-employed 1.000000 5.035936 26.360650 1.000000 \n", - "job_services 1.000000 2.838807 9.058826 1.000000 \n", - "job_student 1.000000 6.608737 44.675407 1.000000 \n", - "job_technician 1.000000 1.770734 4.135498 1.000000 \n", - "job_unemployed 1.000000 5.666467 33.108845 1.000000 \n", - "job_unknown 1.000000 12.257279 151.240878 1.000000 \n", - "marital_divorced 1.000000 2.424877 6.880031 1.000000 \n", - "marital_married 1.000000 -0.419670 1.176123 1.000000 \n", - "marital_single 1.000000 0.963215 1.927782 1.000000 \n", - "duration 1.000000 -0.800767 3.760395 1.000000 \n", - "campaign 1.000000 4.928998 43.121657 1.000000 \n", - "pdays 1.000000 2.604408 9.919031 1.000000 \n", - "previous 1.000000 44.589497 4659.092137 1.000000 \n", - "y 0.116539 0.116539 0.116539 0.102957 \n", - "contact_cellular 1.000000 -0.621315 1.386032 1.000000 \n", - "contact_telephone 1.000000 3.539552 13.528425 1.000000 \n", - "contact_unknown 1.000000 0.941843 1.887068 1.000000 \n", - "month_1.0 1.000000 5.373873 29.878506 1.000000 \n", - "month_2.0 1.000000 3.716084 14.809280 1.000000 \n", - "month_3.0 1.000000 9.624551 93.631978 1.000000 \n", - "month_4.0 1.000000 3.553374 13.626466 1.000000 \n", - "month_5.0 1.000000 0.858162 1.736443 1.000000 \n", - "month_6.0 1.000000 2.375339 6.642234 1.000000 \n", - "month_7.0 1.000000 1.928524 4.719203 1.000000 \n", - "month_8.0 1.000000 2.095163 5.389710 1.000000 \n", - "month_9.0 1.000000 8.590167 74.790974 1.000000 \n", - "month_10.0 1.000000 7.566510 58.252079 1.000000 \n", - "month_11.0 1.000000 2.909726 9.466503 1.000000 \n", - "month_12.0 1.000000 14.118852 200.341983 1.000000 \n", - "day_1.0 1.000000 11.605506 135.687767 1.000000 \n", - "day_2.0 1.000000 5.657598 33.008418 1.000000 \n", - "day_3.0 1.000000 6.220047 39.688983 1.000000 \n", - "day_4.0 1.000000 5.340684 29.522909 1.000000 \n", - "day_5.0 1.000000 4.525020 21.475803 1.000000 \n", - "day_6.0 1.000000 4.523367 21.460851 1.000000 \n", - "day_7.0 1.000000 4.644600 22.572305 1.000000 \n", - "day_8.0 1.000000 4.680253 22.904771 1.000000 \n", - "day_9.0 1.000000 5.145374 27.474873 1.000000 \n", - "day_10.0 1.000000 9.098349 83.779947 1.000000 \n", - "day_11.0 1.000000 5.203644 28.077909 1.000000 \n", - "day_12.0 1.000000 5.066584 26.670276 1.000000 \n", - "day_13.0 1.000000 5.079868 26.805061 1.000000 \n", - "day_14.0 1.000000 4.627029 22.409396 1.000000 \n", - "day_15.0 1.000000 4.877548 24.790476 1.000000 \n", - "day_16.0 1.000000 5.373873 29.878506 1.000000 \n", - "day_17.0 1.000000 4.498761 21.238851 1.000000 \n", - "day_18.0 1.000000 4.102207 17.828101 1.000000 \n", - "day_19.0 1.000000 4.780097 23.849328 1.000000 \n", - "day_20.0 1.000000 3.647795 14.306412 1.000000 \n", - "day_21.0 1.000000 4.362368 20.030252 1.000000 \n", - "day_22.0 1.000000 6.854458 47.983589 1.000000 \n", - "day_23.0 1.000000 6.700339 45.894537 1.000000 \n", - "day_24.0 1.000000 10.082003 102.646792 1.000000 \n", - "day_25.0 1.000000 7.269524 53.845983 1.000000 \n", - "day_26.0 1.000000 6.406346 42.041275 1.000000 \n", - "day_27.0 1.000000 6.137775 38.672281 1.000000 \n", - "day_28.0 1.000000 4.621790 22.360946 1.000000 \n", - "day_29.0 1.000000 4.820415 24.236397 1.000000 \n", - "day_30.0 1.000000 5.062176 26.625629 1.000000 \n", - "day_31.0 1.000000 8.228920 68.715119 1.000000 \n", - "poutcome_failure 1.000000 2.509270 7.296436 1.000000 \n", - "poutcome_other 1.000000 4.616567 22.312689 1.000000 \n", - "poutcome_success 1.000000 5.138495 27.404135 1.000000 \n", - "poutcome_unknown 1.000000 -1.628838 3.653112 1.000000 \n", + "age 1.000000 0.682366 3.328235 1.000000 \n", + "education 1.000000 -0.152303 2.305096 1.000000 \n", + "default 1.000000 7.241579 53.440463 1.000000 \n", + "balance 1.000000 7.967780 128.947991 1.000000 \n", + "housing 1.000000 -0.224695 1.050488 1.000000 \n", + "loan 1.000000 1.852760 4.432718 1.000000 \n", + "job_admin. 1.000000 2.416956 6.841677 1.000000 \n", + "job_blue-collar 1.000000 1.379278 2.902408 1.000000 \n", + "job_entrepreneur 1.000000 5.211619 28.160978 1.000000 \n", + "job_housemaid 1.000000 5.786186 34.479949 1.000000 \n", + "job_management 1.000000 1.433333 3.054445 1.000000 \n", + "job_retired 1.000000 4.159677 18.302913 1.000000 \n", + "job_self-employed 1.000000 5.096332 26.972604 1.000000 \n", + "job_services 1.000000 2.838809 9.058838 1.000000 \n", + "job_student 1.000000 6.734311 46.350944 1.000000 \n", + "job_technician 1.000000 1.772778 4.142743 1.000000 \n", + "job_unemployed 1.000000 5.608954 32.460364 1.000000 \n", + "job_unknown 1.000000 12.338367 153.235297 1.000000 \n", + "marital_divorced 1.000000 2.398540 6.752996 1.000000 \n", + "marital_married 1.000000 -0.411454 1.169294 1.000000 \n", + "marital_single 1.000000 0.962038 1.925516 1.000000 \n", + "duration 1.000000 -0.976752 4.099032 1.000000 \n", + "campaign 1.000000 4.880232 41.921269 1.000000 \n", + "pdays 1.000000 2.619630 10.019985 1.000000 \n", + "previous 1.000000 43.957189 4561.467798 1.000000 \n", + "y 0.117459 0.117459 0.117459 0.103663 \n", + "contact_cellular 1.000000 -0.623682 1.388979 1.000000 \n", + "contact_telephone 1.000000 3.561451 13.683936 1.000000 \n", + "contact_unknown 1.000000 0.941129 1.885723 1.000000 \n", + "month_1.0 1.000000 5.434842 30.537508 1.000000 \n", + "month_2.0 1.000000 3.762560 15.156859 1.000000 \n", + "month_3.0 1.000000 9.600201 93.163868 1.000000 \n", + "month_4.0 1.000000 3.504827 13.283811 1.000000 \n", + "month_5.0 1.000000 0.859194 1.738215 1.000000 \n", + "month_6.0 1.000000 2.362848 6.583051 1.000000 \n", + "month_7.0 1.000000 1.925799 4.708701 1.000000 \n", + "month_8.0 1.000000 2.082791 5.338016 1.000000 \n", + "month_9.0 1.000000 8.804934 78.526862 1.000000 \n", + "month_10.0 1.000000 7.722593 60.638450 1.000000 \n", + "month_11.0 1.000000 2.910307 9.469886 1.000000 \n", + "month_12.0 1.000000 14.577338 213.498780 1.000000 \n", + "day_1.0 1.000000 11.475683 132.691304 1.000000 \n", + "day_2.0 1.000000 5.676302 33.220410 1.000000 \n", + "day_3.0 1.000000 6.157210 38.911232 1.000000 \n", + "day_4.0 1.000000 5.361931 29.750303 1.000000 \n", + "day_5.0 1.000000 4.524143 21.467870 1.000000 \n", + "day_6.0 1.000000 4.504429 21.289878 1.000000 \n", + "day_7.0 1.000000 4.702780 23.116144 1.000000 \n", + "day_8.0 1.000000 4.673862 22.844991 1.000000 \n", + "day_9.0 1.000000 5.078497 26.791131 1.000000 \n", + "day_10.0 1.000000 9.055914 83.009585 1.000000 \n", + "day_11.0 1.000000 5.279225 28.870216 1.000000 \n", + "day_12.0 1.000000 5.034608 26.347280 1.000000 \n", + "day_13.0 1.000000 5.054236 26.545301 1.000000 \n", + "day_14.0 1.000000 4.631319 22.449119 1.000000 \n", + "day_15.0 1.000000 4.906584 25.074570 1.000000 \n", + "day_16.0 1.000000 5.416369 30.337058 1.000000 \n", + "day_17.0 1.000000 4.475257 21.027925 1.000000 \n", + "day_18.0 1.000000 4.096451 17.780915 1.000000 \n", + "day_19.0 1.000000 4.762006 23.676700 1.000000 \n", + "day_20.0 1.000000 3.642574 14.268344 1.000000 \n", + "day_21.0 1.000000 4.451305 20.814118 1.000000 \n", + "day_22.0 1.000000 6.850586 47.930527 1.000000 \n", + "day_23.0 1.000000 6.678180 45.598093 1.000000 \n", + "day_24.0 1.000000 9.895965 98.930118 1.000000 \n", + "day_25.0 1.000000 7.100804 51.421415 1.000000 \n", + "day_26.0 1.000000 6.350294 41.326240 1.000000 \n", + "day_27.0 1.000000 6.052291 37.630228 1.000000 \n", + "day_28.0 1.000000 4.691883 23.013767 1.000000 \n", + "day_29.0 1.000000 4.796166 24.003206 1.000000 \n", + "day_30.0 1.000000 5.130217 27.319129 1.000000 \n", + "day_31.0 1.000000 8.156521 67.528827 1.000000 \n", + "poutcome_failure 1.000000 2.503061 7.265313 1.000000 \n", + "poutcome_other 1.000000 4.697324 23.064850 1.000000 \n", + "poutcome_success 1.000000 5.150832 27.531067 1.000000 \n", + "poutcome_unknown 1.000000 -1.637538 3.681530 1.000000 \n", "\n", " central_moment_3 central_moment_4 sum sum_2 \\\n", - "age 0.683627 3.332610 5.456968e-12 36623.0 \n", - "education -0.150311 2.309413 -8.185452e-12 36623.0 \n", - "default 7.252029 53.591920 -9.094947e-13 36623.0 \n", - "balance 8.470364 147.645486 1.421085e-12 36623.0 \n", - "housing -0.219812 1.048317 7.730705e-12 36623.0 \n", - "loan 1.843177 4.397301 2.586376e-12 36623.0 \n", - "job_admin. 2.426150 6.886204 -9.379164e-13 36623.0 \n", - "job_blue-collar 1.386860 2.923380 -1.477929e-12 36623.0 \n", - "job_entrepreneur 5.246681 28.527661 7.105427e-14 36623.0 \n", - "job_housemaid 5.797803 34.614514 -1.989520e-13 36623.0 \n", - "job_management 1.435105 3.059526 -4.320100e-12 36623.0 \n", - "job_retired 4.104804 17.849418 -7.958079e-13 36623.0 \n", - "job_self-employed 5.035936 26.360650 -2.131628e-12 36623.0 \n", - "job_services 2.838807 9.058826 9.094947e-13 36623.0 \n", - "job_student 6.608737 44.675407 -4.547474e-13 36623.0 \n", - "job_technician 1.770734 4.135498 1.492140e-12 36623.0 \n", - "job_unemployed 5.666467 33.108845 -6.110668e-13 36623.0 \n", - "job_unknown 12.257279 151.240878 -2.557954e-13 36623.0 \n", - "marital_divorced 2.424877 6.880031 4.831691e-13 36623.0 \n", - "marital_married -0.419670 1.176123 -9.876544e-13 36623.0 \n", - "marital_single 0.963215 1.927782 5.186962e-12 36623.0 \n", - "duration -0.800767 3.760395 1.797673e-12 36623.0 \n", - "campaign 4.928998 43.121657 2.273737e-12 36623.0 \n", - "pdays 2.604408 9.919031 9.094947e-13 36623.0 \n", - "previous 44.589497 4659.092137 6.821210e-13 36623.0 \n", - "y 0.078960 0.071157 4.268000e+03 4268.0 \n", - "contact_cellular -0.621315 1.386032 2.728484e-12 36623.0 \n", - "contact_telephone 3.539552 13.528425 9.094947e-13 36623.0 \n", - "contact_unknown 0.941843 1.887068 -4.547474e-12 36623.0 \n", - "month_1.0 5.373873 29.878506 2.273737e-13 36623.0 \n", - "month_2.0 3.716084 14.809280 1.136868e-13 36623.0 \n", - "month_3.0 9.624551 93.631978 -9.094947e-13 36623.0 \n", - "month_4.0 3.553374 13.626466 -1.023182e-12 36623.0 \n", - "month_5.0 0.858162 1.736443 5.911716e-12 36623.0 \n", - "month_6.0 2.375339 6.642234 7.275958e-12 36623.0 \n", - "month_7.0 1.928524 4.719203 0.000000e+00 36623.0 \n", - "month_8.0 2.095163 5.389710 1.091394e-11 36623.0 \n", - "month_9.0 8.590167 74.790974 1.534772e-12 36623.0 \n", - "month_10.0 7.566510 58.252079 1.591616e-12 36623.0 \n", - "month_11.0 2.909726 9.466503 -1.818989e-12 36623.0 \n", - "month_12.0 14.118852 200.341983 3.410605e-13 36623.0 \n", - "day_1.0 11.605506 135.687767 -7.958079e-13 36623.0 \n", - "day_2.0 5.657598 33.008418 -2.728484e-12 36623.0 \n", - "day_3.0 6.220047 39.688983 -1.818989e-12 36623.0 \n", - "day_4.0 5.340684 29.522909 -1.250555e-12 36623.0 \n", - "day_5.0 4.525020 21.475803 3.410605e-12 36623.0 \n", - "day_6.0 4.523367 21.460851 -2.501110e-12 36623.0 \n", - "day_7.0 4.644600 22.572305 1.023182e-12 36623.0 \n", - "day_8.0 4.680253 22.904771 2.046363e-12 36623.0 \n", - "day_9.0 5.145374 27.474873 2.501110e-12 36623.0 \n", - "day_10.0 9.098349 83.779947 2.842171e-13 36623.0 \n", - "day_11.0 5.203644 28.077909 -4.547474e-13 36623.0 \n", - "day_12.0 5.066584 26.670276 -6.821210e-13 36623.0 \n", - "day_13.0 5.079868 26.805061 -2.387424e-12 36623.0 \n", - "day_14.0 4.627029 22.409396 -7.958079e-13 36623.0 \n", - "day_15.0 4.877548 24.790476 6.821210e-13 36623.0 \n", - "day_16.0 5.373873 29.878506 1.023182e-12 36623.0 \n", - "day_17.0 4.498761 21.238851 1.591616e-12 36623.0 \n", - "day_18.0 4.102207 17.828101 1.477929e-12 36623.0 \n", - "day_19.0 4.780097 23.849328 2.046363e-12 36623.0 \n", - "day_20.0 3.647795 14.306412 4.774847e-12 36623.0 \n", - "day_21.0 4.362368 20.030252 7.958079e-13 36623.0 \n", - "day_22.0 6.854458 47.983589 -2.273737e-13 36623.0 \n", - "day_23.0 6.700339 45.894537 -2.046363e-12 36623.0 \n", - "day_24.0 10.082003 102.646792 -7.958079e-13 36623.0 \n", - "day_25.0 7.269524 53.845983 0.000000e+00 36623.0 \n", - "day_26.0 6.406346 42.041275 -9.094947e-13 36623.0 \n", - "day_27.0 6.137775 38.672281 -9.094947e-13 36623.0 \n", - "day_28.0 4.621790 22.360946 -2.728484e-12 36623.0 \n", - "day_29.0 4.820415 24.236397 -2.728484e-12 36623.0 \n", - "day_30.0 5.062176 26.625629 0.000000e+00 36623.0 \n", - "day_31.0 8.228920 68.715119 -1.136868e-13 36623.0 \n", - "poutcome_failure 2.509270 7.296436 1.136868e-12 36623.0 \n", - "poutcome_other 4.616567 22.312689 2.273737e-13 36623.0 \n", - "poutcome_success 5.138495 27.404135 -6.821210e-13 36623.0 \n", - "poutcome_unknown -1.628838 3.653112 5.456968e-12 36623.0 \n", + "age 0.682366 3.328235 -6.878054e-12 36634.0 \n", + "education -0.152303 2.305096 3.410605e-13 36634.0 \n", + "default 7.241579 53.440463 1.818989e-12 36634.0 \n", + "balance 7.967780 128.947991 -5.115908e-13 36634.0 \n", + "housing -0.224695 1.050488 -1.136868e-12 36634.0 \n", + "loan 1.852760 4.432718 2.117417e-12 36634.0 \n", + "job_admin. 2.416956 6.841677 -8.384404e-13 36634.0 \n", + "job_blue-collar 1.379278 2.902408 1.136868e-12 36634.0 \n", + "job_entrepreneur 5.211619 28.160978 -3.552714e-13 36634.0 \n", + "job_housemaid 5.786186 34.479949 1.989520e-12 36634.0 \n", + "job_management 1.433333 3.054445 1.989520e-12 36634.0 \n", + "job_retired 4.159677 18.302913 2.330580e-12 36634.0 \n", + "job_self-employed 5.096332 26.972604 -1.563194e-12 36634.0 \n", + "job_services 2.838809 9.058838 2.330580e-12 36634.0 \n", + "job_student 6.734311 46.350944 -3.410605e-13 36634.0 \n", + "job_technician 1.772778 4.142743 -6.821210e-13 36634.0 \n", + "job_unemployed 5.608954 32.460364 5.400125e-13 36634.0 \n", + "job_unknown 12.338367 153.235297 -5.542233e-13 36634.0 \n", + "marital_divorced 2.398540 6.752996 1.008971e-12 36634.0 \n", + "marital_married -0.411454 1.169294 -4.888534e-12 36634.0 \n", + "marital_single 0.962038 1.925516 2.202682e-12 36634.0 \n", + "duration -0.976752 4.099032 2.430056e-12 36634.0 \n", + "campaign 4.880232 41.921269 1.477929e-12 36634.0 \n", + "pdays 2.619630 10.019985 -2.273737e-13 36634.0 \n", + "previous 43.957189 4561.467798 -1.477929e-12 36634.0 \n", + "y 0.079310 0.071425 4.303000e+03 4303.0 \n", + "contact_cellular -0.623682 1.388979 -1.818989e-12 36634.0 \n", + "contact_telephone 3.561451 13.683936 2.160050e-12 36634.0 \n", + "contact_unknown 0.941129 1.885723 -4.547474e-12 36634.0 \n", + "month_1.0 5.434842 30.537508 -9.094947e-13 36634.0 \n", + "month_2.0 3.762560 15.156859 3.069545e-12 36634.0 \n", + "month_3.0 9.600201 93.163868 -1.932676e-12 36634.0 \n", + "month_4.0 3.504827 13.283811 6.821210e-12 36634.0 \n", + "month_5.0 0.859194 1.738215 -4.547474e-13 36634.0 \n", + "month_6.0 2.362848 6.583051 -2.728484e-12 36634.0 \n", + "month_7.0 1.925799 4.708701 -5.684342e-12 36634.0 \n", + "month_8.0 2.082791 5.338016 -6.821210e-13 36634.0 \n", + "month_9.0 8.804934 78.526862 -6.252776e-13 36634.0 \n", + "month_10.0 7.722593 60.638450 -3.410605e-13 36634.0 \n", + "month_11.0 2.910307 9.469886 2.046363e-12 36634.0 \n", + "month_12.0 14.577338 213.498780 2.842171e-13 36634.0 \n", + "day_1.0 11.475683 132.691304 1.136868e-13 36634.0 \n", + "day_2.0 5.676302 33.220410 -2.046363e-12 36634.0 \n", + "day_3.0 6.157210 38.911232 0.000000e+00 36634.0 \n", + "day_4.0 5.361931 29.750303 -1.193712e-12 36634.0 \n", + "day_5.0 4.524143 21.467870 -2.728484e-12 36634.0 \n", + "day_6.0 4.504429 21.289878 2.501110e-12 36634.0 \n", + "day_7.0 4.702780 23.116144 -1.591616e-12 36634.0 \n", + "day_8.0 4.673862 22.844991 -3.296918e-12 36634.0 \n", + "day_9.0 5.078497 26.791131 6.821210e-13 36634.0 \n", + "day_10.0 9.055914 83.009585 -1.477929e-12 36634.0 \n", + "day_11.0 5.279225 28.870216 -1.591616e-12 36634.0 \n", + "day_12.0 5.034608 26.347280 -3.410605e-12 36634.0 \n", + "day_13.0 5.054236 26.545301 1.477929e-12 36634.0 \n", + "day_14.0 4.631319 22.449119 -5.684342e-13 36634.0 \n", + "day_15.0 4.906584 25.074570 2.501110e-12 36634.0 \n", + "day_16.0 5.416369 30.337058 3.410605e-13 36634.0 \n", + "day_17.0 4.475257 21.027925 3.183231e-12 36634.0 \n", + "day_18.0 4.096451 17.780915 9.094947e-13 36634.0 \n", + "day_19.0 4.762006 23.676700 -1.364242e-12 36634.0 \n", + "day_20.0 3.642574 14.268344 -1.591616e-12 36634.0 \n", + "day_21.0 4.451305 20.814118 -2.273737e-12 36634.0 \n", + "day_22.0 6.850586 47.930527 -2.273737e-13 36634.0 \n", + "day_23.0 6.678180 45.598093 -1.136868e-12 36634.0 \n", + "day_24.0 9.895965 98.930118 7.389644e-13 36634.0 \n", + "day_25.0 7.100804 51.421415 -2.160050e-12 36634.0 \n", + "day_26.0 6.350294 41.326240 -6.821210e-13 36634.0 \n", + "day_27.0 6.052291 37.630228 9.094947e-13 36634.0 \n", + "day_28.0 4.691883 23.013767 1.136868e-12 36634.0 \n", + "day_29.0 4.796166 24.003206 -6.821210e-13 36634.0 \n", + "day_30.0 5.130217 27.319129 1.136868e-12 36634.0 \n", + "day_31.0 8.156521 67.528827 1.136868e-12 36634.0 \n", + "poutcome_failure 2.503061 7.265313 -1.136868e-12 36634.0 \n", + "poutcome_other 4.697324 23.064850 -2.728484e-12 36634.0 \n", + "poutcome_success 5.150832 27.531067 -1.250555e-12 36634.0 \n", + "poutcome_unknown -1.637538 3.681530 -4.547474e-13 36634.0 \n", "\n", " sum_3 sum_4 \n", - "age 2.503646e+04 1.220502e+05 \n", - "education -5.504831e+03 8.457763e+04 \n", - "default 2.655910e+05 1.962697e+06 \n", - "balance 3.102101e+05 5.407221e+06 \n", - "housing -8.050184e+03 3.839253e+04 \n", - "loan 6.750267e+04 1.610423e+05 \n", - "job_admin. 8.885289e+04 2.521935e+05 \n", - "job_blue-collar 5.079096e+04 1.070629e+05 \n", - "job_entrepreneur 1.921492e+05 1.044769e+06 \n", - "job_housemaid 2.123329e+05 1.267687e+06 \n", - "job_management 5.255784e+04 1.120490e+05 \n", - "job_retired 1.503302e+05 6.536993e+05 \n", - "job_self-employed 1.844311e+05 9.654061e+05 \n", - "job_services 1.039656e+05 3.317614e+05 \n", - "job_student 2.420318e+05 1.636147e+06 \n", - "job_technician 6.484958e+04 1.514543e+05 \n", - "job_unemployed 2.075230e+05 1.212545e+06 \n", - "job_unknown 4.488983e+05 5.538895e+06 \n", - "marital_divorced 8.880629e+04 2.519674e+05 \n", - "marital_married -1.536959e+04 4.307316e+04 \n", - "marital_single 3.527581e+04 7.060118e+04 \n", - "duration -2.932649e+04 1.377170e+05 \n", - "campaign 1.805147e+05 1.579244e+06 \n", - "pdays 9.538123e+04 3.632647e+05 \n", - "previous 1.633001e+06 1.706299e+08 \n", - "y 4.268000e+03 4.268000e+03 \n", - "contact_cellular -2.275441e+04 5.076065e+04 \n", - "contact_telephone 1.296290e+05 4.954515e+05 \n", - "contact_unknown 3.449310e+04 6.911008e+04 \n", - "month_1.0 1.968073e+05 1.094241e+06 \n", - "month_2.0 1.360941e+05 5.423602e+05 \n", - "month_3.0 3.524799e+05 3.429084e+06 \n", - "month_4.0 1.301352e+05 4.990421e+05 \n", - "month_5.0 3.142848e+04 6.359374e+04 \n", - "month_6.0 8.699203e+04 2.432585e+05 \n", - "month_7.0 7.062832e+04 1.728314e+05 \n", - "month_8.0 7.673117e+04 1.973873e+05 \n", - "month_9.0 3.145977e+05 2.739070e+06 \n", - "month_10.0 2.771083e+05 2.133366e+06 \n", - "month_11.0 1.065629e+05 3.466917e+05 \n", - "month_12.0 5.170747e+05 7.337124e+06 \n", - "day_1.0 4.250284e+05 4.969293e+06 \n", - "day_2.0 2.071982e+05 1.208867e+06 \n", - "day_3.0 2.277968e+05 1.453530e+06 \n", - "day_4.0 1.955919e+05 1.081218e+06 \n", - "day_5.0 1.657198e+05 7.865083e+05 \n", - "day_6.0 1.656593e+05 7.859607e+05 \n", - "day_7.0 1.700992e+05 8.266655e+05 \n", - "day_8.0 1.714049e+05 8.388414e+05 \n", - "day_9.0 1.884390e+05 1.006212e+06 \n", - "day_10.0 3.332088e+05 3.068273e+06 \n", - "day_11.0 1.905730e+05 1.028297e+06 \n", - "day_12.0 1.855535e+05 9.767455e+05 \n", - "day_13.0 1.860400e+05 9.816817e+05 \n", - "day_14.0 1.694557e+05 8.206993e+05 \n", - "day_15.0 1.786304e+05 9.079016e+05 \n", - "day_16.0 1.968073e+05 1.094241e+06 \n", - "day_17.0 1.647581e+05 7.778304e+05 \n", - "day_18.0 1.502351e+05 6.529185e+05 \n", - "day_19.0 1.750615e+05 8.734340e+05 \n", - "day_20.0 1.335932e+05 5.239437e+05 \n", - "day_21.0 1.597630e+05 7.335679e+05 \n", - "day_22.0 2.510308e+05 1.757303e+06 \n", - "day_23.0 2.453865e+05 1.680796e+06 \n", - "day_24.0 3.692332e+05 3.759233e+06 \n", - "day_25.0 2.662318e+05 1.972001e+06 \n", - "day_26.0 2.346196e+05 1.539678e+06 \n", - "day_27.0 2.247837e+05 1.416295e+06 \n", - "day_28.0 1.692638e+05 8.189249e+05 \n", - "day_29.0 1.765380e+05 8.876096e+05 \n", - "day_30.0 1.853921e+05 9.751104e+05 \n", - "day_31.0 3.013677e+05 2.516554e+06 \n", - "poutcome_failure 9.189699e+04 2.672174e+05 \n", - "poutcome_other 1.690725e+05 8.171576e+05 \n", - "poutcome_success 1.881871e+05 1.003622e+06 \n", - "poutcome_unknown -5.965292e+04 1.337879e+05 \n", + "age 2.499780e+04 1.219266e+05 \n", + "education -5.579460e+03 8.444490e+04 \n", + "default 2.652880e+05 1.957738e+06 \n", + "balance 2.918916e+05 4.723881e+06 \n", + "housing -8.231462e+03 3.848356e+04 \n", + "loan 6.787399e+04 1.623882e+05 \n", + "job_admin. 8.854277e+04 2.506380e+05 \n", + "job_blue-collar 5.052848e+04 1.063268e+05 \n", + "job_entrepreneur 1.909225e+05 1.031649e+06 \n", + "job_housemaid 2.119711e+05 1.263138e+06 \n", + "job_management 5.250874e+04 1.118965e+05 \n", + "job_retired 1.523856e+05 6.705089e+05 \n", + "job_self-employed 1.866990e+05 9.881144e+05 \n", + "job_services 1.039969e+05 3.318615e+05 \n", + "job_student 2.467047e+05 1.698020e+06 \n", + "job_technician 6.494396e+04 1.517653e+05 \n", + "job_unemployed 2.054784e+05 1.189153e+06 \n", + "job_unknown 4.520037e+05 5.613622e+06 \n", + "marital_divorced 8.786813e+04 2.473893e+05 \n", + "marital_married -1.507319e+04 4.283592e+04 \n", + "marital_single 3.524328e+04 7.053936e+04 \n", + "duration -3.578234e+04 1.501639e+05 \n", + "campaign 1.787824e+05 1.535744e+06 \n", + "pdays 9.596753e+04 3.670721e+05 \n", + "previous 1.610328e+06 1.671048e+08 \n", + "y 4.303000e+03 4.303000e+03 \n", + "contact_cellular -2.284795e+04 5.088384e+04 \n", + "contact_telephone 1.304702e+05 5.012973e+05 \n", + "contact_unknown 3.447731e+04 6.908158e+04 \n", + "month_1.0 1.991000e+05 1.118711e+06 \n", + "month_2.0 1.378376e+05 5.552564e+05 \n", + "month_3.0 3.516938e+05 3.412965e+06 \n", + "month_4.0 1.283958e+05 4.866391e+05 \n", + "month_5.0 3.147572e+04 6.367775e+04 \n", + "month_6.0 8.656057e+04 2.411635e+05 \n", + "month_7.0 7.054972e+04 1.724986e+05 \n", + "month_8.0 7.630095e+04 1.955529e+05 \n", + "month_9.0 3.225600e+05 2.876753e+06 \n", + "month_10.0 2.829095e+05 2.221429e+06 \n", + "month_11.0 1.066162e+05 3.469198e+05 \n", + "month_12.0 5.340262e+05 7.821314e+06 \n", + "day_1.0 4.204002e+05 4.861013e+06 \n", + "day_2.0 2.079457e+05 1.216996e+06 \n", + "day_3.0 2.255632e+05 1.425474e+06 \n", + "day_4.0 1.964290e+05 1.089873e+06 \n", + "day_5.0 1.657375e+05 7.864540e+05 \n", + "day_6.0 1.650152e+05 7.799334e+05 \n", + "day_7.0 1.722817e+05 8.468368e+05 \n", + "day_8.0 1.712223e+05 8.369034e+05 \n", + "day_9.0 1.860457e+05 9.814663e+05 \n", + "day_10.0 3.317544e+05 3.040973e+06 \n", + "day_11.0 1.933991e+05 1.057631e+06 \n", + "day_12.0 1.844378e+05 9.652063e+05 \n", + "day_13.0 1.851569e+05 9.724606e+05 \n", + "day_14.0 1.696638e+05 8.224010e+05 \n", + "day_15.0 1.797478e+05 9.185818e+05 \n", + "day_16.0 1.984233e+05 1.111368e+06 \n", + "day_17.0 1.639466e+05 7.703370e+05 \n", + "day_18.0 1.500694e+05 6.513860e+05 \n", + "day_19.0 1.744513e+05 8.673722e+05 \n", + "day_20.0 1.334420e+05 5.227065e+05 \n", + "day_21.0 1.630691e+05 7.625044e+05 \n", + "day_22.0 2.509644e+05 1.755887e+06 \n", + "day_23.0 2.446485e+05 1.670441e+06 \n", + "day_24.0 3.625288e+05 3.624206e+06 \n", + "day_25.0 2.601308e+05 1.883772e+06 \n", + "day_26.0 2.326367e+05 1.513945e+06 \n", + "day_27.0 2.217196e+05 1.378546e+06 \n", + "day_28.0 1.718824e+05 8.430863e+05 \n", + "day_29.0 1.757027e+05 8.793334e+05 \n", + "day_30.0 1.879404e+05 1.000809e+06 \n", + "day_31.0 2.988060e+05 2.473851e+06 \n", + "poutcome_failure 9.169713e+04 2.661575e+05 \n", + "poutcome_other 1.720818e+05 8.449577e+05 \n", + "poutcome_success 1.886956e+05 1.008573e+06 \n", + "poutcome_unknown -5.998956e+04 1.348692e+05 \n", "\n", "[76 rows x 26 columns]" ] @@ -4971,50 +5010,50 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m /home/fengjun.feng/anaconda3/envs/sf/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m warnings.warn(\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m /home/fengjun.feng/anaconda3/envs/sf/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m warnings.warn(\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n" + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3805540)\u001b[0m warnings.warn(\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m warnings.warn(\n", + "\u001b[2m\u001b[36m(_run pid=3807059)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3807059)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3807059)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", + "\u001b[2m\u001b[36m(_run pid=3807059)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", + "\u001b[2m\u001b[36m(_run pid=3807059)\u001b[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m [2023-06-19 19:59:23.463] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m [2023-06-19 19:59:23.478] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m [2023-10-07 18:04:03.441] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(_run pid=3807059)\u001b[0m [2023-10-07 18:04:03.419] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" ] }, { "data": { "text/plain": [ - "array([[1.000, -0.162, -0.018, 0.096, -0.186, -0.017, -0.011, 0.006,\n", - " -0.027, -0.000, 0.028],\n", - " [-0.162, 1.000, -0.013, 0.068, -0.075, -0.022, -0.004, 0.003,\n", - " 0.012, 0.030, 0.066],\n", - " [-0.018, -0.013, 1.000, -0.066, -0.006, 0.078, -0.007, 0.012,\n", - " -0.028, -0.017, -0.020],\n", - " [0.096, 0.068, -0.066, 1.000, -0.068, -0.084, 0.015, -0.012,\n", - " 0.004, 0.016, 0.050],\n", - " [-0.186, -0.075, -0.006, -0.068, 1.000, 0.040, 0.002, -0.028,\n", - " 0.123, 0.034, -0.137],\n", - " [-0.017, -0.022, 0.078, -0.084, 0.040, 1.000, -0.012, 0.013,\n", - " -0.023, -0.009, -0.068],\n", - " [-0.011, -0.004, -0.007, 0.015, 0.002, -0.012, 1.000, -0.173,\n", - " 0.016, 0.011, 0.325],\n", - " [0.006, 0.003, 0.012, -0.012, -0.028, 0.013, -0.173, 1.000,\n", - " -0.089, -0.032, -0.072],\n", - " [-0.027, 0.012, -0.028, 0.004, 0.123, -0.023, 0.016, -0.089,\n", - " 1.000, 0.439, 0.106],\n", - " [-0.000, 0.030, -0.017, 0.016, 0.034, -0.009, 0.011, -0.032,\n", - " 0.439, 1.000, 0.092],\n", - " [0.028, 0.066, -0.020, 0.050, -0.137, -0.068, 0.325, -0.072,\n", - " 0.106, 0.092, 1.000]], dtype=float32)" + "array([[1.000, -0.166, -0.020, 0.098, -0.185, -0.014, -0.010, 0.007,\n", + " -0.026, 0.000, 0.023],\n", + " [-0.166, 1.000, -0.010, 0.066, -0.079, -0.025, -0.002, 0.003,\n", + " 0.004, 0.023, 0.070],\n", + " [-0.020, -0.010, 1.000, -0.066, -0.005, 0.078, -0.006, 0.019,\n", + " -0.031, -0.016, -0.025],\n", + " [0.098, 0.066, -0.066, 1.000, -0.070, -0.084, 0.017, -0.019,\n", + " 0.003, 0.014, 0.054],\n", + " [-0.185, -0.079, -0.005, -0.070, 1.000, 0.042, 0.002, -0.027,\n", + " 0.127, 0.039, -0.136],\n", + " [-0.014, -0.025, 0.078, -0.084, 0.042, 1.000, -0.008, 0.011,\n", + " -0.020, -0.008, -0.067],\n", + " [-0.010, -0.002, -0.006, 0.017, 0.002, -0.008, 1.000, -0.181,\n", + " 0.018, 0.009, 0.319],\n", + " [0.007, 0.003, 0.019, -0.019, -0.027, 0.011, -0.181, 1.000,\n", + " -0.089, -0.032, -0.070],\n", + " [-0.026, 0.004, -0.031, 0.003, 0.127, -0.020, 0.018, -0.089,\n", + " 1.000, 0.440, 0.102],\n", + " [0.000, 0.023, -0.016, 0.014, 0.039, -0.008, 0.009, -0.032, 0.440,\n", + " 1.000, 0.087],\n", + " [0.023, 0.070, -0.025, 0.054, -0.136, -0.067, 0.319, -0.070,\n", + " 0.102, 0.087, 1.000]], dtype=float32)" ] }, "execution_count": 17, @@ -5065,20 +5104,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m /home/fengjun.feng/anaconda3/envs/sf/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", - "\u001B[2m\u001B[36m(_run pid=4610)\u001B[0m warnings.warn(\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m /home/fengjun.feng/anaconda3/envs/sf/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", - "\u001B[2m\u001B[36m(_run pid=4620)\u001B[0m warnings.warn(\n" + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3805590)\u001b[0m warnings.warn(\n", + "\u001b[2m\u001b[36m(_run pid=3807059)\u001b[0m /home/zoupeicheng.zpc/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but StandardScaler was fitted without feature names\n", + "\u001b[2m\u001b[36m(_run pid=3807059)\u001b[0m warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Index(['age', 'education', 'default', 'balance', 'housing', 'loan', 'duration',\n", - " 'campaign', 'pdays', 'previous', 'y'],\n", - " dtype='object')\n", - "[1.081 1.051 1.011 1.030 1.091 1.018 1.151 1.040 1.276 1.243 1.173]\n" + "['age', 'education', 'default', 'balance', 'housing', 'loan', 'duration', 'campaign', 'pdays', 'previous', 'y']\n", + "[1.084 1.053 1.012 1.031 1.093 1.018 1.150 1.043 1.279 1.244 1.168]\n" ] } ], @@ -5203,7 +5240,7 @@ { "data": { "text/plain": [ - "Array(0.000, dtype=float32)" + "Array(inf, dtype=float32)" ] }, "execution_count": 22, @@ -5249,20 +5286,13 @@ "execution_count": 23, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[2m\u001B[36m(_run pid=4609)\u001B[0m [2023-06-19 19:59:26.102] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:epoch 1 times: 1.8808467388153076s\n", - "INFO:root:epoch 2 times: 0.7152931690216064s\n", - "INFO:root:epoch 3 times: 0.6313662528991699s\n" + "INFO:root:epoch 1 times: 1.159546136856079s\n", + "INFO:root:epoch 2 times: 0.9041106700897217s\n", + "INFO:root:epoch 3 times: 0.8574354648590088s\n" ] } ], @@ -5330,64 +5360,86 @@ "text": [ "INFO:root:Create proxy actor with party alice.\n", "INFO:root:Create proxy actor with party bob.\n", - "INFO:root:fragment_count 1\n", - "INFO:root:prepare time 0.17217350006103516s\n", - "\u001B[2m\u001B[36m(pid=28283)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n", - "\u001B[2m\u001B[36m(pid=28284)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n", - "INFO:root:global_setup time 3.0523507595062256s\n", + "INFO:root:fragment_count 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:prepare time 0.0977473258972168s\n", + "\u001b[2m\u001b[36m(_run pid=3806453)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3806453)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3806453)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", + "\u001b[2m\u001b[36m(_run pid=3806453)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", + "\u001b[2m\u001b[36m(_run pid=3806453)\u001b[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n", + "INFO:root:global_setup time 1.0207350254058838s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(_run pid=3806453)\u001b[0m [2023-10-07 18:04:08.905] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ "INFO:root:build & infeed bucket_map fragments [0, 0]\n", - "INFO:root:build & infeed bucket_map time 0.8674089908599854s\n", - "INFO:root:init_pred time 0.0363163948059082s\n", - "INFO:root:epoch 0 tree_setup time 0.09707164764404297s\n", - "INFO:root:fragment[0, 0] gradient sum time 1.1570940017700195s\n", - "INFO:root:level 0 time 1.4122493267059326s\n", - "INFO:root:fragment[0, 0] gradient sum time 1.8844733238220215s\n", - "INFO:root:level 1 time 2.151341676712036s\n", - "INFO:root:fragment[0, 0] gradient sum time 3.6136951446533203s\n", - "INFO:root:level 2 time 3.904376983642578s\n", - "INFO:root:fragment[0, 0] gradient sum time 7.211242914199829s\n", - "INFO:root:level 3 time 7.576739311218262s\n", - "INFO:root:fragment[0, 0] gradient sum time 14.350088357925415s\n", - "INFO:root:level 4 time 14.898044347763062s\n" + "INFO:root:build & infeed bucket_map time 0.47269535064697266s\n", + "INFO:root:init_pred time 0.01999831199645996s\n", + "INFO:root:epoch 0 tree_setup time 0.06561398506164551s\n", + "INFO:root:fragment[0, 0] gradient sum time 0.4283578395843506s\n", + "INFO:root:level 0 time 0.633394718170166s\n", + "INFO:root:fragment[0, 0] gradient sum time 0.7379474639892578s\n", + "INFO:root:level 1 time 0.8940839767456055s\n", + "INFO:root:fragment[0, 0] gradient sum time 1.374051809310913s\n", + "INFO:root:level 2 time 1.6118881702423096s\n", + "INFO:root:fragment[0, 0] gradient sum time 2.799717903137207s\n", + "INFO:root:level 3 time 3.1707115173339844s\n", + "INFO:root:fragment[0, 0] gradient sum time 5.533723831176758s\n", + "INFO:root:level 4 time 5.947847843170166s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(XgbTreeWorker pid=28283)\u001B[0m [2023-06-19 20:00:03.794] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", - "\u001B[2m\u001B[36m(XgbTreeWorker pid=28284)\u001B[0m [2023-06-19 20:00:03.816] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" + "\u001b[2m\u001b[36m(XgbTreeWorker pid=3820043)\u001b[0m [2023-10-07 18:04:21.826] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(XgbTreeWorker pid=3820042)\u001b[0m [2023-10-07 18:04:21.809] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:epoch 0 time 30.738550424575806s\n", - "INFO:root:epoch 1 tree_setup time 0.5257089138031006s\n", - "INFO:root:fragment[0, 0] gradient sum time 1.7339739799499512s\n", - "INFO:root:level 0 time 1.875471830368042s\n", - "INFO:root:fragment[0, 0] gradient sum time 1.8286621570587158s\n", - "INFO:root:level 1 time 2.0614569187164307s\n", - "INFO:root:fragment[0, 0] gradient sum time 3.5874507427215576s\n", - "INFO:root:level 2 time 3.8770413398742676s\n", - "INFO:root:fragment[0, 0] gradient sum time 7.182074069976807s\n", - "INFO:root:level 3 time 7.561537027359009s\n", - "INFO:root:fragment[0, 0] gradient sum time 14.358884334564209s\n", - "INFO:root:level 4 time 14.837937831878662s\n", - "INFO:root:epoch 1 time 30.98349380493164s\n", - "INFO:root:epoch 2 tree_setup time 0.525902271270752s\n", - "INFO:root:fragment[0, 0] gradient sum time 1.7702536582946777s\n", - "INFO:root:level 0 time 1.983457088470459s\n", - "INFO:root:fragment[0, 0] gradient sum time 1.7969560623168945s\n", - "INFO:root:level 1 time 1.9531784057617188s\n", - "INFO:root:fragment[0, 0] gradient sum time 3.668266534805298s\n", - "INFO:root:level 2 time 3.9047152996063232s\n", - "INFO:root:fragment[0, 0] gradient sum time 7.151939868927002s\n", - "INFO:root:level 3 time 7.485116720199585s\n", - "INFO:root:fragment[0, 0] gradient sum time 14.194502830505371s\n", - "INFO:root:level 4 time 14.669843435287476s\n", - "INFO:root:epoch 2 time 30.274888277053833s\n" + "INFO:root:epoch 0 time 12.676203489303589s\n", + "INFO:root:epoch 1 tree_setup time 0.32253217697143555s\n", + "INFO:root:fragment[0, 0] gradient sum time 0.6513419151306152s\n", + "INFO:root:level 0 time 0.8437290191650391s\n", + "INFO:root:fragment[0, 0] gradient sum time 0.7146239280700684s\n", + "INFO:root:level 1 time 0.8953475952148438s\n", + "INFO:root:fragment[0, 0] gradient sum time 1.4147369861602783s\n", + "INFO:root:level 2 time 1.6838665008544922s\n", + "INFO:root:fragment[0, 0] gradient sum time 2.8456201553344727s\n", + "INFO:root:level 3 time 3.1713719367980957s\n", + "INFO:root:fragment[0, 0] gradient sum time 5.684190511703491s\n", + "INFO:root:level 4 time 6.1044602394104s\n", + "INFO:root:epoch 1 time 13.149580478668213s\n", + "INFO:root:epoch 2 tree_setup time 0.3217151165008545s\n", + "INFO:root:fragment[0, 0] gradient sum time 0.6628327369689941s\n", + "INFO:root:level 0 time 0.8769242763519287s\n", + "INFO:root:fragment[0, 0] gradient sum time 0.7444479465484619s\n", + "INFO:root:level 1 time 0.9811975955963135s\n", + "INFO:root:fragment[0, 0] gradient sum time 1.4514589309692383s\n", + "INFO:root:level 2 time 1.7142736911773682s\n", + "INFO:root:fragment[0, 0] gradient sum time 2.8057940006256104s\n", + "INFO:root:level 3 time 3.1123876571655273s\n", + "INFO:root:fragment[0, 0] gradient sum time 5.516232490539551s\n", + "INFO:root:level 4 time 5.923535585403442s\n", + "INFO:root:epoch 2 time 12.82395052909851s\n" ] } ], @@ -5487,17 +5539,15 @@ "output_type": "stream", "text": [ "INFO:root:Create proxy actor with party alice.\n", - "INFO:root:Create proxy actor with party bob.\n", - "\u001B[2m\u001B[36m(pid=51208)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n", - "\u001B[2m\u001B[36m(pid=51207)\u001B[0m Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.\n" + "INFO:root:Create proxy actor with party bob.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[2m\u001B[36m(XgbTreeWorker pid=51207)\u001B[0m [2023-06-19 20:01:09.580] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", - "\u001B[2m\u001B[36m(XgbTreeWorker pid=51208)\u001B[0m [2023-06-19 20:01:09.595] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" + "\u001b[2m\u001b[36m(XgbTreeWorker pid=3821892)\u001b[0m [2023-10-07 18:04:49.886] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(XgbTreeWorker pid=3821893)\u001b[0m [2023-10-07 18:04:49.900] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" ] } ], @@ -5572,11 +5622,11 @@ "output_type": "stream", "text": [ "positive_samples: 884.0\n", - "negative_samples: 6441.0\n", - "total_samples: 7325.0\n", - "auc: 0.8973933458328247\n", - "ks: 0.6497097015380859\n", - "f1_score: 0.5536627173423767\n" + "negative_samples: 6443.0\n", + "total_samples: 7327.0\n", + "auc: 0.8957031965255737\n", + "ks: 0.6429727077484131\n", + "f1_score: 0.547467052936554\n" ] } ], @@ -5611,11 +5661,11 @@ "output_type": "stream", "text": [ "positive_samples: 884.0\n", - "negative_samples: 6441.0\n", - "total_samples: 7325.0\n", - "auc: 0.8754899501800537\n", - "ks: 0.5985674858093262\n", - "f1_score: 0.5030487775802612\n" + "negative_samples: 6443.0\n", + "total_samples: 7327.0\n", + "auc: 0.8601635694503784\n", + "ks: 0.5916707515716553\n", + "f1_score: 0.4953111708164215\n" ] } ], @@ -5646,7 +5696,7 @@ { "data": { "text/plain": [ - "PredictionBiasReport(buckets=[BucketPredictionBiasReport(left_endpoint=0.0, left_closed=True, right_endpoint=0.25, right_closed=False, isna=False, avg_prediction=0.0, avg_label=0.13951896131038666, bias=0.13951896131038666, absolute=True), BucketPredictionBiasReport(left_endpoint=0.25, left_closed=True, right_endpoint=0.5, right_closed=False, isna=True, avg_prediction=0, avg_label=0, bias=0, absolute=True), BucketPredictionBiasReport(left_endpoint=0.5, left_closed=True, right_endpoint=0.75, right_closed=False, isna=True, avg_prediction=0, avg_label=0, bias=0, absolute=True), BucketPredictionBiasReport(left_endpoint=0.75, left_closed=True, right_endpoint=1.0, right_closed=True, isna=False, avg_prediction=1.0, avg_label=0.36417245864868164, bias=0.6358275413513184, absolute=True)])" + "PredictionBiasReport(buckets=[BucketPredictionBiasReport(left_endpoint=0.0, left_closed=True, right_endpoint=0.25, right_closed=False, isna=False, avg_prediction=0.0, avg_label=0.14190296828746796, bias=0.14190296828746796, absolute=True), BucketPredictionBiasReport(left_endpoint=0.25, left_closed=True, right_endpoint=0.5, right_closed=False, isna=True, avg_prediction=0, avg_label=0, bias=0, absolute=True), BucketPredictionBiasReport(left_endpoint=0.5, left_closed=True, right_endpoint=0.75, right_closed=False, isna=True, avg_prediction=0, avg_label=0, bias=0, absolute=True), BucketPredictionBiasReport(left_endpoint=0.75, left_closed=True, right_endpoint=1.0, right_closed=True, isna=False, avg_prediction=1.0, avg_label=0.35504791140556335, bias=0.6449520587921143, absolute=True)])" ] }, "execution_count": 31, @@ -5672,7 +5722,7 @@ { "data": { "text/plain": [ - "PredictionBiasReport(buckets=[BucketPredictionBiasReport(left_endpoint=0.0, left_closed=True, right_endpoint=0.25, right_closed=False, isna=False, avg_prediction=0.0, avg_label=0.1599043607711792, bias=0.1599043607711792, absolute=True), BucketPredictionBiasReport(left_endpoint=0.25, left_closed=True, right_endpoint=0.5, right_closed=False, isna=True, avg_prediction=0, avg_label=0, bias=0, absolute=True), BucketPredictionBiasReport(left_endpoint=0.5, left_closed=True, right_endpoint=0.75, right_closed=False, isna=True, avg_prediction=0, avg_label=0, bias=0, absolute=True), BucketPredictionBiasReport(left_endpoint=0.75, left_closed=True, right_endpoint=1.0, right_closed=True, isna=False, avg_prediction=1.0, avg_label=0.370947003364563, bias=0.629052996635437, absolute=True)])" + "PredictionBiasReport(buckets=[BucketPredictionBiasReport(left_endpoint=0.0, left_closed=True, right_endpoint=0.25, right_closed=False, isna=False, avg_prediction=0.0, avg_label=0.1611528992652893, bias=0.1611528992652893, absolute=True), BucketPredictionBiasReport(left_endpoint=0.25, left_closed=True, right_endpoint=0.5, right_closed=False, isna=True, avg_prediction=0, avg_label=0, bias=0, absolute=True), BucketPredictionBiasReport(left_endpoint=0.5, left_closed=True, right_endpoint=0.75, right_closed=False, isna=True, avg_prediction=0, avg_label=0, bias=0, absolute=True), BucketPredictionBiasReport(left_endpoint=0.75, left_closed=True, right_endpoint=1.0, right_closed=True, isna=False, avg_prediction=1.0, avg_label=0.36208581924438477, bias=0.6379141807556152, absolute=True)])" ] }, "execution_count": 32, @@ -5705,15 +5755,15 @@ { "data": { "text/plain": [ - "array([0.729, 0.477, 0.890, 0.463, 0.022, 0.144, 0.997, 0.991, 0.987,\n", - " 0.984, 0.999, 0.959, 0.988, 0.992, 0.939, 0.995, 0.982, 0.993,\n", - " 0.996, 0.990, 0.990, 0.000, 0.686, 0.796, 0.779, 0.981, 0.990,\n", - " 0.976, 0.955, 0.992, 0.790, 0.969, 0.978, 0.974, 0.976, 0.976,\n", - " 0.836, 0.820, 0.985, 0.830, 0.972, 0.984, 0.995, 0.990, 0.996,\n", - " 0.984, 0.973, 0.994, 0.996, 0.953, 0.985, 0.977, 0.964, 0.994,\n", - " 0.984, 0.984, 0.964, 0.999, 0.966, 0.978, 0.992, 0.982, 0.976,\n", - " 0.994, 0.976, 0.993, 0.969, 0.995, 0.983, 0.963, 0.987, 0.976,\n", - " 0.992, 0.824, 0.979, 0.000])" + "array([0.773, 0.324, 0.785, 0.513, 0.011, 0.155, 0.992, 0.991, 0.980,\n", + " 0.983, 0.999, 0.957, 0.995, 0.990, 0.939, 0.997, 0.996, 0.993,\n", + " 0.999, 0.992, 0.990, 0.000, 0.701, 0.828, 0.891, 0.982, 0.986,\n", + " 0.976, 0.953, 0.991, 0.745, 0.969, 0.983, 0.976, 0.973, 0.973,\n", + " 0.843, 0.839, 0.986, 0.859, 0.940, 0.993, 0.984, 1.000, 0.988,\n", + " 0.992, 0.979, 0.999, 0.993, 0.972, 0.997, 0.979, 0.976, 0.999,\n", + " 0.970, 0.995, 0.970, 0.994, 0.964, 0.974, 0.998, 0.994, 0.984,\n", + " 0.999, 0.972, 0.998, 0.980, 0.998, 0.983, 0.972, 0.975, 0.976,\n", + " 0.993, 0.824, 0.979, 0.000])" ] }, "execution_count": 33, @@ -5754,13 +5804,13 @@ { "data": { "text/plain": [ - "array([[456.557],\n", - " [453.931],\n", - " [487.774],\n", + "array([[489.410],\n", + " [452.695],\n", + " [453.148],\n", " ...,\n", - " [456.557],\n", - " [487.774],\n", - " [453.931]])" + " [453.148],\n", + " [480.906],\n", + " [453.148]])" ] }, "execution_count": 34, @@ -5843,7 +5893,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.17" }, "vscode": { "interpreter": { diff --git a/docs/update_data.sh b/docs/update_data.sh index d20ced811..d55497cc3 100644 --- a/docs/update_data.sh +++ b/docs/update_data.sh @@ -15,17 +15,14 @@ # limitations under the License. # -echo "1. Update component spec doc." +echo "1. Update extend spec doc." # comp_spec.tmpl is adapted from https://github.com/pseudomuto/protoc-gen-doc/blob/master/examples/templates/grpc-md.tmpl. docker run --rm -v $(pwd)/component/:/out \ -v $(pwd)/../:/protos \ pseudomuto/protoc-gen-doc \ --doc_opt=/out/comp_spec.tmpl,comp_spec.md \ - protos/secretflow/protos/component/cluster.proto \ - protos/secretflow/protos/component/data.proto \ - protos/secretflow/protos/component/comp.proto \ - protos/secretflow/protos/component/evaluation.proto \ - protos/secretflow/protos/component/report.proto + protos/secretflow/protos/secretflow/spec/extend/cluster.proto \ + protos/secretflow/protos/secretflow/spec/extend/data.proto echo "2. Update comp list doc." env PYTHONPATH=$PYTHONPATH:$PWD/.. python $PWD/component/update_comp_list.py \ No newline at end of file diff --git a/docs/user_guide/federated_learning/horizontal_federated_learning/index.rst b/docs/user_guide/federated_learning/horizontal_federated_learning/index.rst index 5ed4f9e1c..602b82bd1 100644 --- a/docs/user_guide/federated_learning/horizontal_federated_learning/index.rst +++ b/docs/user_guide/federated_learning/horizontal_federated_learning/index.rst @@ -1,14 +1,13 @@ Horizontal Federated Learning ============================== -What is Horizontal Federated Learning -------------------------------------- - -The federated learning here refers specifically to the federated learning of horizontal scenarios. -This mode applies to the situation where each participant has the same business but different customer groups are reached. -In this case, samples from various parties can be combined to train a joint model with better performance. -For example, in the medical scene, each hospital has its own unique patient group, -and hospitals in different regions almost do not overlap each other, +**What is Horizontal Federated Learning** + +The federated learning here refers specifically to the federated learning of horizontal scenarios. +This mode applies to the situation where each participant has the same business but different customer groups are reached. +In this case, samples from various parties can be combined to train a joint model with better performance. +For example, in the medical scene, each hospital has its own unique patient group, +and hospitals in different regions almost do not overlap each other, but their examination records for medical records (such as images, blood tests, etc.) are of the same type. .. image:: ../../../tutorial/resources/federate_learning.png diff --git a/docs/user_guide/preprocessing/WeightOfEvidenceEncoding.ipynb b/docs/user_guide/preprocessing/WeightOfEvidenceEncoding.ipynb index fa88af3ca..490f7b52e 100644 --- a/docs/user_guide/preprocessing/WeightOfEvidenceEncoding.ipynb +++ b/docs/user_guide/preprocessing/WeightOfEvidenceEncoding.ipynb @@ -72,7 +72,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-07-04 11:44:35,254\tINFO worker.py:1544 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n" + "2023-10-07 18:03:18,152\tINFO worker.py:1538 -- Started a local Ray instance.\n" ] } ], @@ -124,42 +124,54 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:Create proxy actor with party alice.\n" + "INFO:root:Create proxy actor with party alice.\n", + "INFO:root:Create proxy actor with party bob.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2m\u001b[36m(SPURuntime pid=3788331)\u001b[0m 2023-10-07 18:03:20.845 [info] [default_brpc_retry_policy.cc:DoRetry:52] socket error, sleep=1000000us and retry\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:Create proxy actor with party bob.\n", - "\u001b[2m\u001b[36m(_run pid=394229)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001b[2m\u001b[36m(_run pid=394229)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", - "\u001b[2m\u001b[36m(_run pid=394229)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", - "\u001b[2m\u001b[36m(_run pid=394229)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", - "\u001b[2m\u001b[36m(_run pid=394229)\u001b[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n" + "\u001b[2m\u001b[36m(_run pid=3781429)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'cuda': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3781429)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'\n", + "\u001b[2m\u001b[36m(_run pid=3781429)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.\n", + "\u001b[2m\u001b[36m(_run pid=3781429)\u001b[0m INFO:jax._src.xla_bridge:Unable to initialize backend 'plugin': xla_extension has no attributes named get_plugin_device_client. Compile TensorFlow with //tensorflow/compiler/xla/python:enable_plugin_device set to true (defaults to false) to enable this.\n", + "\u001b[2m\u001b[36m(_run pid=3781429)\u001b[0m WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(_run pid=394229)\u001b[0m [2023-07-04 11:44:41.549] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n" + "\u001b[2m\u001b[36m(_run pid=3781429)\u001b[0m [2023-10-07 18:03:21.594] [info] [thread_pool.cc:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3788331)\u001b[0m 2023-10-07 18:03:21.846 [info] [default_brpc_retry_policy.cc:LogHttpDetail:29] cntl ErrorCode '112', http status code '200', response header '', error msg '[E111]Fail to connect Socket{id=0 addr=127.0.0.1:58305} (0x0x48efdc0): Connection refused [R1][E112]Not connected to 127.0.0.1:58305 yet, server_id=0'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3788331)\u001b[0m 2023-10-07 18:03:21.846 [info] [default_brpc_retry_policy.cc:DoRetry:75] aggressive retry, sleep=1000000us and retry\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3788331)\u001b[0m 2023-10-07 18:03:22.846 [info] [default_brpc_retry_policy.cc:LogHttpDetail:29] cntl ErrorCode '112', http status code '200', response header '', error msg '[E111]Fail to connect Socket{id=0 addr=127.0.0.1:58305} (0x0x48efdc0): Connection refused [R1][E112]Not connected to 127.0.0.1:58305 yet, server_id=0 [R2][E112]Not connected to 127.0.0.1:58305 yet, server_id=0'\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3788331)\u001b[0m 2023-10-07 18:03:22.846 [info] [default_brpc_retry_policy.cc:DoRetry:75] aggressive retry, sleep=1000000us and retry\n", + "\u001b[2m\u001b[36m(SPURuntime pid=3788330)\u001b[0m 2023-10-07 18:03:22.849 [info] [default_brpc_retry_policy.cc:DoRetry:69] not retry for reached rcp timeout, ErrorCode '1008', error msg '[E1008]Reached timeout=2000ms @127.0.0.1:34077'\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:Create proxy actor with party bob.\n", - "INFO:root:Create proxy actor with party alice.\n" + "INFO:root:Create proxy actor with party bob.\n", + "INFO:root:Create proxy actor with party alice.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=400041)\u001b[0m 2023-07-04 11:44:41.790 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", - "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=400034)\u001b[0m 2023-07-04 11:44:41.851 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=bob) pid=3788331)\u001b[0m 2023-10-07 18:03:25.092 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", + "\u001b[2m\u001b[36m(SPURuntime(device_id=None, party=alice) pid=3788330)\u001b[0m 2023-10-07 18:03:25.107 [info] [thread_pool.cc:ThreadPool:30] Create a fixed thread pool with size 63\n", " x11 x12 x13 x14 x15 x16 x17 \\\n", "0 0.241531 -0.705729 -0.020094 -0.493792 0.851992 0.035219 -0.796096 \n", "1 -0.402727 0.115744 0.468149 -0.735388 0.386395 0.712798 0.239583 \n", @@ -219,10 +231,10 @@ ], "source": [ "from secretflow.preprocessing.binning.vert_woe_binning import VertWoeBinning\n", - "from secretflow.preprocessing.binning.vert_woe_substitution import VertWOESubstitution\n", + "from secretflow.preprocessing.binning.vert_bin_substitution import VertBinSubstitution\n", "\n", "binning = VertWoeBinning(spu)\n", - "woe_rules = binning.binning(\n", + "bin_rules = binning.binning(\n", " vdf,\n", " binning_method=\"chimerge\",\n", " bin_num=4,\n", @@ -230,8 +242,8 @@ " label_name=\"y\",\n", ")\n", "\n", - "woe_sub = VertWOESubstitution()\n", - "vdf = woe_sub.substitution(vdf, woe_rules)\n", + "woe_sub = VertBinSubstitution()\n", + "vdf = woe_sub.substitution(vdf, bin_rules)\n", "\n", "# this is for demo only, be careful with reveal\n", "print(sf.reveal(vdf.partitions[alice].data))\n", @@ -268,8 +280,8 @@ " \"split_points\": list[float], # left-open right-close split points\n", " \"total_counts\": list[int], # total samples count in each bins.\n", " \"else_counts\": int, # np.nan samples count\n", - " \"woes\": list[float], # woe values for each bins.\n", - " \"else_woe\": float, # woe value for np.nan samples.\n", + " \"filling_values\": list[float], # woe values for each bins.\n", + " \"else_filling_value\": float, # woe value for np.nan samples.\n", " },\n", " # ... others feature\n", " ],\n", @@ -298,7 +310,7 @@ "outputs": [], "source": [ "# alice is label holder\n", - "dict_pyu_object = woe_rules[alice]\n", + "dict_pyu_object = bin_rules[alice]\n", "\n", "\n", "def extract_name_and_feature_iv(list_of_feature_iv_info):\n", @@ -416,7 +428,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.15" + "version": "3.8.17" }, "orig_nbformat": 4 }, diff --git a/secretflow/__init__.py b/secretflow/__init__.py index 96997b82e..e99b738d6 100644 --- a/secretflow/__init__.py +++ b/secretflow/__init__.py @@ -20,7 +20,6 @@ kuscia, ml, preprocessing, - protos, security, utils, ) @@ -44,7 +43,6 @@ from .version import __version__ # type: ignore __all__ = [ - 'protos', 'kuscia', 'data', 'device', diff --git a/secretflow/cli.py b/secretflow/cli.py index fb86acb9f..7d9256868 100644 --- a/secretflow/cli.py +++ b/secretflow/cli.py @@ -21,10 +21,11 @@ import click from google.protobuf.json_format import MessageToJson +from secretflow.spec.v1.data_pb2 import StorageConfig +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from secretflow.component.entry import COMP_LIST, COMP_MAP, comp_eval -from secretflow.protos.component.cluster_pb2 import SFClusterConfig -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam +from secretflow.spec.extend.cluster_pb2 import SFClusterConfig from secretflow.utils.logging import LOG_FORMAT, get_logging_level, set_logging_level from secretflow.version import __version__ @@ -134,8 +135,9 @@ def inspect(comp_id, all, file): @click.option("--log_level", required=False, default="INFO") @click.option("--mem_trace", is_flag=True) @click.option("--eval_param", required=True, help="base64ed NodeEvalParam binary") +@click.option("--storage", required=True, help="base64ed Storage binary") @click.option("--cluster", required=True, help="base64ed SFClusterConfig binary") -def run(eval_param, cluster, log_file, result_file, log_level, mem_trace): +def run(eval_param, storage, cluster, log_file, result_file, log_level, mem_trace): def _get_peak_mem() -> float: # only works inside docker # use docker's default cgroup @@ -166,6 +168,8 @@ def _get_peak_mem() -> float: try: eval = NodeEvalParam() eval.ParseFromString(base64.b64decode(eval_param.encode('utf-8'))) + sto = StorageConfig() + sto.ParseFromString(base64.b64decode(storage.encode('utf-8'))) clu = SFClusterConfig() clu.ParseFromString(base64.b64decode(cluster.encode('utf-8'))) except Exception as e: @@ -180,9 +184,9 @@ def _get_peak_mem() -> float: if log_file: with open(log_file, "w") as f: with redirect_stdout(f), redirect_stderr(f): - result = comp_eval(eval, clu, tracer_report=True) + result = comp_eval(eval, sto, clu, tracer_report=True) else: - result = comp_eval(eval, clu, tracer_report=True) + result = comp_eval(eval, sto, clu, tracer_report=True) if mem_trace: ret["mem_peak"] = _get_peak_mem() diff --git a/secretflow/component/component.py b/secretflow/component/component.py index 49039db0d..2e87342c3 100644 --- a/secretflow/component/component.py +++ b/secretflow/component/component.py @@ -23,18 +23,19 @@ import cleantext import spu - -from secretflow.component.data_utils import DistDataType, check_dist_data, check_io_def -from secretflow.component.eval_param_reader import EvalParamReader -from secretflow.device.driver import init, shutdown -from secretflow.protos.component.cluster_pb2 import SFClusterConfig -from secretflow.protos.component.comp_pb2 import ( +from secretflow.spec.v1.component_pb2 import ( AttributeDef, AttrType, ComponentDef, IoDef, ) -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam, NodeEvalResult +from secretflow.spec.v1.data_pb2 import StorageConfig +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam, NodeEvalResult + +from secretflow.component.data_utils import DistDataType, check_dist_data, check_io_def +from secretflow.component.eval_param_reader import EvalParamReader +from secretflow.device.driver import init, shutdown +from secretflow.spec.extend.cluster_pb2 import SFClusterConfig def clean_text(x: str, no_line_breaks: bool = True) -> str: @@ -256,12 +257,12 @@ def float_attr( attr.atomic.allowed_values.fs.extend(allowed_values) if lower_bound is not None: - attr.atomic.has_lower_bound = True + attr.atomic.lower_bound_enabled = True attr.atomic.lower_bound_inclusive = lower_bound_inclusive attr.atomic.lower_bound.f = lower_bound if upper_bound is not None: - attr.atomic.has_upper_bound = True + attr.atomic.upper_bound_enabled = True attr.atomic.upper_bound_inclusive = upper_bound_inclusive attr.atomic.upper_bound.f = upper_bound @@ -400,12 +401,12 @@ def int_attr( attr.atomic.allowed_values.i64s.extend(allowed_values) if lower_bound is not None: - attr.atomic.has_lower_bound = True + attr.atomic.lower_bound_enabled = True attr.atomic.lower_bound_inclusive = lower_bound_inclusive attr.atomic.lower_bound.i64 = lower_bound if upper_bound is not None: - attr.atomic.has_upper_bound = True + attr.atomic.upper_bound_enabled = True attr.atomic.upper_bound_inclusive = upper_bound_inclusive attr.atomic.upper_bound.i64 = upper_bound @@ -737,9 +738,8 @@ def _setup_sf_cluster(self, config: SFClusterConfig): enable_waiting_for_other_parties_ready=True, ) - def _check_storage(self, config: SFClusterConfig): + def _check_storage(self, storage: StorageConfig): # only local fs is supported at this moment. - storage = config.private_config.storage_config if storage.type and storage.type != "local_fs": raise CompEvalError("only local_fs is supported.") return storage.local_fs.wd @@ -859,6 +859,7 @@ def _extract_device_config(self, config: SFClusterConfig): def eval( self, param: NodeEvalParam, + storage_config: StorageConfig = None, cluster_config: SFClusterConfig = None, tracer_report: bool = False, ) -> Union[NodeEvalResult, Dict]: @@ -879,8 +880,10 @@ def eval( # sanity check on sf config ctx = CompEvalContext() + if storage_config is not None: + ctx.local_fs_wd = self._check_storage(storage_config) + if cluster_config is not None: - ctx.local_fs_wd = self._check_storage(cluster_config) ctx.spu_configs, ctx.heu_config = self._extract_device_config( cluster_config ) diff --git a/secretflow/component/data_utils.py b/secretflow/component/data_utils.py index 4a09cbcc7..4090b9b4c 100644 --- a/secretflow/component/data_utils.py +++ b/secretflow/component/data_utils.py @@ -20,15 +20,8 @@ import numpy as np import pandas as pd - -from secretflow.data.vertical import read_csv -from secretflow.data.vertical.dataframe import VDataFrame -from secretflow.device.device.pyu import PYU, PYUObject -from secretflow.device.device.spu import SPU, SPUObject -from secretflow.device.driver import DeviceObject, wait -from secretflow.protos.component.comp_pb2 import IoDef -from secretflow.protos.component.data_pb2 import ( - DeviceObjectCollection, +from secretflow.spec.v1.component_pb2 import IoDef +from secretflow.spec.v1.data_pb2 import ( DistData, IndividualTable, SystemInfo, @@ -36,6 +29,13 @@ VerticalTable, ) +from secretflow.data.vertical import read_csv +from secretflow.data.vertical.dataframe import VDataFrame +from secretflow.device.device.pyu import PYU, PYUObject +from secretflow.device.device.spu import SPU, SPUObject +from secretflow.device.driver import DeviceObject, wait +from secretflow.spec.extend.data_pb2 import DeviceObjectCollection + class MetaEnum(enum.EnumMeta): def __contains__(cls, item): @@ -64,7 +64,7 @@ class DistDataType(BaseEnum): SS_SGD_MODEL = "sf.model.ss_sgd" SS_GLM_MODEL = "sf.model.ss_glm" SGB_MODEL = "sf.model.sgb" - WOE_RUNNING_RULE = "sf.rule.woe_binning" + BIN_RUNNING_RULE = "sf.rule.binning" SS_XGB_MODEL = "sf.model.ss_xgb" REPORT = "sf.report" @@ -142,7 +142,7 @@ def merge_individuals_to_vtable(srcs: List[DistData], dest: DistData) -> DistDat imeta = IndividualTable() assert s.meta.Unpack(imeta) vmeta.schemas.append(imeta.schema) - vmeta.num_lines = imeta.num_lines + vmeta.line_count = imeta.line_count dest.meta.Pack(vmeta) @@ -267,7 +267,7 @@ def load_table( assert ( parties_path_format[p].format.lower() in DataSetFormatSupported ), f"Illegal path format: {parties_path_format[p].format.lower()}, path format of party {p} should be in DataSetFormatSupported" - # TODO: assert sys_info + # TODO: assert system_info with ctx.tracer.trace_io(): pyus = {p: PYU(p) for p in v_headers} @@ -284,7 +284,7 @@ def load_table( @dataclass class VerticalTableWrapper: - num_lines: int + line_count: int schema_map: Dict[str, TableSchema] def to_vertical_table(self, order: List[str] = None): @@ -293,15 +293,15 @@ def to_vertical_table(self, order: List[str] = None): else: schemas = [self.schema_map[k] for k in order] - return VerticalTable(schemas=schemas, num_lines=self.num_lines) + return VerticalTable(schemas=schemas, line_count=self.line_count) @classmethod - def from_dist_data(cls, data: DistData, num_lines: int = None): + def from_dist_data(cls, data: DistData, line_count: int = None): meta = VerticalTable() assert data.meta.Unpack(meta) return cls( - num_lines=meta.num_lines if num_lines is None else num_lines, + line_count=meta.line_count if line_count is None else line_count, schema_map={ data_ref.party: schema for data_ref, schema in zip(list(data.data_refs), list(meta.schemas)) @@ -314,7 +314,7 @@ def dump_vertical_table( v_data: VDataFrame, uri: str, meta: VerticalTableWrapper, - sys_info: SystemInfo, + system_info: SystemInfo, ) -> DistData: assert isinstance(v_data, VDataFrame) @@ -329,7 +329,7 @@ def dump_vertical_table( ret = DistData( name=uri, type=str(DistDataType.VERTICAL_TABLE), - sys_info=sys_info, + system_info=system_info, data_refs=[ DistData.DataRef(uri=output_uri[p], party=p.party, format="csv") for p in output_uri @@ -349,7 +349,7 @@ def model_dumps( public_info: Any, storage_root: str, dist_data_uri: str, - sys_info: SystemInfo, + system_info: SystemInfo, ) -> DistData: objs_uri = [] objs_party = [] @@ -409,7 +409,7 @@ def dumps(path: str, obj: Any): dist_data = DistData( name=model_name, type=str(model_type), - sys_info=sys_info, + system_info=system_info, data_refs=[ DistData.DataRef(uri=uri, party=p, format="pickle") for uri, p in zip(objs_uri, objs_party) @@ -428,8 +428,8 @@ def model_loads( storage_root: str, pyus: Dict[str, PYU] = None, spu: SPU = None, - # TODO: assert sys_info - # sys_info: SystemInfo = None, + # TODO: assert system_info + # system_info: SystemInfo = None, ) -> Tuple[List[DeviceObject], str]: assert dist_data.type == model_type model_meta = DeviceObjectCollection() @@ -513,7 +513,7 @@ def gen_prediction_csv_meta( label_header: Dict[str, Dict[str, np.dtype]], party: str, pred_name: str, - num_lines: int = None, + line_count: int = None, id_keys: List[str] = None, label_keys: List[str] = None, ) -> IndividualTable: @@ -531,5 +531,5 @@ def gen_prediction_csv_meta( ) + ["float"], ), - num_lines=num_lines if num_lines is not None else -1, + line_count=line_count if line_count is not None else -1, ) diff --git a/secretflow/component/entry.py b/secretflow/component/entry.py index b53140da1..ddff0e3a3 100644 --- a/secretflow/component/entry.py +++ b/secretflow/component/entry.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from secretflow.component.feature.vert_woe_binning import ( - vert_woe_binning_comp, - vert_woe_substitution_comp, -) +from secretflow.component.preprocessing.vert_woe_binning import vert_woe_binning_comp + from secretflow.component.ml.boost.sgb.sgb import sgb_predict_comp, sgb_train_comp from secretflow.component.ml.boost.ss_xgb.ss_xgb import ( ss_xgb_predict_comp, @@ -26,17 +24,22 @@ ) from secretflow.component.ml.eval.prediction_bias_eval import prediction_bias_comp from secretflow.component.ml.eval.ss_pvalue import ss_pvalue_comp +from secretflow.component.ml.linear.ss_glm import ss_glm_predict_comp, ss_glm_train_comp from secretflow.component.ml.linear.ss_sgd import ss_sgd_predict_comp, ss_sgd_train_comp from secretflow.component.preprocessing.feature_filter import feature_filter_comp +from secretflow.component.preprocessing.vert_binning import ( + vert_binning_comp, + vert_bin_substitution_comp, +) from secretflow.component.preprocessing.psi import psi_comp from secretflow.component.preprocessing.train_test_split import train_test_split_comp from secretflow.component.stats.ss_pearsonr import ss_pearsonr_comp from secretflow.component.stats.ss_vif import ss_vif_comp from secretflow.component.stats.table_statistics import table_statistics_comp -from secretflow.protos.component.cluster_pb2 import SFClusterConfig -from secretflow.protos.component.comp_pb2 import CompListDef, ComponentDef -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam, NodeEvalResult -from secretflow.component.ml.linear.ss_glm import ss_glm_predict_comp, ss_glm_train_comp +from secretflow.spec.extend.cluster_pb2 import SFClusterConfig +from secretflow.spec.v1.data_pb2 import StorageConfig +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam, NodeEvalResult +from secretflow.spec.v1.component_pb2 import CompListDef, ComponentDef ALL_COMPONENTS = [ train_test_split_comp, @@ -44,8 +47,9 @@ ss_sgd_train_comp, ss_sgd_predict_comp, feature_filter_comp, + vert_binning_comp, vert_woe_binning_comp, - vert_woe_substitution_comp, + vert_bin_substitution_comp, ss_vif_comp, ss_pearsonr_comp, ss_pvalue_comp, @@ -97,12 +101,15 @@ def get_comp_def(domain: str, name: str, version: str) -> ComponentDef: # FIXME(junfeng): Should load storage config from .sf_storage JSON file def comp_eval( param: NodeEvalParam, + storage_config: StorageConfig, cluster_config: SFClusterConfig, tracer_report: bool = False, ) -> NodeEvalResult: key = gen_key(param.domain, param.name, param.version) if key in COMP_MAP: comp = COMP_MAP[key] - return comp.eval(param, cluster_config, tracer_report=tracer_report) + return comp.eval( + param, storage_config, cluster_config, tracer_report=tracer_report + ) else: raise RuntimeError("component is not found.") diff --git a/secretflow/component/eval_param_reader.py b/secretflow/component/eval_param_reader.py index a70425664..70132043f 100644 --- a/secretflow/component/eval_param_reader.py +++ b/secretflow/component/eval_param_reader.py @@ -14,14 +14,14 @@ import math -from secretflow.protos.component.comp_pb2 import ( +from secretflow.spec.v1.component_pb2 import ( Attribute, AttributeDef, AttrType, ComponentDef, IoDef, ) -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam class EvalParamError(Exception): @@ -48,7 +48,7 @@ def check_allowed_values(value: Attribute, definition: AttributeDef): def check_lower_bound(value: Attribute, definition: AttributeDef): - if not definition.atomic.has_lower_bound: + if not definition.atomic.lower_bound_enabled: return True if definition.type == AttrType.AT_FLOAT: return value.f > definition.atomic.lower_bound.f or ( @@ -64,7 +64,7 @@ def check_lower_bound(value: Attribute, definition: AttributeDef): def check_upper_bound(value: Attribute, definition: AttributeDef): - if not definition.atomic.has_upper_bound: + if not definition.atomic.upper_bound_enabled: return True if definition.type == AttrType.AT_FLOAT: return value.f < definition.atomic.upper_bound.f or ( @@ -92,7 +92,7 @@ def check_table_attr_col_cnt(value: Attribute, definition: IoDef.TableAttrDef): def get_value(value: Attribute, at: AttrType): - if at == AttrType.AT_UNDEFINED: + if at == AttrType.ATTR_TYPE_UNSPECIFIED: raise EvalParamError("Type of Attribute is undefined.") elif at == AttrType.AT_FLOAT: return value.f @@ -201,7 +201,7 @@ def _preprocess(self): self._instance_inputs[input_def.name] = input_instance for input_attr in input_def.attrs: - if len(input_attr.attrs): + if len(input_attr.extra_attrs): raise EvalParamError( "extra attribute is unsupported at this moment." ) diff --git a/secretflow/component/i18n.py b/secretflow/component/i18n.py index 21560465c..76ad57f67 100644 --- a/secretflow/component/i18n.py +++ b/secretflow/component/i18n.py @@ -18,9 +18,9 @@ import json import click +from secretflow.spec.v1.component_pb2 import CompListDef from secretflow.component.entry import COMP_LIST, gen_key -from secretflow.protos.component.comp_pb2 import CompListDef ROOT = "." @@ -72,7 +72,7 @@ def restore_from_archives(text, key, archives=None): for t_attr in io.attrs: text[t_attr.name] = "" text[t_attr.desc] = "" - for t_attr_a in t_attr.attrs: + for t_attr_a in t_attr.extra_attrs: text[t_attr_a.name] = "" text[t_attr_a.desc] = "" diff --git a/secretflow/component/ml/boost/sgb/sgb.py b/secretflow/component/ml/boost/sgb/sgb.py index a496e6250..4f8b3caac 100644 --- a/secretflow/component/ml/boost/sgb/sgb.py +++ b/secretflow/component/ml/boost/sgb/sgb.py @@ -14,6 +14,8 @@ import json import os +from secretflow.spec.v1.data_pb2 import DistData + from secretflow.component.component import Component, IoType from secretflow.component.data_utils import ( DistDataType, @@ -29,7 +31,6 @@ from secretflow.device.driver import wait from secretflow.ml.boost.sgb_v import Sgb, SgbModel from secretflow.ml.boost.sgb_v.model import from_dict -from secretflow.protos.component.data_pb2 import DistData sgb_train_comp = Component( "sgb_train", @@ -387,7 +388,7 @@ def sgb_train_eval_fn( json.dumps(m_dict), ctx.local_fs_wd, output_model, - train_dataset.sys_info, + train_dataset.system_info, ) return {"output_model": model_db} @@ -559,7 +560,7 @@ def sgb_predict_eval_fn( label_header=label_header_map, party=receiver, pred_name=pred_name, - num_lines=x.shape[0], + line_count=x.shape[0], id_keys=id_header, label_keys=label_header, ) diff --git a/secretflow/component/ml/boost/ss_xgb/ss_xgb.py b/secretflow/component/ml/boost/ss_xgb/ss_xgb.py index 4e2478a7c..4375af3b4 100644 --- a/secretflow/component/ml/boost/ss_xgb/ss_xgb.py +++ b/secretflow/component/ml/boost/ss_xgb/ss_xgb.py @@ -14,6 +14,8 @@ import json import os +from secretflow.spec.v1.data_pb2 import DistData + from secretflow.component.component import CompEvalError, Component, IoType from secretflow.component.data_utils import ( DistDataType, @@ -29,7 +31,6 @@ from secretflow.device.driver import wait from secretflow.ml.boost.ss_xgb_v import Xgb, XgbModel from secretflow.ml.boost.ss_xgb_v.core.node_split import RegType -from secretflow.protos.component.data_pb2 import DistData ss_xgb_train_comp = Component( "ss_xgb_train", @@ -231,7 +232,7 @@ def ss_xgb_train_eval_fn( json.dumps(m_dict), ctx.local_fs_wd, output_model, - train_dataset.sys_info, + train_dataset.system_info, ) return {"output_model": model_db} @@ -416,7 +417,7 @@ def ss_xgb_predict_eval_fn( label_header=label_header_map, party=receiver, pred_name=pred_name, - num_lines=x.shape[0], + line_count=x.shape[0], id_keys=id_header, label_keys=label_header, ) diff --git a/secretflow/component/ml/eval/biclassification_eval.py b/secretflow/component/ml/eval/biclassification_eval.py index 95948c649..85b232877 100644 --- a/secretflow/component/ml/eval/biclassification_eval.py +++ b/secretflow/component/ml/eval/biclassification_eval.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData +from secretflow.spec.v1.report_pb2 import Descriptions, Div, Report, Tab, Table + from secretflow.component.component import Component, IoType, TableColParam from secretflow.component.data_utils import DistDataType, load_table from secretflow.device.driver import reveal -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData -from secretflow.protos.component.report_pb2 import Descriptions, Div, Report, Tab, Table from secretflow.stats.biclassification_eval import BiClassificationEval biclassification_eval_comp = Component( @@ -131,13 +132,15 @@ def biclassification_eval_fn( ).get_all_reports() ) - return {"reports": dump_biclassification_reports(reports, labels.sys_info, result)} + return { + "reports": dump_biclassification_reports(reports, labels.system_info, result) + } -def dump_biclassification_reports(name, sys_info, reports): +def dump_biclassification_reports(name, system_info, reports): ret = DistData( name=name, - sys_info=sys_info, + system_info=system_info, type=str(DistDataType.REPORT), ) diff --git a/secretflow/component/ml/eval/prediction_bias_eval.py b/secretflow/component/ml/eval/prediction_bias_eval.py index 2eaf5151c..5f4871669 100644 --- a/secretflow/component/ml/eval/prediction_bias_eval.py +++ b/secretflow/component/ml/eval/prediction_bias_eval.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData +from secretflow.spec.v1.report_pb2 import Div, Report, Tab, Table + from secretflow.component.component import Component, IoType, TableColParam from secretflow.component.data_utils import DistDataType, load_table from secretflow.device.driver import reveal -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData -from secretflow.protos.component.report_pb2 import Div, Report, Tab, Table from secretflow.stats.core.prediction_bias_core import PredictionBiasReport from secretflow.stats.prediction_bias_eval import prediction_bias_eval @@ -93,7 +94,7 @@ ) -def dump_report(name, sys_info, report: PredictionBiasReport) -> DistData: +def dump_report(name, system_info, report: PredictionBiasReport) -> DistData: table = Table( name="Prediction Bias Table", desc="Calculate prediction bias, ie. average of predictions - average of labels.", @@ -185,7 +186,7 @@ def gen_interval_str(left_endpoint, left_closed, right_endpoint, right_closed): report_dd = DistData( name=name, type=str(DistDataType.REPORT), - sys_info=sys_info, + system_info=system_info, ) report_dd.meta.Pack(report_meta) @@ -230,4 +231,4 @@ def prediction_bias_eval_fn( ) ) - return {"result": dump_report(result, labels.sys_info, res)} + return {"result": dump_report(result, labels.system_info, res)} diff --git a/secretflow/component/ml/eval/ss_pvalue.py b/secretflow/component/ml/eval/ss_pvalue.py index ba91a4041..c9247ce65 100644 --- a/secretflow/component/ml/eval/ss_pvalue.py +++ b/secretflow/component/ml/eval/ss_pvalue.py @@ -13,14 +13,14 @@ # limitations under the License. import numpy as np +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData +from secretflow.spec.v1.report_pb2 import Descriptions, Div, Report, Tab from secretflow.component.component import CompEvalError, Component, IoType from secretflow.component.data_utils import DistDataType, load_table from secretflow.component.ml.linear.ss_sgd import load_ss_sgd_model from secretflow.device.device.spu import SPU -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData -from secretflow.protos.component.report_pb2 import Descriptions, Div, Report, Tab from secretflow.stats.ss_pvalue_v import PValue ss_pvalue_comp = Component( @@ -121,7 +121,7 @@ def ss_pearsonr_eval_fn( report_dd = DistData( name=report, type=str(DistDataType.REPORT), - sys_info=input_data.sys_info, + system_info=input_data.system_info, ) report_dd.meta.Pack(report_mate) diff --git a/secretflow/component/ml/linear/ss_glm.py b/secretflow/component/ml/linear/ss_glm.py index edac7c7a2..b7644fc9b 100644 --- a/secretflow/component/ml/linear/ss_glm.py +++ b/secretflow/component/ml/linear/ss_glm.py @@ -14,14 +14,11 @@ import json import os - from typing import Tuple -from secretflow.component.component import ( - CompEvalError, - Component, - IoType, -) +from secretflow.spec.v1.data_pb2 import DistData + +from secretflow.component.component import CompEvalError, Component, IoType from secretflow.component.data_utils import ( DistDataType, extract_table_header, @@ -35,8 +32,7 @@ from secretflow.device.device.spu import SPU, SPUObject from secretflow.device.driver import wait from secretflow.ml.linear import SSGLM -from secretflow.ml.linear.ss_glm.core import get_link, Linker -from secretflow.protos.component.data_pb2 import DistData +from secretflow.ml.linear.ss_glm.core import Linker, get_link ss_glm_train_comp = Component( "ss_glm_train", @@ -295,7 +291,7 @@ def ss_glm_train_eval_fn( json.dumps(model_meta), local_fs_wd, output_model, - train_dataset.sys_info, + train_dataset.system_info, ) return {"output_model": model_db} @@ -488,7 +484,7 @@ def ss_glm_predict_eval_fn( label_header=label_header_map, party=receiver, pred_name=pred_name, - num_lines=x.shape[0], + line_count=x.shape[0], id_keys=id_header, label_keys=label_header, ) diff --git a/secretflow/component/ml/linear/ss_sgd.py b/secretflow/component/ml/linear/ss_sgd.py index e2b94a52b..4155a4017 100644 --- a/secretflow/component/ml/linear/ss_sgd.py +++ b/secretflow/component/ml/linear/ss_sgd.py @@ -15,6 +15,8 @@ import json import os +from secretflow.spec.v1.data_pb2 import DistData + from secretflow.component.component import CompEvalError, Component, IoType from secretflow.component.data_utils import ( DistDataType, @@ -29,7 +31,6 @@ from secretflow.device.device.spu import SPU, SPUObject from secretflow.device.driver import wait from secretflow.ml.linear import LinearModel, RegType, SSRegression -from secretflow.protos.component.data_pb2 import DistData from secretflow.utils.sigmoid import SigType ss_sgd_train_comp = Component( @@ -192,7 +193,7 @@ def ss_sgd_train_eval_fn( json.dumps(model_meta), local_fs_wd, output_model, - train_dataset.sys_info, + train_dataset.system_info, ) return {"output_model": model_db} @@ -383,7 +384,7 @@ def ss_sgd_predict_eval_fn( label_header=label_header_map, party=receiver, pred_name=pred_name, - num_lines=x.shape[0], + line_count=x.shape[0], id_keys=id_header, label_keys=label_header, ) diff --git a/secretflow/component/preprocessing/condition_filter.py b/secretflow/component/preprocessing/condition_filter.py index 1bf33f7cc..26e8cc756 100644 --- a/secretflow/component/preprocessing/condition_filter.py +++ b/secretflow/component/preprocessing/condition_filter.py @@ -123,7 +123,7 @@ def condition_filter_comp_eval_fn( ds, out_ds, VerticalTableWrapper.from_dist_data(in_ds, ds.shape[0]), - in_ds.sys_info, + in_ds.system_info, ) out_db_else = dump_vertical_table( @@ -131,7 +131,7 @@ def condition_filter_comp_eval_fn( else_ds, out_ds_else, VerticalTableWrapper.from_dist_data(in_ds, else_ds.shape[0]), - in_ds.sys_info, + in_ds.system_info, ) return {"out_ds": out_db, "out_ds_else": out_db_else} diff --git a/secretflow/component/preprocessing/feature_filter.py b/secretflow/component/preprocessing/feature_filter.py index 3b6affda1..3e9fa4e78 100644 --- a/secretflow/component/preprocessing/feature_filter.py +++ b/secretflow/component/preprocessing/feature_filter.py @@ -14,10 +14,11 @@ import os +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable + from secretflow.component.component import Component, IoType, TableColParam from secretflow.component.data_utils import DistDataType, load_table from secretflow.device.driver import wait -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable feature_filter_comp = Component( "feature_filter", @@ -51,7 +52,7 @@ def feature_filter_eval_fn(*, ctx, in_ds, in_ds_drop_features, out_ds): in_ds.meta.Unpack(in_meta) out_meta = VerticalTable() - out_meta.num_lines = in_meta.num_lines + out_meta.line_count = in_meta.line_count for s in in_meta.schemas: s_meta = TableSchema() diff --git a/secretflow/component/preprocessing/psi.py b/secretflow/component/preprocessing/psi.py index a2d3faca9..95e6c023a 100644 --- a/secretflow/component/preprocessing/psi.py +++ b/secretflow/component/preprocessing/psi.py @@ -15,6 +15,8 @@ import os from typing import List +from secretflow.spec.v1.data_pb2 import DistData, IndividualTable, VerticalTable + from secretflow.component.component import ( CompEvalError, Component, @@ -29,11 +31,6 @@ ) from secretflow.device.device.pyu import PYU from secretflow.device.device.spu import SPU -from secretflow.protos.component.data_pb2 import ( - DistData, - IndividualTable, - VerticalTable, -) psi_comp = Component( "psi", @@ -49,6 +46,13 @@ default_value="ECDH_PSI_2PC", allowed_values=["ECDH_PSI_2PC", "KKRT_PSI_2PC", "BC22_PSI_2PC"], ) +psi_comp.bool_attr( + name="sort", + desc="Sort the output.", + is_list=False, + is_optional=True, + default_value=False, +) psi_comp.int_attr( name="bucket_size", desc="Specify the hash bucket size used in PSI. Larger values consume more memory.", @@ -135,7 +139,7 @@ def modify_schema(x: DistData, keys: List[str]) -> DistData: new_meta.schema.labels.extend(list(imeta.schema.labels)) new_meta.schema.label_types.extend(list(imeta.schema.label_types)) - new_meta.num_lines = imeta.num_lines + new_meta.line_count = imeta.line_count new_x.meta.Pack(new_meta) @@ -147,6 +151,7 @@ def two_party_balanced_psi_eval_fn( *, ctx, protocol, + sort, bucket_size, ecdh_curve_type, receiver_input, @@ -207,7 +212,7 @@ def two_party_balanced_psi_eval_fn( sender_pyu: os.path.join(local_fs_wd, psi_output), }, receiver=receiver_party, - sort=False, + sort=sort, protocol=protocol, bucket_size=bucket_size, curve_type=ecdh_curve_type, @@ -216,7 +221,7 @@ def two_party_balanced_psi_eval_fn( output_db = DistData( name=psi_output, type=str(DistDataType.VERTICAL_TABLE), - sys_info=receiver_input.sys_info, + system_info=receiver_input.system_info, data_refs=[ DistData.DataRef( uri=psi_output, @@ -240,7 +245,7 @@ def two_party_balanced_psi_eval_fn( ) vmeta = VerticalTable() assert output_db.meta.Unpack(vmeta) - vmeta.num_lines = intersection_count + vmeta.line_count = intersection_count output_db.meta.Pack(vmeta) return {"psi_output": output_db} diff --git a/secretflow/component/preprocessing/train_test_split.py b/secretflow/component/preprocessing/train_test_split.py index 459443d5c..03e5983db 100644 --- a/secretflow/component/preprocessing/train_test_split.py +++ b/secretflow/component/preprocessing/train_test_split.py @@ -121,7 +121,7 @@ def train_test_split_eval_fn( train_df, train, VerticalTableWrapper.from_dist_data(input_data, train_df.shape[0]), - input_data.sys_info, + input_data.system_info, ) test_db = dump_vertical_table( @@ -129,7 +129,7 @@ def train_test_split_eval_fn( test_df, test, VerticalTableWrapper.from_dist_data(input_data, test_df.shape[0]), - input_data.sys_info, + input_data.system_info, ) return {"train": train_db, "test": test_db} diff --git a/secretflow/component/preprocessing/vert_binning.py b/secretflow/component/preprocessing/vert_binning.py new file mode 100644 index 000000000..960a2a4ee --- /dev/null +++ b/secretflow/component/preprocessing/vert_binning.py @@ -0,0 +1,205 @@ +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from secretflow.component.component import ( + Component, + IoType, + TableColParam, +) +from secretflow.component.data_utils import ( + DistDataType, + VerticalTableWrapper, + dump_vertical_table, + load_table, + model_dumps, + model_loads, +) + +from secretflow.device.device.pyu import PYUObject +from secretflow.preprocessing.binning.vert_binning import VertBinning +from secretflow.preprocessing.binning.vert_bin_substitution import VertBinSubstitution + +vert_binning_comp = Component( + "vert_binning", + domain="feature", + version="0.0.1", + desc="Generate equal frequency or equal range binning rules for vertical partitioning datasets.", +) + +vert_binning_comp.str_attr( + name="binning_method", + desc="How to bin features with numeric types: " + '"quantile"(equal frequency)/"eq_range"(equal range)', + is_list=False, + is_optional=True, + default_value="eq_range", + allowed_values=["eq_range", "quantile"], +) +vert_binning_comp.int_attr( + name="bin_num", + desc="Max bin counts for one features.", + is_list=False, + is_optional=True, + default_value=10, + lower_bound=0, + lower_bound_inclusive=False, +) + +vert_binning_comp.io( + io_type=IoType.INPUT, + name="input_data", + desc="Input vertical table.", + types=[DistDataType.VERTICAL_TABLE], + col_params=[ + TableColParam( + name="feature_selects", + desc="which features should be binned.", + col_min_cnt_inclusive=1, + ) + ], +) +vert_binning_comp.io( + io_type=IoType.OUTPUT, + name="bin_rule", + desc="Output bin rule.", + types=[DistDataType.BIN_RUNNING_RULE], + col_params=None, +) + +# current version 0.1 +MODEL_MAX_MAJOR_VERSION = 0 +MODEL_MAX_MINOR_VERSION = 1 + + +@vert_binning_comp.eval_fn +def vert_binning_eval_fn( + *, + ctx, + binning_method, + bin_num, + input_data, + input_data_feature_selects, + bin_rule, +): + input_df = load_table( + ctx, + input_data, + load_features=True, + feature_selects=input_data_feature_selects, + load_labels=False, + ) + + with ctx.tracer.trace_running(): + bining = VertBinning() + col_index = input_df._col_index(input_data_feature_selects) + bin_names = { + party: bins if isinstance(bins, list) else [bins] + for party, bins in col_index.items() + } + rules = bining.binning(input_df, binning_method, bin_num, bin_names) + + model_dist_data = model_dumps( + "bin_rule", + DistDataType.BIN_RUNNING_RULE, + MODEL_MAX_MAJOR_VERSION, + MODEL_MAX_MINOR_VERSION, + [o for o in rules.values()], + input_data_feature_selects, + ctx.local_fs_wd, + bin_rule, + input_data.system_info, + ) + + return {"bin_rule": model_dist_data} + + +vert_bin_substitution_comp = Component( + "vert_bin_substitution", + domain="feature", + version="0.0.1", + desc="Substitute datasets' value by bin substitution rules.", +) + +vert_bin_substitution_comp.io( + io_type=IoType.INPUT, + name="input_data", + desc="Vertical partitioning dataset to be substituted.", + types=[DistDataType.VERTICAL_TABLE], + col_params=None, +) +vert_bin_substitution_comp.io( + io_type=IoType.INPUT, + name="bin_rule", + desc="Input bin substitution rule.", + types=[DistDataType.BIN_RUNNING_RULE], + col_params=None, +) +vert_bin_substitution_comp.io( + io_type=IoType.OUTPUT, + name="output_data", + desc="Output vertical table.", + types=[DistDataType.VERTICAL_TABLE], + col_params=None, +) + + +@vert_bin_substitution_comp.eval_fn +def vert_bin_substitution_eval_fn( + *, + ctx, + input_data, + bin_rule, + output_data, +): + input_df = load_table( + ctx, input_data, load_features=True, load_labels=True, load_ids=True + ) + + pyus = {p.party: p for p in input_df.partitions} + + model_objs, feature_selects = model_loads( + bin_rule, + MODEL_MAX_MAJOR_VERSION, + MODEL_MAX_MINOR_VERSION, + DistDataType.BIN_RUNNING_RULE, + ctx.local_fs_wd, + pyus=pyus, + ) + + bin_rule = {} + for obj in model_objs: + assert isinstance(obj, PYUObject) + bin_rule[obj.device] = obj + + with ctx.tracer.trace_running(): + output_df = VertBinSubstitution().substitution(input_df, bin_rule) + + vt_wrapper = VerticalTableWrapper.from_dist_data(input_data, output_df.shape[0]) + + # modify types of feature_selects to float + for v in vt_wrapper.schema_map.values(): + for i, f in enumerate(list(v.features)): + if f in feature_selects: + v.feature_types[i] = 'float' + + return { + "output_data": dump_vertical_table( + ctx, + output_df, + output_data, + vt_wrapper, + input_data.system_info, + ) + } diff --git a/secretflow/component/feature/vert_woe_binning.py b/secretflow/component/preprocessing/vert_woe_binning.py similarity index 72% rename from secretflow/component/feature/vert_woe_binning.py rename to secretflow/component/preprocessing/vert_woe_binning.py index 8a42a545f..276907949 100644 --- a/secretflow/component/feature/vert_woe_binning.py +++ b/secretflow/component/preprocessing/vert_woe_binning.py @@ -23,18 +23,13 @@ ) from secretflow.component.data_utils import ( DistDataType, - VerticalTableWrapper, - dump_vertical_table, extract_table_header, load_table, model_dumps, - model_loads, ) from secretflow.device.device.heu import HEU -from secretflow.device.device.pyu import PYUObject from secretflow.device.device.spu import SPU from secretflow.preprocessing.binning.vert_woe_binning import VertWoeBinning -from secretflow.preprocessing.binning.vert_woe_substitution import VertWOESubstitution vert_woe_binning_comp = Component( "vert_woe_binning", @@ -121,9 +116,9 @@ ) vert_woe_binning_comp.io( io_type=IoType.OUTPUT, - name="woe_rule", + name="bin_rule", desc="Output WOE rule.", - types=[DistDataType.WOE_RUNNING_RULE], + types=[DistDataType.BIN_RUNNING_RULE], col_params=None, ) @@ -145,7 +140,7 @@ def vert_woe_binning_eval_fn( chimerge_target_pvalue, input_data, input_data_feature_selects, - woe_rule, + bin_rule, ): input_df = load_table( ctx, @@ -208,95 +203,15 @@ def vert_woe_binning_eval_fn( ) model_dist_data = model_dumps( - "woe_rule", - DistDataType.WOE_RUNNING_RULE, + "bin_rule", + DistDataType.BIN_RUNNING_RULE, MODEL_MAX_MAJOR_VERSION, MODEL_MAX_MINOR_VERSION, [o for o in rules.values()], input_data_feature_selects, ctx.local_fs_wd, - woe_rule, - input_data.sys_info, + bin_rule, + input_data.system_info, ) - return {"woe_rule": model_dist_data} - - -vert_woe_substitution_comp = Component( - "vert_woe_substitution", - domain="feature", - version="0.0.1", - desc="Substitute datasets' value by WOE substitution rules.", -) - -vert_woe_substitution_comp.io( - io_type=IoType.INPUT, - name="input_data", - desc="Vertical partitioning dataset to be substituted.", - types=[DistDataType.VERTICAL_TABLE], - col_params=None, -) -vert_woe_substitution_comp.io( - io_type=IoType.INPUT, - name="woe_rule", - desc="Input WOE substitution rule.", - types=[DistDataType.WOE_RUNNING_RULE], - col_params=None, -) -vert_woe_substitution_comp.io( - io_type=IoType.OUTPUT, - name="output_data", - desc="Output vertical table.", - types=[DistDataType.VERTICAL_TABLE], - col_params=None, -) - - -@vert_woe_substitution_comp.eval_fn -def vert_woe_substitution_eval_fn( - *, - ctx, - input_data, - woe_rule, - output_data, -): - input_df = load_table( - ctx, input_data, load_features=True, load_labels=True, load_ids=True - ) - - pyus = {p.party: p for p in input_df.partitions} - - model_objs, feature_selects = model_loads( - woe_rule, - MODEL_MAX_MAJOR_VERSION, - MODEL_MAX_MINOR_VERSION, - DistDataType.WOE_RUNNING_RULE, - ctx.local_fs_wd, - pyus=pyus, - ) - - woe_rule = {} - for obj in model_objs: - assert isinstance(obj, PYUObject) - woe_rule[obj.device] = obj - - with ctx.tracer.trace_running(): - output_df = VertWOESubstitution().substitution(input_df, woe_rule) - - vt_wrapper = VerticalTableWrapper.from_dist_data(input_data, output_df.shape[0]) - - # modify types of feature_selects to float - for v in vt_wrapper.schema_map.values(): - for i, f in enumerate(list(v.features)): - if f in feature_selects: - v.feature_types[i] = 'float' - - return { - "output_data": dump_vertical_table( - ctx, - output_df, - output_data, - vt_wrapper, - input_data.sys_info, - ) - } + return {"bin_rule": model_dist_data} diff --git a/secretflow/component/stats/ss_pearsonr.py b/secretflow/component/stats/ss_pearsonr.py index 2f8f928aa..31bb0396a 100644 --- a/secretflow/component/stats/ss_pearsonr.py +++ b/secretflow/component/stats/ss_pearsonr.py @@ -13,6 +13,9 @@ # limitations under the License. import numpy as np +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData +from secretflow.spec.v1.report_pb2 import Div, Report, Tab, Table from secretflow.component.component import ( CompEvalError, @@ -22,9 +25,6 @@ ) from secretflow.component.data_utils import DistDataType, load_table from secretflow.device.device.spu import SPU -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData -from secretflow.protos.component.report_pb2 import Div, Report, Tab, Table from secretflow.stats.ss_pearsonr_v import PearsonR ss_pearsonr_comp = Component( @@ -125,7 +125,7 @@ def ss_pearsonr_eval_fn( report_dd = DistData( name=report, type=str(DistDataType.REPORT), - sys_info=input_data.sys_info, + system_info=input_data.system_info, ) report_dd.meta.Pack(report_mate) diff --git a/secretflow/component/stats/ss_vif.py b/secretflow/component/stats/ss_vif.py index d71051f2b..8b79210e7 100644 --- a/secretflow/component/stats/ss_vif.py +++ b/secretflow/component/stats/ss_vif.py @@ -13,6 +13,9 @@ # limitations under the License. import numpy as np +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData +from secretflow.spec.v1.report_pb2 import Descriptions, Div, Report, Tab from secretflow.component.component import ( CompEvalError, @@ -22,9 +25,6 @@ ) from secretflow.component.data_utils import DistDataType, load_table from secretflow.device.device.spu import SPU -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData -from secretflow.protos.component.report_pb2 import Descriptions, Div, Report, Tab from secretflow.stats.ss_vif_v import VIF ss_vif_comp = Component( @@ -121,7 +121,7 @@ def ss_vif_eval_fn( report_dd = DistData( name=report, type=str(DistDataType.REPORT), - sys_info=input_data.sys_info, + system_info=input_data.system_info, ) report_dd.meta.Pack(report_mate) diff --git a/secretflow/component/stats/table_statistics.py b/secretflow/component/stats/table_statistics.py index 627cce48f..8d4368558 100644 --- a/secretflow/component/stats/table_statistics.py +++ b/secretflow/component/stats/table_statistics.py @@ -13,12 +13,12 @@ # limitations under the License. import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData +from secretflow.spec.v1.report_pb2 import Div, Report, Tab, Table from secretflow.component.component import Component, IoType from secretflow.component.data_utils import DistDataType, load_table -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData -from secretflow.protos.component.report_pb2 import Div, Report, Tab, Table from secretflow.stats.table_statistics import table_statistics table_statistics_comp = Component( @@ -110,11 +110,11 @@ def gen_table_statistic_report(df: pd.DataFrame) -> Report: ) -def dump_table_statistics(name, sys_info, df: pd.DataFrame) -> DistData: +def dump_table_statistics(name, system_info, df: pd.DataFrame) -> DistData: report_mate = gen_table_statistic_report(df) res = DistData( name=name, - sys_info=sys_info, + system_info=system_info, type=str(DistDataType.REPORT), data_refs=[], ) @@ -131,4 +131,4 @@ def table_statistics_eval_fn(*, ctx, input_data, report): with ctx.tracer.trace_running(): stat = table_statistics(input_df) - return {"report": dump_table_statistics(report, input_data.sys_info, stat)} + return {"report": dump_table_statistics(report, input_data.system_info, stat)} diff --git a/secretflow/data/__init__.py b/secretflow/data/__init__.py index cb8a99a34..575b75df4 100644 --- a/secretflow/data/__init__.py +++ b/secretflow/data/__init__.py @@ -12,16 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import horizontal, vertical, mix, partition, io +from . import core, horizontal, io, mix, ndarray, vertical +from .core import Partition, partition from .ndarray import FedNdarray, PartitionWay __all__ = [ 'horizontal', 'vertical', 'mix', - 'partition', + 'core', 'io', 'ndarray', 'FedNdarray', 'PartitionWay', + 'partition', + 'Partition', ] diff --git a/secretflow/data/base.py b/secretflow/data/base.py index 37348b63f..879c98c46 100644 --- a/secretflow/data/base.py +++ b/secretflow/data/base.py @@ -11,98 +11,116 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import logging from abc import ABC, abstractmethod -from typing import List, Union - -from jax.tree_util import tree_map +from typing import Callable, List, Union -from secretflow.device import PYUObject, reveal +import numpy as np +import pandas as pd class DataFrameBase(ABC): - """Abstract base class for horizontal, vertical and mixed partitioned DataFrame""" + """Abstract base class for: + - API Level: horizontal, vertical and mixed partitioned DataFrame; + - Partition Level: Partition and Partition agent; + - Partitin implement Level: PdPartDataFrame or other implements. + """ @abstractmethod - def __getitem__(self, item): - """Get columns""" + def __getitem__(self, item) -> "DataFrameBase": + """Get items from indicate columns""" pass @abstractmethod def __setitem__(self, key, value): - """Set columns""" + """Set item from indicate columns""" pass @abstractmethod - def sum(self, *args, **kwargs): - """Returns the sum of the values over the requested axis.""" + def __len__(self): + """Get len of the dataframe""" pass @abstractmethod - def min(self, *args, **kwargs): - """Gets minimum value of all columns""" + def columns(self) -> list: + """Returns the column names of the DataFrame.""" pass @abstractmethod - def max(self, *args, **kwargs): - """Gets maximum value of all columns""" + def dtypes(self) -> dict: + """Returns the dtypes in the DataFrame. + Returns dict always. + """ pass @abstractmethod - def count(self, *args, **kwargs): - """Gets number of rows""" + def shape(self) -> tuple: + """Returns a tuple representing the dimensionality of the DataFrame.""" pass @abstractmethod - def values(self): - """Get underlying ndarray""" + def index(self) -> list: + """Returns the index (row labels) of the DataFrame.""" + pass + + @abstractmethod + def count(self, *args, **kwargs) -> pd.Series: + """Count non-NA cells for each column.""" + pass + + @abstractmethod + def sum(self, *args, **kwargs) -> pd.Series: + """Returns the sum of the values over the requested axis.""" + pass + + @abstractmethod + def min(self, *args, **kwargs) -> pd.Series: + """Gets minimum value of all columns""" + pass + + @abstractmethod + def max(self, *args, **kwargs) -> pd.Series: + """Gets maximum value of all columns""" pass @abstractmethod - def mean(self, *args, **kwargs): + def mean(self, *args, **kwargs) -> pd.Series: """Returns the mean of the values over the requested axis.""" pass @abstractmethod - def var(self, *args, **kwargs): + def var(self, *args, **kwargs) -> pd.Series: """Returns the variance of the values over the requested axis.""" pass @abstractmethod - def std(self, *args, **kwargs): + def std(self, *args, **kwargs) -> pd.Series: """Returns the standard deviation of the values over the requested axis.""" pass @abstractmethod - def sem(self, *args, **kwargs): + def sem(self, *args, **kwargs) -> pd.Series: """Returns the standard error of the mean over the requested axis.""" pass @abstractmethod - def skew(self, *args, **kwargs): + def skew(self, *args, **kwargs) -> pd.Series: """Returns the skewness over the requested axis.""" pass @abstractmethod - def kurtosis(self, *args, **kwargs): + def kurtosis(self, *args, **kwargs) -> pd.Series: """Returns the kurtosis over the requested axis.""" pass @abstractmethod - def replace(self, *args, **kwargs): - """Replace values given in to_replace with value. - Same as pandas.DataFrame.replace - Values of the DataFrame are replaced with other values dynamically. - """ - pass - - @abstractmethod - def quantile(self, q=0.5, axis=0): + def quantile(self, *args, **kwargs) -> pd.Series: """Returns values at the given quantile over requested axis.""" pass @abstractmethod - def mode(self, *args, **kwargs): + def mode(self, *args, **kwargs) -> pd.Series: """Returns the mode of the values over the requested axis. For data protection reasons, only one mode will be returned. @@ -111,7 +129,17 @@ def mode(self, *args, **kwargs): pass @abstractmethod - def isna(self): + def value_counts(self, *args, **kwargs) -> pd.Series: + """Return a Series containing counts of unique values.""" + pass + + @abstractmethod + def values(self) -> np.ndarray: + """Get underlying ndarray""" + pass + + @abstractmethod + def isna(self) -> "DataFrameBase": """Detects missing values for an array-like object. Same as pandas.DataFrame.isna Returns @@ -121,13 +149,17 @@ def isna(self): pass @abstractmethod - def dtypes(self) -> dict: - """Returns the dtypes in the DataFrame.""" - # return series always. + def replace(self, *args, **kwargs) -> "DataFrameBase": + """Replace values given in to_replace with value. + Same as pandas.DataFrame.replace + Values of the DataFrame are replaced with other values dynamically. + """ pass @abstractmethod - def astype(self, dtype, copy: bool = True, errors: str = "raise"): + def astype( + self, dtype, copy: bool = True, errors: str = "raise" + ) -> "DataFrameBase": """ Cast a pandas object to a specified dtype ``dtype``. @@ -136,17 +168,7 @@ def astype(self, dtype, copy: bool = True, errors: str = "raise"): pass @abstractmethod - def columns(self): - """Returns the column labels of the DataFrame.""" - pass - - @abstractmethod - def shape(self): - """Returns a tuple representing the dimensionality of the DataFrame.""" - pass - - @abstractmethod - def copy(self): + def copy(self) -> "DataFrameBase": """Shallow copy.""" pass @@ -160,7 +182,7 @@ def drop( level=None, inplace=False, errors='raise', - ): + ) -> "DataFrameBase": pass @abstractmethod @@ -172,7 +194,7 @@ def fillna( inplace=False, limit=None, downcast=None, - ) -> Union['PartitionBase', None]: + ) -> Union['DataFrameBase', None]: pass @abstractmethod @@ -180,44 +202,15 @@ def to_csv(self, filepath, **kwargs): """Save DataFrame to csv file.""" pass - -class PartitionBase(DataFrameBase): - """Slice of data that makes up horizontal, vertical and mixed partitioned DataFrame. - - Attributes: - data (PYUObject): Reference to pandas.DataFrame located in local node. - """ - - def __init__(self, data: PYUObject = None, backend="pandas"): - self.data = data - self.backend = backend - - def __unwrap(self, args, kwargs): - new_args = tree_map(lambda x: x.data if (type(x) == type(self)) else x, args) - new_kwargs = tree_map( - lambda x: x.data if (type(x) == type(self)) else x, kwargs - ) - return new_args, new_kwargs - - @reveal - def __len__(self): - """Returns the number of rows.""" - return self.data.device(lambda df: len(df))(self.data) - - @abstractmethod - def index(self): - """Returns the index (row labels) of the DataFrame.""" - pass - @abstractmethod - def iloc(self, index: Union[int, slice, List[int]]) -> 'PartitionBase': + def iloc(self, index: Union[int, slice, List[int]]) -> 'DataFrameBase': """Integer-location based indexing for selection by position. Args: index (Union[int, slice, List[int]]): rows index. Returns: - PartitionBase: Selected DataFrame. + DataFrameBase: Selected DataFrame. """ pass @@ -232,18 +225,13 @@ def rename( inplace=False, level=None, errors='ignore', - ) -> Union['PartitionBase', None]: + ) -> Union['DataFrameBase', None]: """See :py:meth:`pandas.DataFrame.rename`""" pass @abstractmethod - def value_counts(self, *args, **kwargs) -> 'PartitionBase': - """Return a Series containing counts of unique values.""" - pass - - @abstractmethod - def pow(self, *args, **kwargs) -> 'PartitionBase': + def pow(self, *args, **kwargs) -> 'DataFrameBase': """Gets Exponential power of (partition of) dataframe and other, element-wise (binary operator pow). Equivalent to dataframe ** other, but with support to substitute a fill_value for missing data in one of the inputs. With reverse version, rpow. @@ -252,17 +240,17 @@ def pow(self, *args, **kwargs) -> 'PartitionBase': pass @abstractmethod - def round(self, *args, **kwargs) -> 'PartitionBase': + def round(self, *args, **kwargs) -> 'DataFrameBase': """Round the (partition of) DataFrame to a variable number of decimal places.""" pass @abstractmethod - def select_dtypes(self, *args, **kwargs) -> 'PartitionBase': + def select_dtypes(self, *args, **kwargs) -> 'DataFrameBase': """Returns a subset of the DataFrame's columns based on the column dtypes.""" pass @abstractmethod - def subtract(self, *args, **kwargs) -> 'PartitionBase': + def subtract(self, *args, **kwargs) -> 'DataFrameBase': """Gets Subtraction of (partition of) dataframe and other, element-wise (binary operator sub). Equivalent to dataframe - other, but with support to substitute a fill_value for missing data in one of the inputs. With reverse version, rsub. @@ -271,32 +259,37 @@ def subtract(self, *args, **kwargs) -> 'PartitionBase': pass @abstractmethod - def to_pandas(self): + def apply_func( + self, func: Callable, *, nums_return: int = 1, **kwargs + ) -> 'DataFrameBase': + """ + Apply any function inside the dataframe actor. + Please make sure the function retures a dataframe type same as the type of the self.data + Args: + func: any function, with first argument must be dataframe itself. + nums_return: the return nums, defualt to 1. + kwargs: contains the dataframe executed by func, the dataframe should be real df like pandas. + Returns: + A Partition with data applyed by this function. + """ pass - -def partition(data, backend="pandas") -> PartitionBase: - """Construct a Parititon with input data and backend. - Default backend is Pandas, support Polars as well. - """ - if backend == "pandas": - from secretflow.data.partition.pandas.partition import PdPartition - - return PdPartition(data) - elif backend == "polars": - from secretflow.data.partition.polars.partition import PolarsPartition - - logging.warning( - "Currently, polars is still an experimental version. Please use it with caution." - ) - return PolarsPartition(data) - else: - raise RuntimeError(f"Unknown backend {backend}") + @abstractmethod + def to_pandas(self) -> 'DataFrameBase': + """ + Convert myself to pandas type. + Returns: + A DataFrameBase within pandas type. + """ + pass def Partition(data): """For compatibility of some legacy cases, which should be replaced by function partition().""" logging.warning( - "Constructor 'Partition' is deperated, please use partition(data) instead." + "Constructor 'Partition' is deperated, please use 'partition(data)' instead and import like 'from " + "secretflow.data.partition import partition'." ) + from .core import partition + return partition(data) diff --git a/secretflow/protos/component/__init__.py b/secretflow/data/core/__init__.py similarity index 86% rename from secretflow/protos/component/__init__.py rename to secretflow/data/core/__init__.py index 724997cbd..ef16c2129 100644 --- a/secretflow/protos/component/__init__.py +++ b/secretflow/data/core/__init__.py @@ -11,3 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .partition import Partition, partition + +__all__ = [ + "partition", + "Partition", +] diff --git a/secretflow/data/core/agent.py b/secretflow/data/core/agent.py new file mode 100644 index 000000000..1b05d4cdd --- /dev/null +++ b/secretflow/data/core/agent.py @@ -0,0 +1,392 @@ +# Copyright 2022 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Callable, Dict, List, Union + +import numpy as np +import pandas as pd + +from secretflow.device import PYUObject, proxy + +from .base import AgentIndex, PartDataFrameBase, PartitionAgentBase +from .pandas import PdPartDataFrame +from .polars import PlPartDataFrame + + +def partition_data(source, backend="pandas") -> "PartDataFrameBase": + if backend == "pandas": + return PdPartDataFrame(source) + elif backend == "polars": + from secretflow.data.core.polars import PlPartDataFrame + + return PlPartDataFrame(source) + else: + raise RuntimeError(f"Unknown backend {backend}") + + +@proxy(PYUObject) +class PartitionAgent(PartitionAgentBase): + """A partition agent actor. + In order to achieve the best performance and prevent copying PYUObject, + Each party resides with a partition agent actor during the data process phase. + The agent actor create in initialization (such as read_csv) and is shared among multiple partitions. + The relation between partition and partition agent like this: + Partition0 ... ... pd.DataFrame + . . + Partition1 .... PartitionAgent ... pd.DataFrame + . (actor) . + Partition2 ... ... pd.DataFrame + """ + + working_objects: Dict[AgentIndex, PartDataFrameBase] + + def __init__(self): + super().__init__() + self.cur_id = 0 + self.working_objects = {} + + def append_data( + self, source: Union[Callable, Any], backend="pandas", **kwargs + ) -> AgentIndex: + cur_idx = self.__next_agent_index() + if callable(source): + source = source(**kwargs) + data = partition_data(source, backend) + self.working_objects[cur_idx] = data + return cur_idx + + def __next_agent_index(self): + self.cur_id += 1 + return AgentIndex(self.cur_id - 1) + + def get_data(self, idx: AgentIndex): + # returns pd.DataFrame or any dataframe backend. + return self.working_objects[idx].get_data() + + def del_object(self, idx: AgentIndex): + if idx in self.working_objects: + del self.working_objects[idx] + + def get_backend(self, idx: AgentIndex) -> str: + working_object = self.working_objects[idx] + if isinstance(working_object, PdPartDataFrame): + return "pandas" + elif isinstance(working_object, PlPartDataFrame): + return "polars" + return "unknown" + + def __getitem__(self, idx: AgentIndex, item) -> AgentIndex: + working_object = self.working_objects[idx] + data = working_object.__getitem__(item) + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def __setitem__(self, idx: AgentIndex, key, value: Union[AgentIndex, PYUObject]): + working_object = self.working_objects[idx] + if isinstance(value, AgentIndex): + value = self.working_objects[value] + + working_object.__setitem__(key, value) + + def __len__(self, idx: AgentIndex) -> int: + working_object = self.working_objects[idx] + return working_object.__len__() + + def columns(self, idx: AgentIndex) -> list: + working_object = self.working_objects[idx] + return working_object.columns() + + def dtypes(self, idx: AgentIndex) -> dict: + working_object = self.working_objects[idx] + + return working_object.dtypes() + + def shape(self, idx: AgentIndex) -> tuple: + working_object = self.working_objects[idx] + return working_object.shape() + + def index(self, idx: AgentIndex) -> list: + working_object = self.working_objects[idx] + return working_object.index() + + def count(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Count non-NA cells for each column.""" + working_object = self.working_objects[idx] + return working_object.count(*args, **kwargs) + + def sum(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns the sum of the values over the requested axis.""" + working_object = self.working_objects[idx] + return working_object.sum(*args, **kwargs) + + def min(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Gets minimum value of all columns""" + working_object = self.working_objects[idx] + return working_object.min(*args, **kwargs) + + def max(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Gets maximum value of all columns""" + working_object = self.working_objects[idx] + return working_object.max(*args, **kwargs) + + def mean(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns the mean of the values over the requested axis.""" + working_object = self.working_objects[idx] + return working_object.mean(*args, **kwargs) + + def var(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns the variance of the values over the requested axis.""" + working_object = self.working_objects[idx] + return working_object.var(*args, **kwargs) + + def std(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns the standard deviation of the values over the requested axis.""" + working_object = self.working_objects[idx] + return working_object.std(*args, **kwargs) + + def sem(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns the standard error of the mean over the requested axis.""" + working_object = self.working_objects[idx] + return working_object.sem(*args, **kwargs) + + def skew(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns the skewness over the requested axis.""" + working_object = self.working_objects[idx] + return working_object.skew(*args, **kwargs) + + def kurtosis(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns the kurtosis over the requested axis.""" + working_object = self.working_objects[idx] + return working_object.kurtosis(*args, **kwargs) + + def quantile(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns values at the given quantile over requested axis.""" + working_object = self.working_objects[idx] + return working_object.quantile(*args, **kwargs) + + def mode(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Returns the mode of the values over the requested axis. + + For data protection reasons, only one mode will be returned. + """ + + working_object = self.working_objects[idx] + return working_object.mode(*args, **kwargs) + + def value_counts(self, idx: AgentIndex, *args, **kwargs) -> pd.Series: + """Return a Series containing counts of unique values.""" + working_object = self.working_objects[idx] + return working_object.value_counts(*args, **kwargs) + + def values(self, idx: AgentIndex) -> np.ndarray: + """Get underlying ndarray""" + working_object = self.working_objects[idx] + return working_object.values() + + def isna(self, idx: AgentIndex) -> AgentIndex: + """Detects missing values for an array-like object. + Same as pandas.DataFrame.isna + Returns + Mask of bool values for each element in DataFrame + that indicates whether an element is an NA value. + """ + working_object = self.working_objects[idx] + data = working_object.isna() + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def replace(self, idx: AgentIndex, *args, **kwargs) -> AgentIndex: + """Replace values given in to_replace with value. + Same as pandas.DataFrame.replace + Values of the DataFrame are replaced with other values dynamically. + """ + working_object = self.working_objects[idx] + data = working_object.replace(*args, **kwargs) + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def astype( + self, idx: AgentIndex, dtype, copy: bool = True, errors: str = "raise" + ) -> AgentIndex: + """ + Cast a pandas object to a specified dtype ``dtype``. + + All args are same as :py:meth:`pandas.DataFrame.astype`. + """ + working_object = self.working_objects[idx] + data = working_object.astype(dtype, copy, errors) + if copy: + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + else: + return idx + + def copy(self, idx: AgentIndex) -> AgentIndex: + """Shallow copy.""" + working_object = self.working_objects[idx] + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = working_object.copy() + return cur_idx + + def drop( + self, + idx: AgentIndex, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors='raise', + ) -> AgentIndex: + working_object = self.working_objects[idx] + data = working_object.drop(labels, axis, index, columns, level, inplace, errors) + if not inplace: + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + else: + return idx + + def fillna( + self, + idx: AgentIndex, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ) -> Union[AgentIndex, None]: + working_object = self.working_objects[idx] + data = working_object.fillna(value, method, axis, inplace, limit, downcast) + if not inplace: + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + else: + return idx + + def to_csv(self, idx: AgentIndex, filepath, **kwargs): + """Save DataFrame to csv file.""" + working_object = self.working_objects[idx] + working_object.to_csv(filepath, **kwargs) + + def iloc(self, idx: AgentIndex, index: Union[int, slice, List[int]]) -> AgentIndex: + working_object = self.working_objects[idx] + data = working_object.iloc(index) + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def rename( + self, + idx: AgentIndex, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + errors='ignore', + ) -> Union[AgentIndex, None]: + """See :py:meth:`pandas.DataFrame.rename`""" + + working_object = self.working_objects[idx] + data = working_object.rename( + mapper, index, columns, axis, copy, inplace, level, errors + ) + if not inplace: + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + else: + return idx + + def pow(self, idx: AgentIndex, *args, **kwargs) -> AgentIndex: + """Gets Exponential power of (partition of) dataframe and other, element-wise (binary operator pow). + Equivalent to dataframe ** other, but with support to substitute a fill_value for missing data in one of the inputs. + With reverse version, rpow. + Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. + """ + working_object = self.working_objects[idx] + data = working_object.pow(*args, **kwargs) + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def round(self, idx: AgentIndex, *args, **kwargs) -> AgentIndex: + """Round the (partition of) DataFrame to a variable number of decimal places.""" + working_object = self.working_objects[idx] + data = working_object.round(*args, **kwargs) + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def select_dtypes(self, idx: AgentIndex, *args, **kwargs) -> AgentIndex: + """Returns a subset of the DataFrame's columns based on the column dtypes.""" + working_object = self.working_objects[idx] + data = working_object.select_dtypes(*args, **kwargs) + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def subtract(self, idx: AgentIndex, other, *args, **kwargs) -> AgentIndex: + """Gets Subtraction of (partition of) dataframe and other, element-wise (binary operator sub). + Equivalent to dataframe - other, but with support to substitute a fill_value for missing data in one of the inputs. + With reverse version, rsub. + Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. + """ + if isinstance(other, AgentIndex): + other = self.working_objects[other] + working_object = self.working_objects[idx] + data = working_object.subtract(other, *args, **kwargs) + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def apply_func( + self, idx: AgentIndex, func: Callable, *, nums_return: int = 1, **kwargs + ) -> Union[AgentIndex, List[AgentIndex]]: + working_object = self.working_objects[idx] + data = working_object.apply_func(func, nums_return=nums_return, **kwargs) + if nums_return != 1: + assert isinstance(data, list) and len(data) == nums_return + ret = [] + for d in data: + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = d + ret.append(cur_idx) + return ret + else: + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx + + def to_pandas(self, idx: AgentIndex) -> AgentIndex: + """ + Convert myself to pandas type. + Returns: + A DataFrameBase within pandas type. + """ + working_object = self.working_objects[idx] + data = working_object.to_pandas() + cur_idx = self.__next_agent_index() + self.working_objects[cur_idx] = data + return cur_idx diff --git a/secretflow/data/core/base.py b/secretflow/data/core/base.py new file mode 100644 index 000000000..0dfd8b8e1 --- /dev/null +++ b/secretflow/data/core/base.py @@ -0,0 +1,604 @@ +# Copyright 2022 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Callable, List, Union + +import numpy as np +import pandas as pd + +from ...device import PYUObject +from ..base import DataFrameBase + + +class AgentIndex: + def __init__(self, idx: int): + self.idx = idx + + def __eq__(self, other: 'AgentIndex'): + return self.idx == other.idx + + def __hash__(self): + return self.idx.__hash__() + + def __str__(self): + return f"AgentIndex: {self.idx}" + + def __repr__(self): + return f"AgentIndex: {self.idx}" + + +class PartitionAgentBase: + """ + Abstract PartDataFrame Base class. + The implement of this class should add @proxy(PYUObject) as an actor. + Therefore, all function who returns something will return PYUObject instead. + """ + + @abstractmethod + def append_data( + self, source: Union[Callable, 'AnyDataFrame'], backend="pandas", **kwargs + ) -> AgentIndex: + """ + Append data or construct data into this remote agent. + Args: + source: the source to construct part dataframe + It can be a Callable func type which takes kwargs as parameters and retures a real dataframe. + or, it can be any dataframe (like pd.DataFrame). + backend: The partition backend, default to pandas. + **kwargs: if the source is a function, use kwargs as its parameters. + + Returns: + AgentIndex of the generated dataframe. + """ + pass + + @abstractmethod + def get_data(self, idx: AgentIndex) -> PYUObject: + """ + Get the real data of the idx inside the partition agent. + Args: + idx: the data's index + + Returns: + the real dataframe such as pd.DataFrame. + """ + pass + + @abstractmethod + def del_object(self, idx: PYUObject): + pass + + @abstractmethod + def get_backend(self, idx: AgentIndex) -> PYUObject: + pass + + @abstractmethod + def __getitem__(self, idx: AgentIndex, item) -> PYUObject: + """Get columns""" + pass + + @abstractmethod + def __setitem__(self, idx: AgentIndex, key, value: Union[AgentIndex, PYUObject]): + """Set columns""" + pass + + @abstractmethod + def __len__(self, idx: AgentIndex) -> PYUObject: + pass + + @abstractmethod + def columns(self, idx: AgentIndex) -> PYUObject: + """Returns the column labels of the DataFrame.""" + pass + + @abstractmethod + def dtypes(self, idx: AgentIndex) -> PYUObject: + """Returns the dtypes in the DataFrame.""" + # return series always. + pass + + @abstractmethod + def shape(self, idx: AgentIndex) -> PYUObject: + """Returns a tuple representing the dimensionality of the DataFrame.""" + pass + + @abstractmethod + def index(self, idx: AgentIndex) -> PYUObject: + """Returns the index (row labels) of the DataFrame.""" + pass + + @abstractmethod + def count(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Count non-NA cells for each column.""" + pass + + @abstractmethod + def sum(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns the sum of the values over the requested axis.""" + pass + + @abstractmethod + def min(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Gets minimum value of all columns""" + pass + + @abstractmethod + def max(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Gets maximum value of all columns""" + pass + + @abstractmethod + def mean(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns the mean of the values over the requested axis.""" + pass + + @abstractmethod + def var(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns the variance of the values over the requested axis.""" + pass + + @abstractmethod + def std(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns the standard deviation of the values over the requested axis.""" + pass + + @abstractmethod + def sem(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns the standard error of the mean over the requested axis.""" + pass + + @abstractmethod + def skew(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns the skewness over the requested axis.""" + pass + + @abstractmethod + def kurtosis(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns the kurtosis over the requested axis.""" + pass + + @abstractmethod + def quantile(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns values at the given quantile over requested axis.""" + pass + + @abstractmethod + def mode(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns the mode of the values over the requested axis. + + For data protection reasons, only one mode will be returned. + """ + + pass + + @abstractmethod + def value_counts(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Return a Series containing counts of unique values.""" + pass + + @abstractmethod + def values(self, idx: AgentIndex) -> PYUObject: + """Get underlying ndarray""" + pass + + @abstractmethod + def isna(self, idx: AgentIndex) -> PYUObject: + """Detects missing values for an array-like object. + Same as pandas.DataFrame.isna + Returns + Mask of bool values for each element in DataFrame + that indicates whether an element is an NA value. + """ + pass + + @abstractmethod + def replace(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Replace values given in to_replace with value. + Same as pandas.DataFrame.replace + Values of the DataFrame are replaced with other values dynamically. + """ + pass + + @abstractmethod + def astype( + self, idx: AgentIndex, dtype, copy: bool = True, errors: str = "raise" + ) -> PYUObject: + """ + Cast a pandas object to a specified dtype ``dtype``. + + All args are same as :py:meth:`pandas.DataFrame.astype`. + """ + pass + + @abstractmethod + def copy(self, idx: AgentIndex) -> PYUObject: + """Shallow copy.""" + pass + + @abstractmethod + def drop( + self, + idx: AgentIndex, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors='raise', + ) -> PYUObject: + pass + + @abstractmethod + def fillna( + self, + idx: AgentIndex, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ) -> Union[PYUObject, None]: + pass + + @abstractmethod + def to_csv(self, idx: AgentIndex, filepath, **kwargs): + """Save DataFrame to csv file.""" + pass + + @abstractmethod + def iloc(self, idx: AgentIndex, index: Union[int, slice, List[int]]) -> PYUObject: + """Integer-location based indexing for selection by position. + + Args: + idx: data index in agent. + index (Union[int, slice, List[int]]): rows index. + + Returns: + Selected DataFrame. + """ + pass + + @abstractmethod + def rename( + self, + idx: AgentIndex, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + errors='ignore', + ) -> Union[PYUObject, None]: + """See :py:meth:`pandas.DataFrame.rename`""" + + pass + + @abstractmethod + def pow(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Gets Exponential power of (partition of) dataframe and other, element-wise (binary operator pow). + Equivalent to dataframe ** other, but with support to substitute a fill_value for missing data in one of the inputs. + With reverse version, rpow. + Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. + """ + pass + + @abstractmethod + def round(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Round the (partition of) DataFrame to a variable number of decimal places.""" + pass + + @abstractmethod + def select_dtypes(self, idx: AgentIndex, *args, **kwargs) -> PYUObject: + """Returns a subset of the DataFrame's columns based on the column dtypes.""" + pass + + @abstractmethod + def subtract(self, idx: AgentIndex, other, *args, **kwargs) -> PYUObject: + """Gets Subtraction of (partition of) dataframe and other, element-wise (binary operator sub). + Equivalent to dataframe - other, but with support to substitute a fill_value for missing data in one of the inputs. + With reverse version, rsub. + Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. + """ + pass + + @abstractmethod + def apply_func( + self, idx: AgentIndex, func: Callable, *, nums_return: int = 1, **kwargs + ) -> PYUObject: + """ + Apply any function inside the dataframe actor. + Please make sure the function retures a dataframe type same as the type of the self.data + Args: + idx: the agent index to indicate which data to apply func. + func: any function, with first argument must be dataframe itself. + nums_return: nums to return, default to 1. + kwargs: contains the dataframe executed by func, the dataframe should be real df like pandas. + Returns: + A Partition with data applyed by this function. + """ + pass + + @abstractmethod + def to_pandas(self, idx: AgentIndex) -> PYUObject: + """ + Convert myself to pandas type. + Returns: + A DataFrameBase within pandas type. + """ + pass + + +class PartDataFrameBase(DataFrameBase, ABC): + @abstractmethod + def get_data(self) -> Union["pd.DataFrame", "pl.DataFrame"]: + pass + + @abstractmethod + def __getitem__(self, item) -> "PartDataFrameBase": + """Get columns""" + pass + + @abstractmethod + def __setitem__(self, key, value): + """Set columns""" + pass + + @abstractmethod + def __len__(self): + pass + + @abstractmethod + def columns(self) -> list: + """Returns the column labels of the DataFrame.""" + pass + + @abstractmethod + def dtypes(self) -> dict: + """Returns the dtypes in the DataFrame.""" + # return series always. + pass + + @abstractmethod + def shape(self) -> tuple: + """Returns a tuple representing the dimensionality of the DataFrame.""" + pass + + @abstractmethod + def index(self) -> list: + """Returns the index (row labels) of the DataFrame.""" + pass + + @abstractmethod + def count(self, *args, **kwargs) -> pd.Series: + """Count non-NA cells for each column.""" + pass + + @abstractmethod + def sum(self, *args, **kwargs) -> pd.Series: + """Returns the sum of the values over the requested axis.""" + pass + + @abstractmethod + def min(self, *args, **kwargs) -> pd.Series: + """Gets minimum value of all columns""" + pass + + @abstractmethod + def max(self, *args, **kwargs) -> pd.Series: + """Gets maximum value of all columns""" + pass + + @abstractmethod + def mean(self, *args, **kwargs) -> pd.Series: + """Returns the mean of the values over the requested axis.""" + pass + + @abstractmethod + def var(self, *args, **kwargs) -> pd.Series: + """Returns the variance of the values over the requested axis.""" + pass + + @abstractmethod + def std(self, *args, **kwargs) -> pd.Series: + """Returns the standard deviation of the values over the requested axis.""" + pass + + @abstractmethod + def sem(self, *args, **kwargs) -> pd.Series: + """Returns the standard error of the mean over the requested axis.""" + pass + + @abstractmethod + def skew(self, *args, **kwargs) -> pd.Series: + """Returns the skewness over the requested axis.""" + pass + + @abstractmethod + def kurtosis(self, *args, **kwargs) -> pd.Series: + """Returns the kurtosis over the requested axis.""" + pass + + @abstractmethod + def quantile(self, *args, **kwargs) -> pd.Series: + """Returns values at the given quantile over requested axis.""" + pass + + @abstractmethod + def mode(self, *args, **kwargs) -> pd.Series: + """Returns the mode of the values over the requested axis. + + For data protection reasons, only one mode will be returned. + """ + + pass + + @abstractmethod + def value_counts(self, *args, **kwargs) -> pd.Series: + """Return a Series containing counts of unique values.""" + pass + + @abstractmethod + def values(self) -> np.ndarray: + """Get underlying ndarray""" + pass + + @abstractmethod + def isna(self) -> "PartDataFrameBase": + """Detects missing values for an array-like object. + Same as pandas.DataFrame.isna + Returns + Mask of bool values for each element in DataFrame + that indicates whether an element is an NA value. + """ + pass + + @abstractmethod + def replace(self, *args, **kwargs) -> "PartDataFrameBase": + """Replace values given in to_replace with value. + Same as pandas.DataFrame.replace + Values of the DataFrame are replaced with other values dynamically. + """ + pass + + @abstractmethod + def astype( + self, dtype, copy: bool = True, errors: str = "raise" + ) -> "PartDataFrameBase": + """ + Cast a pandas object to a specified dtype ``dtype``. + + All args are same as :py:meth:`pandas.DataFrame.astype`. + """ + pass + + @abstractmethod + def copy(self) -> "PartDataFrameBase": + """Shallow copy.""" + pass + + @abstractmethod + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors='raise', + ) -> "PartDataFrameBase": + pass + + @abstractmethod + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ) -> Union['PartDataFrameBase', None]: + pass + + @abstractmethod + def to_csv(self, filepath, **kwargs): + """Save DataFrame to csv file.""" + pass + + @abstractmethod + def iloc(self, index: Union[int, slice, List[int]]) -> 'PartDataFrameBase': + """Integer-location based indexing for selection by position. + + Args: + index (Union[int, slice, List[int]]): rows index. + + Returns: + DataFrameBase: Selected DataFrame. + """ + pass + + @abstractmethod + def rename( + self, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + errors='ignore', + ) -> Union['PartDataFrameBase', None]: + """See :py:meth:`pandas.DataFrame.rename`""" + + pass + + @abstractmethod + def pow(self, *args, **kwargs) -> 'PartDataFrameBase': + """Gets Exponential power of (partition of) dataframe and other, element-wise (binary operator pow). + Equivalent to dataframe ** other, but with support to substitute a fill_value for missing data in one of the inputs. + With reverse version, rpow. + Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. + """ + pass + + @abstractmethod + def round(self, *args, **kwargs) -> 'PartDataFrameBase': + """Round the (partition of) DataFrame to a variable number of decimal places.""" + pass + + @abstractmethod + def select_dtypes(self, *args, **kwargs) -> 'PartDataFrameBase': + """Returns a subset of the DataFrame's columns based on the column dtypes.""" + pass + + @abstractmethod + def subtract(self, *args, **kwargs) -> 'PartDataFrameBase': + """Gets Subtraction of (partition of) dataframe and other, element-wise (binary operator sub). + Equivalent to dataframe - other, but with support to substitute a fill_value for missing data in one of the inputs. + With reverse version, rsub. + Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. + """ + pass + + @abstractmethod + def apply_func( + self, func: Callable, *, nums_return: int = 1, **kwargs + ) -> 'PartDataFrameBase': + """ + Apply any function inside the dataframe actor. + Please make sure the function retures a dataframe type same as the type of the self.data + Args: + func: any function, with first argument must be dataframe itself. + nums_return: the return nums, defualt to 1. + kwargs: contains the dataframe executed by func, the dataframe should be real df like pandas. + Returns: + A Partition with data applyed by this function. + """ + pass + + @abstractmethod + def to_pandas(self) -> 'PartDataFrameBase': + """ + Convert myself to pandas type. + Returns: + A DataFrameBase within pandas type. + """ + pass diff --git a/secretflow/data/partition/io.py b/secretflow/data/core/io.py similarity index 83% rename from secretflow/data/partition/io.py rename to secretflow/data/core/io.py index 0cc6f77bc..c5b38f8ca 100644 --- a/secretflow/data/partition/io.py +++ b/secretflow/data/core/io.py @@ -11,19 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from typing import Union + import pandas as pd def read_csv_wrapper( - filepath: str, auto_gen_header_prefix: str = "", backend="pandas", **kwargs + filepath: str, auto_gen_header_prefix: str = "", read_backend="pandas", **kwargs ) -> Union[pd.DataFrame, "pl.DataFrame"]: """A wrapper of pandas read_csv and supports oss file. Args: filepath: the file path. auto_gen_header_prefix: If set, the format of generated headers would be {gen_header_prefix}_{col_idx}. - backend: reading backend. + read_backend: reading backend. kwargs: all other arguments are same with :py:meth:`pandas.DataFrame.read_csv`. Returns: @@ -34,7 +36,7 @@ def _read_csv(_filepath, _backend, *_args, **_kwargs): if _backend == "pandas": return pd.read_csv(open(_filepath), *_args, **_kwargs) elif _backend == "polars": - from secretflow.data.partition.polars.util import read_polars_csv + from secretflow.data.core.polars.util import read_polars_csv return read_polars_csv(_filepath, *_args, **_kwargs) else: @@ -42,11 +44,11 @@ def _read_csv(_filepath, _backend, *_args, **_kwargs): if auto_gen_header_prefix: kwargs['header'] = None - df = _read_csv(filepath, backend, **kwargs) + df = _read_csv(filepath, read_backend, **kwargs) df.columns = [ "{}_{}".format(auto_gen_header_prefix, i) for i in range(df.shape[1]) ] return df else: - df = _read_csv(filepath, backend, **kwargs) + df = _read_csv(filepath, read_backend, **kwargs) return df diff --git a/secretflow/data/partition/pandas/__init__.py b/secretflow/data/core/pandas/__init__.py similarity index 93% rename from secretflow/data/partition/pandas/__init__.py rename to secretflow/data/core/pandas/__init__.py index 2fbe146b7..7f5fb0cf9 100644 --- a/secretflow/data/partition/pandas/__init__.py +++ b/secretflow/data/core/pandas/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .partition import PdPartition +from .dataframe import PdPartDataFrame diff --git a/secretflow/data/core/pandas/dataframe.py b/secretflow/data/core/pandas/dataframe.py new file mode 100644 index 000000000..4f2fd3765 --- /dev/null +++ b/secretflow/data/core/pandas/dataframe.py @@ -0,0 +1,236 @@ +# Copyright 2022 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Callable, List, Union + +import pandas as pd +from jax import tree_map +from pandas import Index +from pandas._typing import IgnoreRaise + +from ..base import PartDataFrameBase +from ...io.util import is_local_file + + +class PdPartDataFrame(PartDataFrameBase): + data: pd.DataFrame + + def __init__(self, data: pd.DataFrame): + super().__init__() + assert isinstance( + data, (pd.DataFrame, pd.Series) + ), f"need pd.DataFrame/pd.Series, got {type(data)}" + self.data = data.copy(deep=True) + + def get_data(self): + return self.data + + def __unwrap(self, args, kwargs): + new_args = tree_map(lambda x: x.data if (type(x) == type(self)) else x, args) + new_kwargs = tree_map( + lambda x: x.data if (type(x) == type(self)) else x, kwargs + ) + return new_args, new_kwargs + + def __getitem__(self, item) -> "PdPartDataFrame": + item_list = item + + if not isinstance(item, (list, tuple, Index)): + item_list = [item_list] + return PdPartDataFrame(self.data.__getitem__(item_list)) + + def __setitem__(self, key, value): + if isinstance(value, PdPartDataFrame): + value = value.get_data() + + self.data.__setitem__(key, value) + + def __len__(self): + return self.data.__len__() + + def columns(self) -> list: + return self.data.columns.tolist() + + def dtypes(self) -> dict: + return self.data.dtypes.to_dict() + + def shape(self) -> tuple: + return self.data.shape + + def index(self) -> list: + return self.data.index + + def count(self, *args, **kwargs) -> pd.Series: + return self.data.count(*args, **kwargs) + + def sum(self, *args, **kwargs) -> pd.Series: + return self.data.sum(*args, **kwargs) + + def min(self, *args, **kwargs) -> pd.Series: + return self.data.min(*args, **kwargs) + + def max(self, *args, **kwargs) -> pd.Series: + return self.data.max(*args, **kwargs) + + def mean(self, *args, **kwargs) -> pd.Series: + return self.data.mean(*args, **kwargs) + + def var(self, *args, **kwargs) -> pd.Series: + return self.data.var(*args, **kwargs) + + def std(self, *args, **kwargs) -> pd.Series: + return self.data.std(*args, **kwargs) + + def sem(self, *args, **kwargs) -> pd.Series: + return self.data.sem(*args, **kwargs) + + def skew(self, *args, **kwargs) -> pd.Series: + return self.data.skew(*args, **kwargs) + + def kurtosis(self, *args, **kwargs) -> pd.Series: + return self.data.kurtosis(*args, **kwargs) + + def quantile(self, q=0.5, axis=0, **kwargs) -> pd.Series: + return self.data.quantile(q=q, axis=axis, **kwargs) + + def mode(self, *args, **kwargs) -> pd.Series: + return self.data.mode(*args, **kwargs).iloc[0, :] + + def value_counts(self, *args, **kwargs) -> pd.Series: + return self.data.value_counts(*args, **kwargs) + + def values(self): + return self.data.values + + def isna(self) -> "PdPartDataFrame": + return PdPartDataFrame(self.data.isna()) + + def replace(self, *args, **kwargs) -> "PdPartDataFrame": + return PdPartDataFrame(self.data.replace(*args, **kwargs)) + + def astype( + self, dtype, copy: bool = True, errors: str = "raise" + ) -> "PdPartDataFrame": + return PdPartDataFrame(self.data.astype(dtype, copy, errors)) + + def copy(self) -> "PdPartDataFrame": + return PdPartDataFrame(self.data.copy(False)) + + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors: IgnoreRaise = 'raise', + ) -> "PdPartDataFrame": + data = self.data.drop( + labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + if not inplace: + return PdPartDataFrame(data) + + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ) -> Union['PdPartDataFrame', None]: + data = self.data.fillna( + value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + if not inplace: + return PdPartDataFrame(data) + + def to_csv(self, filepath, **kwargs): + if is_local_file(filepath): + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + self.data.to_csv(filepath, **kwargs) + + def iloc(self, index: Union[int, slice, List[int]]) -> Union['PdPartDataFrame']: + return PdPartDataFrame(self.data.iloc[index]) + + def rename( + self, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + errors: IgnoreRaise = 'ignore', + ) -> Union['PdPartDataFrame', None]: + data = self.data.rename( + mapper, + index=index, + columns=columns, + axis=axis, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) + if not inplace: + return PdPartDataFrame(data) + + def pow(self, *args, **kwargs) -> 'PdPartDataFrame': + new_args, new_kwargs = self.__unwrap(args, kwargs) + + return PdPartDataFrame(self.data.__pow__(*new_args, **new_kwargs)) + + def round(self, *args, **kwargs) -> 'PdPartDataFrame': + new_args, new_kwargs = self.__unwrap(args, kwargs) + + return PdPartDataFrame(self.data.round(*new_args, **new_kwargs)) + + def select_dtypes(self, *args, **kwargs) -> 'PdPartDataFrame': + return PdPartDataFrame(self.data.select_dtypes(*args, **kwargs)) + + def subtract(self, *args, **kwargs) -> 'PdPartDataFrame': + new_args, new_kwargs = self.__unwrap(args, kwargs) + + return PdPartDataFrame(self.data.__sub__(*new_args, **new_kwargs)) + + def apply_func( + self, func: Callable, *, nums_return: int = 1, **kwargs + ) -> Union['PdPartDataFrame', 'List[PdPartDataFrame]']: + dfs = func(self.data, **kwargs) + if nums_return != 1: + assert isinstance(dfs, tuple) and len(dfs) == nums_return + return [PdPartDataFrame(df) for df in dfs] + else: + return PdPartDataFrame(dfs) + + def to_pandas(self) -> 'PdPartDataFrame': + raise RuntimeError( + "Should not got here, since it will loss performance, try to_pandas outside." + ) diff --git a/secretflow/data/core/partition.py b/secretflow/data/core/partition.py new file mode 100644 index 000000000..2eeada49a --- /dev/null +++ b/secretflow/data/core/partition.py @@ -0,0 +1,355 @@ +# Copyright 2022 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Callable, List, Union + +from secretflow.device import PYU, Device, PYUObject, reveal +from secretflow.distributed.primitive import get_current_cluster_idx + +from ..base import DataFrameBase +from .agent import PartitionAgent +from .base import AgentIndex, PartitionAgentBase + + +def partition( + data: Union[Callable, PYUObject], + device: Device = None, + backend="pandas", + **kwargs, +) -> "Partition": + """ + Construct a Partition with input data. + Notes: This function should not be called multiple times. + Args: + data: The source to construct a partiton. + It can be a Callable func type which takes kwargs as parameters and retures a real dataframe. + or, it can be a PYUObject within a real dataframe type. + We suggest to use function source to improve performance. + device: Which device use this partition. + backend: The partition backend, default to pandas. + kwargs: if the source is a function, use kwargs as its parameters. + Returns: + A Partition. + """ + if callable(data): + assert device is not None, f"cannot infer device from a callable source." + else: + assert ( + device is None or device == data.device + ), f"source's device does not match input device." + device = data.device + logging.warning("To create a Partitoin, we suggest to use function source.") + agent: PartitionAgentBase = PartitionAgent(device=device) + index: AgentIndex = agent.append_data(data, backend, **kwargs) + return Partition(part_agent=agent, agent_idx=index, device=device, backend=backend) + + +class StatPartition: + """ + A wrapper of pd.Series. + Generally, it will be used to wrap a statistic results, which is pd.Series type. + """ + + data: PYUObject + + def __init__(self, data: PYUObject): + self.data = data + + @property + def index(self) -> PYUObject: + # inside it returns list + return reveal(self.data.device(lambda series: series.index)(self.data)) + + @property + def values(self) -> PYUObject: + # inside it returns np.ndarray + return self.data.device(lambda series: series.values)(self.data) + + +class Partition(DataFrameBase): + """Partition of a party""" + + part_agent: PartitionAgentBase + agent_idx: Union[AgentIndex, PYUObject] + device: PYU + backend: str + active_cluster_idx: int + + def __init__( + self, + part_agent: PartitionAgentBase, + agent_idx: Union[AgentIndex, PYUObject], + device: PYU, + backend="pandas", + ): + """ + A party's partition, which use a part_agent wrapper to access real dataframe. + Args: + part_agent: A PYU actor which resident on one party. Generally, to ensure processing performance, + only create a part_agent when initializing data (such as read_csv). + agent_idx: A index to access real data inside the part_agent. + device: The PYU device. + backend: The read backend, default to pandas. + """ + self.part_agent: PartitionAgentBase = part_agent + self.agent_idx = agent_idx + self.device = device + self.backend: str = backend + self.active_cluster_idx: int = get_current_cluster_idx() + + def __del__(self): + """ + When a partition id deleted, we should make shure that the data corresponding to partition's agent_idx + in the part_agent actor is also deleted. So we need to explicit call the del_object ont eh part_agent. + However, in following cases, when the varible 'temp' is deleted, the first cluster is already shutdown, + calling the 'del_object()' on the actor in a died cluster will cause unrecoverable errors. + Therefore, the partition record the cluter index to which it belongs. + When the current active cluster is not which the partition belongs, the __del__ do nothing. + - sf.init() + - temp = Partition(xx) + - sf.shutdown() + - sf.init() + - temp = Partition(xx) # temp will be deleted first. + - sf.shutdown() + """ + if get_current_cluster_idx() == self.active_cluster_idx: + self.part_agent.del_object(self.agent_idx) + + def __getitem__(self, item) -> "Partition": + data_idx = self.part_agent.__getitem__(self.agent_idx, item) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def __setitem__(self, key, value: Union['Partition', PYUObject]): + if isinstance(value, (PYUObject, Partition)): + assert ( + self.device == value.device + ), f'Can not assign a partition with different device.' + if isinstance(value, Partition): + if self.part_agent == value.part_agent: + # same part_agent directly set with index. + self.part_agent.__setitem__(self.agent_idx, key, value.agent_idx) + else: + # different part_agent need to get data and then set into it. + self.part_agent.__setitem__( + self.agent_idx, key, value.part_agent.get_data(value.agent_idx) + ) + else: + self.part_agent.__setitem__(self.agent_idx, key, value) + + def __len__(self): + return reveal(self.part_agent.__len__(self.agent_idx)) + + @property + def columns(self) -> list: + return reveal(self.part_agent.columns(self.agent_idx)) + + @property + def dtypes(self) -> dict: + return reveal(self.part_agent.dtypes(self.agent_idx)) + + @property + def shape(self) -> tuple: + return reveal(self.part_agent.shape(self.agent_idx)) + + @property + def index(self) -> list: + return reveal(self.part_agent.index(self.agent_idx)) + + def count(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.count(self.agent_idx, *args, **kwargs)) + + def sum(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.sum(self.agent_idx, *args, **kwargs)) + + def min(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.min(self.agent_idx, *args, **kwargs)) + + def max(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.max(self.agent_idx, *args, **kwargs)) + + def mean(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.mean(self.agent_idx, *args, **kwargs)) + + def var(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.var(self.agent_idx, *args, **kwargs)) + + def std(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.std(self.agent_idx, *args, **kwargs)) + + def sem(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.sem(self.agent_idx, *args, **kwargs)) + + def skew(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.skew(self.agent_idx, *args, **kwargs)) + + def kurtosis(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.kurtosis(self.agent_idx, *args, **kwargs)) + + def quantile(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.quantile(self.agent_idx, *args, **kwargs)) + + def mode(self, *args, **kwargs) -> StatPartition: + return StatPartition(self.part_agent.mode(self.agent_idx, *args, **kwargs)) + + def value_counts(self, *args, **kwargs) -> StatPartition: + return StatPartition( + self.part_agent.value_counts(self.agent_idx, *args, **kwargs) + ) + + @property + def values(self) -> PYUObject: + # Will return a PYUObject within np.ndarray type. + return self.part_agent.values( + self.agent_idx, + ) + + def isna(self) -> "Partition": + data_idx = self.part_agent.isna(self.agent_idx) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def replace(self, *args, **kwargs) -> "Partition": + data_idx = self.part_agent.replace(self.agent_idx, *args, **kwargs) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def astype(self, dtype, copy: bool = True, errors: str = "raise") -> "Partition": + data_idx = self.part_agent.astype(self.agent_idx, dtype, copy, errors) + if copy: + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def copy(self) -> "Partition": + data_idx = self.part_agent.copy(self.agent_idx) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors='raise', + ) -> "Partition": + data_idx = self.part_agent.drop( + self.agent_idx, labels, axis, index, columns, level, inplace, errors + ) + if not inplace: + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ) -> Union['Partition', None]: + data_idx = self.part_agent.fillna( + self.agent_idx, value, method, axis, inplace, limit, downcast + ) + if not inplace: + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def to_csv(self, filepath, **kwargs): + self.part_agent.to_csv(self.agent_idx, filepath, **kwargs) + + def iloc(self, index: Union[int, slice, List[int]]) -> 'Partition': + data_idx = self.part_agent.iloc(self.agent_idx, index) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def rename( + self, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + errors='ignore', + ) -> Union['Partition', None]: + data_idx = self.part_agent.rename( + self.agent_idx, mapper, index, columns, axis, copy, inplace, level, errors + ) + if not inplace: + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def pow(self, *args, **kwargs) -> 'Partition': + data_idx = self.part_agent.pow(self.agent_idx, *args, **kwargs) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def round(self, *args, **kwargs) -> 'Partition': + data_idx = self.part_agent.round(self.agent_idx, *args, **kwargs) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def select_dtypes(self, *args, **kwargs) -> 'Partition': + data_idx = self.part_agent.select_dtypes(self.agent_idx, *args, **kwargs) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def subtract(self, other) -> 'Partition': + sub_other = other + if isinstance(other, (StatPartition)): + sub_other = other.data + elif isinstance(other, Partition): + assert self.device == other.device + sub_other = other.agent_idx + data_idx = self.part_agent.subtract(self.agent_idx, sub_other) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + def apply_func( + self, func: Callable, *, nums_return: int = 1, **kwargs + ) -> Union['Partition', 'tuple[Partition]']: + data_idx = self.part_agent.apply_func( + self.agent_idx, func, nums_return=nums_return, **kwargs + ) + if nums_return == 1: + data_idx = reveal(data_idx) + return Partition( + self.part_agent, + data_idx, + self.device, + reveal(self.part_agent.get_backend(data_idx)), + ) + else: + data_idx_list = reveal(data_idx) + assert ( + isinstance(data_idx_list, list) and len(data_idx_list) == nums_return + ), ( + f"nums_return not match, got type = {type(data_idx_list)}, len = {len(data_idx_list)} " + f"while nums_return = {nums_return}" + ) + return tuple( + [ + Partition( + self.part_agent, + idx, + self.device, + reveal(self.part_agent.get_backend(idx)), + ) + for idx in data_idx_list + ] + ) + + def to_pandas(self) -> 'Partition': + if self.backend == "pandas": + return self + else: + data_idx = self.part_agent.to_pandas(self.agent_idx) + return Partition(self.part_agent, data_idx, self.device, self.backend) + + @property + def data(self): + return self.part_agent.get_data(self.agent_idx) diff --git a/secretflow/data/partition/polars/__init__.py b/secretflow/data/core/polars/__init__.py similarity index 87% rename from secretflow/data/partition/polars/__init__.py rename to secretflow/data/core/polars/__init__.py index 234c9fecd..500bdaa1e 100644 --- a/secretflow/data/partition/polars/__init__.py +++ b/secretflow/data/core/polars/__init__.py @@ -11,12 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import sys -PolarsPartition = None +PlPartDataFrame = None try: - from .partition import PolarsPartition + from .dataframe import PlPartDataFrame from .util import infer_pl_dtype, read_polars_csv except ImportError as exc: exc_tb = sys.exc_info()[2] @@ -26,4 +27,4 @@ ) raise type(exc)(msg).with_traceback(exc_tb) from None -__all__ = ["PolarsPartition", "infer_pl_dtype", "read_polars_csv"] +__all__ = ["PlPartDataFrame", "infer_pl_dtype", "read_polars_csv"] diff --git a/secretflow/data/core/polars/dataframe.py b/secretflow/data/core/polars/dataframe.py new file mode 100644 index 000000000..3215457bc --- /dev/null +++ b/secretflow/data/core/polars/dataframe.py @@ -0,0 +1,452 @@ +# Copyright 2022 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from pathlib import Path +from typing import Callable, List, Optional, Union + +import pandas as pd +import polars as pl +import polars.selectors as cs +from pandas.core.dtypes.inference import is_list_like + +from ...io.util import is_local_file +from ..base import PartDataFrameBase +from ..pandas import PdPartDataFrame +from .util import infer_pd_dtype, infer_pl_dtype + + +class PlPartDataFrame(PartDataFrameBase): + df: pl.DataFrame + use_item: Optional[List[str]] + + def __init__(self, df: pl.DataFrame, use_item=None): + super().__init__() + self.df = df.clone() + self.use_item = use_item + + def set_data(self, data: pl.DataFrame): + assert isinstance(data, pl.DataFrame) + self.df = data + + def get_data(self): + self._collect() + return self.df + + def _collect(self): + """compute result if it is lazy mode""" + if self.use_item is not None: + self.df = self.df.select(pl.col(self.use_item)) + self.use_item = None + + def __getitem__(self, item) -> "PlPartDataFrame": + item_list = item + if not isinstance(item, (list, tuple)): + item_list = [item_list] + lazy_df = PlPartDataFrame(self.df) + if self.use_item is not None: + assert all([it in self.use_item for it in item_list]) + lazy_df.use_item = item_list + + return lazy_df + + def __setitem__(self, key, value): + self._collect() + if isinstance(value, PlPartDataFrame): + if value.use_item is not None: + if isinstance(key, str): + key = [key] + assert len(key) == len( + value.use_item + ), f"expect key len({len(key)}) == value.expr len({len(value.use_item)})" + if value.df is self.df: + self.df = self.df.with_columns( + [ + pl.col(value.use_item[i]).alias(key[i]) + for i in range(len(value.use_item)) + ] + ) + return + value = value.get_data() + if isinstance(key, str): + if isinstance(value, pl.DataFrame): + self.__setitem__([key], value) + elif isinstance(value, pl.Series): + self.df = self.df.with_columns(value.alias(key)) + elif is_list_like(value): + self.df = self.df.with_columns( + pl.Series(name=key, value=value).alias(key) + ) + else: + # is a str or int to fullfill all values in this columns + self.df = self.df.with_columns( + pl.Series( + name=key, values=[value for i in range(len(self.df))] + ).alias(key) + ) + elif isinstance(key, list): + if isinstance(value, pl.DataFrame): + assert ( + isinstance(key, list) and len(key) == value.shape[1] + ), f"input value len({value.shape[1]}) does not equal with len of key({len(key)})" + self.df = self.df.with_columns( + [value_series.alias(key[i]) for i, value_series in enumerate(value)] + ) + elif isinstance(value, pl.Series): + assert ( + len(key) == 1 + ), f"to set a pl.Series, key must be str or ['xxx'] with len = 1" + self.df = self.df.with_columns(value.alias(key[0])) + elif not is_list_like(value): + # is a str or int to fullfill all values in this columns + self.df = self.df.with_columns( + [ + pl.Series( + name=k, values=[value for i in range(len(self.df))] + ).alias(k) + for k in key + ] + ) + else: + self.df.__setitem__(key, value) + else: + self.df.__setitem__(key, value) + + def __len__(self): + self._collect() + return self.df.__len__() + + def columns(self) -> list: + self._collect() + return self.df.columns + + def dtypes(self) -> dict: + self._collect() + schema = self.df.schema + return {k: infer_pd_dtype(v) for k, v in schema.items()} + + def shape(self) -> tuple: + self._collect() + return self.df.shape + + def index(self) -> list: + raise NotImplementedError("polars does not support index.") + + @staticmethod + def __pd_series_convert(df: pl.DataFrame) -> pd.Series: + """ + For compatibility with Pandas when execute statistic function. + Take 'max()' as an example, in pandas, it returns a pd.Series. + However, in polars, 'max()' returns a pl.DataFrame, which has 1 row for each colum's result. + We need to convert pl.DataFrame to pd.DataFrame, and then convert a row to pd.Series. + In addition, pd.Series converted from polars has a default name, while the result in Pandas do not. + So we set it to None insted. + Args: + df: the statistic result polars DataFrame with 1 row. + + Returns: + a pd.Series type result. + """ + series = df.to_pandas().iloc[0, :] + series.name = None + return series + + def count(self, *args, **kwargs) -> pd.Series: + raise NotImplementedError() + + def sum(self, *args, **kwargs) -> pd.Series: + self._collect() + axis = kwargs.get('axis', 0) + null_strategy = kwargs.get('null_strategy', 'ignore') + numeric_only = kwargs.get('numeric_only', False) + if numeric_only: + return self.__pd_series_convert( + self.df.select(pl.col(pl.NUMERIC_DTYPES)).sum( + axis=axis, null_strategy=null_strategy + ) + ) + else: + return self.__pd_series_convert( + self.df.sum(axis=axis, null_strategy=null_strategy) + ) + + def min(self, *args, **kwargs) -> pd.Series: + self._collect() + axis = kwargs.get('axis', 0) + numeric_only = kwargs.get('numeric_only', False) + if numeric_only: + return self.__pd_series_convert( + self.df.select(pl.col(pl.NUMERIC_DTYPES)).min(axis=axis) + ) + else: + return self.__pd_series_convert(self.df.min(axis=axis)) + + def max(self, *args, **kwargs) -> pd.Series: + self._collect() + axis = kwargs.get('axis', 0) + numeric_only = kwargs.get('numeric_only', False) + if numeric_only: + return self.__pd_series_convert( + self.df.select(pl.col(pl.NUMERIC_DTYPES)).max(axis=axis) + ) + else: + return self.__pd_series_convert(self.df.max(axis=axis)) + + def mean(self, *args, **kwargs) -> pd.Series: + self._collect() + axis = kwargs.get('axis', 0) + null_strategy = kwargs.get('null_strategy', 'ignore') + numeric_only = kwargs.get('numeric_only', False) + if numeric_only: + return self.__pd_series_convert( + self.df.select(pl.col(pl.NUMERIC_DTYPES)).mean( + axis=axis, null_strategy=null_strategy + ) + ) + else: + return self.__pd_series_convert( + self.df.mean(axis=axis, null_strategy=null_strategy) + ) + + def var(self, *args, **kwargs) -> pd.Series: + self._collect() + ddof = kwargs.get('ddof', 1) + numeric_only = kwargs.get('numeric_only', False) + if numeric_only: + return self.__pd_series_convert( + self.df.select(pl.col(pl.NUMERIC_DTYPES)).var(ddof=ddof) + ) + else: + return self.__pd_series_convert(self.df.var(ddof=ddof)) + + def std(self, *args, **kwargs) -> pd.Series: + self._collect() + ddof = kwargs.get('ddof', 1) + + numeric_only = kwargs.get('numeric_only', False) + if numeric_only: + return self.__pd_series_convert( + self.df.select(pl.col(pl.NUMERIC_DTYPES)).std(ddof=ddof) + ) + else: + return self.__pd_series_convert(self.df.std(ddof=ddof)) + + def sem(self, *args, **kwargs) -> pd.Series: + raise NotImplementedError() + + def skew(self, *args, **kwargs) -> pd.Series: + raise NotImplementedError() + + def kurtosis(self, *args, **kwargs) -> pd.Series: + raise NotImplementedError() + + def quantile(self, *args, **kwargs) -> pd.Series: + self._collect() + assert len(args) == 0, f"please use keyword arguments to input q and axis" + q = kwargs.get('q', 0.5) + interpolation = kwargs.get('interpolation', 'linear') + numeric_only = kwargs.get('numeric_only', False) + if numeric_only: + ret = self.__pd_series_convert( + self.df.select(pl.col(pl.NUMERIC_DTYPES)).quantile( + quantile=q, interpolation=interpolation + ) + ) + else: + ret = self.__pd_series_convert( + self.df.quantile(quantile=q, interpolation=interpolation) + ) + ret.name = q + return ret + + def mode(self, *args, **kwargs) -> pd.Series: + raise NotImplementedError() + + def value_counts(self, *args, **kwargs) -> pd.Series: + raise NotImplementedError() + + def values(self): + raise NotImplementedError() + + def isna(self) -> "PlPartDataFrame": + raise NotImplementedError() + + def replace(self, *args, **kwargs) -> "PlPartDataFrame": + raise NotImplementedError("'replace' in polars is different with it in pandas.") + + def astype( + self, dtype, copy: bool = True, errors: str = "raise" + ) -> "PlPartDataFrame": + self._collect() + exprs = [] + if isinstance(dtype, dict): + for col in dtype: + exprs.append(pl.col(col).cast(infer_pl_dtype(dtype[col]))) + else: + exprs.append(pl.col("*").cast(infer_pl_dtype(dtype))) + new_data = self.df.with_columns(*exprs) + if copy: + return PlPartDataFrame(new_data) + else: + # In this actor coly = false since not work. + return self + + def copy(self) -> "PlPartDataFrame": + self._collect() + cp = self.df.__copy__() + return PlPartDataFrame(cp) + + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors='raise', + ) -> "PlPartDataFrame": + self._collect() + new_data = self.df.drop(columns=columns) + if not inplace: + return PlPartDataFrame(new_data) + else: + self.df = new_data + + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ) -> Union['PlPartDataFrame', None]: + self._collect() + logging.warning( + "polars' fillna (actually pl.DataFrame.fill_null) does not act same as pandas, col whose tpye diffrent " + "with value will not be filled." + ) + new_data = self.df.fill_null(value=value) + if not inplace: + return PlPartDataFrame(new_data) + else: + self.df = new_data + + def to_csv(self, filepath, **kwargs): + self._collect() + if is_local_file(filepath): + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + self.df.write_csv(filepath) + + def iloc(self, index: Union[int, slice, List[int]]) -> 'PlPartDataFrame': + raise NotImplementedError() + + def rename( + self, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + errors='ignore', + ) -> Union['PlPartDataFrame', None]: + self._collect() + if index is not None: + logging.warning( + "Polars dataframe only support rename column names, index parameters will be ignored." + ) + if axis is not None and axis != 1: + logging.warning( + f"Polars dataframe only support axis = 0 and rename columns names. Got axis = {axis}" + ) + if columns is None and mapper is not None: + columns = mapper + new_data = self.df.rename(columns) + if not inplace: + return PlPartDataFrame(new_data) + + def pow(self, *args, **kwargs) -> 'PlPartDataFrame': + raise NotImplementedError() + + def round(self, decimals) -> 'PlPartDataFrame': + """ + + Args: + decimals: An int or dict decimals. + + Returns: + round value. + + Note: + Polars round's working principle is differnt from Pandas. + It works more like round (2.4-> 2, 2.5 -> 3, 2.6 -> 3). + But in Pandas, it will be rounded to Even Numbers (2.4 -> 2, 2.5 -> 2, 2.6 -> 3). + """ + self._collect() + df = None + if isinstance(decimals, dict): + for col, decimal in decimals.items(): + df = self.df.with_columns(pl.col(col).round(decimals=decimal)) + else: + df = self.df.with_columns( + pl.col(pl.FLOAT_DTYPES).round(decimals=decimals), + ) + return PlPartDataFrame(df) + + def select_dtypes(self, include=None, exclude=None) -> 'PlPartDataFrame': + self._collect() + assert exclude is None, f"does not support exclude dtypes in polars yet." + assert include is not None, f"inxlude must indicate" + if include == 'number': + include = pl.NUMERIC_DTYPES + else: + if not isinstance(include, (list, tuple)): + include = [include] + include = [infer_pl_dtype(icld) for icld in include] + return PlPartDataFrame(self.df.select(cs.by_dtype(include))) + + def subtract(self, *args, **kwargs) -> 'PlPartDataFrame': + raise NotImplementedError() + + def apply_func( + self, func: Callable, *, nums_return: int = 1, **kwargs + ) -> Union['PlPartDataFrame', 'List[PlPartDataFrame]']: + self._collect() + dfs = func(self.df, **kwargs) + if nums_return != 1: + assert isinstance(dfs, tuple) and len(dfs) == nums_return + for df in dfs: + assert isinstance(df, (pl.DataFrame, pd.DataFrame)) + return [ + PlPartDataFrame(df) + if isinstance(df, pl.DataFrame) + else PdPartDataFrame(df) + for df in dfs + ] + else: + assert isinstance( + dfs, (pl.DataFrame, pd.DataFrame) + ), f"need DataFrame, got {type(dfs)}" + return ( + PlPartDataFrame(dfs) + if isinstance(dfs, pl.DataFrame) + else PdPartDataFrame(dfs) + ) + + def to_pandas(self) -> 'PdPartDataFrame': + self._collect() + return PdPartDataFrame(self.df.to_pandas()) diff --git a/secretflow/data/partition/polars/util.py b/secretflow/data/core/polars/util.py similarity index 72% rename from secretflow/data/partition/polars/util.py rename to secretflow/data/core/polars/util.py index 35004c283..c798d982c 100644 --- a/secretflow/data/partition/polars/util.py +++ b/secretflow/data/core/polars/util.py @@ -11,19 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import polars as pl + import logging + import numpy as np +import polars as pl +from polars.datatypes import DataTypeClass def read_polars_csv(filepath, *args, **kwargs): - if 'delimiter' in kwargs: + if 'delimiter' in kwargs and kwargs['delimiter'] is not None: kwargs['separator'] = kwargs['delimiter'] - if 'usecols' in kwargs: + if 'usecols' in kwargs and kwargs['usecols'] is not None: # polars only recognized list columns but not dictkeys. kwargs['columns'] = list(kwargs['usecols']) - if 'dtype' in kwargs: - logging.warning("dtypes with np.xxx in polars backend will go wrong.") + if 'dtype' in kwargs and kwargs['dtype'] is not None: pl_dtypes = {} for col, dt in kwargs['dtype'].items(): pl_dtypes[col] = infer_pl_dtype(dt) @@ -61,9 +63,29 @@ def infer_pl_dtype(tp): np.int32: pl.Int32, np.int64: pl.Int64, np.bool8: pl.Boolean, - np.str_: str, + np.string_: str, + np.str0: str, np.str: str, } if tp in np_relation: return np_relation[tp] return tp + + +def infer_pd_dtype(tp: DataTypeClass): + assert isinstance(tp, DataTypeClass) + np_relation = { + pl.Float32: np.float32, + pl.Float64: np.float64, + pl.Int8: np.int8, + pl.Int16: np.int16, + pl.Int32: np.int32, + pl.Int64: np.int64, + pl.Boolean: np.bool8, + pl.Utf8: np.string_, + pl.Object: np.object_, + } + if tp in np_relation: + return np_relation[tp] + logging.warning(f"Cannot infer padas dtype with input tp: {tp}") + return tp diff --git a/secretflow/data/horizontal/dataframe.py b/secretflow/data/horizontal/dataframe.py index 6cf2a09f7..a6917fdf5 100644 --- a/secretflow/data/horizontal/dataframe.py +++ b/secretflow/data/horizontal/dataframe.py @@ -13,7 +13,7 @@ # limitations under the License. from dataclasses import dataclass, field -from typing import Dict, Union +from typing import Callable, Dict, List, Union import numpy as np import pandas as pd @@ -22,7 +22,9 @@ from secretflow.security.aggregation.aggregator import Aggregator from secretflow.security.compare.comparator import Comparator from secretflow.utils.errors import InvalidArgumentError -from ..base import DataFrameBase, PartitionBase + +from ..base import DataFrameBase +from ..core.partition import Partition from ..ndarray import FedNdarray, PartitionWay @@ -84,7 +86,7 @@ class 150 >>> h_df.fillna({'sepal_length': 2}) """ - partitions: Dict[PYU, PartitionBase] = field(default_factory=dict) + partitions: Dict[PYU, Partition] = field(default_factory=dict) aggregator: Aggregator = None comparator: Comparator = None @@ -102,8 +104,7 @@ def mean(self, *args, **kwargs) -> pd.Series: """ assert self.aggregator is not None, 'Aggregator should be provided for mean.' means = [part.mean(*args, **kwargs) for part in self.partitions.values()] - if 'numeric_only' in kwargs: - numeric_only = kwargs['numeric_only'] + numeric_only = kwargs.get('numeric_only', False) cnts = [ part.count(numeric_only=numeric_only) for part in self.partitions.values() ] @@ -443,7 +444,7 @@ def __setitem__(self, key, value): ) for pyu, part in value.partitions.items(): self.partitions[pyu][key] = part - elif isinstance(value, PartitionBase): + elif isinstance(value, Partition): assert ( value.data.device in self.partitions ), f'Partition to assgin is not in this dataframe pyu list.' @@ -452,6 +453,52 @@ def __setitem__(self, key, value): for part in self.partitions.values(): part[key] = value + def index(self) -> list: + raise NotImplementedError() + + def value_counts(self, *args, **kwargs) -> pd.Series: + raise NotImplementedError() + + def iloc(self, index: Union[int, slice, List[int]]) -> 'HDataFrame': + raise NotImplementedError() + + def rename( + self, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + errors='ignore', + ) -> Union['HDataFrame', None]: + raise NotImplementedError() + + def pow(self, *args, **kwargs) -> 'HDataFrame': + raise NotImplementedError() + + def round(self, *args, **kwargs) -> 'HDataFrame': + raise NotImplementedError() + + def select_dtypes(self, *args, **kwargs) -> 'HDataFrame': + raise NotImplementedError() + + def subtract(self, *args, **kwargs) -> 'HDataFrame': + raise NotImplementedError() + + def apply_func( + self, func: Callable, *, nums_return: int = 1, **kwargs + ) -> 'HDataFrame': + return HDataFrame( + partitions={ + pyu: part.apply_func(func, nums_return=nums_return, **kwargs) + for pyu, part in self.partitions.items() + }, + aggregator=self.aggregator, + comparator=self.comparator, + ) + def to_pandas(self): return HDataFrame( partitions={pyu: part.to_pandas() for pyu, part in self.partitions.items()}, diff --git a/secretflow/data/horizontal/io.py b/secretflow/data/horizontal/io.py index d79edbf50..b4a6de87e 100644 --- a/secretflow/data/horizontal/io.py +++ b/secretflow/data/horizontal/io.py @@ -17,9 +17,10 @@ from secretflow.device import PYU from secretflow.security.aggregation.aggregator import Aggregator from secretflow.security.compare.comparator import Comparator + +from ..core import partition +from ..core.io import read_csv_wrapper from .dataframe import HDataFrame -from ..base import partition -from ..partition.io import read_csv_wrapper def read_csv( @@ -35,7 +36,7 @@ def read_csv( filepath: a dict {PYU: file path}. aggregator: optionla; the aggregator assigned to the dataframe. comparator: optionla; the comparator assigned to the dataframe. - backend optionla; the backend of the dataframe, default to 'pandas'. + backend: optional; the backend of the dataframe, default to 'pandas'. kwargs: all other arguments. Returns: @@ -48,7 +49,12 @@ def read_csv( df = HDataFrame(aggregator=aggregator, comparator=comparator) for device, path in filepath.items(): df.partitions[device] = partition( - device(read_csv_wrapper)(path, backend=backend, **kwargs), backend=backend + data=read_csv_wrapper, + device=device, + backend=backend, + filepath=path, + read_backend=backend, + **kwargs, ) # Check column and dtype. dtypes = None diff --git a/secretflow/data/mix/dataframe.py b/secretflow/data/mix/dataframe.py index 7e35ba9ae..33bbdcd2d 100644 --- a/secretflow/data/mix/dataframe.py +++ b/secretflow/data/mix/dataframe.py @@ -21,7 +21,8 @@ from pandas.core.indexes.base import Index from secretflow.utils.errors import InvalidArgumentError, NotFoundError, UnexpectedError -from ..base import PartitionBase + +from ..core.partition import Partition from ..horizontal import HDataFrame from ..vertical import VDataFrame @@ -91,6 +92,10 @@ class MixDataFrame: or VDataFrame, and shall not be mixed. """ + def __init__(self, partitions=None): + self.partitions = partitions + self.__post__init() + def __post__init(self): self._check_partitions(self.partitions) if not isinstance(self.partitions, tuple): @@ -152,8 +157,7 @@ def mean(self, *args, **kwargs) -> pd.Series: """ means = [part.mean(*args, **kwargs) for part in self.partitions] if self.partition_way == PartitionWay.HORIZONTAL: - if 'numeric_only' in kwargs: - numeric_only = kwargs['numeric_only'] + numeric_only = kwargs.get('numeric_only', False) cnts = [part.count(numeric_only=numeric_only) for part in self.partitions] return pd.Series( np.average(means, weights=cnts, axis=0), index=means[0].index @@ -229,11 +233,10 @@ def var(self, *args, **kwargs): @property def values(self): - # TODO - pass + raise NotImplementedError() @property - def dtypes(self) -> pd.Series: + def dtypes(self) -> dict: """ Returns the dtypes in the DataFrame. @@ -411,7 +414,7 @@ def __getitem__(self, item) -> 'MixDataFrame': ) def __setitem__(self, key, value): - if isinstance(value, (HDataFrame, VDataFrame, PartitionBase)): + if isinstance(value, (HDataFrame, VDataFrame, Partition)): raise InvalidArgumentError( 'Can not assgin a HDataFrame/VDataFrame/Partition to MixDataFrame.' ) @@ -425,8 +428,8 @@ def __setitem__(self, key, value): f'{type(value.partitions[0])} differs with {type(self.partitions[0])}.' ) if self.partition_way == PartitionWay.HORIZONTAL: - for i, part in enumerate(value.partitions): - self.partitions[i][key] = part + for i, part_df in enumerate(value.partitions): + self.partitions[i][key] = part_df else: part_key = self._col_index(key) for idx, key in part_key.items(): diff --git a/secretflow/data/ndarray/ndarray.py b/secretflow/data/ndarray/ndarray.py index f6835c7f7..f760b9864 100644 --- a/secretflow/data/ndarray/ndarray.py +++ b/secretflow/data/ndarray/ndarray.py @@ -22,6 +22,8 @@ from secretflow.device import PYU, SPU, PYUObject, reveal from secretflow.utils.errors import InvalidArgumentError + +from ..io import util as io_util from .math_utils import ( mean_of_difference_abs, mean_of_difference_ratio_abs, @@ -30,7 +32,6 @@ sum_of_difference_ratio_abs, sum_of_difference_squares, ) -from ..io import util as io_util # 下面的函数是同时支持水平和垂直的。 __ndarray = "__ndarray_type__" diff --git a/secretflow/data/partition/pandas/partition.py b/secretflow/data/partition/pandas/partition.py deleted file mode 100644 index dd050089a..000000000 --- a/secretflow/data/partition/pandas/partition.py +++ /dev/null @@ -1,444 +0,0 @@ -# Copyright 2022 Ant Group Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pathlib import Path -from typing import Union, List, Callable - -import pandas as pd -from jax.tree_util import tree_map -from pandas.core.indexes.base import Index - -from secretflow.device import PYUObject, reveal -from ...base import PartitionBase -from ...io.util import is_local_file - - -class PdPartition(PartitionBase): - """Slice of data that makes up horizontal, vertical and mixed partitioned DataFrame. - - Attributes: - data (PYUObject): Reference to pandas.DataFrame located in local node. - """ - - def __init__(self, data: PYUObject = None): - super().__init__(data, backend="pandas") - - def __partition_wrapper(self, fn: Callable, *args, **kwargs) -> 'PartitionBase': - return PdPartition(self.data.device(fn)(self.data, *args, **kwargs)) - - def mean(self, *args, **kwargs) -> 'PartitionBase': - """Returns the mean of the values over the requested axis. - - Returns: - PartitionBase: mean values series. - """ - return self.__partition_wrapper(pd.DataFrame.mean, *args, **kwargs) - - def var(self, *args, **kwargs) -> 'PartitionBase': - """Returns the variance of the values over the requested axis. - - Returns: - PartitionBase: variance values series. - """ - return self.__partition_wrapper(pd.DataFrame.var, *args, **kwargs) - - def std(self, *args, **kwargs) -> 'PartitionBase': - """Returns the standard deviation of the values over the requested axis. - - Returns: - PartitionBase: standard deviation values series. - - """ - return self.__partition_wrapper(pd.DataFrame.std, *args, **kwargs) - - def sem(self, *args, **kwargs) -> 'PartitionBase': - """Returns the standard error of the mean over the requested axis. - - Returns: - PartitionBase: standard error of the mean series. - - """ - return self.__partition_wrapper(pd.DataFrame.sem, *args, **kwargs) - - def skew(self, *args, **kwargs) -> 'PartitionBase': - """Returns the skewness over the requested axis. - - Returns: - PartitionBase: skewness series. - - """ - return self.__partition_wrapper(pd.DataFrame.skew, *args, **kwargs) - - def kurtosis(self, *args, **kwargs) -> 'PartitionBase': - """Returns the kurtosis over the requested axis. - - Returns: - PartitionBase: kurtosis series. - - """ - return self.__partition_wrapper(pd.DataFrame.kurtosis, *args, **kwargs) - - def sum(self, *args, **kwargs) -> 'PartitionBase': - """Returns the sum of the values over the requested axis. - - Returns: - PartitionBase: sum values series. - """ - return self.__partition_wrapper(pd.DataFrame.sum, *args, **kwargs) - - def replace(self, *args, **kwargs) -> 'PartitionBase': - """Replace values given in to_replace with value. - Same as pandas.DataFrame.replace - Values of the DataFrame are replaced with other values dynamically. - - Returns: - PartitionBase: same shape except value replaced - """ - return self.__partition_wrapper(pd.DataFrame.replace, *args, **kwargs) - - def quantile(self, q=0.5, axis=0) -> 'PartitionBase': - """Returns values at the given quantile over requested axis. - - Returns: - PartitionBase: quantile values series. - """ - # q is between 0 and 1 - q = max(min(q, 1), 0) - # limit q to one of 0, 0.25, 0.5, 0.75 and 1 - q = round(4 * q) / 4 - return self.__partition_wrapper(pd.DataFrame.quantile, q=q, axis=axis) - - def min(self, *args, **kwargs) -> 'PartitionBase': - """Returns the minimum of the values over the requested axis. - - Returns: - PartitionBase: minimum values series. - """ - return self.__partition_wrapper(pd.DataFrame.min, *args, **kwargs) - - def mode(self, *args, **kwargs) -> 'PartitionBase': - """Returns the mode of the values over the requested axis. - - For data protection reasons, only one mode will be returned. - - Returns: - PartitionBase: mode values series. - """ - - def _mode(*_args, **_kwargs): - return pd.DataFrame.mode(*_args, **_kwargs).iloc[0, :] - - return self.__partition_wrapper(_mode, *args, **kwargs) - - def max(self, *args, **kwargs) -> 'PartitionBase': - """Returns the maximum of the values over the requested axis. - - Returns: - PartitionBase: maximum values series. - """ - return self.__partition_wrapper(pd.DataFrame.max, *args, **kwargs) - - def count(self, *args, **kwargs) -> 'PartitionBase': - """Counts non-NA cells for each column or row. - - Returns: - PartitionBase: count values series. - """ - return self.__partition_wrapper(pd.DataFrame.count, *args, **kwargs) - - def isna(self) -> 'PartitionBase': - """Detects missing values for an array-like object. - Same as pandas.DataFrame.isna - Returns - DataFrame: Mask of bool values for each element in DataFrame - that indicates whether an element is an NA value. - """ - return self.__partition_wrapper(pd.DataFrame.isna) - - def __unwrap(self, args, kwargs): - new_args = tree_map(lambda x: x.data if (type(x) == type(self)) else x, args) - new_kwargs = tree_map( - lambda x: x.data if (type(x) == type(self)) else x, kwargs - ) - return new_args, new_kwargs - - def pow(self, *args, **kwargs) -> 'PartitionBase': - """Gets Exponential power of (partition of) dataframe and other, element-wise (binary operator pow). - Equivalent to dataframe ** other, but with support to substitute a fill_value for missing data in one of the inputs. - With reverse version, rpow. - Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. - - Reference: - pd.DataFrame.pow - """ - new_args, new_kwargs = self.__unwrap(args, kwargs) - - return self.__partition_wrapper(pd.DataFrame.pow, *new_args, **new_kwargs) - - def subtract(self, *args, **kwargs) -> 'PartitionBase': - """Gets Subtraction of (partition of) dataframe and other, element-wise (binary operator sub). - Equivalent to dataframe - other, but with support to substitute a fill_value for missing data in one of the inputs. - With reverse version, rsub. - Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. - - Reference: - pd.DataFrame.subtract - """ - new_args, new_kwargs = self.__unwrap(args, kwargs) - - return self.__partition_wrapper(pd.DataFrame.subtract, *new_args, **new_kwargs) - - def round(self, *args, **kwargs) -> 'PartitionBase': - """Round the (partition of) DataFrame to a variable number of decimal places. - - Reference: - pd.DataFrame.round - """ - new_args, new_kwargs = self.__unwrap(args, kwargs) - - return self.__partition_wrapper(pd.DataFrame.round, *new_args, **new_kwargs) - - def select_dtypes(self, *args, **kwargs) -> 'PartitionBase': - """Returns a subset of the DataFrame's columns based on the column dtypes. - - Reference: - pandas.DataFrame.select_dtypes - """ - return self.__partition_wrapper(pd.DataFrame.select_dtypes, *args, **kwargs) - - @property - def values(self): - """Returns the underlying ndarray.""" - return self.data.device(lambda df: df.values)(self.data) - - @property - @reveal - def index(self): - """Returns the index (row labels) of the DataFrame.""" - return self.data.device(lambda df: df.index)(self.data) - - @property - @reveal - def dtypes(self): - """Returns the dtypes in the DataFrame.""" - # return dict always. - return self.data.device( - lambda df: df.dtypes.to_dict() - if isinstance(df, pd.DataFrame) - else {df.name: df.types} - )(self.data) - - def astype(self, dtype, copy: bool = True, errors: str = "raise"): - """ - Cast a pandas object to a specified dtype ``dtype``. - - All args are same as :py:meth:`pandas.DataFrame.astype`. - """ - return PdPartition( - self.data.device(pd.DataFrame.astype)(self.data, dtype, copy, errors) - ) - - @property - @reveal - def columns(self): - """Returns the column labels of the DataFrame.""" - return self.data.device(lambda df: df.columns.to_list())(self.data) - - @property - @reveal - def shape(self): - """Returns a tuple representing the dimensionality of the DataFrame.""" - return self.data.device(lambda df: df.shape)(self.data) - - def iloc(self, index: Union[int, slice, List[int]]) -> 'PartitionBase': - """Integer-location based indexing for selection by position. - - Args: - index (Union[int, slice, List[int]]): rows index. - - Returns: - PartitionBase: Selected DataFrame. - """ - return PdPartition( - self.data.device(lambda df, idx: df.iloc[idx])(self.data, index) - ) - - def drop( - self, - labels=None, - axis=0, - index=None, - columns=None, - level=None, - inplace=False, - errors='raise', - ) -> Union['PartitionBase', None]: - """See `pandas.DataFrame.drop`""" - - def _drop(df: pd.DataFrame, **kwargs): - if inplace: - new_df = df.copy(deep=True) - new_df.drop(**kwargs) - return new_df - else: - return df.drop(**kwargs) - - new_data = self.data.device(_drop)( - self.data, - labels=labels, - axis=axis, - index=index, - columns=columns, - level=level, - inplace=inplace, - errors=errors, - ) - if inplace: - self.data = new_data - else: - return PdPartition(new_data) - - def fillna( - self, - value=None, - method=None, - axis=None, - inplace=False, - limit=None, - downcast=None, - ) -> Union['PartitionBase', None]: - """See :py:meth:`pandas.DataFrame.fillna`""" - - def _fillna(df: pd.DataFrame, **kwargs): - if inplace: - new_df = df.copy(deep=True) - new_df.fillna(**kwargs) - return new_df - else: - return df.fillna(**kwargs) - - new_data = self.data.device(_fillna)( - self.data, - value=value, - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - if inplace: - self.data = new_data - return self - else: - return PdPartition(new_data) - - def rename( - self, - mapper=None, - index=None, - columns=None, - axis=None, - copy=True, - inplace=False, - level=None, - errors='ignore', - ) -> Union['PartitionBase', None]: - """See :py:meth:`pandas.DataFrame.rename`""" - - def _rename(df: pd.DataFrame, **kwargs): - if inplace: - new_df = df.copy(deep=True) - new_df.rename(**kwargs) - return new_df - else: - return df.rename(**kwargs) - - new_data = self.data.device(_rename)( - self.data, - mapper=mapper, - index=index, - columns=columns, - axis=axis, - copy=copy, - inplace=inplace, - level=level, - errors=errors, - ) - if inplace: - self.data = new_data - else: - return PdPartition(new_data) - - def value_counts(self, *args, **kwargs) -> 'PartitionBase': - """Return a Series containing counts of unique values.""" - return self.__partition_wrapper(pd.DataFrame.value_counts, *args, **kwargs) - - def to_csv(self, filepath, **kwargs): - """Save DataFrame to csv file.""" - - def _to_csv_wrapper(df: pd.DataFrame, path, **_kwargs): - if is_local_file(path): - Path(path).parent.mkdir(parents=True, exist_ok=True) - df.to_csv(open(path, 'wb'), **_kwargs) - - return self.data.device(_to_csv_wrapper)(self.data, filepath, **kwargs) - - @reveal - def __len__(self): - """Returns the number of rows.""" - return self.data.device(lambda df: len(df))(self.data) - - def __getitem__(self, item: Union[str, List[str]]) -> 'PartitionBase': - """Get columns from DataFrame. - - NOTE: Always return DataFrame even if specify single column. - - Args: - item (Union[str, List[str]]): Columns to get. - - Returns: - PartitionBase: DataFrame. - """ - item_list = item - if not isinstance(item, (list, tuple, Index)): - item_list = [item_list] - return self.__partition_wrapper(pd.DataFrame.__getitem__, item_list) - - def __setitem__(self, key, value): - """Assign values to columns. - - Args: - key (Union[str, List[str]]): columns to be assigned. - value (PartitionBase): assigned values. - """ - if isinstance(value, PartitionBase): - assert ( - self.data.device == value.data.device - ), f'Can not assign a partition with different device.' - - def _setitem(df: pd.DataFrame, k, v): - # Deep copy DataFrame since ray object store is immutable. - df = df.copy(deep=True) - df[k] = v - return df - - self.data = self.data.device(_setitem)( - self.data, key, value if not isinstance(value, PdPartition) else value.data - ) - - def copy(self): - """Shallow copy.""" - return PdPartition(self.data) - - def to_pandas(self): - return self diff --git a/secretflow/data/partition/polars/partition.py b/secretflow/data/partition/polars/partition.py deleted file mode 100644 index aad9f1b48..000000000 --- a/secretflow/data/partition/polars/partition.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright 2023 Ant Group Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from pathlib import Path -from typing import Callable, List, Union - -import polars as pl -import polars.selectors as cs -from pandas.core.indexes.base import Index - -from secretflow.data.io.util import is_local_file -from secretflow.device import PYUObject -from secretflow.device import reveal -from .util import infer_pl_dtype -from ..pandas.partition import PdPartition -from ...base import PartitionBase - - -class PolarsPartition(PartitionBase): - def __init__(self, data: PYUObject = None): - super().__init__(data, backend="polars") - - def __partition_wrapper(self, fn: Callable, *args, **kwargs) -> 'PartitionBase': - return PolarsPartition(self.data.device(fn)(self.data, *args, **kwargs)) - - def __pd_stat_wrapper(self, fn: Callable, *args, **kwargs) -> 'PartitionBase': - return PdPartition(self.data.device(fn)(self.data, *args, **kwargs)) - - def mean(self, *args, **kwargs) -> 'PartitionBase': - def _mean(df, *_args, **_kwargs): - mean_df = df.mean(*_args, **_kwargs) - return mean_df.to_pandas().iloc[0, :] - - return self.__pd_stat_wrapper(_mean, *args, **kwargs) - - def var(self, *args, **kwargs) -> 'PartitionBase': - def _var(df, *_args, **_kwargs): - var_df = df.var(*_args, **_kwargs) - return var_df.to_pandas().iloc[0, :] - - return self.__pd_stat_wrapper(_var, *args, **kwargs) - - def std(self, *args, **kwargs) -> 'PartitionBase': - def _std(df, *_args, **_kwargs): - std_df = df.std(*_args, **_kwargs) - return std_df.to_pandas().iloc[0, :] - - return self.__pd_stat_wrapper(_std, *args, **kwargs) - - def sem(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - def skew(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - def kurtosis(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - def sum(self, *args, **kwargs) -> 'PartitionBase': - def _sum(df, *_args, **_kwargs): - sum_df = df.sum(*_args, **_kwargs) - return sum_df.to_pandas().iloc[0, :] - - return self.__pd_stat_wrapper(_sum, *args, **kwargs) - - def replace(self, *args, **kwargs) -> 'PartitionBase': - return self.__partition_wrapper(pl.DataFrame.replace, *args, **kwargs) - - @reveal - def quantile(self, q=0.5, axis=0) -> 'PartitionBase': - def _quantile(df, _q, _axis): - quantile_df = df.quantile(q=_q, axis=_axis) - return quantile_df.to_pandas().iloc[0, :] - - return self.__pd_stat_wrapper(_quantile, q, axis) - - def min(self, *args, **kwargs) -> 'PartitionBase': - def _min(df, *_args, **_kwargs): - axis = 0 - if "axis" in _kwargs: - axis = _kwargs["axis"] - min_df = df.min(axis) - return min_df.to_pandas().iloc[0, :] - - return self.__pd_stat_wrapper(_min, *args, **kwargs) - - def mode(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - def max(self, *args, **kwargs) -> 'PartitionBase': - def _max(df, *_args, **_kwargs): - axis = 0 - if "axis" in _kwargs: - axis = _kwargs["axis"] - max_df = df.max(axis) - return max_df.to_pandas().iloc[0, :] - - return self.__pd_stat_wrapper(_max, *args, **kwargs) - - def count(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - def isna(self) -> 'PartitionBase': - raise NotImplementedError() - - def pow(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - def subtract(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - def round(self, *args, **kwargs) -> 'PartitionBase': - return PolarsPartition( - self.data.device(pl.DataFrame.with_columns)( - self.data, - cs.by_dtype(pl.FLOAT_DTYPES).round(*args, **kwargs), - ) - ) - - def select_dtypes(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - @property - def values(self): - return self.to_pandas().values - - @property - @reveal - def index(self): - raise NotImplementedError() - - @property - @reveal - def dtypes(self): - def get_dict_dtypes(_df: pl.DataFrame): - return {_df.columns[i]: _df.dtypes[i] for i in range(len(_df.columns))} - - return self.data.device(get_dict_dtypes)(self.data) - - def astype(self, dtype, copy: bool = True, errors: str = "raise"): - def _cast_type(df: pl.DataFrame, _dtype, _copy): - if _copy: - new_df = df.clone() - else: - new_df = df - exprs = [] - if isinstance(_dtype, dict): - for col in _dtype: - exprs.append(pl.col(col).cast(infer_pl_dtype(_dtype[col]))) - else: - exprs.append(pl.col("*").cast(infer_pl_dtype(_dtype))) - new_df = new_df.with_columns(*exprs) - return new_df - - if copy: - return PolarsPartition(self.data.device(_cast_type)(self.data, dtype, copy)) - else: - self.data = self.data.device(_cast_type)(self.data, dtype, copy) - return self - - @property - @reveal - def columns(self): - return self.data.device(lambda df: df.columns)(self.data) - - @property - @reveal - def shape(self): - return self.data.device(lambda df: df.shape)(self.data) - - def iloc(self, index: Union[int, slice, List[int]]) -> 'PartitionBase': - raise NotImplementedError() - - def drop( - self, - labels=None, - axis=0, - index=None, - columns=None, - level=None, - inplace=False, - errors='raise', - ) -> Union['PartitionBase', None]: - def _drop(df: pl.DataFrame, **kwargs): - if inplace: - new_df = df.clone() - new_df.drop(**kwargs) - return new_df - else: - return df.drop(**kwargs) - - new_data = self.data.device(_drop)( - self.data, - columns=columns, - ) - if inplace: - self.data = new_data - else: - return PolarsPartition(new_data) - - def fillna( - self, - value=None, - method=None, - axis=None, - inplace=False, - limit=None, - downcast=None, - ) -> Union['PartitionBase', None]: - def _fillna(df: pl.DataFrame, **kwargs): - if inplace: - new_df = df.clone() - new_df.fill_null(**kwargs) # TODO: 明确fill_null和fill_na的区别 - return new_df - else: - return df.fill_null(**kwargs) - - new_data = self.data.device(_fillna)(self.data, value=value) - if inplace: - self.data = new_data - return self - else: - return PolarsPartition(new_data) - - def rename( - self, - mapper=None, - index=None, - columns=None, - axis=None, - copy=True, - inplace=False, - level=None, - errors='ignore', - ) -> Union['PartitionBase', None]: - if mapper is not None: - logging.warning("Polars dataframe does not support mapper in reaname.") - if index is not None: - logging.warning("Polars dataframe does not support index in reaname.") - if axis is not None: - logging.warning("Polars dataframe does not support index in reaname.") - - def _rename(df: pl.DataFrame, rename_col_dict): - if inplace: - new_df = df.clone() - new_df.rename(rename_col_dict) - return new_df - else: - return df.rename(rename_col_dict) - - new_data = self.data.device(_rename)(self.data, columns) - if inplace: - self.data = new_data - else: - return PolarsPartition(new_data) - - def value_counts(self, *args, **kwargs) -> 'PartitionBase': - raise NotImplementedError() - - def to_csv(self, filepath, **kwargs): - def _to_csv_wrapper(df: pl.DataFrame, path): - if is_local_file(path): - Path(path).parent.mkdir(parents=True, exist_ok=True) - df.write_csv(path) - - return self.data.device(_to_csv_wrapper)(self.data, filepath) - - @reveal - def __len__(self): - """Returns the number of rows.""" - return self.data.device(lambda df: len(df))(self.data) - - def __getitem__(self, item): - item_list = item - if not isinstance(item, (list, tuple, Index)): - item_list = [item_list] - return self.__partition_wrapper(pl.DataFrame.__getitem__, item_list) - - def __setitem__(self, key, value): - if isinstance(value, PolarsPartition): - assert ( - self.data.device == value.data.device - ), f'Can not assign a partition with different device.' - - def _setitem(df: pl.DataFrame, ks, v): - # Deep copy DataFrame since ray object store is immutable. - df = df.clone() - if isinstance(v, pl.DataFrame): - """set a DataFrame""" - assert ks == v.columns - return df.with_columns(v) - df[ks] = v - return df - - self.data = self.data.device(_setitem)( - self.data, - key, - value if not isinstance(value, PolarsPartition) else value.data, - ) - - def copy(self): - """shallow copy""" - return PolarsPartition(self.data) - - def to_pandas(self): - return PdPartition(self.data.device(pl.DataFrame.to_pandas)(self.data)) diff --git a/secretflow/data/split.py b/secretflow/data/split.py index eefab5e41..1ee5757be 100644 --- a/secretflow/data/split.py +++ b/secretflow/data/split.py @@ -19,10 +19,9 @@ import pandas as pd from sklearn.model_selection import train_test_split as _train_test_split -from secretflow.data.horizontal.dataframe import HDataFrame -from secretflow.data.ndarray import FedNdarray -from secretflow.data.base import partition -from secretflow.data.vertical.dataframe import VDataFrame +from .horizontal.dataframe import HDataFrame +from .ndarray import FedNdarray +from .vertical.dataframe import VDataFrame def train_test_split( @@ -39,7 +38,7 @@ def train_test_split( test_size (float): test dataset size, default is None. train_size (float): train dataset size, default is None. random_state (int): Controls the shuffling applied to the data before applying the split. - shuffle (bool): Whether or not to shuffle the data before splitting, default is True. + shuffle (bool): Whether to shuffle the data before splitting, default is True. Returns splitting : list, length=2 * len(arrays) @@ -67,8 +66,8 @@ def train_test_split( >>> df_alice = df_alice >>> df_bob = df_bob >>> vdf = VDataFrame( - ... {alice: Partition(data=cls.alice(lambda: df_alice)()), - ... bob: Partition(data=cls.bob(lambda: df_bob)())}) + ... {alice: partition(data=cls.alice(lambda: df_alice)()), + ... bob: partition(data=cls.bob(lambda: df_bob)())}) >>> train_vdf, test_vdf = train_test_split(vdf, test_size=0.33, random_state=42) """ @@ -102,49 +101,47 @@ def split(*args, **kwargs) -> Tuple[object, object]: parts_train, parts_test = {}, {} for device, part in data.partitions.items(): if isinstance(data, FedNdarray): - part_data = part + parts_train[device], parts_test[device] = device(split)( + part, + train_size=train_size, + test_size=test_size, + random_state=random_state, + shuffle=shuffle, + ) else: - part_data = part.data - parts_train[device], parts_test[device] = device(split)( - part_data, - train_size=train_size, - test_size=test_size, - random_state=random_state, - shuffle=shuffle, - ) + parts_train[device], parts_test[device] = part.apply_func( + split, + nums_return=2, + train_size=train_size, + test_size=test_size, + random_state=random_state, + shuffle=shuffle, + ) if isinstance(data, VDataFrame): return VDataFrame( - partitions={ - pyu: partition(data=part, backend=data.partitions[pyu].backend) - for pyu, part in parts_train.items() - }, + partitions=parts_train, aligned=data.aligned, ), VDataFrame( - partitions={ - pyu: partition(data=part, backend=data.partitions[pyu].backend) - for pyu, part in parts_test.items() - }, + partitions=parts_test, aligned=data.aligned, ) elif isinstance(data, HDataFrame): return HDataFrame( - partitions={ - pyu: partition(data=part, backend=data.partitions[pyu].backend) - for pyu, part in parts_train.items() - }, + partitions=parts_train, aggregator=data.aggregator, comparator=data.comparator, ), HDataFrame( - partitions={ - pyu: partition(data=part, backend=data.partitions[pyu].backend) - for pyu, part in parts_test.items() - }, + partitions=parts_test, aggregator=data.aggregator, comparator=data.comparator, ) else: return ( - FedNdarray(parts_train, data.partition_way), - FedNdarray(parts_test, data.partition_way), + FedNdarray( + {pyu: part for pyu, part in parts_train.items()}, data.partition_way + ), + FedNdarray( + {pyu: part for pyu, part in parts_test.items()}, data.partition_way + ), ) diff --git a/secretflow/data/vertical/dataframe.py b/secretflow/data/vertical/dataframe.py index 8bf4ee15d..edb0ab277 100644 --- a/secretflow/data/vertical/dataframe.py +++ b/secretflow/data/vertical/dataframe.py @@ -13,16 +13,18 @@ # limitations under the License. from dataclasses import dataclass -from typing import Dict, List, Union +from typing import Callable, Dict, List, Union import pandas as pd from pandas import Index -from secretflow.data.base import DataFrameBase, PartitionBase -from secretflow.data.ndarray import FedNdarray, PartitionWay -from secretflow.device import PYU, Device, reveal +from secretflow.device import PYU, reveal from secretflow.utils.errors import InvalidArgumentError, NotFoundError +from ..base import DataFrameBase +from ..core import Partition +from ..ndarray import FedNdarray, PartitionWay + @dataclass class VDataFrame(DataFrameBase): @@ -76,13 +78,13 @@ class 150 >>> v_df.fillna({'sepal_length': 2}) """ - partitions: Dict[PYU, PartitionBase] + partitions: Dict[PYU, Partition] aligned: bool = True def _check_parts(self): assert self.partitions, 'Partitions in the VDataFrame is None or empty.' - def __concat_reveal_apply(self, fn: str, *args, **kwargs) -> pd.Series: + def __concat_apply_reveal(self, fn: Callable, *args, **kwargs) -> pd.Series: """Helper function to concatenate the revealed results of fn applied on each partition. Args: @@ -93,10 +95,7 @@ def __concat_reveal_apply(self, fn: str, *args, **kwargs) -> pd.Series: """ return pd.concat( reveal( - [ - getattr(part, fn)(*args, **kwargs).data - for part in self.partitions.values() - ] + [fn(part, *args, **kwargs).data for part in self.partitions.values()] ) ) @@ -110,30 +109,126 @@ def __part_apply(self, fn, *args, **kwargs) -> 'VDataFrame': """ # Note the [par.columns] is here to make sure alice does not see bob's columns. - # and it is only effective for subtract two VDataFrames with the same partitions roles and shapes.∂ + # and it is only effective for subtract two VDataFrames with the same partitions roles and shapes. return VDataFrame( { - pyu: getattr(part, fn)(*args, **kwargs)[part.columns] + pyu: fn(part, *args, **kwargs)[part.columns] for pyu, part in self.partitions.items() }, self.aligned, ) - def mode(self, numeric_only=False, dropna=True) -> pd.Series: + def _col_index(self, col) -> Dict[PYU, Union[str, List[str]]]: + assert ( + col.tolist() if isinstance(col, Index) else col + ), f'Column to index is None or empty!' + pyu_col = {} + listed_col = col.tolist() if isinstance(col, Index) else col + if not isinstance(listed_col, (list, tuple)): + listed_col = [listed_col] + for key in listed_col: + found = False + for pyu, part in self.partitions.items(): + if key not in part.dtypes: + continue + + found = True + if pyu not in pyu_col: + pyu_col[pyu] = [] # ensure the output is [] + pyu_col[pyu].append(key) + else: + if not isinstance(pyu_col[pyu], list): + # Convert to list if more than one column. + pyu_col[pyu] = [pyu_col[pyu]] + pyu_col[pyu].append(key) + + break + + if not found: + raise NotFoundError(f'Item {key} does not exist.') + return pyu_col + + def __getitem__(self, item) -> "VDataFrame": + item_index = self._col_index(item) + return VDataFrame( + partitions={ + pyu: self.partitions[pyu][keys] for pyu, keys in item_index.items() + } + ) + + def __setitem__(self, key, value): + if isinstance(value, Partition): + assert ( + value.device in self.partitions + ), 'Device of the partition to assgin is not in this dataframe devices.' + self.partitions[value.device][key] = value + return + elif isinstance(value, VDataFrame): + for pyu in value.partitions.keys(): + assert ( + pyu in self.partitions + ), 'Partitions to assgin is not same with this dataframe partitions.' + try: + key_index = self._col_index(key) + for pyu, col in key_index.items(): + self.partitions[pyu][col] = value.partitions[pyu] + except NotFoundError: + # Insert as a new key if not seen. + for pyu, part in value.partitions.items(): + self.partitions[pyu][list(part.dtypes.keys())] = part + else: + key_index = self._col_index(key) + for pyu, col in key_index.items(): + self.partitions[pyu][col] = value + + @property + def columns(self) -> list: """ - Return the mode of the values over the axis 0. - The mode of a set of values is the value that appears most often. - Restrict mode on axis 0 in VDataFrame for data protection reasons. + The column labels of the DataFrame. + """ + self._check_parts() + cols = None + for part in self.partitions.values(): + if cols is None: + cols = part.columns + elif isinstance(cols, list): + cols.extend(part.columns) + else: + cols = cols.append(part.columns) + return cols + + @property + def dtypes(self) -> dict: + """ + Return the dtypes in the DataFrame. + + Returns: + dict: the data type of each column. + """ + my_dtypes = {} + for part in self.partitions.values(): + my_dtypes.update(part.dtypes) + return my_dtypes + + @property + def shape(self) -> tuple: + """Return a tuple representing the dimensionality of the DataFrame.""" + self._check_parts() + shapes = [part.shape for part in self.partitions.values()] + return shapes[0][0], sum([shape[1] for shape in shapes]) + + def count(self, numeric_only=False) -> pd.Series: + """Count non-NA cells for each column. + + Restrict count on axis 0 in VDataFrame for data protection reasons. Returns: pd.Series """ - return self.__concat_reveal_apply( - "mode", - numeric_only=numeric_only, - dropna=dropna, - axis=0, - ) + return self.__concat_apply_reveal(Partition.count, numeric_only=numeric_only) + + def index(self) -> list: + raise NotImplementedError() def sum(self, numeric_only=False) -> pd.Series: """ @@ -144,7 +239,7 @@ def sum(self, numeric_only=False) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("sum", numeric_only=numeric_only) + return self.__concat_apply_reveal(Partition.sum, numeric_only=numeric_only) def min(self, numeric_only=False) -> pd.Series: """ @@ -157,7 +252,7 @@ def min(self, numeric_only=False) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("min", numeric_only=numeric_only) + return self.__concat_apply_reveal(Partition.min, numeric_only=numeric_only) def max(self, numeric_only=False): """ @@ -170,136 +265,7 @@ def max(self, numeric_only=False): Returns: pd.Series """ - return self.__concat_reveal_apply("max", numeric_only=numeric_only) - - def pow(self, *args, **kwargs) -> 'VDataFrame': - """Gets Exponential power of dataframe and other, element-wise (binary operator pow). - Equivalent to dataframe ** other, but with support to substitute a fill_value for missing data in one of the inputs. - With reverse version, rpow. - Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. - - Returns: - VDataFrame - - Reference: - pd.DataFrame.pow - """ - return self.__part_apply("pow", *args, **kwargs) - - def round(self, *args, **kwargs) -> 'VDataFrame': - """Round the DataFrame to a variable number of decimal places. - - Returns: - VDataFrame: same shape except value rounded - - Reference: - pd.DataFrame.round - """ - return self.__part_apply("round", *args, **kwargs) - - def select_dtypes(self, *args, **kwargs) -> 'VDataFrame': - """Returns a subset of the DataFrame's columns based on the column dtypes. - - Reference: - pandas.DataFrame.select_dtypes - """ - return VDataFrame( - { - pyu: part.select_dtypes(*args, **kwargs) - for pyu, part in self.partitions.items() - }, - self.aligned, - ) - - def replace(self, *args, **kwargs) -> 'VDataFrame': - """Replace values given in to_replace with value. - Same as pandas.DataFrame.replace - Values of the DataFrame are replaced with other values dynamically. - - Returns: - VDataFrame: same shape except value replaced - """ - return self.__part_apply("replace", *args, **kwargs) - - def subtract(self, *args, **kwargs) -> 'VDataFrame': - """Gets Subtraction of dataframe and other, element-wise (binary operator sub). - Equivalent to dataframe - other, but with support to substitute a fill_value for missing data in one of the inputs. - With reverse version, rsub. - Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. - - Note each part only will contains its own columns. - - Reference: - pd.DataFrame.subtract - """ - return self.__part_apply("subtract", *args, **kwargs) - - @property - def dtypes(self) -> dict: - """ - Return the dtypes in the DataFrame. - - Returns: - dict: the data type of each column. - """ - my_dtypes = {} - for part in self.partitions.values(): - my_dtypes.update(part.to_pandas().dtypes) - return my_dtypes - - def astype(self, dtype, copy: bool = True, errors: str = "raise"): - """ - Cast object to a specified dtype ``dtype``. - - All args are same as :py:meth:`pandas.DataFrame.astype`. - """ - if isinstance(dtype, dict): - item_index = self._col_index(list(dtype.keys())) - new_parts = {} - for pyu, part in self.partitions.items(): - if pyu not in item_index: - new_parts[pyu] = part.copy() - else: - cols = item_index[pyu] - if not isinstance(cols, list): - cols = [cols] - new_parts[pyu] = part.astype( - dtype={col: dtype[col] for col in cols}, - copy=copy, - errors=errors, - ) - return VDataFrame(partitions=new_parts, aligned=self.aligned) - - return VDataFrame( - partitions={ - pyu: part.astype(dtype, copy, errors) - for pyu, part in self.partitions.items() - }, - aligned=self.aligned, - ) - - @property - def columns(self) -> list: - """ - The column labels of the DataFrame. - """ - self._check_parts() - cols = None - for part in self.partitions.values(): - if cols is None: - cols = part.columns - elif isinstance(cols, list): - cols.extend(part.columns) - else: - cols = cols.append(part.columns) - return cols - - @property - def shape(self): - """Return a tuple representing the dimensionality of the DataFrame.""" - self._check_parts() - shapes = [part.shape for part in self.partitions.values()] - return shapes[0][0], sum([shape[1] for shape in shapes]) + return self.__concat_apply_reveal(Partition.max, numeric_only=numeric_only) def mean(self, numeric_only=False) -> pd.Series: """ @@ -312,7 +278,7 @@ def mean(self, numeric_only=False) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("mean", numeric_only=numeric_only) + return self.__concat_apply_reveal(Partition.mean, numeric_only=numeric_only) # TODO(zoupeicheng.zpc): support in HDataFrame is not scheduled yet # TODO(zoupeicheng.zpc): DataFrame variance currently ignore None columns. @@ -328,7 +294,7 @@ def var(self, numeric_only=False) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("var", numeric_only=numeric_only) + return self.__concat_apply_reveal(Partition.var, numeric_only=numeric_only) # TODO(zoupeicheng.zpc): support in HDataFrame is not scheduled yet # TODO(zoupeicheng.zpc): DataFrame std currently ignore None columns. @@ -344,7 +310,7 @@ def std(self, numeric_only=False) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("std", numeric_only=numeric_only) + return self.__concat_apply_reveal(Partition.std, numeric_only=numeric_only) # TODO(zoupeicheng.zpc): support in HDataFrame is not scheduled yet # TODO(zoupeicheng.zpc): DataFrame sem currently ignore None columns. @@ -360,7 +326,7 @@ def sem(self, numeric_only=False) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("sem", numeric_only=numeric_only) + return self.__concat_apply_reveal(Partition.sem, numeric_only=numeric_only) # TODO(zoupeicheng.zpc): support in HDataFrame is not scheduled yet # TODO(zoupeicheng.zpc): DataFrame skew currently ignore None columns. @@ -376,7 +342,7 @@ def skew(self, numeric_only=False) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("skew", numeric_only=numeric_only) + return self.__concat_apply_reveal(Partition.skew, numeric_only=numeric_only) # TODO(zoupeicheng.zpc): support in HDataFrame is not scheduled yet # TODO(zoupeicheng.zpc): DataFrame kurtosis currently ignore None columns. @@ -392,9 +358,9 @@ def kurtosis(self, numeric_only=False) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("kurtosis", numeric_only=numeric_only) + return self.__concat_apply_reveal(Partition.kurtosis, numeric_only=numeric_only) - def quantile(self, q=0.5) -> pd.Series: + def quantile(self, q=0.5, numeric_only=True, **kwargs) -> pd.Series: """Returns values at the given quantile over axis 0. Note columns containing None values are ignored. Fill before proceed. @@ -404,19 +370,43 @@ def quantile(self, q=0.5) -> pd.Series: Returns: pd.Series """ - return self.__concat_reveal_apply("quantile", q=q) - - def count(self, numeric_only=False) -> pd.Series: - """Count non-NA cells for each column. + return self.__concat_apply_reveal( + Partition.quantile, q=q, axis=0, numeric_only=numeric_only, **kwargs + ) - Restrict count on axis 0 in VDataFrame for data protection reasons. + def mode(self, numeric_only=False, dropna=True) -> pd.Series: + """ + Return the mode of the values over the axis 0. + The mode of a set of values is the value that appears most often. + Restrict mode on axis 0 in VDataFrame for data protection reasons. Returns: pd.Series """ - return self.__concat_reveal_apply("count", numeric_only=numeric_only) + return self.__concat_apply_reveal( + Partition.mode, + numeric_only=numeric_only, + dropna=dropna, + axis=0, + ) + + def value_counts(self, *args, **kwargs) -> pd.Series: + return self.__concat_apply_reveal(Partition.value_counts, *args, **kwargs) + + @property + def values(self) -> FedNdarray: + """ + Return a federated Numpy representation of the DataFrame. - def isna(self) -> 'VDataFrame': + Returns: + FedNdarray. + """ + return FedNdarray( + partitions={pyu: part.values for pyu, part in self.partitions.items()}, + partition_way=PartitionWay.VERTICAL, + ) + + def isna(self) -> "DataFrameBase": """ "Detects missing values for an array-like object. Same as pandas.DataFrame.isna Returns @@ -429,19 +419,47 @@ def isna(self) -> 'VDataFrame': Reference: pd.DataFrame.isna """ - return self.__part_apply("isna") + return self.__part_apply(Partition.isna) - @property - def values(self) -> FedNdarray: - """ - Return a federated Numpy representation of the DataFrame. + def replace(self, *args, **kwargs) -> "VDataFrame": + """Replace values given in to_replace with value. + Same as pandas.DataFrame.replace + Values of the DataFrame are replaced with other values dynamically. Returns: - FedNdarray. + VDataFrame: same shape except value replaced """ - return FedNdarray( - partitions={pyu: part.values for pyu, part in self.partitions.items()}, - partition_way=PartitionWay.VERTICAL, + return self.__part_apply(Partition.replace, *args, **kwargs) + + def astype(self, dtype, copy: bool = True, errors: str = "raise") -> "VDataFrame": + """ + Cast object to a specified dtype ``dtype``. + + All args are same as :py:meth:`pandas.DataFrame.astype`. + """ + if isinstance(dtype, dict): + item_index = self._col_index(list(dtype.keys())) + new_parts = {} + for pyu, part in self.partitions.items(): + if pyu not in item_index: + new_parts[pyu] = part.copy() + else: + cols = item_index[pyu] + if not isinstance(cols, list): + cols = [cols] + new_parts[pyu] = part.astype( + dtype={col: dtype[col] for col in cols}, + copy=copy, + errors=errors, + ) + return VDataFrame(partitions=new_parts, aligned=self.aligned) + + return VDataFrame( + partitions={ + pyu: part.astype(dtype, copy, errors) + for pyu, part in self.partitions.items() + }, + aligned=self.aligned, ) def copy(self) -> 'VDataFrame': @@ -451,7 +469,7 @@ def copy(self) -> 'VDataFrame': Returns: VDataFrame. """ - return self.__part_apply("copy") + return self.__part_apply(Partition.copy) def drop( self, @@ -512,7 +530,7 @@ def drop( ) else: return self.__part_apply( - "drop", + Partition.drop, labels=labels, axis=axis, index=index, @@ -548,10 +566,9 @@ def fillna( limit=limit, downcast=downcast, ) - return self else: return self.__part_apply( - "fillna", + Partition.fillna, value=value, method=method, axis=axis, @@ -580,84 +597,98 @@ def to_csv(self, fileuris: Dict[PYU, str], **kwargs): for device, uri in fileuris.items() ] - def __len__(self): - """Return the max length if not aligned.""" - parts = list(self.partitions.values()) - assert parts, 'No partitions in VDataFrame.' - return max([len(part) for part in parts]) + def iloc(self, index: Union[int, slice, List[int]]) -> 'DataFrameBase': + raise NotImplementedError() - def _col_index(self, col) -> Dict[Device, Union[str, List[str]]]: - assert ( - col.tolist() if isinstance(col, Index) else col - ), f'Column to index is None or empty!' - pyu_col = {} - listed_col = col.tolist() if isinstance(col, Index) else col - if not isinstance(listed_col, (list, tuple)): - listed_col = [listed_col] - for key in listed_col: - found = False - for pyu, part in self.partitions.items(): - if key not in part.dtypes: - continue + def rename( + self, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + errors='ignore', + ) -> Union['DataFrameBase', None]: + raise NotImplementedError() - found = True - if pyu not in pyu_col: - pyu_col[pyu] = [] # ensure the output is [] - pyu_col[pyu].append(key) - else: - if not isinstance(pyu_col[pyu], list): - # Convert to list if more than one column. - pyu_col[pyu] = [pyu_col[pyu]] - pyu_col[pyu].append(key) + def pow(self, *args, **kwargs) -> 'VDataFrame': + """Gets Exponential power of dataframe and other, element-wise (binary operator pow). + Equivalent to dataframe ** other, but with support to substitute a fill_value for missing data in one of the inputs. + With reverse version, rpow. + Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. - break + Returns: + VDataFrame - if not found: - raise NotFoundError(f'Item {key} does not exist.') - return pyu_col + Reference: + pd.DataFrame.pow + """ + return self.__part_apply(Partition.pow, *args, **kwargs) - def __getitem__(self, item) -> 'VDataFrame': - item_index = self._col_index(item) + def round(self, decimals: Union[int, dict]) -> 'VDataFrame': + """Round the DataFrame to a variable number of decimal places. + + Returns: + VDataFrame: same shape except value rounded + + Reference: + pd.DataFrame.round + """ + if isinstance(decimals, dict): + col_idx = self._col_index(list(decimals.keys())) + ret_part = {} + for pyu, part in self.partitions.items(): + if pyu in col_idx: + cols = col_idx[pyu] + if not isinstance(cols, list): + cols = [cols] + ret_part[pyu] = part.round({col: decimals[col] for col in cols}) + else: + ret_part[pyu] = part + return VDataFrame( + ret_part, + self.aligned, + ) + else: + return self.__part_apply(Partition.round, decimals) + + def select_dtypes(self, *args, **kwargs) -> 'VDataFrame': + """Returns a subset of the DataFrame's columns based on the column dtypes. + + Reference: + pandas.DataFrame.select_dtypes + """ return VDataFrame( - partitions={ - pyu: self.partitions[pyu][keys] for pyu, keys in item_index.items() - } + { + pyu: part.select_dtypes(*args, **kwargs) + for pyu, part in self.partitions.items() + }, + self.aligned, ) - def __setitem__(self, key, value): - if isinstance(value, PartitionBase): - assert ( - value.data.device in self.partitions - ), 'Device of the partition to assgin is not in this dataframe devices.' - self.partitions[value.data.device][key] = value - return - elif isinstance(value, VDataFrame): - for pyu in value.partitions.keys(): - assert ( - pyu in self.partitions - ), 'Partitions to assgin is not same with this dataframe partitions.' - try: - key_index = self._col_index(key) - for pyu, col in key_index.items(): - self.partitions[pyu][col] = ( - value.partitions[pyu] - if isinstance(value, VDataFrame) - else value - ) - except NotFoundError: - # Insert as a new key if not seen. - for pyu, part in value.partitions.items(): - self.partitions[pyu][list(part.dtypes.keys())] = part - else: - key_index = self._col_index(key) - for pyu, col in key_index.items(): - self.partitions[pyu][col] = ( - value.partitions[pyu] if isinstance(value, VDataFrame) else value - ) + def subtract(self, *args, **kwargs) -> 'VDataFrame': + """Gets Subtraction of dataframe and other, element-wise (binary operator sub). + Equivalent to dataframe - other, but with support to substitute a fill_value for missing data in one of the inputs. + With reverse version, rsub. + Among flexible wrappers (add, sub, mul, div, mod, pow) to arithmetic operators: +, -, *, /, //, %, **. + + Note each part only will contain its own columns. + + Reference: + pd.DataFrame.subtract + """ + return self.__part_apply(Partition.subtract, *args, **kwargs) + + def apply_func(self, func, *, nums_return: int = 1, **kwargs) -> 'VDataFrame': + return self.__part_apply( + Partition.apply_func, nums_return=nums_return, func=func, **kwargs + ) @reveal def partition_shape(self): - """Return shapes of each partition. + """Return shapes of each core. Returns: a dict of {pyu: shape} @@ -678,7 +709,13 @@ def partition_columns(self): device: partition.columns for device, partition in self.partitions.items() } - def to_pandas(self): + def __len__(self): + """Return the max length if not aligned.""" + parts = list(self.partitions.values()) + assert parts, 'No partitions in VDataFrame.' + return max([len(part) for part in parts]) + + def to_pandas(self) -> "VDataFrame": return VDataFrame( {pyu: part.to_pandas() for pyu, part in self.partitions.items()}, self.aligned, diff --git a/secretflow/data/vertical/io.py b/secretflow/data/vertical/io.py index 090240cee..d2fc968cd 100644 --- a/secretflow/data/vertical/io.py +++ b/secretflow/data/vertical/io.py @@ -14,13 +14,14 @@ from typing import Dict, List, Union -from secretflow.data.partition.io import read_csv_wrapper -from secretflow.data.base import partition -from secretflow.data.vertical.dataframe import VDataFrame from secretflow.device import PYU, SPU, Device from secretflow.utils.errors import InvalidArgumentError from secretflow.utils.random import global_random +from ..core import partition +from ..core.io import read_csv_wrapper +from .dataframe import VDataFrame + def read_csv( filepath: Dict[PYU, str], @@ -122,17 +123,16 @@ def get_keys( usecols = dtypes[device].keys() if dtypes is not None else None dtype = dtypes[device] if dtypes is not None else None partitions[device] = partition( - device(read_csv_wrapper)( - path, - auto_gen_header_prefix=str(device) if no_header else "", - delimiter=delimiter, - usecols=usecols, - dtype=dtype, - backend=backend, - ), + data=read_csv_wrapper, + device=device, backend=backend, + filepath=path, + auto_gen_header_prefix=str(device) if no_header else "", + delimiter=delimiter, + usecols=usecols, + dtype=dtype, + read_backend=backend, ) - if drop_keys: for device, part in partitions.items(): device_drop_key = get_keys(device, drop_keys) diff --git a/secretflow/device/device/base.py b/secretflow/device/device/base.py index 9b3267f2b..bf7b6d227 100644 --- a/secretflow/device/device/base.py +++ b/secretflow/device/device/base.py @@ -42,6 +42,8 @@ def _name_of_to(device_type: DeviceType): class DeviceObject(ABC): + device: 'Device' + def __init__(self, device: Device): """Abstraction device object base class. diff --git a/secretflow/device/device/pyu.py b/secretflow/device/device/pyu.py index ab2230ec0..84330ff68 100644 --- a/secretflow/device/device/pyu.py +++ b/secretflow/device/device/pyu.py @@ -33,6 +33,8 @@ class PYUObject(DeviceObject): data: Reference to underlying data. """ + device: 'PYU' + def __init__(self, device: 'PYU', data: Union[ray.ObjectRef, fed.FedObject]): super().__init__(device) self.data = data @@ -145,10 +147,10 @@ def _run(fn, *args, **kwargs): for pos, arg in enumerate(arg_flat) if isinstance(arg, ray.ObjectRef) } - - actual_vals = ray.get(list(refs.values())) - for pos, actual_val in zip(refs.keys(), actual_vals): - arg_flat[pos] = actual_val + if refs: + actual_vals = ray.get(list(refs.values())) + for pos, actual_val in zip(refs.keys(), actual_vals): + arg_flat[pos] = actual_val args, kwargs = jax.tree_util.tree_unflatten(arg_tree, arg_flat) return fn(*args, **kwargs) diff --git a/secretflow/device/driver.py b/secretflow/device/driver.py index 300d7bc8d..c298084f4 100644 --- a/secretflow/device/driver.py +++ b/secretflow/device/driver.py @@ -25,6 +25,7 @@ import secretflow.distributed as sfd from secretflow.device import global_state +from secretflow.distributed.primitive import DISTRIBUTION_MODE from secretflow.utils.errors import InvalidArgumentError from secretflow.utils.logging import set_logging_level from secretflow.utils.ray_compatibility import ray_version_less_than_2_0_0 @@ -42,7 +43,6 @@ TEEUObject, ) - _CROSS_SILO_COMM_BACKENDS = ['grpc', 'brpc_link'] @@ -225,6 +225,7 @@ def init( auth_manager_config: Dict = None, party_key_pair: Dict[str, Dict] = None, tee_simulation: bool = False, + debug_mode=False, **kwargs, ): """Connect to an existing Ray cluster or start one and connect to it. @@ -399,11 +400,16 @@ def init( tee_simulation: optional, enable TEE simulation if True. When simulation is enabled, the remote attestation for auth manager will be ignored. This is for test only and keep it False when for production. + debug_mode: Whether to enable debug mode. In debug mode, single-process simulation + will be used instead of ray scheduling, and lazy mode will be changed to + synchronous mode to facilitate debugging.and will use PYU to simulate SPU device + ONLY DEBUG! + **kwargs: see :py:meth:`ray.init` parameters. """ set_logging_level(logging_level) simluation_mode = True if parties else False - + sfd.active_sf_cluster() if auth_manager_config and simluation_mode: raise InvalidArgumentError( 'TEE abilities is available only in production mode.' @@ -457,16 +463,6 @@ def init( global_state.set_tee_simulation(tee_simulation=tee_simulation) if simluation_mode: - if cluster_config: - raise InvalidArgumentError( - 'Simulation mode is enabled when `parties` is provided, ' - 'but you provide `cluster_config` at the same time. ' - '`cluster_config` is for production mode only and should be `None` in simulation mode. ' - 'Or if you want to run SecretFlow in product mode, ' - 'please keep `parties` with `None`.' - ) - # Simulation mode - sfd.set_production(False) if not isinstance(parties, (str, Tuple, List)): raise InvalidArgumentError('parties must be str or list of str.') if isinstance(parties, str): @@ -474,26 +470,40 @@ def init( else: assert len(set(parties)) == len(parties), f'duplicated parties {parties}.' - if local_mode: - resources = {party: num_cpus for party in parties} + if debug_mode: + # debug mode + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.DEBUG) else: - resources = None + if cluster_config: + raise InvalidArgumentError( + 'Simulation mode is enabled when `parties` is provided, ' + 'but you provide `cluster_config` at the same time. ' + '`cluster_config` is for production mode only and should be `None` in simulation mode. ' + 'Or if you want to run SecretFlow in product mode, ' + 'please keep `parties` with `None`.' + ) + # Simulation mode + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.SIMULATION) + if local_mode: + resources = {party: num_cpus for party in parties} + else: + resources = None - if not address: - if omp_num_threads: + if not address and omp_num_threads: os.environ['OMP_NUM_THREADS'] = f'{omp_num_threads}' - ray.init( - address, - num_cpus=num_cpus, - num_gpus=num_gpus, - resources=resources, - log_to_driver=log_to_driver, - **kwargs, - ) + ray.init( + address, + num_cpus=num_cpus, + num_gpus=num_gpus, + resources=resources, + log_to_driver=log_to_driver, + **kwargs, + ) global_state.set_parties(parties=parties) + else: - sfd.set_production(True) + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.PRODUCTION) global _CROSS_SILO_COMM_BACKENDS assert cross_silo_comm_backend.lower() in _CROSS_SILO_COMM_BACKENDS, ( 'Invalid cross_silo_comm_backend, ' @@ -565,7 +575,7 @@ def init( def barrier(): - if sfd.production_mode(): + if sfd.get_distribution_mode() == DISTRIBUTION_MODE.PRODUCTION: barriers = [] for party in global_state.parties(): barriers.append(PYU(party)(lambda: None)()) diff --git a/secretflow/device/link.py b/secretflow/device/link.py index c094783b6..3d391eb5f 100644 --- a/secretflow/device/link.py +++ b/secretflow/device/link.py @@ -23,6 +23,7 @@ import secretflow.distributed as sfd from secretflow.device.driver import reveal +from secretflow.distributed.primitive import DISTRIBUTION_MODE from .device import PYU @@ -342,7 +343,7 @@ def recv( def init_link(link: Link, partners: List[Link]): if not isinstance(partners, list): partners = [partners] - if sfd.production_mode(): + if sfd.get_distribution_mode() == DISTRIBUTION_MODE.PRODUCTION: comm = FedCommunicator([partner.device for partner in partners]) # Use `get` here as a barrier to make sure that initialize is done at first. # Note that link should be a `proxy`ed actor. diff --git a/secretflow/device/proxy.py b/secretflow/device/proxy.py index 5adf40c7b..8517a2cac 100644 --- a/secretflow/device/proxy.py +++ b/secretflow/device/proxy.py @@ -21,6 +21,7 @@ import ray import secretflow.distributed as sfd +from secretflow.distributed.primitive import DISTRIBUTION_MODE from secretflow.utils.logging import LOG_FORMAT, get_logging_level from . import link @@ -31,7 +32,6 @@ def _actor_wrapper(device_object_type, name, num_returns): def wrapper(self, *args, **kwargs): - # device object type check and unwrap _num_returns = kwargs.pop('_num_returns', num_returns) value_flat, value_tree = jax.tree_util.tree_flatten((args, kwargs)) for i, value in enumerate(value_flat): @@ -49,9 +49,7 @@ def wrapper(self, *args, **kwargs): ) ) handle = getattr(self.data, name) - # TODO @raofei: 支持public_reveal装饰器 res = handle.options(num_returns=_num_returns).remote(*args, **kwargs) - if _num_returns == 1: return device_object_type(self.device, res) else: @@ -75,7 +73,6 @@ def wrapper(*args, **kwargs): for pos, actual_val in zip(refs.keys(), actual_vals): arg_flat[pos] = actual_val args, kwargs = jax.tree_util.tree_unflatten(arg_tree, arg_flat) - return method(*args, **kwargs) return wrapper @@ -175,7 +172,7 @@ def __init__(self, *args, **kwargs): if ( max_concur is None and _simulation_max_concurrency is not None - and not sfd.production_mode() + and sfd.get_distribution_mode() == DISTRIBUTION_MODE.SIMULATION ): max_concur = _simulation_max_concurrency @@ -210,8 +207,11 @@ def __init__(self, *args, **kwargs): num_returns = len(sig.return_annotation) else: num_returns = 1 + wrapped_method = wraps(method)( - _actor_wrapper(device_object_type, name, num_returns) + _actor_wrapper( + device_object_type, name, num_returns + ) # DeviceObject, method_name, num_returns ) setattr(ActorProxy, name, wrapped_method) @@ -219,7 +219,6 @@ def __init__(self, *args, **kwargs): ActorProxy.__module__ = cls.__module__ ActorProxy.__name__ = name ActorProxy.__qualname__ = name - return ActorProxy return make_proxy diff --git a/secretflow/distributed/__init__.py b/secretflow/distributed/__init__.py index 4baf0447e..a4ceaaa0a 100644 --- a/secretflow/distributed/__init__.py +++ b/secretflow/distributed/__init__.py @@ -13,13 +13,24 @@ # limitations under the License. -from .primitive import get, kill, production_mode, remote, set_production, shutdown +from .primitive import ( + get, + kill, + remote, + shutdown, + set_distribution_mode, + get_distribution_mode, + get_current_cluster_idx, + active_sf_cluster, +) __all__ = [ 'get', 'kill', - 'production_mode', 'remote', - 'set_production', 'shutdown', + 'set_distribution_mode', + 'get_distribution_mode', + 'get_current_cluster_idx', + 'active_sf_cluster', ] diff --git a/secretflow/distributed/memory_actor.py b/secretflow/distributed/memory_actor.py new file mode 100644 index 000000000..893e6c402 --- /dev/null +++ b/secretflow/distributed/memory_actor.py @@ -0,0 +1,90 @@ +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from secretflow.distributed.memory_function import MemCallHolder + + +class MemActorHandle: + def __init__( + self, + cls, + node_party, + options, + ) -> None: + self._body = cls + self._node_party = node_party + self._options = options + self._actor_handle = None + + def __getattr__(self, method_name: str): + # User trying to call .bind() without a bind class method + if method_name == "remote" and "remote" not in dir(self._body): + raise AttributeError(f".remote() cannot be used again on {type(self)} ") + # Raise an error if the method is invalid. + getattr(self._body, method_name) + method_actor = MemActorMethod( + self._node_party, + self, + method_name, + ).options(**self._options) + return method_actor + + def execute_impl(self, cls_args, cls_kwargs): + """Executor of ClassNode by mem.remote() + + Args and kwargs are to match base class signature, but not in the + implementation. All args and kwargs should be resolved and replaced + with value in bound_args and bound_kwargs via bottom-up recursion when + current node is executed. + """ + self._actor_handle = self._body(*cls_args, **cls_kwargs) + + def _execute_method(self, method_name, options, *args, **kwargs): + num_returns = 1 + if options and 'num_returns' in options: + num_returns = options['num_returns'] + logging.debug(f"Actor method call: {method_name}, num_returns: {num_returns}") + # execute method call + function = getattr(self._actor_handle, method_name) + + return function(*args, **kwargs) + + +class MemActorMethod: + def __init__( + self, + node_party, + mem_actor_handle, + method_name, + ) -> None: + self._node_party = node_party + self._mem_actor_handle = mem_actor_handle + self._method_name = method_name + self._options = {} + self._mem_call_holder = MemCallHolder(node_party, self._execute_impl) + + def remote(self, *args, **kwargs) -> object: + return self._mem_call_holder.internal_remote(*args, **kwargs) + + def options(self, **options): + self._options = options + self._mem_call_holder.options(**options) + return self + + def _execute_impl(self, *args, **kwargs): + return self._mem_actor_handle._execute_method( + self._method_name, self._options, *args, **kwargs + ) diff --git a/secretflow/distributed/memory_api.py b/secretflow/distributed/memory_api.py new file mode 100644 index 000000000..c54f6b25d --- /dev/null +++ b/secretflow/distributed/memory_api.py @@ -0,0 +1,115 @@ +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import inspect +from functools import partial + +from secretflow.distributed.memory_actor import MemActorHandle +from secretflow.distributed.memory_function import MemCallHolder + + +def _is_cython(obj): + """Check if an object is a Cython function or method""" + + def check_cython(x): + return type(x).__name__ == "cython_function_or_method" + + # Check if function or method, respectively + return check_cython(obj) or ( + hasattr(obj, "__func__") and check_cython(obj.__func__) + ) + + +class MemRemoteFunction: + def __init__(self, func_or_class) -> None: + self._node_party = None + self._func_body = func_or_class + self._options = {} + self._mem_call_holder = None + + def party(self, party: str): + self._node_party = party + + self._mem_call_holder = MemCallHolder( + self._node_party, + self._execute_impl, + self._options, + ) + return self + + def options(self, **options): + self._options = options + if self._mem_call_holder: + self._mem_call_holder.options(**options) + return self + + def remote(self, *args, **kwargs): + if not self.party: + raise ValueError("You should specify a party name on the remote function.") + + result = self._mem_call_holder.internal_remote(*args, **kwargs) + return result + + def _execute_impl(self, *args, **kwargs): + return self._func_body(*args, **kwargs) + + +class MemRemoteClass: + def __init__(self, func_or_class) -> None: + self._party = None + self._cls = func_or_class + self._options = {} + + def party(self, party: str): + self._party = party + return self + + def options(self, **options): + self._options = options + return self + + def remote(self, *cls_args, **cls_kwargs): + mem_actor_handle = MemActorHandle( + self._cls, + self._party, + self._options, + ) + mem_actor_handle.execute_impl(cls_args, cls_kwargs) + return mem_actor_handle + + +def _make_actor(cls, actor_options): + return MemRemoteClass(func_or_class=cls).options(**actor_options) + + +def _make_remote(function_or_class, options): + if inspect.isfunction(function_or_class) or _is_cython(function_or_class): + return MemRemoteFunction(function_or_class).options(**options) + + if inspect.isclass(function_or_class): + return _make_actor(function_or_class, options) + + raise TypeError( + "The mem.remote decorator must be applied to either a function or a class." + ) + + +def mem_remote(*args, **kwargs): + if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): + # This is the case where the decorator is just @mem.remote. + # "args[0]" is the class or function under the decorator. + return _make_remote(args[0], {}) + assert len(args) == 0 and len(kwargs) > 0, "args cannot be none" + return partial(_make_remote, options=kwargs) diff --git a/secretflow/distributed/memory_function.py b/secretflow/distributed/memory_function.py new file mode 100644 index 000000000..b995fedab --- /dev/null +++ b/secretflow/distributed/memory_function.py @@ -0,0 +1,48 @@ +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class MemCallHolder: + """ + `MemCallHolder` represents a call node holder when submitting tasks. + For example, + + f.party("ALICE").remote() + ~~~~~~~~~~~~~~~~ + ^ + | + it's a holder. + + """ + + def __init__( + self, + node_party, + execute_impl, + options={}, + ) -> None: + self._party = None + self._node_party = node_party + self._options = options + self._execute_impl = execute_impl + + def options(self, **options): + self._options = options + return self + + def internal_remote(self, *args, **kwargs): + if not self._node_party: + raise ValueError("You should specify a party name on the actor.") + + return self._execute_impl(*args, **kwargs) diff --git a/secretflow/distributed/primitive.py b/secretflow/distributed/primitive.py index 5f7a65294..d516da1bd 100644 --- a/secretflow/distributed/primitive.py +++ b/secretflow/distributed/primitive.py @@ -14,6 +14,8 @@ import inspect +import logging +from enum import Enum, unique from functools import partial from typing import List, Union @@ -25,19 +27,73 @@ from ray.actor import ActorClass, _inject_tracing_into_class, ray_constants from ray.remote_function import RemoteFunction +from secretflow.distributed.memory_api import mem_remote from secretflow.utils.ray_compatibility import ray_version_less_than_2_0_0 -_production_mode = False +@unique +class DISTRIBUTION_MODE(Enum): + PRODUCTION = 1 + SIMULATION = 2 + DEBUG = 3 -def set_production(mode: bool): - global _production_mode - _production_mode = mode +_distribution_mode = DISTRIBUTION_MODE.SIMULATION +_is_cluster_active = False +_current_cluster_id = -1 -def production_mode(): - global _production_mode - return _production_mode + +def set_distribution_mode(mode: DISTRIBUTION_MODE): + global _distribution_mode + if mode in DISTRIBUTION_MODE: + _distribution_mode = mode + else: + raise Exception(f"Illegal distribute mode, only support ({DISTRIBUTION_MODE})") + + +def get_distribution_mode(): + global _distribution_mode + return _distribution_mode + + +def get_current_cluster_idx(): + """ + Get the current secretflow cluster index. + In general situation, users will execute sf.init() at the first of their codes, + and execute sf.shutdown() at the end. + But in some cases, the following code may appear: + - sf.init() + - do_something_in_sf() + - sf.shutdown() + - do_something_else() + - sf.init() + - do_something_in_sf() + - sf.shutdown() + To ensure some variable can work as expect when it across different clusters, + we use an index to indicate diferent active cluster. + As shown above, between the first sf.init() and sf.shutdown(), the cluster index is 0, + and the between the second sf.init() and sf.shutdown(), the cluster index is 1, + In do_something_else(), the cluster index is -1 (no active cluster). + + Returns: The current cluster id, -1 for not active. + + """ + global _current_cluster_id + global _is_cluster_active + if _is_cluster_active: + return _current_cluster_id + else: + return -1 + + +def active_sf_cluster(): + """ + Record the cluster's active status and generate the current cluster index. + """ + global _current_cluster_id + global _is_cluster_active + _current_cluster_id = _current_cluster_id + 1 + _is_cluster_active = True def _is_cython(obj): @@ -57,35 +113,56 @@ def check_cython(x): def remote(*args, **kwargs): - if production_mode(): + if get_distribution_mode() == DISTRIBUTION_MODE.PRODUCTION: return fed.remote(*args, **kwargs) - else: + elif get_distribution_mode() == DISTRIBUTION_MODE.SIMULATION: return ray_remote(*args, **kwargs) + elif get_distribution_mode() == DISTRIBUTION_MODE.DEBUG: + return mem_remote(*args, **kwargs) + else: + raise Exception(f"Illegal distribute mode, only support ({DISTRIBUTION_MODE})") def get( object_refs: Union[ Union[ray.ObjectRef, List[ray.ObjectRef]], Union[fed.FedObject, List[fed.FedObject]], + Union[object, List[object]], ] ): - if production_mode(): + if get_distribution_mode() == DISTRIBUTION_MODE.PRODUCTION: return fed.get(object_refs) - else: + elif get_distribution_mode() == DISTRIBUTION_MODE.SIMULATION: return ray.get(object_refs) + elif get_distribution_mode() == DISTRIBUTION_MODE.DEBUG: + return object_refs + else: + raise Exception(f"Illegal distribute mode, only support ({DISTRIBUTION_MODE})") def kill(actor, *, no_restart=True): - if production_mode(): + if get_distribution_mode() == DISTRIBUTION_MODE.PRODUCTION: return fed.kill(actor, no_restart=no_restart) - else: + elif get_distribution_mode() == DISTRIBUTION_MODE.SIMULATION: return ray.kill(actor, no_restart=no_restart) + elif get_distribution_mode() == DISTRIBUTION_MODE.DEBUG: + logging.warning("Actor be killed") + else: + raise Exception(f"Illegal distribute mode, only support ({DISTRIBUTION_MODE})") def shutdown(): - if production_mode(): + global _is_cluster_active + _is_cluster_active = False + if get_distribution_mode() == DISTRIBUTION_MODE.PRODUCTION: fed.shutdown() - ray.shutdown() + ray.shutdown() + elif get_distribution_mode() == DISTRIBUTION_MODE.SIMULATION: + ray.shutdown() + elif get_distribution_mode() == DISTRIBUTION_MODE.DEBUG: + logging.warning("Shut Down!") + else: + raise Exception(f"Illegal distribute mode, only support ({DISTRIBUTION_MODE})") def _resolve_args(*args, **kwargs): diff --git a/secretflow/kuscia/README.md b/secretflow/kuscia/README.md deleted file mode 100644 index 4c6bf093a..000000000 --- a/secretflow/kuscia/README.md +++ /dev/null @@ -1,2 +0,0 @@ -**NOTE: this folder is still experimental.** -This folder is the adapter code of SecretFlow to Kuscia. For those who use SecretFlow directly, you don't need to care about this folder. diff --git a/secretflow/kuscia/entry.py b/secretflow/kuscia/entry.py index c431a1f5d..3a0aecaa0 100644 --- a/secretflow/kuscia/entry.py +++ b/secretflow/kuscia/entry.py @@ -27,17 +27,20 @@ from secretflow.kuscia.datamesh import ( create_domain_data, create_domain_data_service_stub, + create_domain_data_source_service_stub, get_domain_data, + get_domain_data_source, ) from secretflow.kuscia.ray_config import RayConfig from secretflow.kuscia.sf_config import get_sf_cluster_config from secretflow.kuscia.task_config import KusciaTaskConfig -from secretflow.protos.component.data_pb2 import ( +from secretflow.spec.v1.data_pb2 import ( DistData, IndividualTable, + StorageConfig, VerticalTable, ) -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam, NodeEvalResult +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam, NodeEvalResult _LOG_FORMAT = "%(asctime)s|{}|%(levelname)s|secretflow|%(filename)s:%(funcName)s:%(lineno)d| %(message)s" @@ -209,6 +212,41 @@ def try_to_get_datasource_id(task_conf: KusciaTaskConfig): datasource_id = sf_datasource_config[party_name]["id"] +def get_storage_config( + kuscia_config: KusciaTaskConfig, + datamesh_addr: str, + datasource_id: str = None, +) -> StorageConfig: + party_id = kuscia_config.task_cluster_def.self_party_idx + party_name = kuscia_config.task_cluster_def.parties[party_id].name + kuscia_record = kuscia_config.sf_storage_config + + if ( + kuscia_record is not None + and party_name not in kuscia_record + and datasource_id is None + ): + raise RuntimeError( + f"storage config of party [{party_name}] is missing. It must be provided with sf_storage_config explicitly or be inferred from sf_input_ids with DataMesh services." + ) + + if kuscia_record is not None and party_name in kuscia_record: + storage_config = kuscia_record[party_name] + else: + # try to get storage config with sf_datasource_config + stub = create_domain_data_source_service_stub(datamesh_addr) + domain_data_source = get_domain_data_source(stub, datasource_id) + + storage_config = StorageConfig( + type="local_fs", + local_fs=StorageConfig.LocalFSConfig( + wd=domain_data_source.info.localfs.path + ), + ) + + return storage_config + + @click.command() @click.argument("task_config_path", type=click.Path(exists=True)) @click.option("--datamesh_addr", required=False, default=DEFAULT_DATAMESH_ADDRESS) @@ -234,9 +272,11 @@ def main(task_config_path, datamesh_addr): task_conf.sf_output_uris, ) - sf_cluster_config = get_sf_cluster_config(task_conf, datamesh_addr, datasource_id) + sf_cluster_config = get_sf_cluster_config(task_conf) + + storage_config = get_storage_config(task_conf, datamesh_addr, datasource_id) - res = comp_eval(sf_node_eval_param, sf_cluster_config) + res = comp_eval(sf_node_eval_param, storage_config, sf_cluster_config) postprocess_sf_node_eval_result( res, diff --git a/secretflow/kuscia/sf_config.py b/secretflow/kuscia/sf_config.py index 9bbaf2ba7..10c574199 100644 --- a/secretflow/kuscia/sf_config.py +++ b/secretflow/kuscia/sf_config.py @@ -13,33 +13,17 @@ # limitations under the License. -from kuscia.proto.api.v1alpha1.kusciatask.kuscia_task_pb2 import ( - AllocatedPorts, - ClusterDefine, -) - -from secretflow.kuscia.datamesh import ( - create_domain_data_source_service_stub, - get_domain_data_source, -) from secretflow.kuscia.ray_config import RayConfig from secretflow.kuscia.task_config import KusciaTaskConfig -from secretflow.protos.component.cluster_pb2 import ( - SFClusterConfig, - SFClusterDesc, - StorageConfig, -) - - -def compose_sf_cluster_config( - sf_cluster_desc: SFClusterDesc, - datamesh_addr: str, - kuscia_task_cluster_def: ClusterDefine, - kuscia_task_allocated_ports: AllocatedPorts, - ray_config: RayConfig, - sf_storage_config: StorageConfig, - datasource_id: str = None, -) -> SFClusterConfig: +from secretflow.spec.extend.cluster_pb2 import SFClusterConfig + + +def get_sf_cluster_config(kuscia_config: KusciaTaskConfig) -> SFClusterConfig: + sf_cluster_desc = kuscia_config.sf_cluster_desc + kuscia_task_cluster_def = kuscia_config.task_cluster_def + kuscia_task_allocated_ports = kuscia_config.task_allocated_ports + ray_config = RayConfig.from_kuscia_task_config(kuscia_config) + party_id = kuscia_task_cluster_def.self_party_idx party_name = kuscia_task_cluster_def.parties[party_id].name @@ -69,35 +53,11 @@ def compose_sf_cluster_config( # add "http://" to force brpc to set the correct Host spu_address["spu"][party.name] = f"http://{service.endpoints[0]}" - if ( - sf_storage_config is not None - and party_name not in sf_storage_config - and datasource_id is None - ): - raise RuntimeError( - f"storage config of party [{party_name}] is missing. It must be provided with sf_storage_config explicitly or be inferred from sf_input_ids with DataMesh services." - ) - - if sf_storage_config is not None and party_name in sf_storage_config: - storage_config = sf_storage_config[party_name] - else: - # try to get storage config with sf_datasource_config - stub = create_domain_data_source_service_stub(datamesh_addr) - domain_data_source = get_domain_data_source(stub, datasource_id) - - storage_config = StorageConfig( - type="local_fs", - local_fs=StorageConfig.LocalFSConfig( - wd=domain_data_source.info.localfs.path - ), - ) - res = SFClusterConfig( desc=sf_cluster_desc, private_config=SFClusterConfig.PrivateConfig( self_party=party_name, ray_head_addr=f"{ray_config.ray_node_ip_address}:{ray_config.ray_gcs_port}", - storage_config=storage_config, ), ) @@ -134,17 +94,3 @@ def compose_sf_cluster_config( res.public_config.spu_configs.append(pulic_spu_config) return res - - -def get_sf_cluster_config( - kuscia_config: KusciaTaskConfig, datamesh_addr: str, datasource_id: str = None -) -> SFClusterConfig: - return compose_sf_cluster_config( - kuscia_config.sf_cluster_desc, - datamesh_addr, - kuscia_config.task_cluster_def, - kuscia_config.task_allocated_ports, - RayConfig.from_kuscia_task_config(kuscia_config), - kuscia_config.sf_storage_config, - datasource_id, - ) diff --git a/secretflow/kuscia/task_config.py b/secretflow/kuscia/task_config.py index 95b6f6e6d..c327257d5 100644 --- a/secretflow/kuscia/task_config.py +++ b/secretflow/kuscia/task_config.py @@ -22,8 +22,9 @@ ClusterDefine, ) -from secretflow.protos.component.cluster_pb2 import SFClusterDesc, StorageConfig -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam +from secretflow.spec.extend.cluster_pb2 import SFClusterDesc +from secretflow.spec.v1.data_pb2 import StorageConfig +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam @dataclass diff --git a/secretflow/ml/nn/sl/sl_model.py b/secretflow/ml/nn/sl/sl_model.py index 3416a0f5d..f746337b1 100644 --- a/secretflow/ml/nn/sl/sl_model.py +++ b/secretflow/ml/nn/sl/sl_model.py @@ -27,7 +27,7 @@ from multiprocess import cpu_count from tqdm import tqdm -from secretflow.data.base import PartitionBase +from secretflow.data import Partition from secretflow.data.horizontal import HDataFrame from secretflow.data.ndarray import FedNdarray from secretflow.data.vertical import VDataFrame @@ -174,7 +174,7 @@ def handle_data( xs = ( [ xi.partitions[device].data # xi is FedDataframe - if isinstance(xi.partitions[device], PartitionBase) + if isinstance(xi.partitions[device], Partition) else xi.partitions[device] # xi is FedNdarray for xi in x ] @@ -208,7 +208,7 @@ def handle_data( if device in self.base_model_dict else [None] ) - xs = [t.data if isinstance(t, PartitionBase) else t for t in xs] + xs = [t.data if isinstance(t, Partition) else t for t in xs] worker.build_dataset_from_numeric( *xs, y=y_partitions, diff --git a/secretflow/preprocessing/binning/vert_woe_substitution.py b/secretflow/preprocessing/binning/vert_bin_substitution.py similarity index 77% rename from secretflow/preprocessing/binning/vert_woe_substitution.py rename to secretflow/preprocessing/binning/vert_bin_substitution.py index 108aed1f8..b2e261a5f 100644 --- a/secretflow/preprocessing/binning/vert_woe_substitution.py +++ b/secretflow/preprocessing/binning/vert_bin_substitution.py @@ -17,20 +17,20 @@ import numpy as np import pandas as pd -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame from secretflow.device import PYU, PYUObject, proxy @proxy(PYUObject) -class VertWOESubstitutionPyuWorker: +class VertBinSubstitutionPyuWorker: def sub(self, data: pd.DataFrame, r: Dict) -> pd.DataFrame: """ - PYU functions for woe substitution. + PYU functions for binning substitution. Args: data: input dataset for this party. - r: woe substitution rules. + r: bin substitution rules. Returns: data: dataset after substituted. @@ -50,8 +50,8 @@ def sub(self, data: pd.DataFrame, r: Dict) -> pd.DataFrame: rule = rules[v] if rule["type"] == "string": condlist = [col_data == c for c in rule["categories"]] - choicelist = rule["woes"] - data[v] = np.select(condlist, choicelist, rule["else_woe"]) + choicelist = rule["filling_values"] + data[v] = np.select(condlist, choicelist, rule["else_filling_value"]) else: condlist = list() split_points = rule["split_points"] @@ -68,40 +68,42 @@ def sub(self, data: pd.DataFrame, r: Dict) -> pd.DataFrame: ) if len(split_points) > 0: condlist.append(col_data > split_points[-1]) - choicelist = rule["woes"] - data[v] = np.select(condlist, choicelist, rule["else_woe"]) + choicelist = rule["filling_values"] + assert len(choicelist) == len(split_points) + 1, f"{choicelist}" + assert len(condlist) == len(split_points) + 1, f"{condlist}" + data[v] = np.select(condlist, choicelist, rule["else_filling_value"]) return data -class VertWOESubstitution: +class VertBinSubstitution: def substitution( - self, vdata: VDataFrame, woe_rules: Dict[PYU, PYUObject] + self, vdata: VDataFrame, rules: Dict[PYU, PYUObject] ) -> VDataFrame: """ - substitute dataset's value by woe substitution rules. + substitute dataset's value by binning substitution rules. Args: vdata: vertical slice dataset to be substituted. - woe_rules: woe substitution rules build by VertWoeBinning. + rules: binning substitution rules build by VertBinning. Returns: new_vdata: vertical slice dataset after substituted. """ - works: Dict[PYU, VertWOESubstitutionPyuWorker] = {} - for device in woe_rules: + works: Dict[PYU, VertBinSubstitutionPyuWorker] = {} + for device in rules: assert ( device in vdata.partitions.keys() ), f"device {device} not exist in vdata" - works[device] = VertWOESubstitutionPyuWorker(device=device) + works[device] = VertBinSubstitutionPyuWorker(device=device) new_vdata = VDataFrame( { d: partition( - data=works[d].sub(vdata.partitions[d].data, woe_rules[d]), + data=works[d].sub(vdata.partitions[d].data, rules[d]), backend=vdata.partitions[d].backend, ) - for d in woe_rules + for d in rules } ) diff --git a/secretflow/preprocessing/binning/vert_binning.py b/secretflow/preprocessing/binning/vert_binning.py new file mode 100644 index 000000000..8878c985a --- /dev/null +++ b/secretflow/preprocessing/binning/vert_binning.py @@ -0,0 +1,110 @@ +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Dict, List +from secretflow.data.vertical import VDataFrame +from secretflow.device import PYU, PYUObject + +from secretflow.preprocessing.binning.vert_binning_pyu import VertBinningPyuWorker + + +class VertBinning: + """ + equal range or frequency binning for vertical slice datasets. + + Split all features into bins by equal range or equal frequency. + + Finally, this method will output binning rules which are essentially split points. + """ + + def __init__(self): + return + + def binning( + self, + vdata: VDataFrame, + binning_method: str = "eq_range", + bin_num: int = 10, + bin_names: Dict[PYU, List[str]] = {}, + ): + """ + Build bin substitution rules base on vdata. + The split points for bins remains at each party. + + Attributes: + vdata (VDataFrame): vertical slice datasets + use {binning_method} to bin all number type features. + for string type feature bin by it's categories. + else bin is count for np.nan samples + binning_method (str): how to bin number type features. + Options: "quantile"(equal frequency)/"eq_range"(equal range) + Default: "eq_range" + bin_num (int): max bin counts for one features. + Range: (0, ∞] + Default: 10 + bin_names (Dict[PYU, List[str]]): which features should be binned. + Default: {} + + Return: + Dict[PYU, PYUObject], PYUObject contain a dict for all features' rule in this party. + + .. code:: python + + { + "variables":[ + { + "name": str, # feature name + "type": str, # "string" or "numeric", if feature is discrete or continuous + "categories": list[str], # categories for discrete feature + "split_points": list[float], # left-open right-close split points + "total_counts": list[int], # total samples count in each bins. + "else_counts": int, # np.nan samples count + # for this binning method, we use [*range(f_num_bins)] as filling values + # that is 0 for bin with index 0, 1 for bin with index 1 etc. + "filling_values": list[float], # filling values for each bins. + # for this binning method, we use -1 as filling value for nan samples + "else_filling_value": float, # filling value for np.nan samples. + }, + # ... others feature + ], + } + + + """ + assert binning_method in ( + "quantile", + "eq_range", + ), f"binning_method only support ('quantile', 'eq_range'), got {binning_method}" + assert bin_num > 0, f"bin_num range (0, ∞], got {bin_num}" + + workers: Dict[PYU, VertBinningPyuWorker] = {} + + for device in bin_names: + assert ( + device in vdata.partitions.keys() + ), f"device {device} in bin_names not exist in vdata" + workers[device] = VertBinningPyuWorker( + vdata.partitions[device].data.data, + binning_method, + bin_num, + bin_names[device], + device=device, + ) + + rules: Dict[PYU, PYUObject] = {} + for device in bin_names: + rules[device] = workers[device].bin_feature_and_produce_rules() + + return rules diff --git a/secretflow/preprocessing/binning/vert_binning_pyu.py b/secretflow/preprocessing/binning/vert_binning_pyu.py new file mode 100644 index 000000000..b26f991a5 --- /dev/null +++ b/secretflow/preprocessing/binning/vert_binning_pyu.py @@ -0,0 +1,254 @@ +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple, Union + +import numpy as np +import pandas as pd + +from secretflow.device import PYUObject, proxy + + +@proxy(PYUObject) +class VertBinningPyuWorker: + """ + PYU functions for binning + Attributes: see VertBinning + """ + + def __init__( + self, + data: pd.DataFrame, + binning_method: str, + bin_num: int, + bin_names: List[str], + ): + data_columns = data.columns + assert isinstance( + bin_names, list + ), f"bin names should be a list of string but got {type(bin_names)}" + assert np.isin( + bin_names, data_columns + ).all(), ( + f"bin_names[{bin_names}] not exist in input data columns[{data_columns}]" + ) + self.data = data + self.bin_names = bin_names + self.bin_num = bin_num + self.binning_method = binning_method + + def _build_feature_bin( + self, f_data: pd.DataFrame + ) -> Tuple[List[np.ndarray], Union[np.ndarray, List[str]], np.ndarray]: + ''' + split one feature column into {bin_num} bins. + + Attributes: + f_data: feature column to be split. + + Return: + First: sample indices for each bins. + Second: split points for number column (np.array) or + categories for string column (List[str]) + Third: sample indices for np.nan values. + ''' + if f_data.dtype == np.dtype(object): + # for string type col, split into bins by categories. + categories = {d for d in f_data if not pd.isna(d)} + for c in categories: + assert isinstance( + c, str + ), f"only support str if dtype == np.obj, but got {type(c)}" + split_points = sorted(list(categories)) + bin_indices = list() + for b in split_points: + bin_indices.append(np.flatnonzero(f_data == b)) + return bin_indices, split_points, np.flatnonzero(pd.isna(f_data)) + else: + # for number type col, first binning by pd.qcut. + bin_num = self.bin_num + + if self.binning_method == "quantile": + bins, split_points = pd.qcut( + f_data, bin_num, labels=False, duplicates='drop', retbins=True + ) + elif self.binning_method == "eq_range": + bins, split_points = pd.cut( + f_data, bin_num, labels=False, duplicates='drop', retbins=True + ) + else: + raise ValueError(f"binning_method {self.binning_method} not supported") + + bin_indices = list() + assert split_points.size >= 2, f"split_points.size {split_points.size}" + empty_bins = [0, split_points.size - 1] + for b in range(split_points.size - 1): + bin = np.flatnonzero(bins == b) + if bin.size == 0: + # Then, remove empty bin in pd.qcut's result. + empty_bins.append(b + 1) + else: + bin_indices.append(bin) + + return ( + bin_indices, + # remove start/end value & empty bins in pd.qcut's range result. + # remain only left-open right-close split points + np.delete(split_points, empty_bins), + np.flatnonzero(pd.isna(f_data)), + ) + + def _build_feature_bins( + self, data: pd.DataFrame + ) -> Tuple[List[np.ndarray], List[Union[np.ndarray, List[str]]], List[np.ndarray]]: + ''' + split all columns into {bin_num} bins. + Attributes: + data: dataset to be split. + + Return: + First: sample indices for each bins in all features. + Second: split points for number column (np.array) or + categories for string column (List[str]) for all features. + Third: sample indices for np.nan values in all features. + ''' + ret_bins_idx = list() + ret_points = list() + ret_else_bins = list() + assert isinstance(data, pd.DataFrame), type(data) + for f_name in self.bin_names: + f_data = data.loc[:, f_name] + bin_idx, split_point, else_bin = self._build_feature_bin(f_data) + if isinstance(split_point, list): + # use List[str] for string column + # split_point means categories, so length of it need equal to bin_idx + assert len(bin_idx) == len(split_point), ( + f"len(bin_idx) {len(bin_idx)}," + f" len(split_point) {len(split_point)}" + ) + else: + # use np.array for number column + # split_point contain left-open right-close split points between each bins. + # so length of it need equal to len(bin_idx) - 1 + assert len(bin_idx) == split_point.size + 1, ( + f"len(bin_idx) {len(bin_idx)}," + f" split_point.size {split_point.size}" + ) + ret_bins_idx += bin_idx + ret_points.append(split_point) + ret_else_bins.append(else_bin) + + return ret_bins_idx, ret_points, ret_else_bins + + def _build_report_dict( + self, + f_name: str, + split_points: Union[np.ndarray, List[str]], + total_counts: List[int], + else_counts: int, + ) -> Dict: + ''' + build report dict for one feature. + Attributes: + f_name: feature name. + split_points: see _build_feature_bin. + total_counts: total samples in each bins. + else_counts: total samples for np.nan values. + + Return: + Dict report. + ''' + ret = dict() + ret['name'] = f_name + + f_bin_size = 0 + if isinstance(split_points, list): + ret['type'] = "string" + ret['categories'] = split_points + f_bin_size = len(split_points) + else: + ret['type'] = "numeric" + ret['split_points'] = list(split_points) + f_bin_size = split_points.size + 1 + + ret['total_counts'] = list() + for i in range(len(total_counts)): + ret['total_counts'].append(total_counts[i]) + + ret['else_counts'] = else_counts + + ret['filling_values'] = [*range(f_bin_size)] + + ret['else_filling_value'] = -1 + return ret + + def _build_report( + self, + split_points: List[Union[np.ndarray, List[str]]], + total_counts: List[int], + else_counts: List[int], + ) -> Dict: + ''' + Attributes: + split_points: see _build_feature_bin. + total_counts: total samples all features' bins. + else_counts: np.nan samples in all features. + + Return: + Dict report + ''' + assert len(else_counts) == len(self.bin_names), ( + f"len(else_counts) {len(else_counts)}," + f" len(self.bin_names) {len(self.bin_names)}" + ) + assert len(split_points) == len(self.bin_names), ( + f"len(split_points) {len(split_points)}," + f" len(self.bin_names) {len(self.bin_names)}" + ) + + pos = 0 + variables = list() + for f_idx in range(len(split_points)): + split_point = split_points[f_idx] + f_bin_size = 0 + if isinstance(split_point, list): + f_bin_size = len(split_point) + else: + f_bin_size = split_point.size + 1 + + variables.append( + self._build_report_dict( + self.bin_names[f_idx], + split_points[f_idx], + total_counts[pos : pos + f_bin_size], + else_counts[f_idx], + ) + ) + pos += f_bin_size + + assert len(variables) == len(self.bin_names), ( + f"len(variables) {len(variables)}, " + f"len(self.bin_names) {len(self.bin_names)}" + ) + return {"variables": variables} + + def bin_feature_and_produce_rules(self) -> Dict: + ''' + bin all the features in self.data and return rules + ''' + bins_idx, split_points, else_bins = self._build_feature_bins(self.data) + + total_counts = [b.size for b in bins_idx] + else_counts = [b.size for b in else_bins] + return self._build_report(split_points, total_counts, else_counts) diff --git a/secretflow/preprocessing/binning/vert_woe_binning.py b/secretflow/preprocessing/binning/vert_woe_binning.py index a868f2aac..f15b83d3e 100644 --- a/secretflow/preprocessing/binning/vert_woe_binning.py +++ b/secretflow/preprocessing/binning/vert_woe_binning.py @@ -33,7 +33,7 @@ class VertWoeBinning: Split all features into bins by equal frequency or ChiMerge. Then calculate woe value & iv value for each bin by SS or HE secure device to protect Y label. - Finally, this method will output binning rules used to substitute features' value into woe by VertWOESubstitution. + Finally, this method will output binning rules used to substitute features' value into woe by VertBinSubstitution. more details about woe/iv value: https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html @@ -131,8 +131,8 @@ def binning( "split_points": list[float], # left-open right-close split points "total_counts": list[int], # total samples count in each bins. "else_counts": int, # np.nan samples count - "woes": list[float], # woe values for each bins. - "else_woe": float, # woe value for np.nan samples. + "filling_values": list[float], # woe values for each bins. + "else_filling_value": float, # woe value for np.nan samples. }, # ... others feature ], @@ -208,7 +208,7 @@ def binning( device=device, ) - woe_rules: Dict[PYU, PYUObject] = {} + bin_rules: Dict[PYU, PYUObject] = {} # label_holder build woe rules label_holder_worker = workers[label_holder_device] @@ -280,12 +280,12 @@ def spu_work(label, select): ivs, bim_sum_info ) report = worker.participant_build_report(woes.to(device)) - woe_rules[device] = report + bin_rules[device] = report # feature ivs are in label_holder report, which may be later shared to worker label_holder_report = label_holder_worker.generate_iv_report( label_holder_report ) - woe_rules[label_holder_device] = label_holder_report + bin_rules[label_holder_device] = label_holder_report - return woe_rules + return bin_rules diff --git a/secretflow/preprocessing/binning/vert_woe_binning_pyu.py b/secretflow/preprocessing/binning/vert_woe_binning_pyu.py index ac3d8dc3e..71d579a89 100644 --- a/secretflow/preprocessing/binning/vert_woe_binning_pyu.py +++ b/secretflow/preprocessing/binning/vert_woe_binning_pyu.py @@ -251,16 +251,16 @@ def _build_report_dict( ret['type'] = "numeric" ret['split_points'] = list(split_points) - ret['woes'] = list() + ret['filling_values'] = list() ret['total_counts'] = list() assert len(total_counts) == len(woes), ( f"len(total_counts) {len(total_counts)}," f" len(woes) {len(woes)}" ) for i in range(len(woes)): ret['total_counts'].append(total_counts[i]) - ret['woes'].append(woes[i]) + ret['filling_values'].append(woes[i]) - ret['else_woe'] = else_woe + ret['else_filling_value'] = else_woe ret['else_counts'] = else_counts return ret diff --git a/secretflow/preprocessing/cond_filter_v.py b/secretflow/preprocessing/cond_filter_v.py index 86e2b3913..12fb9e2ce 100644 --- a/secretflow/preprocessing/cond_filter_v.py +++ b/secretflow/preprocessing/cond_filter_v.py @@ -8,10 +8,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from functools import reduce from typing import Any, Dict, List -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame from secretflow.preprocessing.base import _PreprocessBase diff --git a/secretflow/preprocessing/discretization.py b/secretflow/preprocessing/discretization.py index cc573cb51..3ae5c5759 100644 --- a/secretflow/preprocessing/discretization.py +++ b/secretflow/preprocessing/discretization.py @@ -20,7 +20,6 @@ from secretflow.data.horizontal import HDataFrame from secretflow.data.mix.dataframe import MixDataFrame, PartitionWay -from secretflow.data.base import partition from secretflow.data.vertical import VDataFrame from secretflow.device.driver import reveal from secretflow.preprocessing.base import _PreprocessBase @@ -192,16 +191,14 @@ def _transform( ) -> Union[HDataFrame, VDataFrame]: transformed_parts = {} - def _df_transform(est: SkKBinsDiscretizer, df: pd.DataFrame): + def _df_transform(df: pd.DataFrame, est: SkKBinsDiscretizer): new_df = df.copy() new_df.iloc[:, :] = est.transform(df) return new_df if isinstance(df, HDataFrame): for device, part in df.partitions.items(): - transformed_parts[device] = partition( - device(_df_transform)(est, part.data), part.backend - ) + transformed_parts[device] = part.apply_func(_df_transform, est=est) else: # VDataFrame start_idx = 0 @@ -213,9 +210,7 @@ def _df_transform(est: SkKBinsDiscretizer, df: pd.DataFrame): ) est_part.bin_edges_ = est.bin_edges_[start_idx:end_idx] est_part.n_bins_ = est.n_bins_[start_idx:end_idx] - transformed_parts[device] = partition( - device(_df_transform)(est_part, part.data), part.backend - ) + transformed_parts[device] = part.apply_func(_df_transform, est=est_part) start_idx = end_idx new_df = df.copy() diff --git a/secretflow/preprocessing/encoder.py b/secretflow/preprocessing/encoder.py index 79a50071c..fb4e68938 100644 --- a/secretflow/preprocessing/encoder.py +++ b/secretflow/preprocessing/encoder.py @@ -22,7 +22,6 @@ from secretflow.data.horizontal import HDataFrame from secretflow.data.mix import MixDataFrame -from secretflow.data.base import partition from secretflow.data.vertical import VDataFrame from secretflow.device import reveal from secretflow.preprocessing.base import _PreprocessBase @@ -79,7 +78,7 @@ def _transform( self, df: Union[HDataFrame, VDataFrame] ) -> Union[HDataFrame, VDataFrame]: def _df_transform( - encoder: SkLabelEncoder, _df: Union[pd.DataFrame, "pl.DataFrame"] + _df: Union[pd.DataFrame, "pl.DataFrame"], encoder: SkLabelEncoder ): pl = None try: @@ -101,10 +100,9 @@ def _df_transform( transformed_parts = {} for device, part in df.partitions.items(): - transformed_parts[device] = partition( - device(_df_transform)(self._encoder, part.data), part.backend + transformed_parts[device] = part.apply_func( + _df_transform, encoder=self._encoder ) - new_df = df.copy() new_df.partitions = transformed_parts return new_df @@ -270,10 +268,21 @@ def _transform( ) -> Union[HDataFrame, VDataFrame]: transformed_parts = {} - def _encode(encoder: SkOneHotEncoder, df: pd.DataFrame): - return pd.DataFrame( - encoder.transform(df).toarray(), columns=encoder.get_feature_names_out() - ) + def _encode(_df, _encoder: SkOneHotEncoder): + transformed_df = encoder.transform(_df).toarray() + columns = encoder.get_feature_names_out() + + pl = None + try: + import polars as pl + except ImportError: + pass + if isinstance(_df, pd.DataFrame): + return pd.DataFrame(transformed_df, columns=columns) + elif pl is not None and isinstance(_df, pl.DataFrame): + return pl.DataFrame(transformed_df, schema=columns.tolist()) + else: + raise RuntimeError("Unknown dataframe type.") for device, part in df.partitions.items(): if self.min_frequency or self.max_categories: @@ -293,9 +302,7 @@ def _encode(encoder: SkOneHotEncoder, df: pd.DataFrame): } self._fill_categories(categories) encoder.fit(pd.DataFrame(categories)) - transformed_parts[device] = partition( - device(_encode)(encoder, part.data), backend=part.backend - ) + transformed_parts[device] = part.apply_func(_encode, _encoder=encoder) new_df = df.copy() new_df.partitions = transformed_parts return new_df diff --git a/secretflow/preprocessing/scaler.py b/secretflow/preprocessing/scaler.py index ed32ca81b..b1059e034 100644 --- a/secretflow/preprocessing/scaler.py +++ b/secretflow/preprocessing/scaler.py @@ -19,7 +19,7 @@ from sklearn.preprocessing import MinMaxScaler as SkMinMaxScaler from sklearn.preprocessing import StandardScaler as SkStandardScaler -from secretflow.data.base import PartitionBase, partition +from secretflow.data import Partition from secretflow.data.horizontal import HDataFrame from secretflow.data.mix import MixDataFrame, PartitionWay from secretflow.data.vertical import VDataFrame @@ -66,7 +66,7 @@ def _transform( ) -> Union[HDataFrame, VDataFrame]: transformed_parts = {} - def _df_transform(_scaler: SkMinMaxScaler, _df): + def _df_transform(_df, _scaler: SkMinMaxScaler): pl = None try: import polars as pl @@ -90,9 +90,7 @@ def _df_transform(_scaler: SkMinMaxScaler, _df): scaler.fit( np.stack([self._scaler.data_min_[mask], self._scaler.data_max_[mask]]) ) - transformed_parts[device] = partition( - device(_df_transform)(scaler, part.data), part.backend - ) + transformed_parts[device] = part.apply_func(_df_transform, _scaler=scaler) new_df = df.copy() new_df.partitions = transformed_parts @@ -175,7 +173,7 @@ def _check_dataframe(df): ), f'Accepts HDataFrame/VDataFrame/MixDataFrame only but got {type(df)}' def _fit_horizontal( - self, partitions: List[PartitionBase], aggregator: Aggregator = None + self, partitions: List[Partition], aggregator: Aggregator = None ) -> SkStandardScaler: means = [part.mean(numeric_only=True) for part in partitions] cnts = [part.count(numeric_only=True) for part in partitions] @@ -274,7 +272,7 @@ def _sk_fit(with_mean, with_std, _df: pd.DataFrame): def _transform( self, scaler: SkStandardScaler, df: Union[HDataFrame, VDataFrame] ) -> Union[HDataFrame, VDataFrame]: - def _df_transform(_scaler: SkStandardScaler, _df: pd.DataFrame): + def _df_transform(_df: pd.DataFrame, _scaler: SkStandardScaler): _new_df = _df.copy() _new_df.iloc[:, :] = _scaler.transform(_df) return _new_df @@ -282,8 +280,8 @@ def _df_transform(_scaler: SkStandardScaler, _df: pd.DataFrame): transformed_parts = {} if isinstance(df, HDataFrame): for device, part in df.partitions.items(): - transformed_parts[device] = partition( - device(_df_transform)(scaler, part.data), part.backend + transformed_parts[device] = part.apply_func( + _df_transform, _scaler=scaler ) elif isinstance(df, VDataFrame): start_idx = 0 @@ -303,8 +301,8 @@ def _df_transform(_scaler: SkStandardScaler, _df: pd.DataFrame): else: part_scaler.var_ = None part_scaler.scale_ = None - transformed_parts[device] = partition( - device(_df_transform)(part_scaler, part.data), part.backend + transformed_parts[device] = part.apply_func( + _df_transform, _scaler=part_scaler ) start_idx = end_idx else: diff --git a/secretflow/preprocessing/transformer.py b/secretflow/preprocessing/transformer.py index 0a9c35d17..f5d42f258 100644 --- a/secretflow/preprocessing/transformer.py +++ b/secretflow/preprocessing/transformer.py @@ -21,7 +21,6 @@ from secretflow.data.horizontal import HDataFrame from secretflow.data.mix import MixDataFrame -from secretflow.data.base import partition from secretflow.data.vertical import VDataFrame from secretflow.preprocessing.base import _PreprocessBase @@ -81,37 +80,36 @@ def fit(self, df: Union[HDataFrame, VDataFrame, MixDataFrame]): def _transform( self, df: Union[HDataFrame, VDataFrame] ) -> Union[HDataFrame, VDataFrame]: - def _df_transform(transformer: SkFunctionTransformer, df: pd.DataFrame): - if isinstance(df, pd.DataFrame): + def _df_transform(_df: pd.DataFrame, transformer: SkFunctionTransformer): + transformed_df = transformer.transform(_df) + if isinstance(_df, pd.DataFrame): return pd.DataFrame( - data=transformer.transform(df), - columns=df.columns, + data=transformed_df, + columns=_df.columns, ) else: try: import polars as pl - if isinstance(df, pl.DataFrame): - transform_df = transformer.transform(df) - return ( - transform_df - if isinstance(transform_df, pl.DataFrame) - else pl.DataFrame( - data=transformer.transform(df), + if isinstance(_df, pl.DataFrame): + if isinstance(transformed_df, pl.DataFrame): + return transformed_df + else: + return pl.DataFrame( + data=transformer.transform(_df), schema={ - df.columns[i]: df.dtypes[i] - for i in range(len(df.columns)) + _df.columns[i]: _df.dtypes[i] + for i in range(len(_df.columns)) }, ) - ) except ImportError: pass - raise RuntimeError(f"Unknown df type {type(df)}") + raise RuntimeError(f"Unknown df type {type(_df)}") transformed_parts = {} for device, part in df.partitions.items(): - transformed_parts[device] = partition( - device(_df_transform)(self._transformer, part.data), part.backend + transformed_parts[device] = part.apply_func( + _df_transform, transformer=self._transformer ) new_df = df.copy() new_df.partitions = transformed_parts @@ -165,11 +163,10 @@ def _loground( else: try: import polars as pl - import polars.selectors as cs if isinstance(x, pl.DataFrame): x = x.with_columns( - cs.by_dtype(pl.INTEGER_DTYPES, pl.FLOAT_DTYPES) + pl.col(pl.NUMERIC_DTYPES) .add(_bias) .apply(np.log2) .round(_decimals) diff --git a/secretflow/protos/README.md b/secretflow/protos/README.md deleted file mode 100644 index 8ee288fc7..000000000 --- a/secretflow/protos/README.md +++ /dev/null @@ -1,8 +0,0 @@ -**NOTE: this folder is still experimental.** - - -Once you modified proto files, please run the following command at the project root to generate python file: - -``` -python setup.py generate_py_protobufs -``` diff --git a/secretflow/protos/component/README.md b/secretflow/protos/component/README.md deleted file mode 100644 index 185393657..000000000 --- a/secretflow/protos/component/README.md +++ /dev/null @@ -1,26 +0,0 @@ -**NOTE: component is still experimental.** - -This is a high level overview of data stuctures related to component. - -* cluster: First step, we should setup a SecretFlow cluster - - **SFClusterDesc**: Description of a SecretFlow cluster. Two cluster with the same **SFClusterDesc** could be regarded as the same since they have the same topology and security settings. - - **SFClusterConfig**: Config for setting up a SecretFlow cluster. It contains **SFClusterDesc** and other runtime settings for network and storage. - -* data: Data for SecretFlow Applications, e.g. SecretFlow/SCQL - - **SystemInfo**: Describe the application which could consume the data. - - **DistData** - - **DataRef**: **DataRef** refers to a piece of solid data in the SecretFlow cluster. A **DistData** contains a list of **DataRef**s - - meta: The meta would contains the extra to describe how **DataRef**s is organized. Now we support the following first-class Data. - - **VerticalTable**: Vertical table. - -* comp: Definitations for a component. - - **ComponentDef**: A component consists of name, attributes, inputs and outputs. - - **CompListDef**: A list of components. - - **AttributeDef**: Defines a attribute. We organize the attributes of a component as a tree. The leaves of the tree is called *Atomic Attribute*, which represent a solid field need to fill. The non-leaf nodes are *Union Attribute Group* and *Struct Attribute Group*. The children of a *Struct Attribute Group* node could be regarded as a group. While for children of a *Union Attribute Group*, user should select one child to fill. - - **AtomicAttrDesc**: Extra settings for a *Atomic Attribute* - - **UnionAttrGroupDesc** Extra settings for a *Union Attribute Group* - - **IoDef**: Defines an input/output of a component. - -* evaluation: A call to a component. - - **NodeEvalParam**: All the info to evaluate a component - id to the component, solid attributes and inputs, output names. - - **NodeEvalResult**: The result of evaluation - solid output. diff --git a/secretflow/protos/component/cluster_pb2.py b/secretflow/protos/component/cluster_pb2.py deleted file mode 100644 index 3223d6c57..000000000 --- a/secretflow/protos/component/cluster_pb2.py +++ /dev/null @@ -1,566 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: secretflow/protos/component/cluster.proto - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='secretflow/protos/component/cluster.proto', - package='secretflow.component', - syntax='proto3', - serialized_options=b'\n\036org.secretflow.proto.component', - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n)secretflow/protos/component/cluster.proto\x12\x14secretflow.component\"\xcf\x02\n\rSFClusterDesc\x12\x12\n\nsf_version\x18\x01 \x01(\t\x12\x12\n\npy_version\x18\x02 \x01(\t\x12\x0f\n\x07parties\x18\x03 \x03(\t\x12?\n\x07\x64\x65vices\x18\x04 \x03(\x0b\x32..secretflow.component.SFClusterDesc.DeviceDesc\x12H\n\x0eray_fed_config\x18\x05 \x01(\x0b\x32\x30.secretflow.component.SFClusterDesc.RayFedConfig\x1aI\n\nDeviceDesc\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0f\n\x07parties\x18\x03 \x03(\t\x12\x0e\n\x06\x63onfig\x18\x04 \x01(\t\x1a/\n\x0cRayFedConfig\x12\x1f\n\x17\x63ross_silo_comm_backend\x18\x01 \x01(\t\"\x7f\n\rStorageConfig\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x43\n\x08local_fs\x18\x02 \x01(\x0b\x32\x31.secretflow.component.StorageConfig.LocalFSConfig\x1a\x1b\n\rLocalFSConfig\x12\n\n\x02wd\x18\x01 \x01(\t\"\x9f\x05\n\x0fSFClusterConfig\x12\x31\n\x04\x64\x65sc\x18\x01 \x01(\x0b\x32#.secretflow.component.SFClusterDesc\x12I\n\rpublic_config\x18\x02 \x01(\x0b\x32\x32.secretflow.component.SFClusterConfig.PublicConfig\x12K\n\x0eprivate_config\x18\x03 \x01(\x0b\x32\x33.secretflow.component.SFClusterConfig.PrivateConfig\x1aL\n\x0cRayFedConfig\x12\x0f\n\x07parties\x18\x01 \x03(\t\x12\x11\n\taddresses\x18\x02 \x03(\t\x12\x18\n\x10listen_addresses\x18\x03 \x03(\t\x1aW\n\tSPUConfig\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07parties\x18\x02 \x03(\t\x12\x11\n\taddresses\x18\x03 \x03(\t\x12\x18\n\x10listen_addresses\x18\x04 \x03(\t\x1a\xa0\x01\n\x0cPublicConfig\x12J\n\x0eray_fed_config\x18\x01 \x01(\x0b\x32\x32.secretflow.component.SFClusterConfig.RayFedConfig\x12\x44\n\x0bspu_configs\x18\x02 \x03(\x0b\x32/.secretflow.component.SFClusterConfig.SPUConfig\x1aw\n\rPrivateConfig\x12\x12\n\nself_party\x18\x01 \x01(\t\x12\x15\n\rray_head_addr\x18\x02 \x01(\t\x12;\n\x0estorage_config\x18\x03 \x01(\x0b\x32#.secretflow.component.StorageConfigB \n\x1eorg.secretflow.proto.componentb\x06proto3' -) - - - - -_SFCLUSTERDESC_DEVICEDESC = _descriptor.Descriptor( - name='DeviceDesc', - full_name='secretflow.component.SFClusterDesc.DeviceDesc', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.SFClusterDesc.DeviceDesc.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='type', full_name='secretflow.component.SFClusterDesc.DeviceDesc.type', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='parties', full_name='secretflow.component.SFClusterDesc.DeviceDesc.parties', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='config', full_name='secretflow.component.SFClusterDesc.DeviceDesc.config', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=281, - serialized_end=354, -) - -_SFCLUSTERDESC_RAYFEDCONFIG = _descriptor.Descriptor( - name='RayFedConfig', - full_name='secretflow.component.SFClusterDesc.RayFedConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='cross_silo_comm_backend', full_name='secretflow.component.SFClusterDesc.RayFedConfig.cross_silo_comm_backend', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=356, - serialized_end=403, -) - -_SFCLUSTERDESC = _descriptor.Descriptor( - name='SFClusterDesc', - full_name='secretflow.component.SFClusterDesc', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='sf_version', full_name='secretflow.component.SFClusterDesc.sf_version', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='py_version', full_name='secretflow.component.SFClusterDesc.py_version', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='parties', full_name='secretflow.component.SFClusterDesc.parties', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='devices', full_name='secretflow.component.SFClusterDesc.devices', index=3, - number=4, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='ray_fed_config', full_name='secretflow.component.SFClusterDesc.ray_fed_config', index=4, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_SFCLUSTERDESC_DEVICEDESC, _SFCLUSTERDESC_RAYFEDCONFIG, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=68, - serialized_end=403, -) - - -_STORAGECONFIG_LOCALFSCONFIG = _descriptor.Descriptor( - name='LocalFSConfig', - full_name='secretflow.component.StorageConfig.LocalFSConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='wd', full_name='secretflow.component.StorageConfig.LocalFSConfig.wd', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=505, - serialized_end=532, -) - -_STORAGECONFIG = _descriptor.Descriptor( - name='StorageConfig', - full_name='secretflow.component.StorageConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='type', full_name='secretflow.component.StorageConfig.type', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='local_fs', full_name='secretflow.component.StorageConfig.local_fs', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_STORAGECONFIG_LOCALFSCONFIG, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=405, - serialized_end=532, -) - - -_SFCLUSTERCONFIG_RAYFEDCONFIG = _descriptor.Descriptor( - name='RayFedConfig', - full_name='secretflow.component.SFClusterConfig.RayFedConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='parties', full_name='secretflow.component.SFClusterConfig.RayFedConfig.parties', index=0, - number=1, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='addresses', full_name='secretflow.component.SFClusterConfig.RayFedConfig.addresses', index=1, - number=2, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='listen_addresses', full_name='secretflow.component.SFClusterConfig.RayFedConfig.listen_addresses', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=757, - serialized_end=833, -) - -_SFCLUSTERCONFIG_SPUCONFIG = _descriptor.Descriptor( - name='SPUConfig', - full_name='secretflow.component.SFClusterConfig.SPUConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.SFClusterConfig.SPUConfig.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='parties', full_name='secretflow.component.SFClusterConfig.SPUConfig.parties', index=1, - number=2, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='addresses', full_name='secretflow.component.SFClusterConfig.SPUConfig.addresses', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='listen_addresses', full_name='secretflow.component.SFClusterConfig.SPUConfig.listen_addresses', index=3, - number=4, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=835, - serialized_end=922, -) - -_SFCLUSTERCONFIG_PUBLICCONFIG = _descriptor.Descriptor( - name='PublicConfig', - full_name='secretflow.component.SFClusterConfig.PublicConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='ray_fed_config', full_name='secretflow.component.SFClusterConfig.PublicConfig.ray_fed_config', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='spu_configs', full_name='secretflow.component.SFClusterConfig.PublicConfig.spu_configs', index=1, - number=2, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=925, - serialized_end=1085, -) - -_SFCLUSTERCONFIG_PRIVATECONFIG = _descriptor.Descriptor( - name='PrivateConfig', - full_name='secretflow.component.SFClusterConfig.PrivateConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='self_party', full_name='secretflow.component.SFClusterConfig.PrivateConfig.self_party', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='ray_head_addr', full_name='secretflow.component.SFClusterConfig.PrivateConfig.ray_head_addr', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='storage_config', full_name='secretflow.component.SFClusterConfig.PrivateConfig.storage_config', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1087, - serialized_end=1206, -) - -_SFCLUSTERCONFIG = _descriptor.Descriptor( - name='SFClusterConfig', - full_name='secretflow.component.SFClusterConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.SFClusterConfig.desc', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='public_config', full_name='secretflow.component.SFClusterConfig.public_config', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='private_config', full_name='secretflow.component.SFClusterConfig.private_config', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_SFCLUSTERCONFIG_RAYFEDCONFIG, _SFCLUSTERCONFIG_SPUCONFIG, _SFCLUSTERCONFIG_PUBLICCONFIG, _SFCLUSTERCONFIG_PRIVATECONFIG, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=535, - serialized_end=1206, -) - -_SFCLUSTERDESC_DEVICEDESC.containing_type = _SFCLUSTERDESC -_SFCLUSTERDESC_RAYFEDCONFIG.containing_type = _SFCLUSTERDESC -_SFCLUSTERDESC.fields_by_name['devices'].message_type = _SFCLUSTERDESC_DEVICEDESC -_SFCLUSTERDESC.fields_by_name['ray_fed_config'].message_type = _SFCLUSTERDESC_RAYFEDCONFIG -_STORAGECONFIG_LOCALFSCONFIG.containing_type = _STORAGECONFIG -_STORAGECONFIG.fields_by_name['local_fs'].message_type = _STORAGECONFIG_LOCALFSCONFIG -_SFCLUSTERCONFIG_RAYFEDCONFIG.containing_type = _SFCLUSTERCONFIG -_SFCLUSTERCONFIG_SPUCONFIG.containing_type = _SFCLUSTERCONFIG -_SFCLUSTERCONFIG_PUBLICCONFIG.fields_by_name['ray_fed_config'].message_type = _SFCLUSTERCONFIG_RAYFEDCONFIG -_SFCLUSTERCONFIG_PUBLICCONFIG.fields_by_name['spu_configs'].message_type = _SFCLUSTERCONFIG_SPUCONFIG -_SFCLUSTERCONFIG_PUBLICCONFIG.containing_type = _SFCLUSTERCONFIG -_SFCLUSTERCONFIG_PRIVATECONFIG.fields_by_name['storage_config'].message_type = _STORAGECONFIG -_SFCLUSTERCONFIG_PRIVATECONFIG.containing_type = _SFCLUSTERCONFIG -_SFCLUSTERCONFIG.fields_by_name['desc'].message_type = _SFCLUSTERDESC -_SFCLUSTERCONFIG.fields_by_name['public_config'].message_type = _SFCLUSTERCONFIG_PUBLICCONFIG -_SFCLUSTERCONFIG.fields_by_name['private_config'].message_type = _SFCLUSTERCONFIG_PRIVATECONFIG -DESCRIPTOR.message_types_by_name['SFClusterDesc'] = _SFCLUSTERDESC -DESCRIPTOR.message_types_by_name['StorageConfig'] = _STORAGECONFIG -DESCRIPTOR.message_types_by_name['SFClusterConfig'] = _SFCLUSTERCONFIG -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -SFClusterDesc = _reflection.GeneratedProtocolMessageType('SFClusterDesc', (_message.Message,), { - - 'DeviceDesc' : _reflection.GeneratedProtocolMessageType('DeviceDesc', (_message.Message,), { - 'DESCRIPTOR' : _SFCLUSTERDESC_DEVICEDESC, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SFClusterDesc.DeviceDesc) - }) - , - - 'RayFedConfig' : _reflection.GeneratedProtocolMessageType('RayFedConfig', (_message.Message,), { - 'DESCRIPTOR' : _SFCLUSTERDESC_RAYFEDCONFIG, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SFClusterDesc.RayFedConfig) - }) - , - 'DESCRIPTOR' : _SFCLUSTERDESC, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SFClusterDesc) - }) -_sym_db.RegisterMessage(SFClusterDesc) -_sym_db.RegisterMessage(SFClusterDesc.DeviceDesc) -_sym_db.RegisterMessage(SFClusterDesc.RayFedConfig) - -StorageConfig = _reflection.GeneratedProtocolMessageType('StorageConfig', (_message.Message,), { - - 'LocalFSConfig' : _reflection.GeneratedProtocolMessageType('LocalFSConfig', (_message.Message,), { - 'DESCRIPTOR' : _STORAGECONFIG_LOCALFSCONFIG, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.StorageConfig.LocalFSConfig) - }) - , - 'DESCRIPTOR' : _STORAGECONFIG, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.StorageConfig) - }) -_sym_db.RegisterMessage(StorageConfig) -_sym_db.RegisterMessage(StorageConfig.LocalFSConfig) - -SFClusterConfig = _reflection.GeneratedProtocolMessageType('SFClusterConfig', (_message.Message,), { - - 'RayFedConfig' : _reflection.GeneratedProtocolMessageType('RayFedConfig', (_message.Message,), { - 'DESCRIPTOR' : _SFCLUSTERCONFIG_RAYFEDCONFIG, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SFClusterConfig.RayFedConfig) - }) - , - - 'SPUConfig' : _reflection.GeneratedProtocolMessageType('SPUConfig', (_message.Message,), { - 'DESCRIPTOR' : _SFCLUSTERCONFIG_SPUCONFIG, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SFClusterConfig.SPUConfig) - }) - , - - 'PublicConfig' : _reflection.GeneratedProtocolMessageType('PublicConfig', (_message.Message,), { - 'DESCRIPTOR' : _SFCLUSTERCONFIG_PUBLICCONFIG, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SFClusterConfig.PublicConfig) - }) - , - - 'PrivateConfig' : _reflection.GeneratedProtocolMessageType('PrivateConfig', (_message.Message,), { - 'DESCRIPTOR' : _SFCLUSTERCONFIG_PRIVATECONFIG, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SFClusterConfig.PrivateConfig) - }) - , - 'DESCRIPTOR' : _SFCLUSTERCONFIG, - '__module__' : 'secretflow.protos.component.cluster_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SFClusterConfig) - }) -_sym_db.RegisterMessage(SFClusterConfig) -_sym_db.RegisterMessage(SFClusterConfig.RayFedConfig) -_sym_db.RegisterMessage(SFClusterConfig.SPUConfig) -_sym_db.RegisterMessage(SFClusterConfig.PublicConfig) -_sym_db.RegisterMessage(SFClusterConfig.PrivateConfig) - - -DESCRIPTOR._options = None -# @@protoc_insertion_point(module_scope) diff --git a/secretflow/protos/component/comp.proto b/secretflow/protos/component/comp.proto deleted file mode 100644 index 00223a6e9..000000000 --- a/secretflow/protos/component/comp.proto +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright 2023 Ant Group Co., Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -syntax = "proto3"; - -package secretflow.component; - -option java_package = "org.secretflow.proto.component"; - -// Supported attribute types. -enum AttrType { - AT_UNDEFINED = 0; - - // Atomic types - - AT_FLOAT = 1; // FLOAT - AT_INT = 2; // INT - AT_STRING = 3; // STRING - AT_BOOL = 4; // BOOL - - // List types - - AT_FLOATS = 5; // FLOATS - AT_INTS = 6; // INTS - AT_STRINGS = 7; // STRINGS - AT_BOOLS = 8; // BOOLS - - // Special types. - - AT_STRUCT_GROUP = 9; - AT_UNION_GROUP = 10; - AT_SF_TABLE_COL = 11; -} - -// The value of an attribute -message Attribute { - float f = 1; // FLOAT - - // INT - // NOTE(junfeng): "is" is preserved by Python. Replaced with "i64". - int64 i64 = 2; - - string s = 3; // STRING - bool b = 4; // BOOL - - // lists - - repeated float fs = 5; // FLOATS - repeated int64 i64s = 6; // INTS - repeated string ss = 7; // STRINGS - repeated bool bs = 8; // BOOLS - - // Indicates the value is missing explicitly. - bool is_na = 9; -} - -// Describe an attribute. -message AttributeDef { - // Indicates the ancestors of a node, - // e.g. `[name_a, name_b, name_c]` means the path prefixes of current - // Attribute is `name_a/name_b/name_c/`. - // Only `^[a-zA-Z0-9_.-]*$` is allowed. - // `input` and `output` are reserved. - repeated string prefixes = 1; - - // Must be unique in the same level just like Linux file systems. - // Only `^[a-zA-Z0-9_.-]*$` is allowed. - // `input` and `output` are reserved. - string name = 2; - - string desc = 3; - - AttrType type = 4; - - // Extras for an atomic attribute. - // Including: `AT_FLOAT | AT_INT | AT_STRING | AT_BOOL | AT_FLOATS | AT_INTS | - // AT_STRINGS | AT_BOOLS`. - message AtomicAttrDesc { - // Only valid when type is `AT_FLOATS \| AT_INTS \| AT_STRINGS \| AT_BOOLS`. - int64 list_min_length_inclusive = 1; - // Only valid when type is `AT_FLOATS \| AT_INTS \| AT_STRINGS \| AT_BOOLS`. - int64 list_max_length_inclusive = 2; - - // If True, when Atmoic Attr is not provided or is_na, default_value would - // be used. Else, Atmoic Attr must be provided. - bool is_optional = 3; - - // A reasonable default for this attribute if the user does not supply a - // value. - Attribute default_value = 4; - - // Only valid when type is `AT_FLOAT \| AT_INT \| AT_STRING \| AT_FLOATS \| - // AT_INTS \| AT_STRINGS`. - // Please use list fields of AtomicParameter, i.e. `ss`, `i64s`, `fs`. - // If the attribute is a list, allowed_values is applied to each element. - Attribute allowed_values = 5; - - // Only valid when type is `AT_FLOAT \| AT_INT \| AT_FLOATS \| AT_INTS `. - // If the attribute is a list, lower_bound is applied to each element. - bool has_lower_bound = 6; - Attribute lower_bound = 7; - bool lower_bound_inclusive = 8; - - // Only valid when type is `AT_FLOAT \| AT_INT \| AT_FLOATS \| AT_INTS `. - // If the attribute is a list, upper_bound is applied to each element. - bool has_upper_bound = 9; - Attribute upper_bound = 10; - bool upper_bound_inclusive = 11; - } - - AtomicAttrDesc atomic = 5; - - // Extras for a union attribute group. - message UnionAttrGroupDesc { - // The default selected child. - string default_selection = 1; - } - - UnionAttrGroupDesc union = 6; -} - -// Define an input/output for component. -message IoDef { - // should be unique among all IOs of the component. - string name = 1; - - string desc = 2; - - // Must be one of DistData.type in data.proto - repeated string types = 3; - - // An extra attribute for a table. - // - // If provided in a IoDef, e.g. - // ```json - // { - // "name": "feature", - // "types": [ - // "int", - // "float" - // ], - // "col_min_cnt_inclusive": 1, - // "col_max_cnt": 3, - // "attrs": [ - // { - // "name": "bucket_size", - // "type": "AT_INT" - // } - // ] - // } - // ``` - // means after a user provide a table as IO, they should also specify - // cols as "feature": - // - col_min_cnt_inclusive is 1: At least 1 col to be selected. - // - col_max_cnt_inclusive is 3: At most 3 cols to be selected. - // And afterwards, user have to fill an int attribute called bucket_size for - // each selected cols. - message TableAttrDef { - // Must be unique among all attributes for the table. - string name = 1; - - string desc = 2; - - // Accepted col data types. - // Please check DistData.VerticalTable in data.proto. - repeated string types = 3; - - // inclusive - int64 col_min_cnt_inclusive = 4; - int64 col_max_cnt_inclusive = 5; - - // extra attribute for specified col. - repeated AttributeDef attrs = 6; - } - - // Only valid for tables. - // The attribute path for a TableAttrDef is `{input\|output}/{IoDef - // name}/{TableAttrDef name}`. - repeated TableAttrDef attrs = 4; -} - -// The definition of a comp. -message ComponentDef { - // Namespace of the comp. - string domain = 1; - - // Should be unique among all comps of the same domain. - string name = 2; - - string desc = 3; - - // Version of the comp. - string version = 4; - - repeated AttributeDef attrs = 5; - - repeated IoDef inputs = 6; - - repeated IoDef outputs = 7; -} - -// A list of components -message CompListDef { - string name = 1; - - string desc = 2; - - string version = 3; - - repeated ComponentDef comps = 4; -} diff --git a/secretflow/protos/component/comp_pb2.py b/secretflow/protos/component/comp_pb2.py deleted file mode 100644 index b80d01432..000000000 --- a/secretflow/protos/component/comp_pb2.py +++ /dev/null @@ -1,734 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: secretflow/protos/component/comp.proto - -from google.protobuf.internal import enum_type_wrapper -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='secretflow/protos/component/comp.proto', - package='secretflow.component', - syntax='proto3', - serialized_options=b'\n\036org.secretflow.proto.component', - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n&secretflow/protos/component/comp.proto\x12\x14secretflow.component\"z\n\tAttribute\x12\t\n\x01\x66\x18\x01 \x01(\x02\x12\x0b\n\x03i64\x18\x02 \x01(\x03\x12\t\n\x01s\x18\x03 \x01(\t\x12\t\n\x01\x62\x18\x04 \x01(\x08\x12\n\n\x02\x66s\x18\x05 \x03(\x02\x12\x0c\n\x04i64s\x18\x06 \x03(\x03\x12\n\n\x02ss\x18\x07 \x03(\t\x12\n\n\x02\x62s\x18\x08 \x03(\x08\x12\r\n\x05is_na\x18\t \x01(\x08\"\xdf\x05\n\x0c\x41ttributeDef\x12\x10\n\x08prefixes\x18\x01 \x03(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x03 \x01(\t\x12,\n\x04type\x18\x04 \x01(\x0e\x32\x1e.secretflow.component.AttrType\x12\x41\n\x06\x61tomic\x18\x05 \x01(\x0b\x32\x31.secretflow.component.AttributeDef.AtomicAttrDesc\x12\x44\n\x05union\x18\x06 \x01(\x0b\x32\x35.secretflow.component.AttributeDef.UnionAttrGroupDesc\x1a\xb8\x03\n\x0e\x41tomicAttrDesc\x12!\n\x19list_min_length_inclusive\x18\x01 \x01(\x03\x12!\n\x19list_max_length_inclusive\x18\x02 \x01(\x03\x12\x13\n\x0bis_optional\x18\x03 \x01(\x08\x12\x36\n\rdefault_value\x18\x04 \x01(\x0b\x32\x1f.secretflow.component.Attribute\x12\x37\n\x0e\x61llowed_values\x18\x05 \x01(\x0b\x32\x1f.secretflow.component.Attribute\x12\x17\n\x0fhas_lower_bound\x18\x06 \x01(\x08\x12\x34\n\x0blower_bound\x18\x07 \x01(\x0b\x32\x1f.secretflow.component.Attribute\x12\x1d\n\x15lower_bound_inclusive\x18\x08 \x01(\x08\x12\x17\n\x0fhas_upper_bound\x18\t \x01(\x08\x12\x34\n\x0bupper_bound\x18\n \x01(\x0b\x32\x1f.secretflow.component.Attribute\x12\x1d\n\x15upper_bound_inclusive\x18\x0b \x01(\x08\x1a/\n\x12UnionAttrGroupDesc\x12\x19\n\x11\x64\x65\x66\x61ult_selection\x18\x01 \x01(\t\"\x98\x02\n\x05IoDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\r\n\x05types\x18\x03 \x03(\t\x12\x37\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32(.secretflow.component.IoDef.TableAttrDef\x1a\xaa\x01\n\x0cTableAttrDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\r\n\x05types\x18\x03 \x03(\t\x12\x1d\n\x15\x63ol_min_cnt_inclusive\x18\x04 \x01(\x03\x12\x1d\n\x15\x63ol_max_cnt_inclusive\x18\x05 \x01(\x03\x12\x31\n\x05\x61ttrs\x18\x06 \x03(\x0b\x32\".secretflow.component.AttributeDef\"\xd9\x01\n\x0c\x43omponentDef\x12\x0e\n\x06\x64omain\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x03 \x01(\t\x12\x0f\n\x07version\x18\x04 \x01(\t\x12\x31\n\x05\x61ttrs\x18\x05 \x03(\x0b\x32\".secretflow.component.AttributeDef\x12+\n\x06inputs\x18\x06 \x03(\x0b\x32\x1b.secretflow.component.IoDef\x12,\n\x07outputs\x18\x07 \x03(\x0b\x32\x1b.secretflow.component.IoDef\"m\n\x0b\x43ompListDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x0f\n\x07version\x18\x03 \x01(\t\x12\x31\n\x05\x63omps\x18\x04 \x03(\x0b\x32\".secretflow.component.ComponentDef*\xca\x01\n\x08\x41ttrType\x12\x10\n\x0c\x41T_UNDEFINED\x10\x00\x12\x0c\n\x08\x41T_FLOAT\x10\x01\x12\n\n\x06\x41T_INT\x10\x02\x12\r\n\tAT_STRING\x10\x03\x12\x0b\n\x07\x41T_BOOL\x10\x04\x12\r\n\tAT_FLOATS\x10\x05\x12\x0b\n\x07\x41T_INTS\x10\x06\x12\x0e\n\nAT_STRINGS\x10\x07\x12\x0c\n\x08\x41T_BOOLS\x10\x08\x12\x13\n\x0f\x41T_STRUCT_GROUP\x10\t\x12\x12\n\x0e\x41T_UNION_GROUP\x10\n\x12\x13\n\x0f\x41T_SF_TABLE_COL\x10\x0b\x42 \n\x1eorg.secretflow.proto.componentb\x06proto3' -) - -_ATTRTYPE = _descriptor.EnumDescriptor( - name='AttrType', - full_name='secretflow.component.AttrType', - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name='AT_UNDEFINED', index=0, number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_FLOAT', index=1, number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_INT', index=2, number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_STRING', index=3, number=3, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_BOOL', index=4, number=4, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_FLOATS', index=5, number=5, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_INTS', index=6, number=6, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_STRINGS', index=7, number=7, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_BOOLS', index=8, number=8, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_STRUCT_GROUP', index=9, number=9, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_UNION_GROUP', index=10, number=10, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='AT_SF_TABLE_COL', index=11, number=11, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - ], - containing_type=None, - serialized_options=None, - serialized_start=1541, - serialized_end=1743, -) -_sym_db.RegisterEnumDescriptor(_ATTRTYPE) - -AttrType = enum_type_wrapper.EnumTypeWrapper(_ATTRTYPE) -AT_UNDEFINED = 0 -AT_FLOAT = 1 -AT_INT = 2 -AT_STRING = 3 -AT_BOOL = 4 -AT_FLOATS = 5 -AT_INTS = 6 -AT_STRINGS = 7 -AT_BOOLS = 8 -AT_STRUCT_GROUP = 9 -AT_UNION_GROUP = 10 -AT_SF_TABLE_COL = 11 - - - -_ATTRIBUTE = _descriptor.Descriptor( - name='Attribute', - full_name='secretflow.component.Attribute', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='f', full_name='secretflow.component.Attribute.f', index=0, - number=1, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='i64', full_name='secretflow.component.Attribute.i64', index=1, - number=2, type=3, cpp_type=2, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='s', full_name='secretflow.component.Attribute.s', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='b', full_name='secretflow.component.Attribute.b', index=3, - number=4, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='fs', full_name='secretflow.component.Attribute.fs', index=4, - number=5, type=2, cpp_type=6, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='i64s', full_name='secretflow.component.Attribute.i64s', index=5, - number=6, type=3, cpp_type=2, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='ss', full_name='secretflow.component.Attribute.ss', index=6, - number=7, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='bs', full_name='secretflow.component.Attribute.bs', index=7, - number=8, type=8, cpp_type=7, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='is_na', full_name='secretflow.component.Attribute.is_na', index=8, - number=9, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=64, - serialized_end=186, -) - - -_ATTRIBUTEDEF_ATOMICATTRDESC = _descriptor.Descriptor( - name='AtomicAttrDesc', - full_name='secretflow.component.AttributeDef.AtomicAttrDesc', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='list_min_length_inclusive', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.list_min_length_inclusive', index=0, - number=1, type=3, cpp_type=2, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='list_max_length_inclusive', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.list_max_length_inclusive', index=1, - number=2, type=3, cpp_type=2, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='is_optional', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.is_optional', index=2, - number=3, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='default_value', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.default_value', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='allowed_values', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.allowed_values', index=4, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='has_lower_bound', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.has_lower_bound', index=5, - number=6, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='lower_bound', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.lower_bound', index=6, - number=7, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='lower_bound_inclusive', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.lower_bound_inclusive', index=7, - number=8, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='has_upper_bound', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.has_upper_bound', index=8, - number=9, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='upper_bound', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.upper_bound', index=9, - number=10, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='upper_bound_inclusive', full_name='secretflow.component.AttributeDef.AtomicAttrDesc.upper_bound_inclusive', index=10, - number=11, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=435, - serialized_end=875, -) - -_ATTRIBUTEDEF_UNIONATTRGROUPDESC = _descriptor.Descriptor( - name='UnionAttrGroupDesc', - full_name='secretflow.component.AttributeDef.UnionAttrGroupDesc', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='default_selection', full_name='secretflow.component.AttributeDef.UnionAttrGroupDesc.default_selection', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=877, - serialized_end=924, -) - -_ATTRIBUTEDEF = _descriptor.Descriptor( - name='AttributeDef', - full_name='secretflow.component.AttributeDef', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='prefixes', full_name='secretflow.component.AttributeDef.prefixes', index=0, - number=1, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.AttributeDef.name', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.AttributeDef.desc', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='type', full_name='secretflow.component.AttributeDef.type', index=3, - number=4, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='atomic', full_name='secretflow.component.AttributeDef.atomic', index=4, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='union', full_name='secretflow.component.AttributeDef.union', index=5, - number=6, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_ATTRIBUTEDEF_ATOMICATTRDESC, _ATTRIBUTEDEF_UNIONATTRGROUPDESC, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=189, - serialized_end=924, -) - - -_IODEF_TABLEATTRDEF = _descriptor.Descriptor( - name='TableAttrDef', - full_name='secretflow.component.IoDef.TableAttrDef', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.IoDef.TableAttrDef.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.IoDef.TableAttrDef.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='types', full_name='secretflow.component.IoDef.TableAttrDef.types', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='col_min_cnt_inclusive', full_name='secretflow.component.IoDef.TableAttrDef.col_min_cnt_inclusive', index=3, - number=4, type=3, cpp_type=2, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='col_max_cnt_inclusive', full_name='secretflow.component.IoDef.TableAttrDef.col_max_cnt_inclusive', index=4, - number=5, type=3, cpp_type=2, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='attrs', full_name='secretflow.component.IoDef.TableAttrDef.attrs', index=5, - number=6, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1037, - serialized_end=1207, -) - -_IODEF = _descriptor.Descriptor( - name='IoDef', - full_name='secretflow.component.IoDef', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.IoDef.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.IoDef.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='types', full_name='secretflow.component.IoDef.types', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='attrs', full_name='secretflow.component.IoDef.attrs', index=3, - number=4, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_IODEF_TABLEATTRDEF, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=927, - serialized_end=1207, -) - - -_COMPONENTDEF = _descriptor.Descriptor( - name='ComponentDef', - full_name='secretflow.component.ComponentDef', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='domain', full_name='secretflow.component.ComponentDef.domain', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.ComponentDef.name', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.ComponentDef.desc', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='version', full_name='secretflow.component.ComponentDef.version', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='attrs', full_name='secretflow.component.ComponentDef.attrs', index=4, - number=5, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='inputs', full_name='secretflow.component.ComponentDef.inputs', index=5, - number=6, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='outputs', full_name='secretflow.component.ComponentDef.outputs', index=6, - number=7, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1210, - serialized_end=1427, -) - - -_COMPLISTDEF = _descriptor.Descriptor( - name='CompListDef', - full_name='secretflow.component.CompListDef', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.CompListDef.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.CompListDef.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='version', full_name='secretflow.component.CompListDef.version', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='comps', full_name='secretflow.component.CompListDef.comps', index=3, - number=4, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1429, - serialized_end=1538, -) - -_ATTRIBUTEDEF_ATOMICATTRDESC.fields_by_name['default_value'].message_type = _ATTRIBUTE -_ATTRIBUTEDEF_ATOMICATTRDESC.fields_by_name['allowed_values'].message_type = _ATTRIBUTE -_ATTRIBUTEDEF_ATOMICATTRDESC.fields_by_name['lower_bound'].message_type = _ATTRIBUTE -_ATTRIBUTEDEF_ATOMICATTRDESC.fields_by_name['upper_bound'].message_type = _ATTRIBUTE -_ATTRIBUTEDEF_ATOMICATTRDESC.containing_type = _ATTRIBUTEDEF -_ATTRIBUTEDEF_UNIONATTRGROUPDESC.containing_type = _ATTRIBUTEDEF -_ATTRIBUTEDEF.fields_by_name['type'].enum_type = _ATTRTYPE -_ATTRIBUTEDEF.fields_by_name['atomic'].message_type = _ATTRIBUTEDEF_ATOMICATTRDESC -_ATTRIBUTEDEF.fields_by_name['union'].message_type = _ATTRIBUTEDEF_UNIONATTRGROUPDESC -_IODEF_TABLEATTRDEF.fields_by_name['attrs'].message_type = _ATTRIBUTEDEF -_IODEF_TABLEATTRDEF.containing_type = _IODEF -_IODEF.fields_by_name['attrs'].message_type = _IODEF_TABLEATTRDEF -_COMPONENTDEF.fields_by_name['attrs'].message_type = _ATTRIBUTEDEF -_COMPONENTDEF.fields_by_name['inputs'].message_type = _IODEF -_COMPONENTDEF.fields_by_name['outputs'].message_type = _IODEF -_COMPLISTDEF.fields_by_name['comps'].message_type = _COMPONENTDEF -DESCRIPTOR.message_types_by_name['Attribute'] = _ATTRIBUTE -DESCRIPTOR.message_types_by_name['AttributeDef'] = _ATTRIBUTEDEF -DESCRIPTOR.message_types_by_name['IoDef'] = _IODEF -DESCRIPTOR.message_types_by_name['ComponentDef'] = _COMPONENTDEF -DESCRIPTOR.message_types_by_name['CompListDef'] = _COMPLISTDEF -DESCRIPTOR.enum_types_by_name['AttrType'] = _ATTRTYPE -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -Attribute = _reflection.GeneratedProtocolMessageType('Attribute', (_message.Message,), { - 'DESCRIPTOR' : _ATTRIBUTE, - '__module__' : 'secretflow.protos.component.comp_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Attribute) - }) -_sym_db.RegisterMessage(Attribute) - -AttributeDef = _reflection.GeneratedProtocolMessageType('AttributeDef', (_message.Message,), { - - 'AtomicAttrDesc' : _reflection.GeneratedProtocolMessageType('AtomicAttrDesc', (_message.Message,), { - 'DESCRIPTOR' : _ATTRIBUTEDEF_ATOMICATTRDESC, - '__module__' : 'secretflow.protos.component.comp_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.AttributeDef.AtomicAttrDesc) - }) - , - - 'UnionAttrGroupDesc' : _reflection.GeneratedProtocolMessageType('UnionAttrGroupDesc', (_message.Message,), { - 'DESCRIPTOR' : _ATTRIBUTEDEF_UNIONATTRGROUPDESC, - '__module__' : 'secretflow.protos.component.comp_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.AttributeDef.UnionAttrGroupDesc) - }) - , - 'DESCRIPTOR' : _ATTRIBUTEDEF, - '__module__' : 'secretflow.protos.component.comp_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.AttributeDef) - }) -_sym_db.RegisterMessage(AttributeDef) -_sym_db.RegisterMessage(AttributeDef.AtomicAttrDesc) -_sym_db.RegisterMessage(AttributeDef.UnionAttrGroupDesc) - -IoDef = _reflection.GeneratedProtocolMessageType('IoDef', (_message.Message,), { - - 'TableAttrDef' : _reflection.GeneratedProtocolMessageType('TableAttrDef', (_message.Message,), { - 'DESCRIPTOR' : _IODEF_TABLEATTRDEF, - '__module__' : 'secretflow.protos.component.comp_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.IoDef.TableAttrDef) - }) - , - 'DESCRIPTOR' : _IODEF, - '__module__' : 'secretflow.protos.component.comp_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.IoDef) - }) -_sym_db.RegisterMessage(IoDef) -_sym_db.RegisterMessage(IoDef.TableAttrDef) - -ComponentDef = _reflection.GeneratedProtocolMessageType('ComponentDef', (_message.Message,), { - 'DESCRIPTOR' : _COMPONENTDEF, - '__module__' : 'secretflow.protos.component.comp_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.ComponentDef) - }) -_sym_db.RegisterMessage(ComponentDef) - -CompListDef = _reflection.GeneratedProtocolMessageType('CompListDef', (_message.Message,), { - 'DESCRIPTOR' : _COMPLISTDEF, - '__module__' : 'secretflow.protos.component.comp_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.CompListDef) - }) -_sym_db.RegisterMessage(CompListDef) - - -DESCRIPTOR._options = None -# @@protoc_insertion_point(module_scope) diff --git a/secretflow/protos/component/data.proto b/secretflow/protos/component/data.proto deleted file mode 100644 index b0b8ab859..000000000 --- a/secretflow/protos/component/data.proto +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2023 Ant Group Co., Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -package secretflow.component; - -import "google/protobuf/any.proto"; - -import "secretflow/protos/component/cluster.proto"; - -option java_package = "org.secretflow.proto.component"; - -// Describe the application related to data. -// - SCQL, GRM related meta information should be here. -// - You can add more field here, when another application is added. -message SystemInfo { - // The application name. - // Supported: `secretflow` - string app_name = 1; - - // For secretflow. - SFClusterDesc secretflow = 2; -} - -// The schema of a table. -// - A col must be one of `id | feature | label`. By default, it should be a -// feature. -// - All names must match the regexp `[A-Za-z0-9.][A-Za-z0-9_>./]*`. -// - All data type must be one of -// * int8 -// * int16 -// * int32 -// * int64 -// * uint8 -// * uint16 -// * uint32 -// * uint64 -// * float16 -// * float32 -// * float64 -// * bool -// * int -// * float -// * str -message TableSchema { - // Id column name(s). - // Optional, can be empty. - repeated string ids = 1; - - // Feature column name(s). - repeated string features = 2; - - // Label column name(s). - // Optional, can be empty. - repeated string labels = 3; - - // Id column data type(s). - // Len(id) should match len(id_types). - repeated string id_types = 4; - - // Feature column data type(s). - // Len(features) should match len(feature_types). - repeated string feature_types = 5; - - // Label column data type(s). - // Len(labels) should match len(label_types). - repeated string label_types = 6; -} - -// VerticalTable describes a vertical virtual table from multiple parties. -// > TODO: move this to secretflow/protos/builtin/ -// -// > Guide: if some type is only required to be handle inside a specific system, -// for instance woe.rule file in engine, we don't need to define a new type -// here. -message VerticalTable { - // The vertical partitioned slices' schema. - // Must match data_refs in the parent DistData message. - repeated TableSchema schemas = 1; - - // If -1, the number is unknown. - int64 num_lines = 2; -} - -// IndividualTable describes a table owned by a single party. -message IndividualTable { - TableSchema schema = 1; - - // If -1, the number is unknown. - int64 num_lines = 2; -} - -// Descibes public storage info for a collection of Device Objects. -message DeviceObjectCollection { - message DeviceObject { - // Supported: `spu \| pyu` - string type = 1; - // Index of data_ref in the parent DistData message. - repeated int32 data_ref_idxs = 2; - } - - repeated DeviceObject objs = 1; - - // Any public information. - string public_info = 2; -} - -// A public record for a general distributed data. -// -// The type of this distributed data, should be meaningful to components. -// -// The concrete data format (include public and private parts) is defined by -// other protos. -// -// Suggested names, i.e. -// - sf.table.vertical_table represent a secretflow vertical table -// - sf.model.* represent a secretflow models. -// - sf.rule.* represent a secretflow rules. -message DistData { - // The name of this distributed data. - string name = 1; - - string type = 2; - - // Describe the system information that used to generate this distributed - // data. - SystemInfo sys_info = 3; - - // Public information, known to all parties. - // i.e. VerticalTable - google.protobuf.Any meta = 4; - - // A reference to a data that is stored in the remote path. - message DataRef { - // The path information relative to StorageConfig of the party. - string uri = 1; - - // The owner party. - string party = 2; - - // The storage format, i.e. csv. - string format = 3; - } - - // Remote data references. - repeated DataRef data_refs = 5; -} diff --git a/secretflow/protos/component/data_pb2.py b/secretflow/protos/component/data_pb2.py deleted file mode 100644 index 70ca4c82f..000000000 --- a/secretflow/protos/component/data_pb2.py +++ /dev/null @@ -1,473 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: secretflow/protos/component/data.proto - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from google.protobuf import any_pb2 as google_dot_protobuf_dot_any__pb2 -from secretflow.protos.component import cluster_pb2 as secretflow_dot_protos_dot_component_dot_cluster__pb2 - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='secretflow/protos/component/data.proto', - package='secretflow.component', - syntax='proto3', - serialized_options=b'\n\036org.secretflow.proto.component', - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n&secretflow/protos/component/data.proto\x12\x14secretflow.component\x1a\x19google/protobuf/any.proto\x1a)secretflow/protos/component/cluster.proto\"W\n\nSystemInfo\x12\x10\n\x08\x61pp_name\x18\x01 \x01(\t\x12\x37\n\nsecretflow\x18\x02 \x01(\x0b\x32#.secretflow.component.SFClusterDesc\"z\n\x0bTableSchema\x12\x0b\n\x03ids\x18\x01 \x03(\t\x12\x10\n\x08\x66\x65\x61tures\x18\x02 \x03(\t\x12\x0e\n\x06labels\x18\x03 \x03(\t\x12\x10\n\x08id_types\x18\x04 \x03(\t\x12\x15\n\rfeature_types\x18\x05 \x03(\t\x12\x13\n\x0blabel_types\x18\x06 \x03(\t\"V\n\rVerticalTable\x12\x32\n\x07schemas\x18\x01 \x03(\x0b\x32!.secretflow.component.TableSchema\x12\x11\n\tnum_lines\x18\x02 \x01(\x03\"W\n\x0fIndividualTable\x12\x31\n\x06schema\x18\x01 \x01(\x0b\x32!.secretflow.component.TableSchema\x12\x11\n\tnum_lines\x18\x02 \x01(\x03\"\xab\x01\n\x16\x44\x65viceObjectCollection\x12G\n\x04objs\x18\x01 \x03(\x0b\x32\x39.secretflow.component.DeviceObjectCollection.DeviceObject\x12\x13\n\x0bpublic_info\x18\x02 \x01(\t\x1a\x33\n\x0c\x44\x65viceObject\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x15\n\rdata_ref_idxs\x18\x02 \x03(\x05\"\xf0\x01\n\x08\x44istData\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x32\n\x08sys_info\x18\x03 \x01(\x0b\x32 .secretflow.component.SystemInfo\x12\"\n\x04meta\x18\x04 \x01(\x0b\x32\x14.google.protobuf.Any\x12\x39\n\tdata_refs\x18\x05 \x03(\x0b\x32&.secretflow.component.DistData.DataRef\x1a\x35\n\x07\x44\x61taRef\x12\x0b\n\x03uri\x18\x01 \x01(\t\x12\r\n\x05party\x18\x02 \x01(\t\x12\x0e\n\x06\x66ormat\x18\x03 \x01(\tB \n\x1eorg.secretflow.proto.componentb\x06proto3' - , - dependencies=[google_dot_protobuf_dot_any__pb2.DESCRIPTOR,secretflow_dot_protos_dot_component_dot_cluster__pb2.DESCRIPTOR,]) - - - - -_SYSTEMINFO = _descriptor.Descriptor( - name='SystemInfo', - full_name='secretflow.component.SystemInfo', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='app_name', full_name='secretflow.component.SystemInfo.app_name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='secretflow', full_name='secretflow.component.SystemInfo.secretflow', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=134, - serialized_end=221, -) - - -_TABLESCHEMA = _descriptor.Descriptor( - name='TableSchema', - full_name='secretflow.component.TableSchema', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='ids', full_name='secretflow.component.TableSchema.ids', index=0, - number=1, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='features', full_name='secretflow.component.TableSchema.features', index=1, - number=2, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='labels', full_name='secretflow.component.TableSchema.labels', index=2, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='id_types', full_name='secretflow.component.TableSchema.id_types', index=3, - number=4, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='feature_types', full_name='secretflow.component.TableSchema.feature_types', index=4, - number=5, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='label_types', full_name='secretflow.component.TableSchema.label_types', index=5, - number=6, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=223, - serialized_end=345, -) - - -_VERTICALTABLE = _descriptor.Descriptor( - name='VerticalTable', - full_name='secretflow.component.VerticalTable', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='schemas', full_name='secretflow.component.VerticalTable.schemas', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='num_lines', full_name='secretflow.component.VerticalTable.num_lines', index=1, - number=2, type=3, cpp_type=2, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=347, - serialized_end=433, -) - - -_INDIVIDUALTABLE = _descriptor.Descriptor( - name='IndividualTable', - full_name='secretflow.component.IndividualTable', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='schema', full_name='secretflow.component.IndividualTable.schema', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='num_lines', full_name='secretflow.component.IndividualTable.num_lines', index=1, - number=2, type=3, cpp_type=2, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=435, - serialized_end=522, -) - - -_DEVICEOBJECTCOLLECTION_DEVICEOBJECT = _descriptor.Descriptor( - name='DeviceObject', - full_name='secretflow.component.DeviceObjectCollection.DeviceObject', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='type', full_name='secretflow.component.DeviceObjectCollection.DeviceObject.type', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='data_ref_idxs', full_name='secretflow.component.DeviceObjectCollection.DeviceObject.data_ref_idxs', index=1, - number=2, type=5, cpp_type=1, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=645, - serialized_end=696, -) - -_DEVICEOBJECTCOLLECTION = _descriptor.Descriptor( - name='DeviceObjectCollection', - full_name='secretflow.component.DeviceObjectCollection', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='objs', full_name='secretflow.component.DeviceObjectCollection.objs', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='public_info', full_name='secretflow.component.DeviceObjectCollection.public_info', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_DEVICEOBJECTCOLLECTION_DEVICEOBJECT, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=525, - serialized_end=696, -) - - -_DISTDATA_DATAREF = _descriptor.Descriptor( - name='DataRef', - full_name='secretflow.component.DistData.DataRef', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='uri', full_name='secretflow.component.DistData.DataRef.uri', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='party', full_name='secretflow.component.DistData.DataRef.party', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='format', full_name='secretflow.component.DistData.DataRef.format', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=886, - serialized_end=939, -) - -_DISTDATA = _descriptor.Descriptor( - name='DistData', - full_name='secretflow.component.DistData', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.DistData.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='type', full_name='secretflow.component.DistData.type', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='sys_info', full_name='secretflow.component.DistData.sys_info', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='meta', full_name='secretflow.component.DistData.meta', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='data_refs', full_name='secretflow.component.DistData.data_refs', index=4, - number=5, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_DISTDATA_DATAREF, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=699, - serialized_end=939, -) - -_SYSTEMINFO.fields_by_name['secretflow'].message_type = secretflow_dot_protos_dot_component_dot_cluster__pb2._SFCLUSTERDESC -_VERTICALTABLE.fields_by_name['schemas'].message_type = _TABLESCHEMA -_INDIVIDUALTABLE.fields_by_name['schema'].message_type = _TABLESCHEMA -_DEVICEOBJECTCOLLECTION_DEVICEOBJECT.containing_type = _DEVICEOBJECTCOLLECTION -_DEVICEOBJECTCOLLECTION.fields_by_name['objs'].message_type = _DEVICEOBJECTCOLLECTION_DEVICEOBJECT -_DISTDATA_DATAREF.containing_type = _DISTDATA -_DISTDATA.fields_by_name['sys_info'].message_type = _SYSTEMINFO -_DISTDATA.fields_by_name['meta'].message_type = google_dot_protobuf_dot_any__pb2._ANY -_DISTDATA.fields_by_name['data_refs'].message_type = _DISTDATA_DATAREF -DESCRIPTOR.message_types_by_name['SystemInfo'] = _SYSTEMINFO -DESCRIPTOR.message_types_by_name['TableSchema'] = _TABLESCHEMA -DESCRIPTOR.message_types_by_name['VerticalTable'] = _VERTICALTABLE -DESCRIPTOR.message_types_by_name['IndividualTable'] = _INDIVIDUALTABLE -DESCRIPTOR.message_types_by_name['DeviceObjectCollection'] = _DEVICEOBJECTCOLLECTION -DESCRIPTOR.message_types_by_name['DistData'] = _DISTDATA -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -SystemInfo = _reflection.GeneratedProtocolMessageType('SystemInfo', (_message.Message,), { - 'DESCRIPTOR' : _SYSTEMINFO, - '__module__' : 'secretflow.protos.component.data_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.SystemInfo) - }) -_sym_db.RegisterMessage(SystemInfo) - -TableSchema = _reflection.GeneratedProtocolMessageType('TableSchema', (_message.Message,), { - 'DESCRIPTOR' : _TABLESCHEMA, - '__module__' : 'secretflow.protos.component.data_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.TableSchema) - }) -_sym_db.RegisterMessage(TableSchema) - -VerticalTable = _reflection.GeneratedProtocolMessageType('VerticalTable', (_message.Message,), { - 'DESCRIPTOR' : _VERTICALTABLE, - '__module__' : 'secretflow.protos.component.data_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.VerticalTable) - }) -_sym_db.RegisterMessage(VerticalTable) - -IndividualTable = _reflection.GeneratedProtocolMessageType('IndividualTable', (_message.Message,), { - 'DESCRIPTOR' : _INDIVIDUALTABLE, - '__module__' : 'secretflow.protos.component.data_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.IndividualTable) - }) -_sym_db.RegisterMessage(IndividualTable) - -DeviceObjectCollection = _reflection.GeneratedProtocolMessageType('DeviceObjectCollection', (_message.Message,), { - - 'DeviceObject' : _reflection.GeneratedProtocolMessageType('DeviceObject', (_message.Message,), { - 'DESCRIPTOR' : _DEVICEOBJECTCOLLECTION_DEVICEOBJECT, - '__module__' : 'secretflow.protos.component.data_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.DeviceObjectCollection.DeviceObject) - }) - , - 'DESCRIPTOR' : _DEVICEOBJECTCOLLECTION, - '__module__' : 'secretflow.protos.component.data_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.DeviceObjectCollection) - }) -_sym_db.RegisterMessage(DeviceObjectCollection) -_sym_db.RegisterMessage(DeviceObjectCollection.DeviceObject) - -DistData = _reflection.GeneratedProtocolMessageType('DistData', (_message.Message,), { - - 'DataRef' : _reflection.GeneratedProtocolMessageType('DataRef', (_message.Message,), { - 'DESCRIPTOR' : _DISTDATA_DATAREF, - '__module__' : 'secretflow.protos.component.data_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.DistData.DataRef) - }) - , - 'DESCRIPTOR' : _DISTDATA, - '__module__' : 'secretflow.protos.component.data_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.DistData) - }) -_sym_db.RegisterMessage(DistData) -_sym_db.RegisterMessage(DistData.DataRef) - - -DESCRIPTOR._options = None -# @@protoc_insertion_point(module_scope) diff --git a/secretflow/protos/component/evaluation.proto b/secretflow/protos/component/evaluation.proto deleted file mode 100644 index 2c798d3a1..000000000 --- a/secretflow/protos/component/evaluation.proto +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2023 Ant Group Co., Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -package secretflow.component; - -import "secretflow/protos/component/comp.proto"; -import "secretflow/protos/component/data.proto"; - -option java_package = "org.secretflow.proto.component"; - -// Evaluate a node. -// - comp.evaluate(NodeEvalParam, SFClusterConfig) -> NodeEvalResult -// -// NodeEvalParam contains all the information to evaluate a component. -message NodeEvalParam { - // Domain of the component. - string domain = 1; - - // Name of the component. - string name = 2; - - // Version of the component. - string version = 3; - - // The path of attributes. - // The attribute path for a TableAttrDef is `{input\|output}/{IoDef - // name}/{TableAttrDef name}`. - repeated string attr_paths = 4; - - // The value of the attribute. - // Must match attr_paths. - repeated Attribute attrs = 5; - - // The input data, the order of inputs must match inputs in ComponentDef. - // NOTE: Names of DistData doesn't need to match those of inputs in - // ComponentDef definition. - repeated DistData inputs = 6; - - // The output data uris, the order of output_uris must match outputs in - // ComponentDef. - repeated string output_uris = 7; -} - -// NodeEvalResult contains outputs of a component evaluation. -message NodeEvalResult { - // Output data. - repeated DistData outputs = 1; -} diff --git a/secretflow/protos/component/evaluation_pb2.py b/secretflow/protos/component/evaluation_pb2.py deleted file mode 100644 index 2c3da084f..000000000 --- a/secretflow/protos/component/evaluation_pb2.py +++ /dev/null @@ -1,159 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: secretflow/protos/component/evaluation.proto - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from secretflow.protos.component import comp_pb2 as secretflow_dot_protos_dot_component_dot_comp__pb2 -from secretflow.protos.component import data_pb2 as secretflow_dot_protos_dot_component_dot_data__pb2 - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='secretflow/protos/component/evaluation.proto', - package='secretflow.component', - syntax='proto3', - serialized_options=b'\n\036org.secretflow.proto.component', - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n,secretflow/protos/component/evaluation.proto\x12\x14secretflow.component\x1a&secretflow/protos/component/comp.proto\x1a&secretflow/protos/component/data.proto\"\xc7\x01\n\rNodeEvalParam\x12\x0e\n\x06\x64omain\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0f\n\x07version\x18\x03 \x01(\t\x12\x12\n\nattr_paths\x18\x04 \x03(\t\x12.\n\x05\x61ttrs\x18\x05 \x03(\x0b\x32\x1f.secretflow.component.Attribute\x12.\n\x06inputs\x18\x06 \x03(\x0b\x32\x1e.secretflow.component.DistData\x12\x13\n\x0boutput_uris\x18\x07 \x03(\t\"A\n\x0eNodeEvalResult\x12/\n\x07outputs\x18\x01 \x03(\x0b\x32\x1e.secretflow.component.DistDataB \n\x1eorg.secretflow.proto.componentb\x06proto3' - , - dependencies=[secretflow_dot_protos_dot_component_dot_comp__pb2.DESCRIPTOR,secretflow_dot_protos_dot_component_dot_data__pb2.DESCRIPTOR,]) - - - - -_NODEEVALPARAM = _descriptor.Descriptor( - name='NodeEvalParam', - full_name='secretflow.component.NodeEvalParam', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='domain', full_name='secretflow.component.NodeEvalParam.domain', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.NodeEvalParam.name', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='version', full_name='secretflow.component.NodeEvalParam.version', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='attr_paths', full_name='secretflow.component.NodeEvalParam.attr_paths', index=3, - number=4, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='attrs', full_name='secretflow.component.NodeEvalParam.attrs', index=4, - number=5, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='inputs', full_name='secretflow.component.NodeEvalParam.inputs', index=5, - number=6, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='output_uris', full_name='secretflow.component.NodeEvalParam.output_uris', index=6, - number=7, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=151, - serialized_end=350, -) - - -_NODEEVALRESULT = _descriptor.Descriptor( - name='NodeEvalResult', - full_name='secretflow.component.NodeEvalResult', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='outputs', full_name='secretflow.component.NodeEvalResult.outputs', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=352, - serialized_end=417, -) - -_NODEEVALPARAM.fields_by_name['attrs'].message_type = secretflow_dot_protos_dot_component_dot_comp__pb2._ATTRIBUTE -_NODEEVALPARAM.fields_by_name['inputs'].message_type = secretflow_dot_protos_dot_component_dot_data__pb2._DISTDATA -_NODEEVALRESULT.fields_by_name['outputs'].message_type = secretflow_dot_protos_dot_component_dot_data__pb2._DISTDATA -DESCRIPTOR.message_types_by_name['NodeEvalParam'] = _NODEEVALPARAM -DESCRIPTOR.message_types_by_name['NodeEvalResult'] = _NODEEVALRESULT -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -NodeEvalParam = _reflection.GeneratedProtocolMessageType('NodeEvalParam', (_message.Message,), { - 'DESCRIPTOR' : _NODEEVALPARAM, - '__module__' : 'secretflow.protos.component.evaluation_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.NodeEvalParam) - }) -_sym_db.RegisterMessage(NodeEvalParam) - -NodeEvalResult = _reflection.GeneratedProtocolMessageType('NodeEvalResult', (_message.Message,), { - 'DESCRIPTOR' : _NODEEVALRESULT, - '__module__' : 'secretflow.protos.component.evaluation_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.NodeEvalResult) - }) -_sym_db.RegisterMessage(NodeEvalResult) - - -DESCRIPTOR._options = None -# @@protoc_insertion_point(module_scope) diff --git a/secretflow/protos/component/report.proto b/secretflow/protos/component/report.proto deleted file mode 100644 index c0d3a0566..000000000 --- a/secretflow/protos/component/report.proto +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright 2023 Ant Group Co., Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -package secretflow.component; - -import "secretflow/protos/component/comp.proto"; - -option java_package = "org.secretflow.proto.component"; - -// Displays multiple read-only fields in groups. -message Descriptions { - message Item { - // Name of the field. - string name = 1; - - string desc = 2; - - // Must be one of bool/int/float/str - string type = 3; - - Attribute value = 4; - } - - // Name of the Descriptions. - string name = 1; - - string desc = 2; - - repeated Item items = 3; -} - -// Displays rows of data. -message Table { - message HeaderItem { - string name = 1; - - string desc = 2; - - // Must be one of bool/int/float/str - string type = 3; - } - - message Row { - string name = 1; - - string desc = 2; - - repeated Attribute items = 3; - } - - // Name of the Table. - string name = 1; - - string desc = 2; - - repeated HeaderItem headers = 3; - - repeated Row rows = 4; -} - -// A division or a section of a page. -message Div { - message Child { - // Supported: descriptions, table, div. - string type = 1; - - Descriptions descriptions = 2; - - Table table = 3; - - Div div = 4; - } - - // Name of the Div. - string name = 1; - - string desc = 2; - - repeated Child children = 3; -} - -// A page of a report. -message Tab { - // Name of the Tab. - string name = 1; - - string desc = 2; - - repeated Div divs = 3; -} - -message Report { - // Name of the Report. - string name = 1; - - string desc = 2; - - repeated Tab tabs = 3; - - int32 err_code = 4; - - // Structed error detail (JSON encoded message). - string err_detail = 5; -} diff --git a/secretflow/protos/component/report_pb2.py b/secretflow/protos/component/report_pb2.py deleted file mode 100644 index d0dcd3203..000000000 --- a/secretflow/protos/component/report_pb2.py +++ /dev/null @@ -1,565 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: secretflow/protos/component/report.proto - -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -from secretflow.protos.component import comp_pb2 as secretflow_dot_protos_dot_component_dot_comp__pb2 - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='secretflow/protos/component/report.proto', - package='secretflow.component', - syntax='proto3', - serialized_options=b'\n\036org.secretflow.proto.component', - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n(secretflow/protos/component/report.proto\x12\x14secretflow.component\x1a&secretflow/protos/component/comp.proto\"\xc4\x01\n\x0c\x44\x65scriptions\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x36\n\x05items\x18\x03 \x03(\x0b\x32\'.secretflow.component.Descriptions.Item\x1a`\n\x04Item\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x0c\n\x04type\x18\x03 \x01(\t\x12.\n\x05value\x18\x04 \x01(\x0b\x32\x1f.secretflow.component.Attribute\"\x96\x02\n\x05Table\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x37\n\x07headers\x18\x03 \x03(\x0b\x32&.secretflow.component.Table.HeaderItem\x12-\n\x04rows\x18\x04 \x03(\x0b\x32\x1f.secretflow.component.Table.Row\x1a\x36\n\nHeaderItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x0c\n\x04type\x18\x03 \x01(\t\x1aQ\n\x03Row\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12.\n\x05items\x18\x03 \x03(\x0b\x32\x1f.secretflow.component.Attribute\"\xfa\x01\n\x03\x44iv\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x31\n\x08\x63hildren\x18\x03 \x03(\x0b\x32\x1f.secretflow.component.Div.Child\x1a\xa3\x01\n\x05\x43hild\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x38\n\x0c\x64\x65scriptions\x18\x02 \x01(\x0b\x32\".secretflow.component.Descriptions\x12*\n\x05table\x18\x03 \x01(\x0b\x32\x1b.secretflow.component.Table\x12&\n\x03\x64iv\x18\x04 \x01(\x0b\x32\x19.secretflow.component.Div\"J\n\x03Tab\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\'\n\x04\x64ivs\x18\x03 \x03(\x0b\x32\x19.secretflow.component.Div\"s\n\x06Report\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\'\n\x04tabs\x18\x03 \x03(\x0b\x32\x19.secretflow.component.Tab\x12\x10\n\x08\x65rr_code\x18\x04 \x01(\x05\x12\x12\n\nerr_detail\x18\x05 \x01(\tB \n\x1eorg.secretflow.proto.componentb\x06proto3' - , - dependencies=[secretflow_dot_protos_dot_component_dot_comp__pb2.DESCRIPTOR,]) - - - - -_DESCRIPTIONS_ITEM = _descriptor.Descriptor( - name='Item', - full_name='secretflow.component.Descriptions.Item', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.Descriptions.Item.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.Descriptions.Item.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='type', full_name='secretflow.component.Descriptions.Item.type', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='value', full_name='secretflow.component.Descriptions.Item.value', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=207, - serialized_end=303, -) - -_DESCRIPTIONS = _descriptor.Descriptor( - name='Descriptions', - full_name='secretflow.component.Descriptions', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.Descriptions.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.Descriptions.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='items', full_name='secretflow.component.Descriptions.items', index=2, - number=3, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_DESCRIPTIONS_ITEM, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=107, - serialized_end=303, -) - - -_TABLE_HEADERITEM = _descriptor.Descriptor( - name='HeaderItem', - full_name='secretflow.component.Table.HeaderItem', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.Table.HeaderItem.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.Table.HeaderItem.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='type', full_name='secretflow.component.Table.HeaderItem.type', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=447, - serialized_end=501, -) - -_TABLE_ROW = _descriptor.Descriptor( - name='Row', - full_name='secretflow.component.Table.Row', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.Table.Row.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.Table.Row.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='items', full_name='secretflow.component.Table.Row.items', index=2, - number=3, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=503, - serialized_end=584, -) - -_TABLE = _descriptor.Descriptor( - name='Table', - full_name='secretflow.component.Table', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.Table.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.Table.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='headers', full_name='secretflow.component.Table.headers', index=2, - number=3, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='rows', full_name='secretflow.component.Table.rows', index=3, - number=4, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_TABLE_HEADERITEM, _TABLE_ROW, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=306, - serialized_end=584, -) - - -_DIV_CHILD = _descriptor.Descriptor( - name='Child', - full_name='secretflow.component.Div.Child', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='type', full_name='secretflow.component.Div.Child.type', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='descriptions', full_name='secretflow.component.Div.Child.descriptions', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='table', full_name='secretflow.component.Div.Child.table', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='div', full_name='secretflow.component.Div.Child.div', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=674, - serialized_end=837, -) - -_DIV = _descriptor.Descriptor( - name='Div', - full_name='secretflow.component.Div', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.Div.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.Div.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='children', full_name='secretflow.component.Div.children', index=2, - number=3, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_DIV_CHILD, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=587, - serialized_end=837, -) - - -_TAB = _descriptor.Descriptor( - name='Tab', - full_name='secretflow.component.Tab', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.Tab.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.Tab.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='divs', full_name='secretflow.component.Tab.divs', index=2, - number=3, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=839, - serialized_end=913, -) - - -_REPORT = _descriptor.Descriptor( - name='Report', - full_name='secretflow.component.Report', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='secretflow.component.Report.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='desc', full_name='secretflow.component.Report.desc', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='tabs', full_name='secretflow.component.Report.tabs', index=2, - number=3, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='err_code', full_name='secretflow.component.Report.err_code', index=3, - number=4, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='err_detail', full_name='secretflow.component.Report.err_detail', index=4, - number=5, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=915, - serialized_end=1030, -) - -_DESCRIPTIONS_ITEM.fields_by_name['value'].message_type = secretflow_dot_protos_dot_component_dot_comp__pb2._ATTRIBUTE -_DESCRIPTIONS_ITEM.containing_type = _DESCRIPTIONS -_DESCRIPTIONS.fields_by_name['items'].message_type = _DESCRIPTIONS_ITEM -_TABLE_HEADERITEM.containing_type = _TABLE -_TABLE_ROW.fields_by_name['items'].message_type = secretflow_dot_protos_dot_component_dot_comp__pb2._ATTRIBUTE -_TABLE_ROW.containing_type = _TABLE -_TABLE.fields_by_name['headers'].message_type = _TABLE_HEADERITEM -_TABLE.fields_by_name['rows'].message_type = _TABLE_ROW -_DIV_CHILD.fields_by_name['descriptions'].message_type = _DESCRIPTIONS -_DIV_CHILD.fields_by_name['table'].message_type = _TABLE -_DIV_CHILD.fields_by_name['div'].message_type = _DIV -_DIV_CHILD.containing_type = _DIV -_DIV.fields_by_name['children'].message_type = _DIV_CHILD -_TAB.fields_by_name['divs'].message_type = _DIV -_REPORT.fields_by_name['tabs'].message_type = _TAB -DESCRIPTOR.message_types_by_name['Descriptions'] = _DESCRIPTIONS -DESCRIPTOR.message_types_by_name['Table'] = _TABLE -DESCRIPTOR.message_types_by_name['Div'] = _DIV -DESCRIPTOR.message_types_by_name['Tab'] = _TAB -DESCRIPTOR.message_types_by_name['Report'] = _REPORT -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -Descriptions = _reflection.GeneratedProtocolMessageType('Descriptions', (_message.Message,), { - - 'Item' : _reflection.GeneratedProtocolMessageType('Item', (_message.Message,), { - 'DESCRIPTOR' : _DESCRIPTIONS_ITEM, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Descriptions.Item) - }) - , - 'DESCRIPTOR' : _DESCRIPTIONS, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Descriptions) - }) -_sym_db.RegisterMessage(Descriptions) -_sym_db.RegisterMessage(Descriptions.Item) - -Table = _reflection.GeneratedProtocolMessageType('Table', (_message.Message,), { - - 'HeaderItem' : _reflection.GeneratedProtocolMessageType('HeaderItem', (_message.Message,), { - 'DESCRIPTOR' : _TABLE_HEADERITEM, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Table.HeaderItem) - }) - , - - 'Row' : _reflection.GeneratedProtocolMessageType('Row', (_message.Message,), { - 'DESCRIPTOR' : _TABLE_ROW, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Table.Row) - }) - , - 'DESCRIPTOR' : _TABLE, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Table) - }) -_sym_db.RegisterMessage(Table) -_sym_db.RegisterMessage(Table.HeaderItem) -_sym_db.RegisterMessage(Table.Row) - -Div = _reflection.GeneratedProtocolMessageType('Div', (_message.Message,), { - - 'Child' : _reflection.GeneratedProtocolMessageType('Child', (_message.Message,), { - 'DESCRIPTOR' : _DIV_CHILD, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Div.Child) - }) - , - 'DESCRIPTOR' : _DIV, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Div) - }) -_sym_db.RegisterMessage(Div) -_sym_db.RegisterMessage(Div.Child) - -Tab = _reflection.GeneratedProtocolMessageType('Tab', (_message.Message,), { - 'DESCRIPTOR' : _TAB, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Tab) - }) -_sym_db.RegisterMessage(Tab) - -Report = _reflection.GeneratedProtocolMessageType('Report', (_message.Message,), { - 'DESCRIPTOR' : _REPORT, - '__module__' : 'secretflow.protos.component.report_pb2' - # @@protoc_insertion_point(class_scope:secretflow.component.Report) - }) -_sym_db.RegisterMessage(Report) - - -DESCRIPTOR._options = None -# @@protoc_insertion_point(module_scope) diff --git a/secretflow/protos/component/cluster.proto b/secretflow/protos/secretflow/spec/extend/cluster.proto similarity index 87% rename from secretflow/protos/component/cluster.proto rename to secretflow/protos/secretflow/spec/extend/cluster.proto index a01cc3136..035353e1c 100644 --- a/secretflow/protos/component/cluster.proto +++ b/secretflow/protos/secretflow/spec/extend/cluster.proto @@ -14,9 +14,9 @@ syntax = "proto3"; -package secretflow.component; +package secretflow.spec.extend; -option java_package = "org.secretflow.proto.component"; +option java_package = "org.secretflow.spec.extend"; // Intrinsic properties of a SecretFlow cluster, including: // @@ -57,6 +57,11 @@ message SFClusterDesc { // } // } // ``` + // Referrences: + // SPU: + // https://www.secretflow.org.cn/docs/spu/latest/en-US/reference/runtime_config#runtimeconfig + // HEU: + // https://www.secretflow.org.cn/docs/secretflow/latest/en-US/source/secretflow.device.device.device#secretflow.device.device.heu.HEU.__init__ message DeviceDesc { // Name of the device. string name = 1; @@ -84,21 +89,6 @@ message SFClusterDesc { RayFedConfig ray_fed_config = 5; } -// A StorageConfig specifies the root for all data for one party. -// - At this moment, only local_fs is supported. We would support OSS, database -// in future. -message StorageConfig { - // Supported: local_fs. - string type = 1; - - message LocalFSConfig { - // Working directory. - string wd = 1; - } - // local_fs config. - LocalFSConfig local_fs = 2; -} - // Runtime Config for a SecretFlow cluster. // Besides intrinsic SFClusterDesc, dynamic network configs are provided. message SFClusterConfig { @@ -136,8 +126,6 @@ message SFClusterConfig { string self_party = 1; string ray_head_addr = 2; - - StorageConfig storage_config = 3; } // Intrinsic properties. diff --git a/secretflow/protos/secretflow/spec/extend/data.proto b/secretflow/protos/secretflow/spec/extend/data.proto new file mode 100644 index 000000000..1bfa5219a --- /dev/null +++ b/secretflow/protos/secretflow/spec/extend/data.proto @@ -0,0 +1,34 @@ +// Copyright 2023 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package secretflow.spec.extend; + +option java_package = "org.secretflow.spec.extend"; + +// Descibes public storage info for a collection of Device Objects. +message DeviceObjectCollection { + message DeviceObject { + // Supported: `spu \| pyu` + string type = 1; + // Index of data_ref in the parent DistData message. + repeated int32 data_ref_idxs = 2; + } + + repeated DeviceObject objs = 1; + + // Any public information. + string public_info = 2; +} diff --git a/secretflow/component/feature/__init__.py b/secretflow/spec/__init__.py similarity index 100% rename from secretflow/component/feature/__init__.py rename to secretflow/spec/__init__.py diff --git a/secretflow/data/partition/__init__.py b/secretflow/spec/extend/__init__.py similarity index 100% rename from secretflow/data/partition/__init__.py rename to secretflow/spec/extend/__init__.py diff --git a/secretflow/spec/extend/cluster_pb2.py b/secretflow/spec/extend/cluster_pb2.py new file mode 100644 index 000000000..c1af8ec1e --- /dev/null +++ b/secretflow/spec/extend/cluster_pb2.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: secretflow/spec/extend/cluster.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n$secretflow/spec/extend/cluster.proto\x12\x16secretflow.spec.extend\"\xd3\x02\n\rSFClusterDesc\x12\x12\n\nsf_version\x18\x01 \x01(\t\x12\x12\n\npy_version\x18\x02 \x01(\t\x12\x0f\n\x07parties\x18\x03 \x03(\t\x12\x41\n\x07\x64\x65vices\x18\x04 \x03(\x0b\x32\x30.secretflow.spec.extend.SFClusterDesc.DeviceDesc\x12J\n\x0eray_fed_config\x18\x05 \x01(\x0b\x32\x32.secretflow.spec.extend.SFClusterDesc.RayFedConfig\x1aI\n\nDeviceDesc\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0f\n\x07parties\x18\x03 \x03(\t\x12\x0e\n\x06\x63onfig\x18\x04 \x01(\t\x1a/\n\x0cRayFedConfig\x12\x1f\n\x17\x63ross_silo_comm_backend\x18\x01 \x01(\t\"\xec\x04\n\x0fSFClusterConfig\x12\x33\n\x04\x64\x65sc\x18\x01 \x01(\x0b\x32%.secretflow.spec.extend.SFClusterDesc\x12K\n\rpublic_config\x18\x02 \x01(\x0b\x32\x34.secretflow.spec.extend.SFClusterConfig.PublicConfig\x12M\n\x0eprivate_config\x18\x03 \x01(\x0b\x32\x35.secretflow.spec.extend.SFClusterConfig.PrivateConfig\x1aL\n\x0cRayFedConfig\x12\x0f\n\x07parties\x18\x01 \x03(\t\x12\x11\n\taddresses\x18\x02 \x03(\t\x12\x18\n\x10listen_addresses\x18\x03 \x03(\t\x1aW\n\tSPUConfig\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0f\n\x07parties\x18\x02 \x03(\t\x12\x11\n\taddresses\x18\x03 \x03(\t\x12\x18\n\x10listen_addresses\x18\x04 \x03(\t\x1a\xa4\x01\n\x0cPublicConfig\x12L\n\x0eray_fed_config\x18\x01 \x01(\x0b\x32\x34.secretflow.spec.extend.SFClusterConfig.RayFedConfig\x12\x46\n\x0bspu_configs\x18\x02 \x03(\x0b\x32\x31.secretflow.spec.extend.SFClusterConfig.SPUConfig\x1a:\n\rPrivateConfig\x12\x12\n\nself_party\x18\x01 \x01(\t\x12\x15\n\rray_head_addr\x18\x02 \x01(\tB\x1c\n\x1aorg.secretflow.spec.extendb\x06proto3') + + + +_SFCLUSTERDESC = DESCRIPTOR.message_types_by_name['SFClusterDesc'] +_SFCLUSTERDESC_DEVICEDESC = _SFCLUSTERDESC.nested_types_by_name['DeviceDesc'] +_SFCLUSTERDESC_RAYFEDCONFIG = _SFCLUSTERDESC.nested_types_by_name['RayFedConfig'] +_SFCLUSTERCONFIG = DESCRIPTOR.message_types_by_name['SFClusterConfig'] +_SFCLUSTERCONFIG_RAYFEDCONFIG = _SFCLUSTERCONFIG.nested_types_by_name['RayFedConfig'] +_SFCLUSTERCONFIG_SPUCONFIG = _SFCLUSTERCONFIG.nested_types_by_name['SPUConfig'] +_SFCLUSTERCONFIG_PUBLICCONFIG = _SFCLUSTERCONFIG.nested_types_by_name['PublicConfig'] +_SFCLUSTERCONFIG_PRIVATECONFIG = _SFCLUSTERCONFIG.nested_types_by_name['PrivateConfig'] +SFClusterDesc = _reflection.GeneratedProtocolMessageType('SFClusterDesc', (_message.Message,), { + + 'DeviceDesc' : _reflection.GeneratedProtocolMessageType('DeviceDesc', (_message.Message,), { + 'DESCRIPTOR' : _SFCLUSTERDESC_DEVICEDESC, + '__module__' : 'secretflow.spec.extend.cluster_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.SFClusterDesc.DeviceDesc) + }) + , + + 'RayFedConfig' : _reflection.GeneratedProtocolMessageType('RayFedConfig', (_message.Message,), { + 'DESCRIPTOR' : _SFCLUSTERDESC_RAYFEDCONFIG, + '__module__' : 'secretflow.spec.extend.cluster_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.SFClusterDesc.RayFedConfig) + }) + , + 'DESCRIPTOR' : _SFCLUSTERDESC, + '__module__' : 'secretflow.spec.extend.cluster_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.SFClusterDesc) + }) +_sym_db.RegisterMessage(SFClusterDesc) +_sym_db.RegisterMessage(SFClusterDesc.DeviceDesc) +_sym_db.RegisterMessage(SFClusterDesc.RayFedConfig) + +SFClusterConfig = _reflection.GeneratedProtocolMessageType('SFClusterConfig', (_message.Message,), { + + 'RayFedConfig' : _reflection.GeneratedProtocolMessageType('RayFedConfig', (_message.Message,), { + 'DESCRIPTOR' : _SFCLUSTERCONFIG_RAYFEDCONFIG, + '__module__' : 'secretflow.spec.extend.cluster_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.SFClusterConfig.RayFedConfig) + }) + , + + 'SPUConfig' : _reflection.GeneratedProtocolMessageType('SPUConfig', (_message.Message,), { + 'DESCRIPTOR' : _SFCLUSTERCONFIG_SPUCONFIG, + '__module__' : 'secretflow.spec.extend.cluster_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.SFClusterConfig.SPUConfig) + }) + , + + 'PublicConfig' : _reflection.GeneratedProtocolMessageType('PublicConfig', (_message.Message,), { + 'DESCRIPTOR' : _SFCLUSTERCONFIG_PUBLICCONFIG, + '__module__' : 'secretflow.spec.extend.cluster_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.SFClusterConfig.PublicConfig) + }) + , + + 'PrivateConfig' : _reflection.GeneratedProtocolMessageType('PrivateConfig', (_message.Message,), { + 'DESCRIPTOR' : _SFCLUSTERCONFIG_PRIVATECONFIG, + '__module__' : 'secretflow.spec.extend.cluster_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.SFClusterConfig.PrivateConfig) + }) + , + 'DESCRIPTOR' : _SFCLUSTERCONFIG, + '__module__' : 'secretflow.spec.extend.cluster_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.SFClusterConfig) + }) +_sym_db.RegisterMessage(SFClusterConfig) +_sym_db.RegisterMessage(SFClusterConfig.RayFedConfig) +_sym_db.RegisterMessage(SFClusterConfig.SPUConfig) +_sym_db.RegisterMessage(SFClusterConfig.PublicConfig) +_sym_db.RegisterMessage(SFClusterConfig.PrivateConfig) + +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\032org.secretflow.spec.extend' + _SFCLUSTERDESC._serialized_start=65 + _SFCLUSTERDESC._serialized_end=404 + _SFCLUSTERDESC_DEVICEDESC._serialized_start=282 + _SFCLUSTERDESC_DEVICEDESC._serialized_end=355 + _SFCLUSTERDESC_RAYFEDCONFIG._serialized_start=357 + _SFCLUSTERDESC_RAYFEDCONFIG._serialized_end=404 + _SFCLUSTERCONFIG._serialized_start=407 + _SFCLUSTERCONFIG._serialized_end=1027 + _SFCLUSTERCONFIG_RAYFEDCONFIG._serialized_start=635 + _SFCLUSTERCONFIG_RAYFEDCONFIG._serialized_end=711 + _SFCLUSTERCONFIG_SPUCONFIG._serialized_start=713 + _SFCLUSTERCONFIG_SPUCONFIG._serialized_end=800 + _SFCLUSTERCONFIG_PUBLICCONFIG._serialized_start=803 + _SFCLUSTERCONFIG_PUBLICCONFIG._serialized_end=967 + _SFCLUSTERCONFIG_PRIVATECONFIG._serialized_start=969 + _SFCLUSTERCONFIG_PRIVATECONFIG._serialized_end=1027 +# @@protoc_insertion_point(module_scope) diff --git a/secretflow/spec/extend/data_pb2.py b/secretflow/spec/extend/data_pb2.py new file mode 100644 index 000000000..43692fa69 --- /dev/null +++ b/secretflow/spec/extend/data_pb2.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: secretflow/spec/extend/data.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n!secretflow/spec/extend/data.proto\x12\x16secretflow.spec.extend\"\xad\x01\n\x16\x44\x65viceObjectCollection\x12I\n\x04objs\x18\x01 \x03(\x0b\x32;.secretflow.spec.extend.DeviceObjectCollection.DeviceObject\x12\x13\n\x0bpublic_info\x18\x02 \x01(\t\x1a\x33\n\x0c\x44\x65viceObject\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x15\n\rdata_ref_idxs\x18\x02 \x03(\x05\x42\x1c\n\x1aorg.secretflow.spec.extendb\x06proto3') + + + +_DEVICEOBJECTCOLLECTION = DESCRIPTOR.message_types_by_name['DeviceObjectCollection'] +_DEVICEOBJECTCOLLECTION_DEVICEOBJECT = _DEVICEOBJECTCOLLECTION.nested_types_by_name['DeviceObject'] +DeviceObjectCollection = _reflection.GeneratedProtocolMessageType('DeviceObjectCollection', (_message.Message,), { + + 'DeviceObject' : _reflection.GeneratedProtocolMessageType('DeviceObject', (_message.Message,), { + 'DESCRIPTOR' : _DEVICEOBJECTCOLLECTION_DEVICEOBJECT, + '__module__' : 'secretflow.spec.extend.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.DeviceObjectCollection.DeviceObject) + }) + , + 'DESCRIPTOR' : _DEVICEOBJECTCOLLECTION, + '__module__' : 'secretflow.spec.extend.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.extend.DeviceObjectCollection) + }) +_sym_db.RegisterMessage(DeviceObjectCollection) +_sym_db.RegisterMessage(DeviceObjectCollection.DeviceObject) + +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\032org.secretflow.spec.extend' + _DEVICEOBJECTCOLLECTION._serialized_start=62 + _DEVICEOBJECTCOLLECTION._serialized_end=235 + _DEVICEOBJECTCOLLECTION_DEVICEOBJECT._serialized_start=184 + _DEVICEOBJECTCOLLECTION_DEVICEOBJECT._serialized_end=235 +# @@protoc_insertion_point(module_scope) diff --git a/secretflow/protos/__init__.py b/secretflow/spec/v1/__init__.py similarity index 100% rename from secretflow/protos/__init__.py rename to secretflow/spec/v1/__init__.py diff --git a/secretflow/spec/v1/component_pb2.py b/secretflow/spec/v1/component_pb2.py new file mode 100644 index 000000000..020483139 --- /dev/null +++ b/secretflow/spec/v1/component_pb2.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: secretflow/spec/v1/component.proto +"""Generated protocol buffer code.""" +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\"secretflow/spec/v1/component.proto\x12\x12secretflow.spec.v1\"z\n\tAttribute\x12\t\n\x01\x66\x18\x01 \x01(\x02\x12\x0b\n\x03i64\x18\x02 \x01(\x03\x12\t\n\x01s\x18\x03 \x01(\t\x12\t\n\x01\x62\x18\x04 \x01(\x08\x12\n\n\x02\x66s\x18\x05 \x03(\x02\x12\x0c\n\x04i64s\x18\x06 \x03(\x03\x12\n\n\x02ss\x18\x07 \x03(\t\x12\n\n\x02\x62s\x18\x08 \x03(\x08\x12\r\n\x05is_na\x18\t \x01(\x08\"\xd9\x05\n\x0c\x41ttributeDef\x12\x10\n\x08prefixes\x18\x01 \x03(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x03 \x01(\t\x12*\n\x04type\x18\x04 \x01(\x0e\x32\x1c.secretflow.spec.v1.AttrType\x12?\n\x06\x61tomic\x18\x05 \x01(\x0b\x32/.secretflow.spec.v1.AttributeDef.AtomicAttrDesc\x12\x42\n\x05union\x18\x06 \x01(\x0b\x32\x33.secretflow.spec.v1.AttributeDef.UnionAttrGroupDesc\x1a\xb8\x03\n\x0e\x41tomicAttrDesc\x12!\n\x19list_min_length_inclusive\x18\x01 \x01(\x03\x12!\n\x19list_max_length_inclusive\x18\x02 \x01(\x03\x12\x13\n\x0bis_optional\x18\x03 \x01(\x08\x12\x34\n\rdefault_value\x18\x04 \x01(\x0b\x32\x1d.secretflow.spec.v1.Attribute\x12\x35\n\x0e\x61llowed_values\x18\x05 \x01(\x0b\x32\x1d.secretflow.spec.v1.Attribute\x12\x1b\n\x13lower_bound_enabled\x18\x06 \x01(\x08\x12\x32\n\x0blower_bound\x18\x07 \x01(\x0b\x32\x1d.secretflow.spec.v1.Attribute\x12\x1d\n\x15lower_bound_inclusive\x18\x08 \x01(\x08\x12\x1b\n\x13upper_bound_enabled\x18\t \x01(\x08\x12\x32\n\x0bupper_bound\x18\n \x01(\x0b\x32\x1d.secretflow.spec.v1.Attribute\x12\x1d\n\x15upper_bound_inclusive\x18\x0b \x01(\x08\x1a/\n\x12UnionAttrGroupDesc\x12\x19\n\x11\x64\x65\x66\x61ult_selection\x18\x01 \x01(\t\"\x9a\x02\n\x05IoDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\r\n\x05types\x18\x03 \x03(\t\x12\x35\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32&.secretflow.spec.v1.IoDef.TableAttrDef\x1a\xae\x01\n\x0cTableAttrDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\r\n\x05types\x18\x03 \x03(\t\x12\x1d\n\x15\x63ol_min_cnt_inclusive\x18\x04 \x01(\x03\x12\x1d\n\x15\x63ol_max_cnt_inclusive\x18\x05 \x01(\x03\x12\x35\n\x0b\x65xtra_attrs\x18\x06 \x03(\x0b\x32 .secretflow.spec.v1.AttributeDef\"\xd3\x01\n\x0c\x43omponentDef\x12\x0e\n\x06\x64omain\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x03 \x01(\t\x12\x0f\n\x07version\x18\x04 \x01(\t\x12/\n\x05\x61ttrs\x18\x05 \x03(\x0b\x32 .secretflow.spec.v1.AttributeDef\x12)\n\x06inputs\x18\x06 \x03(\x0b\x32\x19.secretflow.spec.v1.IoDef\x12*\n\x07outputs\x18\x07 \x03(\x0b\x32\x19.secretflow.spec.v1.IoDef\"k\n\x0b\x43ompListDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x0f\n\x07version\x18\x03 \x01(\t\x12/\n\x05\x63omps\x18\x04 \x03(\x0b\x32 .secretflow.spec.v1.ComponentDef*\xd3\x01\n\x08\x41ttrType\x12\x19\n\x15\x41TTR_TYPE_UNSPECIFIED\x10\x00\x12\x0c\n\x08\x41T_FLOAT\x10\x01\x12\n\n\x06\x41T_INT\x10\x02\x12\r\n\tAT_STRING\x10\x03\x12\x0b\n\x07\x41T_BOOL\x10\x04\x12\r\n\tAT_FLOATS\x10\x05\x12\x0b\n\x07\x41T_INTS\x10\x06\x12\x0e\n\nAT_STRINGS\x10\x07\x12\x0c\n\x08\x41T_BOOLS\x10\x08\x12\x13\n\x0f\x41T_STRUCT_GROUP\x10\t\x12\x12\n\x0e\x41T_UNION_GROUP\x10\n\x12\x13\n\x0f\x41T_SF_TABLE_COL\x10\x0b\x42*\n\x16\x63om.secretflow.spec.v1B\x0e\x43omponentProtoP\x01\x62\x06proto3') + +_ATTRTYPE = DESCRIPTOR.enum_types_by_name['AttrType'] +AttrType = enum_type_wrapper.EnumTypeWrapper(_ATTRTYPE) +ATTR_TYPE_UNSPECIFIED = 0 +AT_FLOAT = 1 +AT_INT = 2 +AT_STRING = 3 +AT_BOOL = 4 +AT_FLOATS = 5 +AT_INTS = 6 +AT_STRINGS = 7 +AT_BOOLS = 8 +AT_STRUCT_GROUP = 9 +AT_UNION_GROUP = 10 +AT_SF_TABLE_COL = 11 + + +_ATTRIBUTE = DESCRIPTOR.message_types_by_name['Attribute'] +_ATTRIBUTEDEF = DESCRIPTOR.message_types_by_name['AttributeDef'] +_ATTRIBUTEDEF_ATOMICATTRDESC = _ATTRIBUTEDEF.nested_types_by_name['AtomicAttrDesc'] +_ATTRIBUTEDEF_UNIONATTRGROUPDESC = _ATTRIBUTEDEF.nested_types_by_name['UnionAttrGroupDesc'] +_IODEF = DESCRIPTOR.message_types_by_name['IoDef'] +_IODEF_TABLEATTRDEF = _IODEF.nested_types_by_name['TableAttrDef'] +_COMPONENTDEF = DESCRIPTOR.message_types_by_name['ComponentDef'] +_COMPLISTDEF = DESCRIPTOR.message_types_by_name['CompListDef'] +Attribute = _reflection.GeneratedProtocolMessageType('Attribute', (_message.Message,), { + 'DESCRIPTOR' : _ATTRIBUTE, + '__module__' : 'secretflow.spec.v1.component_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Attribute) + }) +_sym_db.RegisterMessage(Attribute) + +AttributeDef = _reflection.GeneratedProtocolMessageType('AttributeDef', (_message.Message,), { + + 'AtomicAttrDesc' : _reflection.GeneratedProtocolMessageType('AtomicAttrDesc', (_message.Message,), { + 'DESCRIPTOR' : _ATTRIBUTEDEF_ATOMICATTRDESC, + '__module__' : 'secretflow.spec.v1.component_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.AttributeDef.AtomicAttrDesc) + }) + , + + 'UnionAttrGroupDesc' : _reflection.GeneratedProtocolMessageType('UnionAttrGroupDesc', (_message.Message,), { + 'DESCRIPTOR' : _ATTRIBUTEDEF_UNIONATTRGROUPDESC, + '__module__' : 'secretflow.spec.v1.component_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.AttributeDef.UnionAttrGroupDesc) + }) + , + 'DESCRIPTOR' : _ATTRIBUTEDEF, + '__module__' : 'secretflow.spec.v1.component_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.AttributeDef) + }) +_sym_db.RegisterMessage(AttributeDef) +_sym_db.RegisterMessage(AttributeDef.AtomicAttrDesc) +_sym_db.RegisterMessage(AttributeDef.UnionAttrGroupDesc) + +IoDef = _reflection.GeneratedProtocolMessageType('IoDef', (_message.Message,), { + + 'TableAttrDef' : _reflection.GeneratedProtocolMessageType('TableAttrDef', (_message.Message,), { + 'DESCRIPTOR' : _IODEF_TABLEATTRDEF, + '__module__' : 'secretflow.spec.v1.component_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.IoDef.TableAttrDef) + }) + , + 'DESCRIPTOR' : _IODEF, + '__module__' : 'secretflow.spec.v1.component_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.IoDef) + }) +_sym_db.RegisterMessage(IoDef) +_sym_db.RegisterMessage(IoDef.TableAttrDef) + +ComponentDef = _reflection.GeneratedProtocolMessageType('ComponentDef', (_message.Message,), { + 'DESCRIPTOR' : _COMPONENTDEF, + '__module__' : 'secretflow.spec.v1.component_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.ComponentDef) + }) +_sym_db.RegisterMessage(ComponentDef) + +CompListDef = _reflection.GeneratedProtocolMessageType('CompListDef', (_message.Message,), { + 'DESCRIPTOR' : _COMPLISTDEF, + '__module__' : 'secretflow.spec.v1.component_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.CompListDef) + }) +_sym_db.RegisterMessage(CompListDef) + +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\026com.secretflow.spec.v1B\016ComponentProtoP\001' + _ATTRTYPE._serialized_start=1523 + _ATTRTYPE._serialized_end=1734 + _ATTRIBUTE._serialized_start=58 + _ATTRIBUTE._serialized_end=180 + _ATTRIBUTEDEF._serialized_start=183 + _ATTRIBUTEDEF._serialized_end=912 + _ATTRIBUTEDEF_ATOMICATTRDESC._serialized_start=423 + _ATTRIBUTEDEF_ATOMICATTRDESC._serialized_end=863 + _ATTRIBUTEDEF_UNIONATTRGROUPDESC._serialized_start=865 + _ATTRIBUTEDEF_UNIONATTRGROUPDESC._serialized_end=912 + _IODEF._serialized_start=915 + _IODEF._serialized_end=1197 + _IODEF_TABLEATTRDEF._serialized_start=1023 + _IODEF_TABLEATTRDEF._serialized_end=1197 + _COMPONENTDEF._serialized_start=1200 + _COMPONENTDEF._serialized_end=1411 + _COMPLISTDEF._serialized_start=1413 + _COMPLISTDEF._serialized_end=1520 +# @@protoc_insertion_point(module_scope) diff --git a/secretflow/spec/v1/data_pb2.py b/secretflow/spec/v1/data_pb2.py new file mode 100644 index 000000000..816907e34 --- /dev/null +++ b/secretflow/spec/v1/data_pb2.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: secretflow/spec/v1/data.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import any_pb2 as google_dot_protobuf_dot_any__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1dsecretflow/spec/v1/data.proto\x12\x12secretflow.spec.v1\x1a\x19google/protobuf/any.proto\"A\n\nSystemInfo\x12\x0b\n\x03\x61pp\x18\x01 \x01(\t\x12&\n\x08\x61pp_meta\x18\x02 \x01(\x0b\x32\x14.google.protobuf.Any\"}\n\rStorageConfig\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x41\n\x08local_fs\x18\x02 \x01(\x0b\x32/.secretflow.spec.v1.StorageConfig.LocalFSConfig\x1a\x1b\n\rLocalFSConfig\x12\n\n\x02wd\x18\x01 \x01(\t\"\xef\x01\n\x08\x44istData\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x33\n\x0bsystem_info\x18\x03 \x01(\x0b\x32\x1e.secretflow.spec.v1.SystemInfo\x12\"\n\x04meta\x18\x04 \x01(\x0b\x32\x14.google.protobuf.Any\x12\x37\n\tdata_refs\x18\x05 \x03(\x0b\x32$.secretflow.spec.v1.DistData.DataRef\x1a\x35\n\x07\x44\x61taRef\x12\x0b\n\x03uri\x18\x01 \x01(\t\x12\r\n\x05party\x18\x02 \x01(\t\x12\x0e\n\x06\x66ormat\x18\x03 \x01(\t\"U\n\rVerticalTable\x12\x30\n\x07schemas\x18\x01 \x03(\x0b\x32\x1f.secretflow.spec.v1.TableSchema\x12\x12\n\nline_count\x18\x02 \x01(\x03\"V\n\x0fIndividualTable\x12/\n\x06schema\x18\x01 \x01(\x0b\x32\x1f.secretflow.spec.v1.TableSchema\x12\x12\n\nline_count\x18\x02 \x01(\x03\"z\n\x0bTableSchema\x12\x0b\n\x03ids\x18\x01 \x03(\t\x12\x10\n\x08\x66\x65\x61tures\x18\x02 \x03(\t\x12\x0e\n\x06labels\x18\x03 \x03(\t\x12\x10\n\x08id_types\x18\x04 \x03(\t\x12\x15\n\rfeature_types\x18\x05 \x03(\t\x12\x13\n\x0blabel_types\x18\x06 \x03(\tB%\n\x16\x63om.secretflow.spec.v1B\tDataProtoP\x01\x62\x06proto3') + + + +_SYSTEMINFO = DESCRIPTOR.message_types_by_name['SystemInfo'] +_STORAGECONFIG = DESCRIPTOR.message_types_by_name['StorageConfig'] +_STORAGECONFIG_LOCALFSCONFIG = _STORAGECONFIG.nested_types_by_name['LocalFSConfig'] +_DISTDATA = DESCRIPTOR.message_types_by_name['DistData'] +_DISTDATA_DATAREF = _DISTDATA.nested_types_by_name['DataRef'] +_VERTICALTABLE = DESCRIPTOR.message_types_by_name['VerticalTable'] +_INDIVIDUALTABLE = DESCRIPTOR.message_types_by_name['IndividualTable'] +_TABLESCHEMA = DESCRIPTOR.message_types_by_name['TableSchema'] +SystemInfo = _reflection.GeneratedProtocolMessageType('SystemInfo', (_message.Message,), { + 'DESCRIPTOR' : _SYSTEMINFO, + '__module__' : 'secretflow.spec.v1.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.SystemInfo) + }) +_sym_db.RegisterMessage(SystemInfo) + +StorageConfig = _reflection.GeneratedProtocolMessageType('StorageConfig', (_message.Message,), { + + 'LocalFSConfig' : _reflection.GeneratedProtocolMessageType('LocalFSConfig', (_message.Message,), { + 'DESCRIPTOR' : _STORAGECONFIG_LOCALFSCONFIG, + '__module__' : 'secretflow.spec.v1.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.StorageConfig.LocalFSConfig) + }) + , + 'DESCRIPTOR' : _STORAGECONFIG, + '__module__' : 'secretflow.spec.v1.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.StorageConfig) + }) +_sym_db.RegisterMessage(StorageConfig) +_sym_db.RegisterMessage(StorageConfig.LocalFSConfig) + +DistData = _reflection.GeneratedProtocolMessageType('DistData', (_message.Message,), { + + 'DataRef' : _reflection.GeneratedProtocolMessageType('DataRef', (_message.Message,), { + 'DESCRIPTOR' : _DISTDATA_DATAREF, + '__module__' : 'secretflow.spec.v1.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.DistData.DataRef) + }) + , + 'DESCRIPTOR' : _DISTDATA, + '__module__' : 'secretflow.spec.v1.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.DistData) + }) +_sym_db.RegisterMessage(DistData) +_sym_db.RegisterMessage(DistData.DataRef) + +VerticalTable = _reflection.GeneratedProtocolMessageType('VerticalTable', (_message.Message,), { + 'DESCRIPTOR' : _VERTICALTABLE, + '__module__' : 'secretflow.spec.v1.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.VerticalTable) + }) +_sym_db.RegisterMessage(VerticalTable) + +IndividualTable = _reflection.GeneratedProtocolMessageType('IndividualTable', (_message.Message,), { + 'DESCRIPTOR' : _INDIVIDUALTABLE, + '__module__' : 'secretflow.spec.v1.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.IndividualTable) + }) +_sym_db.RegisterMessage(IndividualTable) + +TableSchema = _reflection.GeneratedProtocolMessageType('TableSchema', (_message.Message,), { + 'DESCRIPTOR' : _TABLESCHEMA, + '__module__' : 'secretflow.spec.v1.data_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.TableSchema) + }) +_sym_db.RegisterMessage(TableSchema) + +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\026com.secretflow.spec.v1B\tDataProtoP\001' + _SYSTEMINFO._serialized_start=80 + _SYSTEMINFO._serialized_end=145 + _STORAGECONFIG._serialized_start=147 + _STORAGECONFIG._serialized_end=272 + _STORAGECONFIG_LOCALFSCONFIG._serialized_start=245 + _STORAGECONFIG_LOCALFSCONFIG._serialized_end=272 + _DISTDATA._serialized_start=275 + _DISTDATA._serialized_end=514 + _DISTDATA_DATAREF._serialized_start=461 + _DISTDATA_DATAREF._serialized_end=514 + _VERTICALTABLE._serialized_start=516 + _VERTICALTABLE._serialized_end=601 + _INDIVIDUALTABLE._serialized_start=603 + _INDIVIDUALTABLE._serialized_end=689 + _TABLESCHEMA._serialized_start=691 + _TABLESCHEMA._serialized_end=813 +# @@protoc_insertion_point(module_scope) diff --git a/secretflow/spec/v1/evaluation_pb2.py b/secretflow/spec/v1/evaluation_pb2.py new file mode 100644 index 000000000..a45e9a6d6 --- /dev/null +++ b/secretflow/spec/v1/evaluation_pb2.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: secretflow/spec/v1/evaluation.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from secretflow.spec.v1 import component_pb2 as secretflow_dot_spec_dot_v1_dot_component__pb2 +from secretflow.spec.v1 import data_pb2 as secretflow_dot_spec_dot_v1_dot_data__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n#secretflow/spec/v1/evaluation.proto\x12\x12secretflow.spec.v1\x1a\"secretflow/spec/v1/component.proto\x1a\x1dsecretflow/spec/v1/data.proto\"\xc3\x01\n\rNodeEvalParam\x12\x0e\n\x06\x64omain\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0f\n\x07version\x18\x03 \x01(\t\x12\x12\n\nattr_paths\x18\x04 \x03(\t\x12,\n\x05\x61ttrs\x18\x05 \x03(\x0b\x32\x1d.secretflow.spec.v1.Attribute\x12,\n\x06inputs\x18\x06 \x03(\x0b\x32\x1c.secretflow.spec.v1.DistData\x12\x13\n\x0boutput_uris\x18\x07 \x03(\t\"?\n\x0eNodeEvalResult\x12-\n\x07outputs\x18\x01 \x03(\x0b\x32\x1c.secretflow.spec.v1.DistDataB+\n\x16\x63om.secretflow.spec.v1B\x0f\x45valuationProtoP\x01\x62\x06proto3') + + + +_NODEEVALPARAM = DESCRIPTOR.message_types_by_name['NodeEvalParam'] +_NODEEVALRESULT = DESCRIPTOR.message_types_by_name['NodeEvalResult'] +NodeEvalParam = _reflection.GeneratedProtocolMessageType('NodeEvalParam', (_message.Message,), { + 'DESCRIPTOR' : _NODEEVALPARAM, + '__module__' : 'secretflow.spec.v1.evaluation_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.NodeEvalParam) + }) +_sym_db.RegisterMessage(NodeEvalParam) + +NodeEvalResult = _reflection.GeneratedProtocolMessageType('NodeEvalResult', (_message.Message,), { + 'DESCRIPTOR' : _NODEEVALRESULT, + '__module__' : 'secretflow.spec.v1.evaluation_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.NodeEvalResult) + }) +_sym_db.RegisterMessage(NodeEvalResult) + +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\026com.secretflow.spec.v1B\017EvaluationProtoP\001' + _NODEEVALPARAM._serialized_start=127 + _NODEEVALPARAM._serialized_end=322 + _NODEEVALRESULT._serialized_start=324 + _NODEEVALRESULT._serialized_end=387 +# @@protoc_insertion_point(module_scope) diff --git a/secretflow/spec/v1/report_pb2.py b/secretflow/spec/v1/report_pb2.py new file mode 100644 index 000000000..0741e791d --- /dev/null +++ b/secretflow/spec/v1/report_pb2.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: secretflow/spec/v1/report.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from secretflow.spec.v1 import component_pb2 as secretflow_dot_spec_dot_v1_dot_component__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1fsecretflow/spec/v1/report.proto\x12\x12secretflow.spec.v1\x1a\"secretflow/spec/v1/component.proto\"\xc0\x01\n\x0c\x44\x65scriptions\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x34\n\x05items\x18\x03 \x03(\x0b\x32%.secretflow.spec.v1.Descriptions.Item\x1a^\n\x04Item\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x0c\n\x04type\x18\x03 \x01(\t\x12,\n\x05value\x18\x04 \x01(\x0b\x32\x1d.secretflow.spec.v1.Attribute\"\x90\x02\n\x05Table\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x35\n\x07headers\x18\x03 \x03(\x0b\x32$.secretflow.spec.v1.Table.HeaderItem\x12+\n\x04rows\x18\x04 \x03(\x0b\x32\x1d.secretflow.spec.v1.Table.Row\x1a\x36\n\nHeaderItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12\x0c\n\x04type\x18\x03 \x01(\t\x1aO\n\x03Row\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12,\n\x05items\x18\x03 \x03(\x0b\x32\x1d.secretflow.spec.v1.Attribute\"\xf2\x01\n\x03\x44iv\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12/\n\x08\x63hildren\x18\x03 \x03(\x0b\x32\x1d.secretflow.spec.v1.Div.Child\x1a\x9d\x01\n\x05\x43hild\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x36\n\x0c\x64\x65scriptions\x18\x02 \x01(\x0b\x32 .secretflow.spec.v1.Descriptions\x12(\n\x05table\x18\x03 \x01(\x0b\x32\x19.secretflow.spec.v1.Table\x12$\n\x03\x64iv\x18\x04 \x01(\x0b\x32\x17.secretflow.spec.v1.Div\"H\n\x03Tab\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12%\n\x04\x64ivs\x18\x03 \x03(\x0b\x32\x17.secretflow.spec.v1.Div\"q\n\x06Report\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04\x64\x65sc\x18\x02 \x01(\t\x12%\n\x04tabs\x18\x03 \x03(\x0b\x32\x17.secretflow.spec.v1.Tab\x12\x10\n\x08\x65rr_code\x18\x04 \x01(\x05\x12\x12\n\nerr_detail\x18\x05 \x01(\tB\'\n\x16\x63om.secretflow.spec.v1B\x0bReportProtoP\x01\x62\x06proto3') + + + +_DESCRIPTIONS = DESCRIPTOR.message_types_by_name['Descriptions'] +_DESCRIPTIONS_ITEM = _DESCRIPTIONS.nested_types_by_name['Item'] +_TABLE = DESCRIPTOR.message_types_by_name['Table'] +_TABLE_HEADERITEM = _TABLE.nested_types_by_name['HeaderItem'] +_TABLE_ROW = _TABLE.nested_types_by_name['Row'] +_DIV = DESCRIPTOR.message_types_by_name['Div'] +_DIV_CHILD = _DIV.nested_types_by_name['Child'] +_TAB = DESCRIPTOR.message_types_by_name['Tab'] +_REPORT = DESCRIPTOR.message_types_by_name['Report'] +Descriptions = _reflection.GeneratedProtocolMessageType('Descriptions', (_message.Message,), { + + 'Item' : _reflection.GeneratedProtocolMessageType('Item', (_message.Message,), { + 'DESCRIPTOR' : _DESCRIPTIONS_ITEM, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Descriptions.Item) + }) + , + 'DESCRIPTOR' : _DESCRIPTIONS, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Descriptions) + }) +_sym_db.RegisterMessage(Descriptions) +_sym_db.RegisterMessage(Descriptions.Item) + +Table = _reflection.GeneratedProtocolMessageType('Table', (_message.Message,), { + + 'HeaderItem' : _reflection.GeneratedProtocolMessageType('HeaderItem', (_message.Message,), { + 'DESCRIPTOR' : _TABLE_HEADERITEM, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Table.HeaderItem) + }) + , + + 'Row' : _reflection.GeneratedProtocolMessageType('Row', (_message.Message,), { + 'DESCRIPTOR' : _TABLE_ROW, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Table.Row) + }) + , + 'DESCRIPTOR' : _TABLE, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Table) + }) +_sym_db.RegisterMessage(Table) +_sym_db.RegisterMessage(Table.HeaderItem) +_sym_db.RegisterMessage(Table.Row) + +Div = _reflection.GeneratedProtocolMessageType('Div', (_message.Message,), { + + 'Child' : _reflection.GeneratedProtocolMessageType('Child', (_message.Message,), { + 'DESCRIPTOR' : _DIV_CHILD, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Div.Child) + }) + , + 'DESCRIPTOR' : _DIV, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Div) + }) +_sym_db.RegisterMessage(Div) +_sym_db.RegisterMessage(Div.Child) + +Tab = _reflection.GeneratedProtocolMessageType('Tab', (_message.Message,), { + 'DESCRIPTOR' : _TAB, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Tab) + }) +_sym_db.RegisterMessage(Tab) + +Report = _reflection.GeneratedProtocolMessageType('Report', (_message.Message,), { + 'DESCRIPTOR' : _REPORT, + '__module__' : 'secretflow.spec.v1.report_pb2' + # @@protoc_insertion_point(class_scope:secretflow.spec.v1.Report) + }) +_sym_db.RegisterMessage(Report) + +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\026com.secretflow.spec.v1B\013ReportProtoP\001' + _DESCRIPTIONS._serialized_start=92 + _DESCRIPTIONS._serialized_end=284 + _DESCRIPTIONS_ITEM._serialized_start=190 + _DESCRIPTIONS_ITEM._serialized_end=284 + _TABLE._serialized_start=287 + _TABLE._serialized_end=559 + _TABLE_HEADERITEM._serialized_start=424 + _TABLE_HEADERITEM._serialized_end=478 + _TABLE_ROW._serialized_start=480 + _TABLE_ROW._serialized_end=559 + _DIV._serialized_start=562 + _DIV._serialized_end=804 + _DIV_CHILD._serialized_start=647 + _DIV_CHILD._serialized_end=804 + _TAB._serialized_start=806 + _TAB._serialized_end=878 + _REPORT._serialized_start=880 + _REPORT._serialized_end=993 +# @@protoc_insertion_point(module_scope) diff --git a/secretflow/stats/ss_pearsonr_v.py b/secretflow/stats/ss_pearsonr_v.py index 435ad164e..c11aca214 100644 --- a/secretflow/stats/ss_pearsonr_v.py +++ b/secretflow/stats/ss_pearsonr_v.py @@ -16,6 +16,7 @@ import jax.numpy as jnp import numpy as np +import pandas as pd import secretflow as sf from secretflow.data.vertical import VDataFrame @@ -49,20 +50,22 @@ def pearsonr(self, vdata: VDataFrame, standardize: bool = True): vdata : VDataFrame vertical slice dataset. standardize: bool - if need standardize dataset. dataset must be standardized + if you need standardize dataset. dataset must be standardized please keep standardize=True, unless dataset is already standardized. standardize purpose: - reduce the result number of matrix xtx, avoid overflow in secret sharing. - after standardize, the variance is 1 and the mean is 0, which can simplify the calculation. """ + if standardize: scaler = StandardScaler() vdata = scaler.fit_transform(vdata) obj_list = [d.data.to(self.spu_device) for d in vdata.partitions.values()] - rows = vdata.shape[0] def spu_xtx(objs: List[np.ndarray]): + if isinstance(objs, pd.DataFrame): + objs = objs.values data = jnp.concatenate(objs, axis=1) return jnp.matmul(data.transpose(), data) diff --git a/secretflow/utils/simulation/data/dataframe.py b/secretflow/utils/simulation/data/dataframe.py index 5a6a44a86..29caa210f 100644 --- a/secretflow/utils/simulation/data/dataframe.py +++ b/secretflow/utils/simulation/data/dataframe.py @@ -16,8 +16,8 @@ import pandas as pd +from secretflow.data import partition from secretflow.data.horizontal import HDataFrame -from secretflow.data.base import partition from secretflow.data.vertical import VDataFrame from secretflow.device import PYU from secretflow.security.aggregation.aggregator import Aggregator @@ -76,7 +76,8 @@ def create_df( assert parts, 'Parts should not be none or empty!' if isinstance(source, str): - df = pd.read_csv(source, engine="pyarrow") + # engin="pyarrow" will lead to stuck in production tests. + df = pd.read_csv(source) elif isinstance(source, pd.DataFrame): df = source elif isinstance(source, Callable): @@ -98,7 +99,7 @@ def create_df( return HDataFrame( partitions={ device: partition( - device(lambda _df: _df.iloc[index[0] : index[1], :])(df) + lambda _df: _df.iloc[index[0] : index[1], :], device=device, _df=df ) for device, index in indexes.items() }, @@ -109,7 +110,7 @@ def create_df( return VDataFrame( partitions={ device: partition( - device(lambda _df: _df.iloc[:, index[0] : index[1]])(df) + lambda _df: _df.iloc[:, index[0] : index[1]], device=device, _df=df ) for device, index in indexes.items() } diff --git a/setup.py b/setup.py index 158ad2c12..746b0be68 100644 --- a/setup.py +++ b/setup.py @@ -213,7 +213,6 @@ def long_description(): "tests.*", ) ), - setup_requires=["protobuf_distutils"], install_requires=install_requires, ext_modules=[ BazelExtension( @@ -227,11 +226,6 @@ def long_description(): dependency_links=dependency_links, options={ "bdist_wheel": {"plat_name": plat_name()}, - "generate_py_protobufs": { - "source_dir": "./secretflow/protos", - "proto_root_path": ".", - "output_dir": ".", - }, }, entry_points={ "console_scripts": [ diff --git a/submodules/spec b/submodules/spec new file mode 160000 index 000000000..25dd619f0 --- /dev/null +++ b/submodules/spec @@ -0,0 +1 @@ +Subproject commit 25dd619f07dd8ba9cb445fb610bafc6370f4702d diff --git a/tests/component/test_biclassification_eval.py b/tests/component/test_biclassification_eval.py index d4a20456b..758fb43d7 100644 --- a/tests/component/test_biclassification_eval.py +++ b/tests/component/test_biclassification_eval.py @@ -4,21 +4,21 @@ import numpy as np import pandas as pd from google.protobuf.json_format import MessageToJson +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import ( + DistData, + IndividualTable, + TableSchema, + VerticalTable, +) +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam +from secretflow.spec.v1.report_pb2 import Report from sklearn.metrics import roc_auc_score from secretflow.component.data_utils import DistDataType from secretflow.component.ml.eval.biclassification_eval import ( biclassification_eval_comp, ) -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import ( - DistData, - IndividualTable, - TableSchema, - VerticalTable, -) -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam -from secretflow.protos.component.report_pb2 import Report def test_biclassification_eval(comp_prod_sf_cluster_config): @@ -124,8 +124,9 @@ def test_biclassification_eval(comp_prod_sf_cluster_config): alice_true_path = "biclassification_eval/alice_true.csv" alice_pred_path = "biclassification_eval/alice_pred.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd if self_party == "alice": os.makedirs(os.path.join(local_fs_wd, "biclassification_eval"), exist_ok=True) @@ -176,7 +177,9 @@ def test_biclassification_eval(comp_prod_sf_cluster_config): param.inputs[1].meta.Pack(meta) res = biclassification_eval_comp.eval( - param=param, cluster_config=comp_prod_sf_cluster_config + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, ) comp_ret = Report() res.outputs[0].meta.Unpack(comp_ret) diff --git a/tests/component/test_component.py b/tests/component/test_component.py index 3392f647d..b4565e1ea 100644 --- a/tests/component/test_component.py +++ b/tests/component/test_component.py @@ -1,12 +1,12 @@ import math from google.protobuf.json_format import ParseDict +from secretflow.spec.v1.component_pb2 import AttributeDef, ComponentDef, IoDef +from secretflow.spec.v1.data_pb2 import DistData +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from secretflow.component.component import Component, IoType, TableColParam from secretflow.component.data_utils import DistDataType -from secretflow.protos.component.comp_pb2 import AttributeDef, ComponentDef, IoDef -from secretflow.protos.component.data_pb2 import DistData -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam def test_float_attr(): @@ -32,10 +32,10 @@ def test_float_attr(): "atomic": { "is_optional": True, "default_value": {"f": 0.75}, - "has_lower_bound": True, + "lower_bound_enabled": True, "lower_bound_inclusive": True, "lower_bound": {"f": 0.0}, - "has_upper_bound": True, + "upper_bound_enabled": True, "upper_bound_inclusive": True, "upper_bound": {"f": 1.0}, }, @@ -65,7 +65,7 @@ def test_int_attr(): "type": "AT_INT", "atomic": { "default_value": {"i64": 1}, - "has_lower_bound": True, + "lower_bound_enabled": True, "lower_bound_inclusive": True, "lower_bound": {"i64": 1}, }, @@ -229,10 +229,10 @@ def test_definition(): "atomic": { "is_optional": True, "default_value": {"f": 0.75}, - "has_lower_bound": True, + "lower_bound_enabled": True, "lower_bound_inclusive": True, "lower_bound": {"f": 0.0}, - "has_upper_bound": True, + "upper_bound_enabled": True, "upper_bound_inclusive": True, "upper_bound": {"f": 1.0}, }, diff --git a/tests/component/test_condition_filter.py b/tests/component/test_condition_filter.py index a2cac94a0..f637a03b6 100644 --- a/tests/component/test_condition_filter.py +++ b/tests/component/test_condition_filter.py @@ -2,12 +2,12 @@ import numpy as np import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from secretflow.component.data_utils import DistDataType, extract_distdata_info from secretflow.component.preprocessing.condition_filter import condition_filter_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam from tests.conftest import TEST_STORAGE_ROOT @@ -17,8 +17,9 @@ def test_condition_filter(comp_prod_sf_cluster_config): train_output_path = "test_condition_filter/train.csv" test_output_path = "test_condition_filter/test.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd if self_party == "alice": df_alice = pd.DataFrame( @@ -119,7 +120,11 @@ def test_condition_filter(comp_prod_sf_cluster_config): exist_ok=True, ) - res = condition_filter_comp.eval(param, comp_prod_sf_cluster_config) + res = condition_filter_comp.eval( + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(res.outputs) == 2 diff --git a/tests/component/test_eval_param_reader.py b/tests/component/test_eval_param_reader.py index ed06050b8..21bd12f62 100644 --- a/tests/component/test_eval_param_reader.py +++ b/tests/component/test_eval_param_reader.py @@ -1,4 +1,13 @@ import pytest +from secretflow.spec.v1.component_pb2 import ( + Attribute, + AttributeDef, + AttrType, + ComponentDef, + IoDef, +) +from secretflow.spec.v1.data_pb2 import DistData +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from secretflow.component.eval_param_reader import ( EvalParamError, @@ -8,15 +17,6 @@ check_table_attr_col_cnt, check_upper_bound, ) -from secretflow.protos.component.comp_pb2 import ( - Attribute, - AttributeDef, - AttrType, - ComponentDef, - IoDef, -) -from secretflow.protos.component.data_pb2 import DistData -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam def test_check_allowed_values(): @@ -55,7 +55,7 @@ def test_check_lower_bound(): d1 = AttributeDef( type=AttrType.AT_FLOAT, atomic=AttributeDef.AtomicAttrDesc( - has_lower_bound=True, lower_bound=Attribute(f=1.0) + lower_bound_enabled=True, lower_bound=Attribute(f=1.0) ), ) @@ -71,7 +71,7 @@ def test_check_lower_bound(): d2 = AttributeDef( type=AttrType.AT_FLOAT, atomic=AttributeDef.AtomicAttrDesc( - has_lower_bound=True, + lower_bound_enabled=True, lower_bound_inclusive=True, lower_bound=Attribute(f=1.0), ), @@ -84,7 +84,7 @@ def test_check_lower_bound(): d3 = AttributeDef( type=AttrType.AT_INT, atomic=AttributeDef.AtomicAttrDesc( - has_lower_bound=True, lower_bound=Attribute(i64=1) + lower_bound_enabled=True, lower_bound=Attribute(i64=1) ), ) @@ -100,7 +100,7 @@ def test_check_lower_bound(): d4 = AttributeDef( type=AttrType.AT_INT, atomic=AttributeDef.AtomicAttrDesc( - has_lower_bound=True, + lower_bound_enabled=True, lower_bound_inclusive=True, lower_bound=Attribute(i64=1), ), @@ -115,7 +115,7 @@ def test_check_upper_bound(): d1 = AttributeDef( type=AttrType.AT_FLOAT, atomic=AttributeDef.AtomicAttrDesc( - has_upper_bound=True, upper_bound=Attribute(f=1.0) + upper_bound_enabled=True, upper_bound=Attribute(f=1.0) ), ) @@ -131,7 +131,7 @@ def test_check_upper_bound(): d2 = AttributeDef( type=AttrType.AT_FLOAT, atomic=AttributeDef.AtomicAttrDesc( - has_upper_bound=True, + upper_bound_enabled=True, upper_bound_inclusive=True, upper_bound=Attribute(f=1.0), ), @@ -144,7 +144,7 @@ def test_check_upper_bound(): d3 = AttributeDef( type=AttrType.AT_INT, atomic=AttributeDef.AtomicAttrDesc( - has_upper_bound=True, upper_bound=Attribute(i64=1) + upper_bound_enabled=True, upper_bound=Attribute(i64=1) ), ) @@ -160,7 +160,7 @@ def test_check_upper_bound(): d4 = AttributeDef( type=AttrType.AT_INT, atomic=AttributeDef.AtomicAttrDesc( - has_upper_bound=True, + upper_bound_enabled=True, upper_bound_inclusive=True, upper_bound=Attribute(i64=1), ), diff --git a/tests/component/test_feature_filter.py b/tests/component/test_feature_filter.py index 324b56aa5..3d34bd0fe 100644 --- a/tests/component/test_feature_filter.py +++ b/tests/component/test_feature_filter.py @@ -1,13 +1,13 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from sklearn.datasets import load_breast_cancer from secretflow.component.data_utils import DistDataType from secretflow.component.preprocessing.feature_filter import feature_filter_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam from tests.conftest import TEST_STORAGE_ROOT @@ -16,8 +16,9 @@ def test_feature_filter(comp_prod_sf_cluster_config): bob_input_path = "test_feature_filter/bob.csv" output_path = "test_feature_filter/out.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd x = load_breast_cancer()["data"] if self_party == "alice": @@ -72,7 +73,11 @@ def test_feature_filter(comp_prod_sf_cluster_config): ], ) param.inputs[0].meta.Pack(meta) - res = feature_filter_comp.eval(param, comp_prod_sf_cluster_config) + res = feature_filter_comp.eval( + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(res.outputs) == 1 diff --git a/tests/component/test_i18n.py b/tests/component/test_i18n.py index c388417d2..bffa3d2eb 100644 --- a/tests/component/test_i18n.py +++ b/tests/component/test_i18n.py @@ -1,12 +1,13 @@ -from secretflow.component.entry import gen_key -from secretflow.component.i18n import ROOT, gettext -from secretflow.protos.component.comp_pb2 import ( +from secretflow.spec.v1.component_pb2 import ( AttributeDef, CompListDef, ComponentDef, IoDef, ) +from secretflow.component.entry import gen_key +from secretflow.component.i18n import ROOT, gettext + def test_gettext(): comp_list = CompListDef( @@ -28,7 +29,7 @@ def test_gettext(): IoDef.TableAttrDef( name="key", desc="key desc", - attrs=[ + extra_attrs=[ AttributeDef(name="key attr", desc="key attr desc") ], ) diff --git a/tests/component/test_prediction_bias_eval.py b/tests/component/test_prediction_bias_eval.py index cccaa55b9..c939bc528 100644 --- a/tests/component/test_prediction_bias_eval.py +++ b/tests/component/test_prediction_bias_eval.py @@ -3,18 +3,18 @@ import os import pandas as pd - -from secretflow.component.data_utils import DistDataType -from secretflow.component.ml.eval.prediction_bias_eval import prediction_bias_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import ( +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import ( DistData, IndividualTable, TableSchema, VerticalTable, ) -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam -from secretflow.protos.component.report_pb2 import Report +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam +from secretflow.spec.v1.report_pb2 import Report + +from secretflow.component.data_utils import DistDataType +from secretflow.component.ml.eval.prediction_bias_eval import prediction_bias_comp def test_prediction_bias_eval(comp_prod_sf_cluster_config): @@ -32,8 +32,9 @@ def test_prediction_bias_eval(comp_prod_sf_cluster_config): alice_labels_path = "prediction_bias_eval/alice_labels.csv" alice_predict_path = "prediction_bias_eval/alice_predict.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd if self_party == "alice": os.makedirs(os.path.join(local_fs_wd, "prediction_bias_eval"), exist_ok=True) @@ -89,7 +90,9 @@ def test_prediction_bias_eval(comp_prod_sf_cluster_config): param.inputs[1].meta.Pack(meta) res = prediction_bias_comp.eval( - param=param, cluster_config=comp_prod_sf_cluster_config + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, ) comp_ret = Report() res.outputs[0].meta.Unpack(comp_ret) diff --git a/tests/component/test_psi.py b/tests/component/test_psi.py index ea243ffe2..3b090903e 100644 --- a/tests/component/test_psi.py +++ b/tests/component/test_psi.py @@ -1,17 +1,17 @@ import os import pandas as pd - -from secretflow.component.data_utils import DistDataType -from secretflow.component.preprocessing.psi import psi_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import ( +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import ( DistData, IndividualTable, TableSchema, VerticalTable, ) -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam + +from secretflow.component.data_utils import DistDataType +from secretflow.component.preprocessing.psi import psi_comp from tests.conftest import TEST_STORAGE_ROOT @@ -21,8 +21,9 @@ def test_psi(comp_prod_sf_cluster_config): output_path = "test_psi/psi_output.csv" output_path_2 = "test_psi/psi_output_2.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd if self_party == "alice": da = pd.DataFrame( @@ -125,7 +126,7 @@ def test_psi(comp_prod_sf_cluster_config): id_types=["str"], ids=["id1"], ), - num_lines=-1, + line_count=-1, ), ) @@ -137,11 +138,15 @@ def test_psi(comp_prod_sf_cluster_config): id_types=["str"], ids=["id2"], ), - num_lines=-1, + line_count=-1, ), ) - res = psi_comp.eval(param, comp_prod_sf_cluster_config) + res = psi_comp.eval( + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(res.outputs) == 1 @@ -213,7 +218,7 @@ def test_psi(comp_prod_sf_cluster_config): id_types=["str"], ids=["id1"], ), - num_lines=-1, + line_count=-1, ), ) @@ -225,11 +230,15 @@ def test_psi(comp_prod_sf_cluster_config): id_types=["str"], ids=["id2"], ), - num_lines=-1, + line_count=-1, ), ) - res_2 = psi_comp.eval(param_2, comp_prod_sf_cluster_config) + res_2 = psi_comp.eval( + param=param_2, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(res_2.outputs) == 1 diff --git a/tests/component/test_sgb.py b/tests/component/test_sgb.py index 0dc342dee..92f9e705d 100644 --- a/tests/component/test_sgb.py +++ b/tests/component/test_sgb.py @@ -1,14 +1,14 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from sklearn.datasets import load_breast_cancer from sklearn.metrics import roc_auc_score from sklearn.preprocessing import StandardScaler from secretflow.component.ml.boost.sgb.sgb import sgb_predict_comp, sgb_train_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam from tests.conftest import TEST_STORAGE_ROOT @@ -18,8 +18,9 @@ def test_sgb(comp_prod_sf_cluster_config): model_path = "test_sgb/model.sf" predict_path = "test_sgb/predict.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd scaler = StandardScaler() ds = load_breast_cancer() @@ -100,7 +101,11 @@ def test_sgb(comp_prod_sf_cluster_config): ) train_param.inputs[0].meta.Pack(meta) - train_res = sgb_train_comp.eval(train_param, comp_prod_sf_cluster_config) + train_res = sgb_train_comp.eval( + param=train_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) predict_param = NodeEvalParam( domain="ml.predict", @@ -145,7 +150,11 @@ def test_sgb(comp_prod_sf_cluster_config): ) predict_param.inputs[1].meta.Pack(meta) - predict_res = sgb_predict_comp.eval(predict_param, comp_prod_sf_cluster_config) + predict_res = sgb_predict_comp.eval( + param=predict_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(predict_res.outputs) == 1 diff --git a/tests/component/test_ss_glm.py b/tests/component/test_ss_glm.py index f3e54658d..48e372600 100644 --- a/tests/component/test_ss_glm.py +++ b/tests/component/test_ss_glm.py @@ -1,14 +1,14 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from sklearn.datasets import load_breast_cancer from sklearn.metrics import roc_auc_score from sklearn.preprocessing import StandardScaler from secretflow.component.ml.linear.ss_glm import ss_glm_predict_comp, ss_glm_train_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam from tests.conftest import TEST_STORAGE_ROOT @@ -18,8 +18,9 @@ def test_glm(comp_prod_sf_cluster_config): model_path = "test_glm/model.sf" predict_path = "test_glm/predict.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd scaler = StandardScaler() ds = load_breast_cancer() @@ -92,7 +93,11 @@ def test_glm(comp_prod_sf_cluster_config): ) train_param.inputs[0].meta.Pack(meta) - train_res = ss_glm_train_comp.eval(train_param, comp_prod_sf_cluster_config) + train_res = ss_glm_train_comp.eval( + param=train_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) predict_param = NodeEvalParam( domain="ml.predict", @@ -137,7 +142,11 @@ def test_glm(comp_prod_sf_cluster_config): ) predict_param.inputs[1].meta.Pack(meta) - predict_res = ss_glm_predict_comp.eval(predict_param, comp_prod_sf_cluster_config) + predict_res = ss_glm_predict_comp.eval( + param=predict_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(predict_res.outputs) == 1 diff --git a/tests/component/test_ss_pearsonr.py b/tests/component/test_ss_pearsonr.py index 25f5ef49c..f7a752d1b 100644 --- a/tests/component/test_ss_pearsonr.py +++ b/tests/component/test_ss_pearsonr.py @@ -2,22 +2,23 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam +from secretflow.spec.v1.report_pb2 import Report from sklearn.datasets import load_breast_cancer from secretflow.component.data_utils import DistDataType from secretflow.component.stats.ss_pearsonr import ss_pearsonr_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam -from secretflow.protos.component.report_pb2 import Report def test_ss_pearsonr(comp_prod_sf_cluster_config): alice_input_path = "test_ss_pearsonr/alice.csv" bob_input_path = "test_ss_pearsonr/bob.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd x = load_breast_cancer()["data"] if self_party == "alice": @@ -73,7 +74,11 @@ def test_ss_pearsonr(comp_prod_sf_cluster_config): ) param.inputs[0].meta.Pack(meta) - res = ss_pearsonr_comp.eval(param, comp_prod_sf_cluster_config) + res = ss_pearsonr_comp.eval( + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(res.outputs) == 1 diff --git a/tests/component/test_ss_pvalue.py b/tests/component/test_ss_pvalue.py index 469220eef..f6ac26b53 100644 --- a/tests/component/test_ss_pvalue.py +++ b/tests/component/test_ss_pvalue.py @@ -2,16 +2,16 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam +from secretflow.spec.v1.report_pb2 import Report from sklearn.datasets import load_breast_cancer from sklearn.preprocessing import StandardScaler from secretflow.component.data_utils import DistDataType from secretflow.component.ml.eval.ss_pvalue import ss_pvalue_comp from secretflow.component.ml.linear.ss_sgd import ss_sgd_train_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam -from secretflow.protos.component.report_pb2 import Report def test_ss_pvalue(comp_prod_sf_cluster_config): @@ -19,8 +19,9 @@ def test_ss_pvalue(comp_prod_sf_cluster_config): bob_input_path = "test_ss_pvalue/bob.csv" model_path = "test_ss_pvalue/model.sf" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd scaler = StandardScaler() ds = load_breast_cancer() @@ -101,7 +102,11 @@ def test_ss_pvalue(comp_prod_sf_cluster_config): ) train_param.inputs[0].meta.Pack(meta) - train_res = ss_sgd_train_comp.eval(train_param, comp_prod_sf_cluster_config) + train_res = ss_sgd_train_comp.eval( + param=train_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) pv_param = NodeEvalParam( domain="ml.eval", @@ -111,7 +116,11 @@ def test_ss_pvalue(comp_prod_sf_cluster_config): output_uris=["report"], ) - res = ss_pvalue_comp.eval(pv_param, comp_prod_sf_cluster_config) + res = ss_pvalue_comp.eval( + param=pv_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(res.outputs) == 1 diff --git a/tests/component/test_ss_sgd.py b/tests/component/test_ss_sgd.py index ee62f8682..abbc629c4 100644 --- a/tests/component/test_ss_sgd.py +++ b/tests/component/test_ss_sgd.py @@ -1,15 +1,15 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from sklearn.datasets import load_breast_cancer from sklearn.metrics import roc_auc_score from sklearn.preprocessing import StandardScaler from secretflow.component.data_utils import DistDataType from secretflow.component.ml.linear.ss_sgd import ss_sgd_predict_comp, ss_sgd_train_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam from tests.conftest import TEST_STORAGE_ROOT @@ -19,8 +19,9 @@ def test_ss_sgd(comp_prod_sf_cluster_config): model_path = "test_ss_sgd/model.sf" predict_path = "test_ss_sgd/predict.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd scaler = StandardScaler() ds = load_breast_cancer() @@ -101,7 +102,11 @@ def test_ss_sgd(comp_prod_sf_cluster_config): ) train_param.inputs[0].meta.Pack(meta) - train_res = ss_sgd_train_comp.eval(train_param, comp_prod_sf_cluster_config) + train_res = ss_sgd_train_comp.eval( + param=train_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) predict_param = NodeEvalParam( domain="ml.predict", @@ -148,7 +153,11 @@ def test_ss_sgd(comp_prod_sf_cluster_config): ) predict_param.inputs[1].meta.Pack(meta) - predict_res = ss_sgd_predict_comp.eval(predict_param, comp_prod_sf_cluster_config) + predict_res = ss_sgd_predict_comp.eval( + param=predict_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(predict_res.outputs) == 1 diff --git a/tests/component/test_ss_vif.py b/tests/component/test_ss_vif.py index 1e6aa2faa..1d9f9b006 100644 --- a/tests/component/test_ss_vif.py +++ b/tests/component/test_ss_vif.py @@ -2,22 +2,23 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam +from secretflow.spec.v1.report_pb2 import Report from sklearn.datasets import load_breast_cancer from secretflow.component.data_utils import DistDataType from secretflow.component.stats.ss_vif import ss_vif_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam -from secretflow.protos.component.report_pb2 import Report def test_ss_vif(comp_prod_sf_cluster_config): alice_input_path = "test_ss_vif/alice.csv" bob_input_path = "test_ss_vif/bob.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd x = load_breast_cancer()["data"] if self_party == "alice": @@ -73,7 +74,11 @@ def test_ss_vif(comp_prod_sf_cluster_config): ) param.inputs[0].meta.Pack(meta) - res = ss_vif_comp.eval(param, comp_prod_sf_cluster_config) + res = ss_vif_comp.eval( + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(res.outputs) == 1 diff --git a/tests/component/test_ss_xgb.py b/tests/component/test_ss_xgb.py index 2c2c16892..03b6daa16 100644 --- a/tests/component/test_ss_xgb.py +++ b/tests/component/test_ss_xgb.py @@ -1,6 +1,9 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from sklearn.datasets import load_breast_cancer from sklearn.metrics import roc_auc_score from sklearn.preprocessing import StandardScaler @@ -9,9 +12,6 @@ ss_xgb_predict_comp, ss_xgb_train_comp, ) -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam from tests.conftest import TEST_STORAGE_ROOT @@ -21,8 +21,9 @@ def test_ss_xgb(comp_prod_sf_cluster_config): model_path = "test_ss_xgb/model.sf" predict_path = "test_ss_xgb/predict.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd scaler = StandardScaler() ds = load_breast_cancer() @@ -101,7 +102,11 @@ def test_ss_xgb(comp_prod_sf_cluster_config): ) train_param.inputs[0].meta.Pack(meta) - train_res = ss_xgb_train_comp.eval(train_param, comp_prod_sf_cluster_config) + train_res = ss_xgb_train_comp.eval( + param=train_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) predict_param = NodeEvalParam( domain="ml.predict", @@ -146,7 +151,11 @@ def test_ss_xgb(comp_prod_sf_cluster_config): ) predict_param.inputs[1].meta.Pack(meta) - predict_res = ss_xgb_predict_comp.eval(predict_param, comp_prod_sf_cluster_config) + predict_res = ss_xgb_predict_comp.eval( + param=predict_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(predict_res.outputs) == 1 diff --git a/tests/component/test_table_statistics.py b/tests/component/test_table_statistics.py index 86e84d885..d01be9946 100644 --- a/tests/component/test_table_statistics.py +++ b/tests/component/test_table_statistics.py @@ -2,20 +2,20 @@ import os import pandas as pd +from secretflow.spec.v1.data_pb2 import ( + DistData, + IndividualTable, + TableSchema, + VerticalTable, +) +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam +from secretflow.spec.v1.report_pb2 import Report from secretflow.component.data_utils import DistDataType from secretflow.component.stats.table_statistics import ( gen_table_statistic_report, table_statistics_comp, ) -from secretflow.protos.component.data_pb2 import ( - DistData, - IndividualTable, - TableSchema, - VerticalTable, -) -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam -from secretflow.protos.component.report_pb2 import Report from secretflow.stats.table_statistics import table_statistics @@ -27,8 +27,9 @@ def test_table_statistics_comp(comp_prod_sf_cluster_config): alice_input_path = "test_table_statistics/alice.csv" bob_input_path = "test_table_statistics/bob.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd test_data = pd.DataFrame( {"a": [9, 6, 5, 5], "b": [5, 5, 6, 7], "c": [1, 1, 2, 4], "d": [11, 55, 1, 99]} @@ -77,7 +78,9 @@ def test_table_statistics_comp(comp_prod_sf_cluster_config): param.inputs[0].meta.Pack(meta) res = table_statistics_comp.eval( - param=param, cluster_config=comp_prod_sf_cluster_config + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, ) comp_ret = Report() res.outputs[0].meta.Unpack(comp_ret) @@ -92,8 +95,9 @@ def test_table_statistics_individual_comp(comp_prod_sf_cluster_config): """ alice_input_path = "test_table_statistics/alice.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd test_data = pd.DataFrame( {"a": [9, 6, 5, 5], "b": [5, 5, 6, 7], "c": [1, 1, 2, 4], "d": [11, 55, 1, 99]} @@ -131,7 +135,9 @@ def test_table_statistics_individual_comp(comp_prod_sf_cluster_config): param.inputs[0].meta.Pack(meta) res = table_statistics_comp.eval( - param=param, cluster_config=comp_prod_sf_cluster_config + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, ) comp_ret = Report() diff --git a/tests/component/test_train_test_split.py b/tests/component/test_train_test_split.py index deede09ec..1eca289bf 100644 --- a/tests/component/test_train_test_split.py +++ b/tests/component/test_train_test_split.py @@ -1,12 +1,12 @@ import os import pandas as pd +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from secretflow.component.data_utils import DistDataType, extract_distdata_info from secretflow.component.preprocessing.train_test_split import train_test_split_comp -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam from tests.conftest import TEST_STORAGE_ROOT @@ -16,8 +16,9 @@ def test_train_test_split(comp_prod_sf_cluster_config): train_output_path = "test_train_test_split/train.csv" test_output_path = "test_train_test_split/test.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd if self_party == "alice": df_alice = pd.DataFrame( @@ -111,7 +112,11 @@ def test_train_test_split(comp_prod_sf_cluster_config): exist_ok=True, ) - res = train_test_split_comp.eval(param, comp_prod_sf_cluster_config) + res = train_test_split_comp.eval( + param=param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(res.outputs) == 2 diff --git a/tests/component/test_vert_binning.py b/tests/component/test_vert_binning.py new file mode 100644 index 000000000..395a8c6da --- /dev/null +++ b/tests/component/test_vert_binning.py @@ -0,0 +1,156 @@ +import os + +import pandas as pd +from sklearn.datasets import load_breast_cancer + +from secretflow.component.data_utils import DistDataType, extract_distdata_info +from secretflow.component.preprocessing.vert_binning import ( + vert_binning_comp, + vert_bin_substitution_comp, +) +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam +from tests.conftest import TEST_STORAGE_ROOT + + +def test_vert_binning(comp_prod_sf_cluster_config): + alice_path = "test_vert_binning/x_alice.csv" + bob_path = "test_vert_binning/x_bob.csv" + rule_path = "test_vert_binning/bin_rule" + output_path = "test_vert_binning/vert.csv" + + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd + + ds = load_breast_cancer() + x, y = ds["data"], ds["target"] + if self_party == "alice": + os.makedirs( + os.path.join(local_fs_wd, "test_vert_binning"), + exist_ok=True, + ) + x = pd.DataFrame(x[:, :15], columns=[f"a{i}" for i in range(15)]) + y = pd.DataFrame(y, columns=["y"]) + ds = pd.concat([x, y], axis=1) + ds.to_csv(os.path.join(local_fs_wd, alice_path), index=False) + + elif self_party == "bob": + os.makedirs( + os.path.join(local_fs_wd, "test_vert_binning"), + exist_ok=True, + ) + + ds = pd.DataFrame(x[:, 15:], columns=[f"b{i}" for i in range(15)]) + ds.to_csv(os.path.join(local_fs_wd, bob_path), index=False) + + bin_param_01 = NodeEvalParam( + domain="feature", + name="vert_binning", + version="0.0.1", + attr_paths=[ + "input/input_data/feature_selects", + ], + attrs=[ + Attribute(ss=[f"a{i}" for i in range(12)] + [f"b{i}" for i in range(11)]), + ], + inputs=[ + DistData( + name="input_data", + type=str(DistDataType.VERTICAL_TABLE), + data_refs=[ + DistData.DataRef(uri=bob_path, party="bob", format="csv"), + DistData.DataRef(uri=alice_path, party="alice", format="csv"), + ], + ), + ], + output_uris=[rule_path], + ) + + bin_param_02 = NodeEvalParam( + domain="feature", + name="vert_binning", + version="0.0.1", + attr_paths=[ + "input/input_data/feature_selects", + ], + attrs=[ + Attribute(ss=[f"a{i}" for i in range(11)] + [f"b{i}" for i in range(12)]), + ], + inputs=[ + DistData( + name="input_data", + type=str(DistDataType.VERTICAL_TABLE), + data_refs=[ + DistData.DataRef(uri=bob_path, party="bob", format="csv"), + DistData.DataRef(uri=alice_path, party="alice", format="csv"), + ], + ), + ], + output_uris=[rule_path], + ) + + meta = VerticalTable( + schemas=[ + TableSchema( + feature_types=["float32"] * 15, + features=[f"b{i}" for i in range(15)], + ), + TableSchema( + feature_types=["float32"] * 15, + features=[f"a{i}" for i in range(15)], + label_types=["float32"], + labels=["y"], + ), + ], + ) + bin_param_01.inputs[0].meta.Pack(meta) + bin_param_02.inputs[0].meta.Pack(meta) + + bin_res = vert_binning_comp.eval( + param=bin_param_01, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) + bin_res = vert_binning_comp.eval( + param=bin_param_02, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) + + sub_param = NodeEvalParam( + domain="feature", + name="vert_bin_substitution", + version="0.0.1", + attr_paths=[], + attrs=[], + inputs=[ + bin_param_01.inputs[0], + bin_res.outputs[0], + ], + output_uris=[output_path], + ) + + sub_res = vert_bin_substitution_comp.eval( + param=sub_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) + + assert len(sub_res.outputs) == 1 + + output_info = extract_distdata_info(sub_res.outputs[0]) + + alice_out = pd.read_csv( + os.path.join(TEST_STORAGE_ROOT, "alice", output_info["alice"].uri) + ) + bob_out = pd.read_csv( + os.path.join(TEST_STORAGE_ROOT, "bob", output_info["bob"].uri) + ) + assert alice_out.shape[0] == bob_out.shape[0] + + # import logging + + # logging.warn(f"alice_out \n{alice_out}\n....\n") + # logging.warn(f"bob_out \n{bob_out}\n....\n") diff --git a/tests/component/test_woe_binning.py b/tests/component/test_woe_binning.py index 7b41af4fe..8a6fd0f74 100644 --- a/tests/component/test_woe_binning.py +++ b/tests/component/test_woe_binning.py @@ -4,24 +4,24 @@ from sklearn.datasets import load_breast_cancer from secretflow.component.data_utils import DistDataType, extract_distdata_info -from secretflow.component.feature.vert_woe_binning import ( - vert_woe_binning_comp, - vert_woe_substitution_comp, -) -from secretflow.protos.component.comp_pb2 import Attribute -from secretflow.protos.component.data_pb2 import DistData, TableSchema, VerticalTable -from secretflow.protos.component.evaluation_pb2 import NodeEvalParam +from secretflow.component.preprocessing.vert_woe_binning import vert_woe_binning_comp + +from secretflow.component.preprocessing.vert_binning import vert_bin_substitution_comp +from secretflow.spec.v1.component_pb2 import Attribute +from secretflow.spec.v1.data_pb2 import DistData, TableSchema, VerticalTable +from secretflow.spec.v1.evaluation_pb2 import NodeEvalParam from tests.conftest import TEST_STORAGE_ROOT def test_woe_binning(comp_prod_sf_cluster_config): alice_path = "test_woe_binning/x_alice.csv" bob_path = "test_woe_binning/x_bob.csv" - rule_path = "test_woe_binning/woe_rule" + rule_path = "test_woe_binning/bin_rule" output_path = "test_woe_binning/woe.csv" - self_party = comp_prod_sf_cluster_config.private_config.self_party - local_fs_wd = comp_prod_sf_cluster_config.private_config.storage_config.local_fs.wd + storage_config, sf_cluster_config = comp_prod_sf_cluster_config + self_party = sf_cluster_config.private_config.self_party + local_fs_wd = storage_config.local_fs.wd ds = load_breast_cancer() x, y = ds["data"], ds["target"] @@ -111,12 +111,20 @@ def test_woe_binning(comp_prod_sf_cluster_config): bin_param_01.inputs[0].meta.Pack(meta) bin_param_02.inputs[0].meta.Pack(meta) - bin_res = vert_woe_binning_comp.eval(bin_param_01, comp_prod_sf_cluster_config) - bin_res = vert_woe_binning_comp.eval(bin_param_02, comp_prod_sf_cluster_config) + bin_res = vert_woe_binning_comp.eval( + param=bin_param_01, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) + bin_res = vert_woe_binning_comp.eval( + param=bin_param_02, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) sub_param = NodeEvalParam( domain="feature", - name="vert_woe_substitution", + name="vert_bin_substitution", version="0.0.1", attr_paths=[], attrs=[], @@ -127,7 +135,11 @@ def test_woe_binning(comp_prod_sf_cluster_config): output_uris=[output_path], ) - sub_res = vert_woe_substitution_comp.eval(sub_param, comp_prod_sf_cluster_config) + sub_res = vert_bin_substitution_comp.eval( + param=sub_param, + storage_config=storage_config, + cluster_config=sf_cluster_config, + ) assert len(sub_res.outputs) == 1 diff --git a/tests/conftest.py b/tests/conftest.py index 288f064ab..fee047533 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import getpass import json +import logging import os import tempfile from dataclasses import dataclass @@ -7,14 +8,15 @@ import multiprocess import pytest import spu +from secretflow.spec.v1.data_pb2 import StorageConfig from xdist.scheduler import LoadScheduling import secretflow as sf import secretflow.distributed as sfd -from secretflow.protos.component.cluster_pb2 import ( +from secretflow.distributed.primitive import DISTRIBUTION_MODE +from secretflow.spec.extend.cluster_pb2 import ( SFClusterConfig, SFClusterDesc, - StorageConfig, ) from secretflow.utils.testing import unused_tcp_port from tests.cluster import cluster, set_self_party @@ -154,10 +156,36 @@ class DeviceInventory: spu2: sf.SPU = None # 3pc +@pytest.fixture(scope="module", params=[semi2k_cluster]) +def sf_memory_setup_devices(request): + devices = DeviceInventory() + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.DEBUG) + sf.shutdown() + sf.init( + ["alice", "bob", "carol", "davy", "spu"], + debug_mode=True, + ) + + devices.alice = sf.PYU("alice") + devices.bob = sf.PYU("bob") + devices.carol = sf.PYU("carol") + devices.davy = sf.PYU("davy") + + logging.warning(f"WARNING:The spu device is actually the pyu device in debug mode") + devices.spu = sf.PYU("spu") + cluster_def = sf.reveal(devices.alice(request.param)()) + + devices.heu = sf.HEU(heu_config, cluster_def["runtime_config"]["field"]) + + yield devices + del devices + sf.shutdown() + + @pytest.fixture(scope="module", params=[semi2k_cluster]) def sf_simulation_setup_devices(request): devices = DeviceInventory() - sfd.set_production(False) + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.SIMULATION) sf.shutdown() sf.init( ["alice", "bob", "carol", "davy"], @@ -197,7 +225,7 @@ def sf_party_for_4pc(request): @pytest.fixture(scope="module") def sf_production_setup_devices(request, sf_party_for_4pc): devices = DeviceInventory() - sfd.set_production(True) + sfd.set_distribution_mode(DISTRIBUTION_MODE.PRODUCTION) set_self_party(sf_party_for_4pc) sf.init( address="local", @@ -245,7 +273,7 @@ def sf_production_setup_devices(request, sf_party_for_4pc): @pytest.fixture(scope="module") def sf_production_setup_devices_aby3(request, sf_party_for_4pc): devices = DeviceInventory() - sfd.set_production(True) + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.PRODUCTION) set_self_party(sf_party_for_4pc) sf.init( address="local", @@ -338,7 +366,7 @@ def comp_prod_sf_cluster_config(request, sf_party_for_4pc): ) storage_path = prepare_storage_path(sf_party_for_4pc) - config = SFClusterConfig( + sf_config = SFClusterConfig( desc=desc, public_config=SFClusterConfig.PublicConfig( ray_fed_config=SFClusterConfig.RayFedConfig( @@ -364,11 +392,12 @@ def comp_prod_sf_cluster_config(request, sf_party_for_4pc): private_config=SFClusterConfig.PrivateConfig( self_party=sf_party_for_4pc, ray_head_addr="local", - storage_config=StorageConfig( - type="local_fs", - local_fs=StorageConfig.LocalFSConfig(wd=storage_path), - ), ), ) - yield config + storage_config = StorageConfig( + type="local_fs", + local_fs=StorageConfig.LocalFSConfig(wd=storage_path), + ) + + yield storage_config, sf_config diff --git a/tests/data/horizontal/test_io.py b/tests/data/horizontal/test_io.py index f87287ed4..e7ce35841 100644 --- a/tests/data/horizontal/test_io.py +++ b/tests/data/horizontal/test_io.py @@ -4,8 +4,8 @@ import pandas as pd from secretflow import reveal +from secretflow.data import partition from secretflow.data.horizontal import HDataFrame, read_csv -from secretflow.data.base import partition def cleartmp(paths): @@ -33,10 +33,10 @@ def test_read_csv_and_to_csv_should_ok(sf_production_setup_devices): df = HDataFrame( { sf_production_setup_devices.alice: partition( - sf_production_setup_devices.alice(lambda df: df)(df1), "pandas" + sf_production_setup_devices.alice(lambda df: df)(df1) ), sf_production_setup_devices.bob: partition( - sf_production_setup_devices.bob(lambda df: df)(df2), "pandas" + sf_production_setup_devices.bob(lambda df: df)(df2) ), } ) diff --git a/tests/data/mix/test_mixdataframe.py b/tests/data/mix/test_mixdataframe.py index 206f89472..64cf1c015 100644 --- a/tests/data/mix/test_mixdataframe.py +++ b/tests/data/mix/test_mixdataframe.py @@ -3,9 +3,9 @@ import pytest from secretflow import reveal +from secretflow.data import partition from secretflow.data.horizontal.dataframe import HDataFrame from secretflow.data.mix import MixDataFrame -from secretflow.data.base import partition from secretflow.data.vertical import VDataFrame from secretflow.security.aggregation import PlainAggregator from secretflow.security.compare import PlainComparator diff --git a/tests/data/test_ndarray.py b/tests/data/test_ndarray.py index 6f0d3d931..eef2a9101 100644 --- a/tests/data/test_ndarray.py +++ b/tests/data/test_ndarray.py @@ -7,6 +7,7 @@ import sklearn.metrics from secretflow import reveal +from secretflow.data import partition from secretflow.data.ndarray import ( histogram, load, @@ -18,7 +19,6 @@ subtract, tss, ) -from secretflow.data.base import partition from secretflow.data.split import train_test_split from secretflow.data.vertical import VDataFrame from secretflow.utils.errors import InvalidArgumentError diff --git a/tests/data/vertical/test_io.py b/tests/data/vertical/test_io.py index 8886191ba..9a9a8faad 100644 --- a/tests/data/vertical/test_io.py +++ b/tests/data/vertical/test_io.py @@ -6,7 +6,7 @@ import pytest from secretflow import reveal -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame, read_csv diff --git a/tests/data/vertical/test_vdataframe.py b/tests/data/vertical/test_vdataframe.py index c0844fdd8..f55af8d11 100644 --- a/tests/data/vertical/test_vdataframe.py +++ b/tests/data/vertical/test_vdataframe.py @@ -3,7 +3,7 @@ import pytest from secretflow import reveal -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame from secretflow.utils.errors import NotFoundError @@ -200,7 +200,7 @@ def test_skew_should_ok(prod_env_and_data): pd.testing.assert_series_equal(value[['b6']], expected_bob[['b6']]) -def test_quantle_should_ok(prod_env_and_data): +def test_quantile_should_ok(prod_env_and_data): env, data = prod_env_and_data # WHEN value = data['df'].quantile() @@ -260,7 +260,7 @@ def test_get_non_exist_items_should_error(prod_env_and_data): env, data = prod_env_and_data # WHEN and THEN with pytest.raises(NotFoundError, match='does not exist'): - data['df']['a1', 'non_exist'] + _ = data['df']['a1', 'non_exist'] def test_get_multi_items_should_ok(prod_env_and_data): diff --git a/tests/data/vertical/test_vdataframe_polars.py b/tests/data/vertical/test_vdataframe_polars.py new file mode 100644 index 000000000..b4fb1d09e --- /dev/null +++ b/tests/data/vertical/test_vdataframe_polars.py @@ -0,0 +1,461 @@ +import logging + +import numpy as np +import pandas as pd +import polars as pl +import pytest + +from secretflow import reveal +from secretflow.data import partition +from secretflow.data.vertical import VDataFrame +from secretflow.utils.errors import NotFoundError + + +@pytest.fixture(scope='function') +def prod_env_and_data(sf_production_setup_devices): + df_alice = pd.DataFrame( + { + 'a1': ['K5', 'K1', None, 'K6'], + 'a2': ['A5', 'A1', 'A2', 'A6'], + 'a3': [5, 1, 2, 6], + } + ) + + df_bob = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4], + 'b5': ['B3', None, 'B9', 'B4'], + 'b6': [3, 1, 9, 4], + } + ) + vdf_alice_poalrs = pl.from_pandas(df_alice) + vdf_bob_polars = pl.from_pandas(df_bob) + df = VDataFrame( + { + sf_production_setup_devices.alice: partition( + data=sf_production_setup_devices.alice(lambda: vdf_alice_poalrs)(), + backend="polars", + ), + sf_production_setup_devices.bob: partition( + data=sf_production_setup_devices.bob(lambda: vdf_bob_polars)(), + backend="polars", + ), + } + ) + + yield sf_production_setup_devices, { + "df_alice": df_alice, + "df_bob": df_bob, + "df": df, + } + + +def test_columns_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + columns = data['df'].columns + # THEN + alice_columns = data['df_alice'].columns.to_list() + bob_columns = data['df_bob'].columns.to_list() + alice_columns.extend(bob_columns) + np.testing.assert_equal(columns, alice_columns) + + +def test_pow_should_ok(prod_env_and_data): + pass + + +def test_select_dtypes_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'].select_dtypes('number').mean() + + # THEN + expected_alice = data['df_alice'].select_dtypes('number').mean() + assert value['a3'] == expected_alice['a3'] + expected_bob = data['df_bob'].select_dtypes('number').mean() + pd.testing.assert_series_equal(value[['b4', 'b6']], expected_bob) + + +def test_subtract_should_ok(prod_env_and_data): + pass + + +def test_round_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'].round({'b4': 1}) + # THEN + value_alice = reveal(value.to_pandas().partitions[env.alice].data) + logging.warning(f"value = {value_alice}") + expected_alice = data['df_alice'] + pd.testing.assert_frame_equal(value_alice, expected_alice) + expected_bob = data['df_bob'].round({'b4': 1}) + value_bob = reveal(value.to_pandas().partitions[env.bob].data) + logging.warning(f"value bob = {value_bob}") + logging.warning(f"expect bob = {expected_bob}") + pd.testing.assert_frame_equal(value_bob, expected_bob) + + +def test_min_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'].min(numeric_only=True) + + # THEN + expected_alice = data['df_alice'].min(numeric_only=True) + assert value['a3'] == expected_alice['a3'] + expected_bob = data['df_bob'].min(numeric_only=True) + pd.testing.assert_series_equal(value[['b4', 'b6']], expected_bob) + + +def test_max_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'].max(numeric_only=True) + + # THEN + expected_alice = data['df_alice'].max(numeric_only=True) + assert value['a3'] == expected_alice['a3'] + expected_bob = data['df_bob'].max(numeric_only=True) + pd.testing.assert_series_equal(value[['b4', 'b6']], expected_bob) + + +def test_mean_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'].mean(numeric_only=True) + + # THEN + expected_alice = data['df_alice'].mean(numeric_only=True) + pd.testing.assert_series_equal(value[expected_alice.index], expected_alice) + expected_bob = data['df_bob'].mean(numeric_only=True) + pd.testing.assert_series_equal(value[expected_bob.index], expected_bob) + + +def test_var_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'].var(numeric_only=True) + # THEN + expected_alice = data['df_alice'].var(numeric_only=True) + assert value['a3'] == expected_alice['a3'] + expected_bob = data['df_bob'].var(numeric_only=True) + # TODO(zoupeicheng.zpc): Ray's pandas ignored series containing None + pd.testing.assert_series_equal(value[['b6']], expected_bob[['b6']]) + + +def test_std_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'].std(numeric_only=True) + # THEN + expected_alice = data['df_alice'].std(numeric_only=True) + assert value['a3'] == expected_alice['a3'] + expected_bob = data['df_bob'].std(numeric_only=True) + # TODO(zoupeicheng.zpc): Ray's pandas ignored series containing None + pd.testing.assert_series_equal(value[['b6']], expected_bob[['b6']]) + + +def test_sem_should_ok(prod_env_and_data): + pass + + +def test_skew_should_ok(prod_env_and_data): + pass + + +def test_quantile_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'].quantile(numeric_only=True) + # THEN + expected_alice = data['df_alice'].quantile() + assert value['a3'] == expected_alice['a3'] + expected_bob = data['df_bob'].quantile() + logging.warning(f"real = {value}") + logging.warning(f"expected = {expected_bob}") + + # TODO(zoupeicheng.zpc): Ray's pandas ignored series containing None + pd.testing.assert_series_equal(value[['b6']], expected_bob[['b6']]) + + +def test_count_should_ok(prod_env_and_data): + pass + + +def test_mode_should_ok(prod_env_and_data): + pass + + +def test_count_na_should_ok(prod_env_and_data): + pass + + +def test_get_single_item_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df']['a1'] + + # THEN + expected_alice = data['df_alice'][['a1']] + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expected_alice + ) + + +def test_get_non_exist_items_should_error(prod_env_and_data): + env, data = prod_env_and_data + # WHEN and THEN + with pytest.raises(NotFoundError, match='does not exist'): + _ = data['df']['a1', 'non_exist'] + + +def test_get_multi_items_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'][['a1', 'b4']] + # THEN + expected_alice = data['df_alice'][['a1']] + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expected_alice + ) + expected_bob = data['df_bob'][['b4']] + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.bob].data), expected_bob + ) + + # WHEN + value = data['df'][['a1', 'a2', 'b5']] + # THEN + expected_alice = data['df_alice'][['a1', 'a2']] + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expected_alice + ) + expected_bob = data['df_bob'][['b5']] + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.bob].data), expected_bob + ) + + +def test_set_item_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # Case 1: single item. + # WHEN + value = data['df'] + value['a1'] = 'test' + # THEN + expected_alice = data['df_alice'].copy(deep=True) + expected_alice['a1'] = 'test' + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expected_alice + ) + + # Case 2: multi items on different parties. + # WHEN + value = data['df'] + value[['a1', 'b4', 'b5']] = 'test' + # THEN + expected_alice = data['df_alice'].copy(deep=True) + expected_alice['a1'] = 'test' + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expected_alice + ) + expected_bob = data['df_bob'].copy(deep=True) + expected_bob[['b4', 'b5']] = 'test' + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.bob].data), expected_bob + ) + + +def test_set_item_on_partition_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'] + a2 = value['a2'] + value['a1'] = a2 + # THEN + expected_alice = data['df_alice'].copy(deep=True) + expected_alice['a1'] = expected_alice['a2'] + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expected_alice + ) + + +def test_set_item_on_non_exist_partition_should_error(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + with pytest.raises( + AssertionError, + match='Device of the partition to assgin is not in this dataframe devices.', + ): + part = partition( + env.carol( + lambda: pd.DataFrame( + { + 'a1': ['K5', 'K1', None, 'K6'], + 'a2': ['A5', 'A1', 'A2', 'A6'], + 'a3': [5, 1, 2, 6], + } + ) + )() + ) + value = data['df'] + value['a1'] = part['a2'] + + +def test_set_item_on_vdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # WHEN + value = data['df'] + value[['a1', 'b4']] = data['df'][['a2', 'b5']] + + # THEN + expected_alice = data['df_alice'].copy(deep=True) + expected_alice['a1'] = expected_alice['a2'] + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expected_alice + ) + + expected_bob = data['df_bob'].copy(deep=True) + expected_bob['b4'] = expected_bob['b5'] + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.bob].data), expected_bob + ) + + +def test_set_item_on_different_vdataframe_should_error(prod_env_and_data): + env, data = prod_env_and_data + with pytest.raises( + AssertionError, + match='Partitions to assgin is not same with this dataframe partitions.', + ): + df = VDataFrame( + { + env.alice: partition( + data=env.alice( + lambda: pd.DataFrame( + { + 'a1': ['K5', 'K1', None, 'K6'], + 'a2': ['A5', 'A1', 'A2', 'A6'], + 'a3': [5, 1, 2, 6], + } + ) + )() + ), + env.carol: partition( + data=env.carol( + lambda: pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4], + 'b5': ['B3', None, 'B9', 'B4'], + 'b6': [3, 1, 9, 4], + } + ) + )() + ), + } + ) + value = data['df'] + value[['a1', 'b4']] = df[['a2', 'b5']] + + +def test_drop(prod_env_and_data): + env, data = prod_env_and_data + # Case 1: not inplace. + # WHEN + value = data['df'].drop(columns='a1', inplace=False) + # THEN + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), + data['df_alice'].drop(columns='a1', inplace=False), + ) + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.bob].data), data['df_bob'] + ) + + # Case 2: inplace. + # WHEN + value = data['df'].copy() + value.drop(columns=['a1', 'b4', 'b5'], inplace=True) + # THEN + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), + data['df_alice'].drop(columns='a1', inplace=False), + ) + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.bob].data), + data['df_bob'].drop(columns=['b4', 'b5'], inplace=False), + ) + + +def test_replace_should_ok(prod_env_and_data): + pass + + +def test_fillna(prod_env_and_data): + env, data = prod_env_and_data + # Case 1: not inplace. + # WHEN + value = data['df'].fillna(value='test', inplace=False) + # THEN + # After processed by pandas, the empty place will be fill with string 'nan', whether polars is None, + # so we need to replace before testing the frame equal or not. + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), + data['df_alice'].fillna(value='test', inplace=False).replace("nan", None), + ) + # polars' fillna (actually pl.DataFrame.fill_null) does not act same as pandas, col whose tpye diffrent + # with value will not be filled. + # Therefore, following check will not pass, since df_bob['b4'] is numeric type but value is str type. + # pd.testing.assert_frame_equal( + # reveal(value.to_pandas().partitions[env.bob].data), + # data['df_bob'].fillna(value='test', inplace=False).replace("nan", None), + # ) + + # Case 2: inplace. + # WHEN + value = data['df'].copy() + value.fillna(value='test', inplace=True) + # THEN + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), + data['df_alice'].fillna(value='test', inplace=False).replace("nan", None), + ) + # pd.testing.assert_frame_equal( + # reveal(value.to_pandas().partitions[env.bob].data), + # data['df_bob'].fillna(value='test', inplace=False).replace("nan", None), + # ) + + +def test_astype_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + df = data['df'][['a3', 'b4']].fillna(value=1, inplace=False) + + # Case 1: single dtype + # WHEN + new_df = df.astype(np.int32) + # THEN + pd.testing.assert_frame_equal( + reveal(new_df.to_pandas().partitions[env.alice].data), + data['df_alice'][['a3']].fillna(1).astype(np.int32), + ) + pd.testing.assert_frame_equal( + reveal(new_df.to_pandas().partitions[env.bob].data), + data['df_bob'][['b4']].fillna(1).astype(np.int32), + ) + + # Case 2: dtype dict. + # WHEN + dtype = {'a3': np.int32} + new_df = df.astype(dtype) + # THEN + pd.testing.assert_frame_equal( + reveal(new_df.to_pandas().partitions[env.alice].data), + data['df_alice'][['a3']].fillna(1).astype(dtype), + ) + pd.testing.assert_frame_equal( + reveal(new_df.to_pandas().partitions[env.bob].data), + data['df_bob'][['b4']].fillna(1), + ) diff --git a/tests/device/test_driver_brpc.py b/tests/device/test_driver_brpc.py index 1b26b733f..414e69bdf 100644 --- a/tests/device/test_driver_brpc.py +++ b/tests/device/test_driver_brpc.py @@ -8,7 +8,7 @@ import secretflow.distributed as sfd from secretflow.device.device.spu import SPUObject from secretflow.device.driver import reveal, to, wait - +from secretflow.distributed.primitive import DISTRIBUTION_MODE from tests.cluster import cluster, set_self_party from tests.conftest import DeviceInventory, semi2k_cluster @@ -16,7 +16,7 @@ @pytest.fixture(scope="module") def env(request, sf_party_for_4pc): devices = DeviceInventory() - sfd.set_production(True) + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.PRODUCTION) set_self_party(sf_party_for_4pc) sf.init( address='local', diff --git a/tests/device/test_teeu.py b/tests/device/test_teeu.py index 30ca1505f..148f8f67a 100644 --- a/tests/device/test_teeu.py +++ b/tests/device/test_teeu.py @@ -10,6 +10,7 @@ import secretflow.distributed as sfd from secretflow.device import global_state from secretflow.device.device.teeu import TEEU +from secretflow.distributed.primitive import DISTRIBUTION_MODE from secretflow.utils.testing import unused_tcp_port from tests.cluster import cluster, get_self_party, set_self_party @@ -26,7 +27,7 @@ class TeeuTestInventory: @pytest.fixture(scope="module") def teeu_production_setup_devices(request, sf_party_for_4pc): inventory = TeeuTestInventory() - sfd.set_production(True) + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.PRODUCTION) set_self_party(sf_party_for_4pc) self_party = get_self_party() if self_party in ('carol', 'davy'): diff --git a/tests/kuscia/test_kuscia.py b/tests/kuscia/test_kuscia.py index f8f491d9c..bd8531894 100644 --- a/tests/kuscia/test_kuscia.py +++ b/tests/kuscia/test_kuscia.py @@ -1,7 +1,3 @@ -from secretflow.kuscia.ray_config import RayConfig -from secretflow.kuscia.sf_config import compose_sf_cluster_config -from secretflow.kuscia.task_config import KusciaTaskConfig -from secretflow.protos.component.cluster_pb2 import SFClusterDesc, StorageConfig from kuscia.proto.api.v1alpha1.kusciatask.kuscia_task_pb2 import ( AllocatedPorts, ClusterDefine, @@ -10,6 +6,11 @@ Service, ) +from secretflow.kuscia.ray_config import RayConfig +from secretflow.kuscia.sf_config import get_sf_cluster_config +from secretflow.kuscia.task_config import KusciaTaskConfig +from secretflow.spec.extend.cluster_pb2 import SFClusterDesc + def test_load_configs(): kuscia_request_json = { @@ -32,7 +33,7 @@ def test_load_configs(): assert ray_config.ray_gcs_port == 8081 -def test_compose_sf_cluster_config(): +def test_get_sf_cluster_config(): sf_cluster_desc = SFClusterDesc( parties=["alice", "bob", "carol"], devices=[ @@ -44,11 +45,16 @@ def test_compose_sf_cluster_config(): parties=[ Party( name="alice", + services=[ + Service(port_name="fed", endpoints=["1.2.3.4"]), + Service(port_name="spu", endpoints=["1.2.3.4"]), + Service(port_name="global", endpoints=["0.0.0.0:1236"]), + ], ), Party( name="bob", services=[ - Service(port_name="fed", endpoints=["1.2.3.4"]), + Service(port_name="fed", endpoints=["1.2.3.5"]), Service(port_name="spu", endpoints=["1.2.3.5"]), ], ), @@ -63,28 +69,24 @@ def test_compose_sf_cluster_config(): ) kuscia_task_allocated_ports = AllocatedPorts( - ports=[Port(name="fed", port=1234), Port(name="spu", port=1235)] + ports=[ + Port(name="fed", port=1234), + Port(name="spu", port=1235), + ] ) - ray_config = RayConfig(ray_node_ip_address="0.0.0.0", ray_gcs_port=1236) - sf_storage_config = { - "alice": StorageConfig( - type="local_fs", local_fs=StorageConfig.LocalFSConfig(wd="/tmp/alice") - ) - } - - sf_cluster_config = compose_sf_cluster_config( - sf_cluster_desc, - "datamesh.local", - kuscia_task_cluster_def, - kuscia_task_allocated_ports, - ray_config, - sf_storage_config, + kuscia_config = KusciaTaskConfig( + task_id="task_id", + task_cluster_def=kuscia_task_cluster_def, + task_allocated_ports=kuscia_task_allocated_ports, + sf_cluster_desc=sf_cluster_desc, ) + sf_cluster_config = get_sf_cluster_config(kuscia_config) + assert list(sf_cluster_config.public_config.ray_fed_config.addresses) == [ "0.0.0.0:1234", - "1.2.3.4:80", + "1.2.3.5:80", "1.2.3.6:2345", ] @@ -95,4 +97,3 @@ def test_compose_sf_cluster_config(): assert sf_cluster_config.private_config.self_party == "alice" assert sf_cluster_config.private_config.ray_head_addr == "0.0.0.0:1236" - assert sf_cluster_config.private_config.storage_config.local_fs.wd == "/tmp/alice" diff --git a/tests/ml/linear/test_fl_lr_mix.py b/tests/ml/linear/test_fl_lr_mix.py index bb7579e78..f7991b6c8 100644 --- a/tests/ml/linear/test_fl_lr_mix.py +++ b/tests/ml/linear/test_fl_lr_mix.py @@ -10,10 +10,11 @@ import secretflow as sf import secretflow.distributed as sfd -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.mix import MixDataFrame from secretflow.data.split import train_test_split from secretflow.data.vertical import VDataFrame +from secretflow.distributed.primitive import DISTRIBUTION_MODE from secretflow.ml.linear.fl_lr_mix import FlLogisticRegressionMix from secretflow.preprocessing.scaler import StandardScaler from secretflow.security.aggregation import SecureAggregator @@ -35,7 +36,7 @@ class DeviceInventory: @pytest.fixture(scope="module") def env(request, sf_party_for_4pc): devices = DeviceInventory() - sfd.set_production(True) + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.PRODUCTION) set_self_party(sf_party_for_4pc) sf.init( address='local', diff --git a/tests/ml/linear/test_fl_lr_v.py b/tests/ml/linear/test_fl_lr_v.py index 60afa96c2..4c477a828 100644 --- a/tests/ml/linear/test_fl_lr_v.py +++ b/tests/ml/linear/test_fl_lr_v.py @@ -8,8 +8,9 @@ import secretflow as sf import secretflow.distributed as sfd -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame +from secretflow.distributed.primitive import DISTRIBUTION_MODE from secretflow.ml.linear.fl_lr_v import FlLogisticRegressionVertical from secretflow.preprocessing import StandardScaler from secretflow.security.aggregation.plain_aggregator import PlainAggregator @@ -29,7 +30,7 @@ class DeviceInventory: @pytest.fixture(scope="module") def env(request, sf_party_for_4pc): devices = DeviceInventory() - sfd.set_production(True) + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.PRODUCTION) set_self_party(sf_party_for_4pc) sf.init( address='local', diff --git a/tests/ml/linear/test_hess_lr.py b/tests/ml/linear/test_hess_lr.py index 031d69ac5..63ed25247 100644 --- a/tests/ml/linear/test_hess_lr.py +++ b/tests/ml/linear/test_hess_lr.py @@ -11,6 +11,7 @@ import secretflow as sf import secretflow.distributed as sfd from secretflow.data import FedNdarray, PartitionWay +from secretflow.distributed.primitive import DISTRIBUTION_MODE from secretflow.ml.linear.hess_sgd import HESSLogisticRegression from tests.cluster import cluster, set_self_party from tests.conftest import heu_config, semi2k_cluster @@ -30,7 +31,7 @@ class DeviceInventory: @pytest.fixture(scope="module") def env(request, sf_party_for_4pc): devices = DeviceInventory() - sfd.set_production(True) + sfd.set_distribution_mode(mode=DISTRIBUTION_MODE.PRODUCTION) set_self_party(sf_party_for_4pc) sf.init( address='local', diff --git a/tests/ml/nn/test_fl_model_tf.py b/tests/ml/nn/test_fl_model_tf.py index 34f136f44..5ce8163b3 100644 --- a/tests/ml/nn/test_fl_model_tf.py +++ b/tests/ml/nn/test_fl_model_tf.py @@ -587,3 +587,46 @@ def dataset_builder(x, stage="train"): backend="tensorflow", dataset_builder=data_builder_dict, ) + + +class TestFedModelMemoryDF: + def test_keras_model_for_iris(self, sf_memory_setup_devices): + """unittest ignore""" + aggregator = PlainAggregator(sf_memory_setup_devices.carol) + comparator = PlainComparator(sf_memory_setup_devices.carol) + hdf = load_iris( + parts=[sf_memory_setup_devices.alice, sf_memory_setup_devices.bob], + aggregator=aggregator, + comparator=comparator, + ) + + label = hdf['class'] + # do preprocess + encoder = OneHotEncoder() + label = encoder.fit_transform(label) + + data = hdf.drop(columns='class', inplace=False) + data = data.fillna(data.mean(numeric_only=True).to_dict()) + + # prepare model + n_features = 4 + n_classes = 3 + model = create_nn_model(n_features, n_classes, 8, 3) + + device_list = [ + sf_memory_setup_devices.alice, + sf_memory_setup_devices.bob, + ] + fed_model = FLModel( + device_list=device_list, + model=model, + aggregator=aggregator, + sampler="batch", + random_seed=1234, + ) + fed_model.fit(data, label, epochs=5, batch_size=16, aggregate_freq=3) + global_metric, _ = fed_model.evaluate(data, label, batch_size=16) + print(global_metric) + + # FIXME(fengjun.feng): This assert is failing. + # assert global_metric[1].result().numpy() > 0.7 diff --git a/tests/preprocessing/test_cond_filter_v.py b/tests/preprocessing/test_cond_filter_v.py index 0b2b4d1de..1ccb28bf9 100644 --- a/tests/preprocessing/test_cond_filter_v.py +++ b/tests/preprocessing/test_cond_filter_v.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical.dataframe import VDataFrame from secretflow.preprocessing.cond_filter_v import ConditionFilter diff --git a/tests/preprocessing/test_discretizer.py b/tests/preprocessing/test_discretizer.py index 98249920d..c5feae5ee 100644 --- a/tests/preprocessing/test_discretizer.py +++ b/tests/preprocessing/test_discretizer.py @@ -4,7 +4,7 @@ from sklearn.preprocessing import KBinsDiscretizer as SkKBinsDiscretizer from secretflow import reveal -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.horizontal.dataframe import HDataFrame from secretflow.data.mix.dataframe import MixDataFrame from secretflow.data.vertical.dataframe import VDataFrame diff --git a/tests/preprocessing/test_encoder.py b/tests/preprocessing/test_encoder.py index 8a1e02dc2..380484338 100644 --- a/tests/preprocessing/test_encoder.py +++ b/tests/preprocessing/test_encoder.py @@ -5,9 +5,10 @@ from sklearn.preprocessing import OneHotEncoder as SkOneHotEncoder from sklearn.utils.validation import column_or_1d -from secretflow.data.base import reveal, partition -from secretflow.data.horizontal.dataframe import HDataFrame -from secretflow.data.mix.dataframe import MixDataFrame +from secretflow import reveal +from secretflow.data import partition +from secretflow.data.horizontal import HDataFrame +from secretflow.data.mix import MixDataFrame from secretflow.data.vertical.dataframe import VDataFrame from secretflow.preprocessing.encoder import LabelEncoder, OneHotEncoder from secretflow.security.aggregation.plain_aggregator import PlainAggregator diff --git a/tests/preprocessing/test_encoder_polars.py b/tests/preprocessing/test_encoder_polars.py new file mode 100644 index 000000000..20acc3fb7 --- /dev/null +++ b/tests/preprocessing/test_encoder_polars.py @@ -0,0 +1,551 @@ +import numpy as np +import pandas as pd +import polars as pl +import pytest +from sklearn.preprocessing import LabelEncoder as SkLabelEncoder +from sklearn.preprocessing import OneHotEncoder as SkOneHotEncoder +from sklearn.utils.validation import column_or_1d + +from secretflow import reveal +from secretflow.data import partition +from secretflow.data.horizontal.dataframe import HDataFrame +from secretflow.data.mix.dataframe import MixDataFrame +from secretflow.data.vertical.dataframe import VDataFrame +from secretflow.preprocessing.encoder import LabelEncoder, OneHotEncoder +from secretflow.security.aggregation.plain_aggregator import PlainAggregator +from secretflow.security.compare.plain_comparator import PlainComparator +from secretflow.utils.simulation.datasets import load_iris + + +@pytest.fixture(scope='module') +def prod_env_and_label_encoder_data(sf_production_setup_devices): + hdf = load_iris( + parts=[sf_production_setup_devices.alice, sf_production_setup_devices.bob], + aggregator=PlainAggregator(sf_production_setup_devices.alice), + comparator=PlainComparator(sf_production_setup_devices.carol), + ) + hdf_alice = reveal(hdf.partitions[sf_production_setup_devices.alice].data) + hdf_bob = reveal(hdf.partitions[sf_production_setup_devices.bob].data) + + vdf_alice = pd.DataFrame( + { + 'a1': ['K5', 'K1', None, 'K6'], + 'a2': ['A5', 'A5', 'A2', 'A2'], + 'a3': [5, 1, 2, 6], + } + ) + + vdf_bob = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4], + 'b5': ['B3', 'B2', 'B3', 'B4'], + 'b6': [3, 1, 9, 4], + } + ) + vdf_alice_poalrs = pl.from_pandas(vdf_alice) + vdf_bob_polars = pl.from_pandas(vdf_bob) + vdf = VDataFrame( + { + sf_production_setup_devices.alice: partition( + data=sf_production_setup_devices.alice(lambda: vdf_alice_poalrs)(), + backend="polars", + ), + sf_production_setup_devices.bob: partition( + data=sf_production_setup_devices.bob(lambda: vdf_bob_polars)(), + backend="polars", + ), + } + ) + + yield sf_production_setup_devices, { + 'hdf': hdf, + 'hdf_alice': hdf_alice, + 'hdf_bob': hdf_bob, + 'vdf_alice': vdf_alice, + 'vdf_bob': vdf_bob, + 'vdf': vdf, + } + + +class TestLabelEncoder: + def test_on_hdataframe_should_ok(self, prod_env_and_label_encoder_data): + env, data = prod_env_and_label_encoder_data + # GIVEN + encoder = LabelEncoder() + + # WHEN + value = encoder.fit_transform(data['hdf']['class']) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkLabelEncoder() + sk_encoder.fit( + column_or_1d( + pd.concat([data['hdf_alice'][['class']], data['hdf_bob'][['class']]]) + ) + ) + expect_alice = sk_encoder.transform(column_or_1d(data['hdf_alice'][['class']]))[ + np.newaxis + ].T + np.testing.assert_equal(reveal(value.partitions[env.alice].data), expect_alice) + expect_bob = sk_encoder.transform(column_or_1d(data['hdf_bob'][['class']]))[ + np.newaxis + ].T + np.testing.assert_equal(reveal(value.partitions[env.bob].data), expect_bob) + assert value.dtypes['class'] == np.int64 + + def test_on_vdataframe_should_ok(self, prod_env_and_label_encoder_data): + env, data = prod_env_and_label_encoder_data + # GIVEN + encoder = LabelEncoder() + + # WHEN + value = encoder.fit_transform(data['vdf']['a2']) + params = encoder.get_params() + + # THEN + assert params + assert len(value.partitions) == 1 + sk_encoder = SkLabelEncoder() + expect_alice = sk_encoder.fit_transform( + column_or_1d(data['vdf_alice'][['a2']]) + )[np.newaxis].T + np.testing.assert_equal( + reveal(value.to_pandas().partitions[env.alice].data), expect_alice + ) + assert value.to_pandas().dtypes['a2'] == np.int64 + + def test_on_h_mixdataframe_should_ok(self, prod_env_and_label_encoder_data): + env, data = prod_env_and_label_encoder_data + # GIVEN + df = pd.DataFrame({'a1': ['A1', 'B1', None, 'D1', None, 'B4', 'C4', 'D4']}) + h_part0 = VDataFrame( + {env.alice: partition(data=env.alice(lambda: df.iloc[:4, :])())} + ) + h_part1 = VDataFrame( + {env.alice: partition(data=env.alice(lambda: df.iloc[4:, :])())} + ) + h_mix = MixDataFrame(partitions=[h_part0, h_part1]) + + encoder = LabelEncoder() + + # WHEN + value = encoder.fit_transform(h_mix) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkLabelEncoder() + expected = sk_encoder.fit_transform(column_or_1d(df))[np.newaxis].T + np.testing.assert_equal( + pd.concat( + [ + reveal(value.partitions[0].partitions[env.alice].data), + reveal(value.partitions[1].partitions[env.alice].data), + ] + ), + expected, + ) + + def test_on_v_mixdataframe_should_ok(self, prod_env_and_label_encoder_data): + env, data = prod_env_and_label_encoder_data + # GIVEN + df = pd.DataFrame({'a1': ['A1', 'B1', None, 'D1', None, 'B4', 'C4', 'D4']}) + v_part0 = HDataFrame( + { + env.alice: partition(data=env.alice(lambda: df.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df.iloc[4:, :])()), + }, + aggregator=PlainAggregator(env.carol), + comparator=PlainComparator(env.carol), + ) + v_mix = MixDataFrame(partitions=[v_part0]) + + encoder = LabelEncoder() + + # WHEN + value = encoder.fit_transform(v_mix) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkLabelEncoder() + expected = sk_encoder.fit_transform(column_or_1d(df))[np.newaxis].T + np.testing.assert_equal( + pd.concat( + [ + reveal(value.partitions[0].partitions[env.alice].data), + reveal(value.partitions[0].partitions[env.bob].data), + ] + ), + expected, + ) + + def test_on_hdataframe_more_than_one_column_should_error( + self, + prod_env_and_label_encoder_data, + ): + env, data = prod_env_and_label_encoder_data + with pytest.raises( + AssertionError, + match='DataFrame to encode should have one and only one column', + ): + encoder = LabelEncoder() + encoder.fit_transform(data['hdf']) + + def test_on_vdataframe_more_than_one_column_should_error( + self, + prod_env_and_label_encoder_data, + ): + env, data = prod_env_and_label_encoder_data + with pytest.raises( + AssertionError, + match='DataFrame to encode should have one and only one column', + ): + encoder = LabelEncoder() + encoder.fit_transform(data['vdf']) + + def test_should_error_when_not_dataframe(self, prod_env_and_label_encoder_data): + env, data = prod_env_and_label_encoder_data + encoder = LabelEncoder() + with pytest.raises( + AssertionError, match='Accepts HDataFrame/VDataFrame/MixDataFrame only' + ): + encoder.fit(['test']) + encoder.fit(data['vdf']['a2']) + with pytest.raises( + AssertionError, match='Accepts HDataFrame/VDataFrame/MixDataFrame only' + ): + encoder.transform(['test']) + + def test_transform_should_error_when_not_fit(self, prod_env_and_label_encoder_data): + env, data = prod_env_and_label_encoder_data + with pytest.raises(AssertionError, match='Encoder has not been fit yet.'): + LabelEncoder().transform('test') + + +@pytest.fixture(scope='module') +def prod_env_and_onehot_encoder_data(sf_production_setup_devices): + hdf = load_iris( + parts=[sf_production_setup_devices.alice, sf_production_setup_devices.bob], + aggregator=PlainAggregator(sf_production_setup_devices.alice), + comparator=PlainComparator(sf_production_setup_devices.alice), + ) + hdf_alice = reveal(hdf.partitions[sf_production_setup_devices.alice].data) + hdf_bob = reveal(hdf.partitions[sf_production_setup_devices.bob].data) + + vdf_alice = pd.DataFrame( + { + 'a1': ['K5', 'K1', None, 'K6'], + 'a2': ['A5', 'A5', 'A2', 'A2'], + 'a3': [5, 1, 2, 1], + } + ) + + vdf_bob = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4], + 'b5': ['B3', 'B2', 'B3', 'B4'], + 'b6': [3, 1, 9, 4], + } + ) + + vdf = VDataFrame( + { + sf_production_setup_devices.alice: partition( + data=sf_production_setup_devices.alice( + lambda: pl.from_pandas(vdf_alice) + )(), + backend="polars", + ), + sf_production_setup_devices.bob: partition( + data=sf_production_setup_devices.bob(lambda: pl.from_pandas(vdf_bob))(), + backend="polars", + ), + } + ) + + yield sf_production_setup_devices, { + 'hdf': hdf, + 'hdf_alice': hdf_alice, + 'hdf_bob': hdf_bob, + 'vdf_alice': vdf_alice, + 'vdf_bob': vdf_bob, + 'vdf': vdf, + } + + +class TestOneHotEncoder: + def test_on_hdataframe_should_ok(self, prod_env_and_onehot_encoder_data): + env, data = prod_env_and_onehot_encoder_data + # GIVEN + encoder = OneHotEncoder() + + # WHEN + value = encoder.fit_transform(data['hdf']['class']) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkOneHotEncoder() + sk_encoder.fit( + pd.concat([data['hdf_alice'][['class']], data['hdf_bob'][['class']]]) + ) + expect_alice = sk_encoder.transform(data['hdf_alice'][['class']]).toarray() + np.testing.assert_equal(reveal(value.partitions[env.alice].data), expect_alice) + expect_bob = sk_encoder.transform(data['hdf_bob'][['class']]).toarray() + np.testing.assert_equal(reveal(value.partitions[env.bob].data), expect_bob) + + def test_on_vdataframe_should_ok(self, prod_env_and_onehot_encoder_data): + env, data = prod_env_and_onehot_encoder_data + # GIVEN + encoder = OneHotEncoder() + + # WHEN + value = encoder.fit_transform(data['vdf'][['a1', 'a2', 'b5']]) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkOneHotEncoder() + expect_alice = sk_encoder.fit_transform( + data['vdf_alice'][['a1', 'a2']] + ).toarray() + np.testing.assert_equal(reveal(value.partitions[env.alice].data), expect_alice) + expect_bob = sk_encoder.fit_transform(data['vdf_bob'][['b5']]).toarray() + np.testing.assert_equal(reveal(value.partitions[env.bob].data), expect_bob) + + def test_on_h_mixdataframe_should_ok(self, prod_env_and_onehot_encoder_data): + env, data = prod_env_and_onehot_encoder_data + # GIVEN + df_part0 = pd.DataFrame( + { + 'a1': ['A1', 'B1', None, 'D1', None, 'B4', 'C4', 'C4'], + 'a2': [5, 5, 2, 6, 15, None, 23, 6], + } + ) + + df_part1 = pd.DataFrame( + {'b4': [10.2, 20.5, None, -0.4, None, 0.5, None, -10.4]} + ) + h_part0 = VDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[:4, :])()), + } + ) + h_part1 = VDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[4:, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[4:, :])()), + } + ) + h_mix = MixDataFrame(partitions=[h_part0, h_part1]) + + encoder = OneHotEncoder() + + # WHEN + value = encoder.fit_transform(h_mix) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkOneHotEncoder() + expected = sk_encoder.fit_transform( + pd.concat([df_part0, df_part1], axis=1) + ).toarray() + np.testing.assert_equal( + pd.concat( + [ + pd.concat( + [ + reveal(value.partitions[0].partitions[env.alice].data), + reveal(value.partitions[1].partitions[env.alice].data), + ] + ), + pd.concat( + [ + reveal(value.partitions[0].partitions[env.bob].data), + reveal(value.partitions[1].partitions[env.bob].data), + ] + ), + ], + axis=1, + ), + expected, + ) + + def test_on_v_mixdataframe_should_ok(self, prod_env_and_onehot_encoder_data): + env, data = prod_env_and_onehot_encoder_data + # GIVEN + df_part0 = pd.DataFrame( + { + 'a1': ['A1', 'B1', None, 'D1', None, 'B4', 'C4', 'C4'], + 'a2': [5, 5, 2, 6, 15, None, 23, 6], + } + ) + + df_part1 = pd.DataFrame( + {'b4': [10.2, 20.5, None, -0.4, None, 0.5, None, -10.4]} + ) + v_part0 = HDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part0.iloc[4:, :])()), + }, + aggregator=PlainAggregator(env.carol), + comparator=PlainComparator(env.carol), + ) + v_part1 = HDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part1.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[4:, :])()), + }, + aggregator=PlainAggregator(env.carol), + comparator=PlainComparator(env.carol), + ) + v_mix = MixDataFrame(partitions=[v_part0, v_part1]) + + encoder = OneHotEncoder() + + # WHEN + value = encoder.fit_transform(v_mix) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkOneHotEncoder() + expected = sk_encoder.fit_transform( + pd.concat([df_part0, df_part1], axis=1) + ).toarray() + np.testing.assert_equal( + pd.concat( + [ + pd.concat( + [ + reveal(value.partitions[0].partitions[env.alice].data), + reveal(value.partitions[0].partitions[env.bob].data), + ] + ), + pd.concat( + [ + reveal(value.partitions[1].partitions[env.alice].data), + reveal(value.partitions[1].partitions[env.bob].data), + ] + ), + ], + axis=1, + ), + expected, + ) + + def test_min_frequency_on_vdataframe_should_ok( + self, prod_env_and_onehot_encoder_data + ): + env, data = prod_env_and_onehot_encoder_data + # WHEN + selected_columns = ['a1', 'a2', 'b5'] + encoder = OneHotEncoder(min_frequency=2) + df = encoder.fit_transform(data['vdf'][selected_columns]) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkOneHotEncoder(min_frequency=2) + expect_alice = pd.DataFrame( + sk_encoder.fit_transform(data['vdf_alice'][['a1', 'a2']]).toarray(), + columns=sk_encoder.get_feature_names_out(), + ) + assert set(expect_alice.columns).issubset(set(df.partitions[env.alice].columns)) + + alice_columns = expect_alice.columns + pd.testing.assert_frame_equal( + reveal(df.to_pandas().partitions[env.alice][alice_columns].data), + expect_alice, + ) + + expect_bob = pd.DataFrame( + sk_encoder.fit_transform(data['vdf_bob'][['b5']]).toarray(), + columns=sk_encoder.get_feature_names_out(), + ) + assert set(expect_bob.columns).issubset(set(df.partitions[env.bob].columns)) + + bob_columns = expect_bob.columns + pd.testing.assert_frame_equal( + reveal(df.to_pandas().partitions[env.bob][bob_columns].data), expect_bob + ) + + def test_max_categories_on_vdataframe_should_ok( + self, prod_env_and_onehot_encoder_data + ): + env, data = prod_env_and_onehot_encoder_data + # WHEN + selected_columns = ['a1', 'a2', 'b5'] + encoder = OneHotEncoder(max_categories=3) + df = encoder.fit_transform(data['vdf'][selected_columns]) + params = encoder.get_params() + + # THEN + assert params + sk_encoder = SkOneHotEncoder(max_categories=3) + expect_alice = pd.DataFrame( + sk_encoder.fit_transform(data['vdf_alice'][['a1', 'a2']]).toarray(), + columns=sk_encoder.get_feature_names_out(), + ) + assert set(expect_alice.columns).issubset(set(df.partitions[env.alice].columns)) + + alice_columns = expect_alice.columns + pd.testing.assert_frame_equal( + reveal(df.to_pandas().partitions[env.alice][alice_columns].data), + expect_alice, + ) + + expect_bob = pd.DataFrame( + sk_encoder.fit_transform(data['vdf_bob'][['b5']]).toarray(), + columns=sk_encoder.get_feature_names_out(), + ) + assert set(expect_bob.columns).issubset(set(df.partitions[env.bob].columns)) + + bob_columns = expect_bob.columns + pd.testing.assert_frame_equal( + reveal(df.to_pandas().partitions[env.bob][bob_columns].data), expect_bob + ) + + def test_should_error_on_hdataframe_with_args( + self, prod_env_and_onehot_encoder_data + ): + env, data = prod_env_and_onehot_encoder_data + encoder = OneHotEncoder(min_frequency=3) + with pytest.raises( + AssertionError, + match='Args min_frequency/max_categories are only supported in VDataFrame', + ): + encoder.fit_transform(data['hdf']) + + encoder = OneHotEncoder(max_categories=3) + with pytest.raises( + AssertionError, + match='Args min_frequency/max_categories are only supported in VDataFrame', + ): + encoder.fit_transform(data['hdf']) + + def test_should_error_when_not_dataframe(self, prod_env_and_onehot_encoder_data): + env, data = prod_env_and_onehot_encoder_data + encoder = OneHotEncoder() + with pytest.raises( + AssertionError, match='Accepts HDataFrame/VDataFrame/MixDataFrame only' + ): + encoder.fit(['test']) + encoder.fit(data['hdf']) + with pytest.raises( + AssertionError, match='Accepts HDataFrame/VDataFrame/MixDataFrame only' + ): + encoder.transform(['test']) + + def test_transform_should_error_when_not_fit( + self, prod_env_and_onehot_encoder_data + ): + env, data = prod_env_and_onehot_encoder_data + with pytest.raises(AssertionError, match='Encoder has not been fit yet.'): + OneHotEncoder().transform('test') diff --git a/tests/preprocessing/test_function_transformer.py b/tests/preprocessing/test_function_transformer.py index f7fc33887..05fde850d 100644 --- a/tests/preprocessing/test_function_transformer.py +++ b/tests/preprocessing/test_function_transformer.py @@ -6,7 +6,7 @@ from sklearn.preprocessing import FunctionTransformer as SkFunctionTransformer from secretflow import reveal -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.horizontal.dataframe import HDataFrame from secretflow.data.mix.dataframe import MixDataFrame from secretflow.data.vertical.dataframe import VDataFrame diff --git a/tests/preprocessing/test_function_transformer_polars.py b/tests/preprocessing/test_function_transformer_polars.py new file mode 100644 index 000000000..673c4ed06 --- /dev/null +++ b/tests/preprocessing/test_function_transformer_polars.py @@ -0,0 +1,305 @@ +from functools import partial + +import numpy as np +import pandas as pd +import polars as pl +import pytest +from sklearn.preprocessing import FunctionTransformer as SkFunctionTransformer + +from secretflow import reveal +from secretflow.data import partition +from secretflow.data.horizontal.dataframe import HDataFrame +from secretflow.data.mix.dataframe import MixDataFrame +from secretflow.data.vertical.dataframe import VDataFrame +from secretflow.preprocessing import LogroundTransformer +from secretflow.preprocessing.transformer import _FunctionTransformer +from secretflow.security.aggregation.plain_aggregator import PlainAggregator +from secretflow.security.compare.plain_comparator import PlainComparator +from secretflow.utils.simulation.datasets import load_iris + + +@pytest.fixture(scope='module') +def prod_env_and_data(sf_production_setup_devices): + hdf = load_iris( + parts=[sf_production_setup_devices.alice, sf_production_setup_devices.bob], + aggregator=PlainAggregator(sf_production_setup_devices.alice), + comparator=PlainComparator(sf_production_setup_devices.carol), + ) + hdf_alice = reveal(hdf.partitions[sf_production_setup_devices.alice].data) + hdf_bob = reveal(hdf.partitions[sf_production_setup_devices.bob].data) + + vdf_alice = pd.DataFrame( + { + 'a1': ['K5', 'K1', None, 'K6'], + 'a2': ['A5', 'A1', 'A2', 'A6'], + 'a3': [5, 1, 2, 6], + } + ) + + vdf_bob = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4], + 'b5': ['B3', None, 'B9', 'B4'], + 'b6': [3, 1, 9, 4], + } + ) + + vdf = VDataFrame( + { + sf_production_setup_devices.alice: partition( + data=sf_production_setup_devices.alice( + lambda: pl.from_pandas(vdf_alice) + )(), + backend="polars", + ), + sf_production_setup_devices.bob: partition( + data=sf_production_setup_devices.bob(lambda: pl.from_pandas(vdf_bob))(), + backend="polars", + ), + } + ) + + yield sf_production_setup_devices, { + 'hdf': hdf, + 'hdf_alice': hdf_alice, + 'hdf_bob': hdf_bob, + 'vdf_alice': vdf_alice, + 'vdf_bob': vdf_bob, + 'vdf': vdf, + } + + +def test_on_vdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + transformer = _FunctionTransformer(partial(np.add, 1)) + + # WHEN + value = transformer.fit_transform(data['vdf'][['a3', 'b4', 'b6']]) + params = transformer.get_params() + + # THEN + assert params + sk_transformer = SkFunctionTransformer(partial(np.add, 1)) + expect_alice = sk_transformer.fit_transform(data['vdf_alice'][['a3']]) + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expect_alice + ) + + expect_bob = sk_transformer.fit_transform(data['vdf_bob'][['b4', 'b6']]) + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.bob].data), expect_bob + ) + + +def test_on_h_mixdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + df_part0 = pd.DataFrame( + { + 'a1': ['A1', 'B1', None, 'D1', None, 'B4', 'C4', 'D4'], + 'a2': ['A2', 'B2', 'C2', 'D2', 'A5', 'B5', 'C5', 'D5'], + 'a3': [5, 1, 2, 6, 15, None, 23, 6], + } + ) + + df_part1 = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4, None, 0.5, None, -10.4], + 'b5': ['B3', None, 'B9', 'B4', 'A3', None, 'C9', 'E4'], + 'b6': [3, 1, 9, 4, 31, 12, 9, 21], + } + ) + h_part0 = VDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[:4, :])()), + } + ) + h_part1 = VDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[4:, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[4:, :])()), + } + ) + h_mix = MixDataFrame(partitions=[h_part0, h_part1]) + + transformer = _FunctionTransformer(partial(np.add, 1)) + + # WHEN + value = transformer.fit_transform(h_mix[['a3', 'b4', 'b6']]) + params = transformer.get_params() + + # THEN + assert params + sk_transformer = SkFunctionTransformer(partial(np.add, 1)) + expect_alice = sk_transformer.fit_transform(df_part0[['a3']]) + pd.testing.assert_frame_equal( + pd.concat( + [ + reveal(value.partitions[0].partitions[env.alice].data), + reveal(value.partitions[1].partitions[env.alice].data), + ] + ), + expect_alice, + ) + expect_bob = sk_transformer.fit_transform(df_part1[['b4', 'b6']]) + pd.testing.assert_frame_equal( + pd.concat( + [ + reveal(value.partitions[0].partitions[env.bob].data), + reveal(value.partitions[1].partitions[env.bob].data), + ] + ), + expect_bob, + ) + + +def test_on_v_mixdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + df_part0 = pd.DataFrame( + { + 'a1': ['A1', 'B1', None, 'D1', None, 'B4', 'C4', 'D4'], + 'a2': ['A2', 'B2', 'C2', 'D2', 'A5', 'B5', 'C5', 'D5'], + 'a3': [5, 1, 2, 6, 15, None, 23, 6], + } + ) + + df_part1 = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4, None, 0.5, None, -10.4], + 'b5': ['B3', None, 'B9', 'B4', 'A3', None, 'C9', 'E4'], + 'b6': [3, 1, 9, 4, 31, 12, 9, 21], + } + ) + v_part0 = HDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part0.iloc[4:, :])()), + }, + aggregator=PlainAggregator(env.carol), + comparator=PlainComparator(env.carol), + ) + v_part1 = HDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part1.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[4:, :])()), + }, + aggregator=PlainAggregator(env.carol), + comparator=PlainComparator(env.carol), + ) + v_mix = MixDataFrame(partitions=[v_part0, v_part1]) + + transformer = _FunctionTransformer(partial(np.add, 1)) + + # WHEN + value = transformer.fit_transform(v_mix[['a3', 'b4', 'b6']]) + params = transformer.get_params() + + # THEN + assert params + sk_transformer = SkFunctionTransformer(partial(np.add, 1)) + expect_alice = sk_transformer.fit_transform(df_part0[['a3']]) + pd.testing.assert_frame_equal( + pd.concat( + [ + reveal(value.partitions[0].partitions[env.alice].data), + reveal(value.partitions[0].partitions[env.bob].data), + ] + ), + expect_alice, + ) + expect_bob = sk_transformer.fit_transform(df_part1[['b4', 'b6']]) + pd.testing.assert_frame_equal( + pd.concat( + [ + reveal(value.partitions[1].partitions[env.alice].data), + reveal(value.partitions[1].partitions[env.bob].data), + ] + ), + expect_bob, + ) + + +def test_should_error_when_not_dataframe(prod_env_and_data): + env, data = prod_env_and_data + transformer = _FunctionTransformer(partial(np.add, 1)) + with pytest.raises( + AssertionError, match='Accepts HDataFrame/VDataFrame/MixDataFrame only' + ): + transformer.fit(['test']) + transformer.fit(data['vdf']['a3']) + with pytest.raises( + AssertionError, match='Accepts HDataFrame/VDataFrame/MixDataFrame only' + ): + transformer.transform('test') + + +def test_transform_should_ok_when_not_fit(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + selected_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] + transformer = _FunctionTransformer(partial(np.add, 1)) + + # WHEN + value = transformer.transform(data['hdf'][selected_cols]) + + # THEN + assert value + + +def test_loground_on_vdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + transformer = LogroundTransformer(decimals=2, bias=1) + + # WHEN + value = transformer.fit_transform(data['vdf'][['a3', 'b4', 'b6']]) + params = transformer.get_params() + + # THEN + assert params + + def loground(x: pd.DataFrame): + return x.add(1).apply(np.log2).round(2) + + sk_transformer = SkFunctionTransformer(loground) + expect_alice = sk_transformer.fit_transform(data['vdf_alice'][['a3']]) + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.alice].data), expect_alice + ) + + expect_bob = sk_transformer.fit_transform(data['vdf_bob'][['b4', 'b6']]) + pd.testing.assert_frame_equal( + reveal(value.to_pandas().partitions[env.bob].data), expect_bob + ) + + +def test_loground_on_hdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + selected_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] + transformer = LogroundTransformer(decimals=2, bias=1) + + # WHEN + value = transformer.fit_transform(data['hdf'][selected_cols]) + params = transformer.get_params() + + # THEN + assert params + + def loground(x: pd.DataFrame): + return x.add(1).apply(np.log2).round(2) + + sk_transformer = SkFunctionTransformer(loground) + sk_transformer.fit( + pd.concat([data['hdf_alice'][selected_cols], data['hdf_bob'][selected_cols]]) + ) + expect_alice = sk_transformer.transform(data['hdf_alice'][selected_cols]) + pd.testing.assert_frame_equal( + reveal(value.partitions[env.alice].data), + expect_alice, + ) + expect_bob = sk_transformer.transform(data['hdf_bob'][selected_cols]) + pd.testing.assert_frame_equal(reveal(value.partitions[env.bob].data), expect_bob) diff --git a/tests/preprocessing/test_minmax_scaler.py b/tests/preprocessing/test_minmax_scaler.py index bd0ad42f5..95f7faf26 100644 --- a/tests/preprocessing/test_minmax_scaler.py +++ b/tests/preprocessing/test_minmax_scaler.py @@ -4,7 +4,7 @@ from sklearn.preprocessing import MinMaxScaler as SkMinMaxScaler from secretflow import reveal -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.horizontal.dataframe import HDataFrame from secretflow.data.mix.dataframe import MixDataFrame from secretflow.data.vertical.dataframe import VDataFrame diff --git a/tests/preprocessing/test_minmax_scaler_polars.py b/tests/preprocessing/test_minmax_scaler_polars.py new file mode 100644 index 000000000..e8ed19373 --- /dev/null +++ b/tests/preprocessing/test_minmax_scaler_polars.py @@ -0,0 +1,264 @@ +import numpy as np +import pandas as pd +import polars as pl +import pytest +from sklearn.preprocessing import MinMaxScaler as SkMinMaxScaler + +from secretflow import reveal +from secretflow.data import partition +from secretflow.data.horizontal.dataframe import HDataFrame +from secretflow.data.mix.dataframe import MixDataFrame +from secretflow.data.vertical.dataframe import VDataFrame +from secretflow.preprocessing.scaler import MinMaxScaler +from secretflow.security.aggregation.plain_aggregator import PlainAggregator +from secretflow.security.compare.plain_comparator import PlainComparator +from secretflow.utils.simulation.datasets import load_iris + + +@pytest.fixture(scope='module') +def prod_env_and_data(sf_production_setup_devices): + hdf = load_iris( + parts=[sf_production_setup_devices.alice, sf_production_setup_devices.bob], + aggregator=PlainAggregator(sf_production_setup_devices.alice), + comparator=PlainComparator(sf_production_setup_devices.carol), + ) + hdf_alice = reveal(hdf.partitions[sf_production_setup_devices.alice].data) + hdf_bob = reveal(hdf.partitions[sf_production_setup_devices.bob].data) + + vdf_alice = pd.DataFrame( + { + 'a1': ['K5', 'K1', None, 'K6'], + 'a2': ['A5', 'A1', 'A2', 'A6'], + 'a3': [5, 1, 2, 6], + } + ) + + vdf_bob = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4], + 'b5': ['B3', None, 'B9', 'B4'], + 'b6': [3, 1, 9, 4], + } + ) + + vdf = VDataFrame( + { + sf_production_setup_devices.alice: partition( + data=sf_production_setup_devices.alice( + lambda: pl.from_pandas(vdf_alice) + )(), + backend="polars", + ), + sf_production_setup_devices.bob: partition( + data=sf_production_setup_devices.bob(lambda: pl.from_pandas(vdf_bob))(), + backend="polars", + ), + } + ) + + yield sf_production_setup_devices, { + 'hdf': hdf, + 'hdf_alice': hdf_alice, + 'hdf_bob': hdf_bob, + 'vdf_alice': vdf_alice, + 'vdf_bob': vdf_bob, + 'vdf': vdf, + } + + +def test_on_hdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + selected_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] + scaler = MinMaxScaler() + + # WHEN + value = scaler.fit_transform(data['hdf'][selected_cols]) + params = scaler.get_params() + + # THEN + assert params + sk_scaler = SkMinMaxScaler() + sk_scaler.fit( + pd.concat([data['hdf_alice'][selected_cols], data['hdf_bob'][selected_cols]]) + ) + expect_alice = sk_scaler.transform(data['hdf_alice'][selected_cols]) + np.testing.assert_almost_equal( + reveal(value.partitions[env.alice].data), + expect_alice, + ) + expect_bob = sk_scaler.transform(data['hdf_bob'][selected_cols]) + np.testing.assert_almost_equal(reveal(value.partitions[env.bob].data), expect_bob) + + +def test_on_vdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + scaler = MinMaxScaler() + + # WHEN + value = scaler.fit_transform(data['vdf'][['a3', 'b4', 'b6']]) + params = scaler.get_params() + + # THEN + assert params + sk_scaler = SkMinMaxScaler() + expect_alice = sk_scaler.fit_transform(data['vdf_alice'][['a3']]) + np.testing.assert_equal( + reveal(value.to_pandas().partitions[env.alice].data), expect_alice + ) + + expect_bob = sk_scaler.fit_transform(data['vdf_bob'][['b4', 'b6']]) + np.testing.assert_equal( + reveal(value.to_pandas().partitions[env.bob].data), expect_bob + ) + + +def test_on_h_mixdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + df_part0 = pd.DataFrame( + { + 'a1': ['A1', 'B1', None, 'D1', None, 'B4', 'C4', 'D4'], + 'a2': ['A2', 'B2', 'C2', 'D2', 'A5', 'B5', 'C5', 'D5'], + 'a3': [5, 1, 2, 6, 15, None, 23, 6], + } + ) + + df_part1 = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4, None, 0.5, None, -10.4], + 'b5': ['B3', None, 'B9', 'B4', 'A3', None, 'C9', 'E4'], + 'b6': [3, 1, 9, 4, 31, 12, 9, 21], + } + ) + h_part0 = VDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[:4, :])()), + } + ) + h_part1 = VDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[4:, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[4:, :])()), + } + ) + h_mix = MixDataFrame(partitions=[h_part0, h_part1]) + + scaler = MinMaxScaler() + + # WHEN + value = scaler.fit_transform(h_mix[['a3', 'b4', 'b6']]) + params = scaler.get_params() + + # THEN + assert params + sk_scaler = SkMinMaxScaler() + expect_alice = sk_scaler.fit_transform(df_part0[['a3']]) + np.testing.assert_equal( + pd.concat( + [ + reveal(value.partitions[0].partitions[env.alice].data), + reveal(value.partitions[1].partitions[env.alice].data), + ] + ), + expect_alice, + ) + expect_bob = sk_scaler.fit_transform(df_part1[['b4', 'b6']]) + np.testing.assert_equal( + pd.concat( + [ + reveal(value.partitions[0].partitions[env.bob].data), + reveal(value.partitions[1].partitions[env.bob].data), + ] + ), + expect_bob, + ) + + +def test_on_v_mixdataframe_should_ok(prod_env_and_data): + env, data = prod_env_and_data + # GIVEN + df_part0 = pd.DataFrame( + { + 'a1': ['A1', 'B1', None, 'D1', None, 'B4', 'C4', 'D4'], + 'a2': ['A2', 'B2', 'C2', 'D2', 'A5', 'B5', 'C5', 'D5'], + 'a3': [5, 1, 2, 6, 15, None, 23, 6], + } + ) + + df_part1 = pd.DataFrame( + { + 'b4': [10.2, 20.5, None, -0.4, None, 0.5, None, -10.4], + 'b5': ['B3', None, 'B9', 'B4', 'A3', None, 'C9', 'E4'], + 'b6': [3, 1, 9, 4, 31, 12, 9, 21], + } + ) + v_part0 = HDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part0.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part0.iloc[4:, :])()), + }, + aggregator=PlainAggregator(env.carol), + comparator=PlainComparator(env.carol), + ) + v_part1 = HDataFrame( + { + env.alice: partition(data=env.alice(lambda: df_part1.iloc[:4, :])()), + env.bob: partition(data=env.bob(lambda: df_part1.iloc[4:, :])()), + }, + aggregator=PlainAggregator(env.carol), + comparator=PlainComparator(env.carol), + ) + v_mix = MixDataFrame(partitions=[v_part0, v_part1]) + + scaler = MinMaxScaler() + + # WHEN + value = scaler.fit_transform(v_mix[['a3', 'b4', 'b6']]) + params = scaler.get_params() + + # THEN + assert params + sk_scaler = SkMinMaxScaler() + expect_alice = sk_scaler.fit_transform(df_part0[['a3']]) + np.testing.assert_equal( + pd.concat( + [ + reveal(value.partitions[0].partitions[env.alice].data), + reveal(value.partitions[0].partitions[env.bob].data), + ] + ), + expect_alice, + ) + expect_bob = sk_scaler.fit_transform(df_part1[['b4', 'b6']]) + np.testing.assert_almost_equal( + pd.concat( + [ + reveal(value.partitions[1].partitions[env.alice].data), + reveal(value.partitions[1].partitions[env.bob].data), + ] + ), + expect_bob, + ) + + +def test_should_error_when_not_dataframe(prod_env_and_data): + env, data = prod_env_and_data + scaler = MinMaxScaler() + with pytest.raises( + AssertionError, match='Accepts HDataFrame/VDataFrame/MixDataFrame only' + ): + scaler.fit(['test']) + scaler.fit(data['vdf']['a3']) + with pytest.raises( + AssertionError, match='Accepts HDataFrame/VDataFrame/MixDataFrame only' + ): + scaler.transform('test') + + +def test_transform_should_error_when_not_fit(prod_env_and_data): + env, data = prod_env_and_data + with pytest.raises(AssertionError, match='Scaler has not been fit yet.'): + MinMaxScaler().transform('test') diff --git a/tests/preprocessing/test_standard_scaler.py b/tests/preprocessing/test_standard_scaler.py index 2e3fc5e8c..772cea9c0 100644 --- a/tests/preprocessing/test_standard_scaler.py +++ b/tests/preprocessing/test_standard_scaler.py @@ -4,7 +4,7 @@ from sklearn.preprocessing import StandardScaler as SkStandardScaler from secretflow import reveal -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.horizontal.dataframe import HDataFrame from secretflow.data.mix.dataframe import MixDataFrame from secretflow.data.vertical.dataframe import VDataFrame diff --git a/tests/preprocessing/test_vert_binning.py b/tests/preprocessing/test_vert_binning.py index 83b047259..456a222f2 100644 --- a/tests/preprocessing/test_vert_binning.py +++ b/tests/preprocessing/test_vert_binning.py @@ -5,9 +5,10 @@ import pytest import secretflow.distributed as sfd -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical.dataframe import VDataFrame from secretflow.device.driver import reveal +from secretflow.preprocessing.binning.vert_binning import VertBinning from secretflow.preprocessing.binning.vert_woe_binning import VertWoeBinning from secretflow.utils.simulation.datasets import dataset @@ -130,6 +131,7 @@ def test_binning_nan_chi(prod_env_and_data): label_name="y", chimerge_target_bins=4, ) + assert he_report.keys() == ss_report.keys() ss_alice = reveal(ss_report[env.alice]) he_alice = reveal(he_report[env.alice]) @@ -150,6 +152,8 @@ def test_binning_nan(prod_env_and_data): env, data = prod_env_and_data he_binning = VertWoeBinning(env.heu) ss_binning = VertWoeBinning(env.spu) + vert_binning = VertBinning() + he_report = he_binning.binning( data['v_nan_data'], bin_names={env.alice: ["f1", "f3", "f2"], env.bob: ["f1", "f3", "f2"]}, @@ -164,7 +168,17 @@ def test_binning_nan(prod_env_and_data): bin_names={env.alice: ["f1", "f3", "f2"], env.bob: ["f1", "f3", "f2"]}, label_name="y", ) + + vert_binning_report = vert_binning.binning( + data['v_nan_data'], + binning_method="eq_range", + bin_names={env.alice: ["f1", "f3", "f2"], env.bob: ["f1", "f3", "f2"]}, + ) + assert he_report.keys() == vert_binning_report.keys() assert he_report.keys() == ss_report.keys() + + print(reveal(vert_binning_report[env.alice])) + ss_alice = reveal(ss_report[env.alice]) he_alice = reveal(he_report[env.alice]) ss_bob = reveal(ss_report[env.bob]) @@ -201,6 +215,9 @@ def test_binning_normal(prod_env_and_data): env, data = prod_env_and_data he_binning = VertWoeBinning(env.heu) ss_binning = VertWoeBinning(env.spu) + + vert_binning = VertBinning() + he_report = he_binning.binning( data['v_float_data'], bin_names={env.alice: ["x1", "x2", "x3"], env.bob: ["x1", "x2", "x3"]}, @@ -211,6 +228,15 @@ def test_binning_normal(prod_env_and_data): bin_names={env.alice: ["x1", "x2", "x3"], env.bob: ["x1", "x2", "x3"]}, label_name="y", ) + + vert_binning_report = vert_binning.binning( + data['v_nan_data'], + binning_method="eq_range", + bin_names={env.alice: ["f1", "f3", "f2"], env.bob: ["f1", "f3", "f2"]}, + ) + assert he_report.keys() == vert_binning_report.keys() + + print(reveal(vert_binning_report[env.alice])) assert he_report.keys() == ss_report.keys() ss_alice = reveal(ss_report[env.alice]) he_alice = reveal(he_report[env.alice]) diff --git a/tests/preprocessing/test_vert_substitution.py b/tests/preprocessing/test_vert_substitution.py index 55636172d..0b981d3c6 100644 --- a/tests/preprocessing/test_vert_substitution.py +++ b/tests/preprocessing/test_vert_substitution.py @@ -4,11 +4,12 @@ import pandas as pd import pytest -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical.dataframe import VDataFrame from secretflow.device.driver import reveal +from secretflow.preprocessing.binning.vert_bin_substitution import VertBinSubstitution +from secretflow.preprocessing.binning.vert_binning import VertBinning from secretflow.preprocessing.binning.vert_woe_binning import VertWoeBinning -from secretflow.preprocessing.binning.vert_woe_substitution import VertWOESubstitution from secretflow.utils.simulation.datasets import dataset @@ -101,7 +102,7 @@ def prod_env_and_data(sf_production_setup_devices): def test_binning_nan(prod_env_and_data): env, data = prod_env_and_data ss_binning = VertWoeBinning(env.spu) - woe_rules = ss_binning.binning( + bin_rules = ss_binning.binning( data['v_nan_data'], binning_method="chimerge", bin_names={env.alice: ["f1", "f3", "f2"], env.bob: ["f1", "f3", "f2"]}, @@ -109,44 +110,97 @@ def test_binning_nan(prod_env_and_data): chimerge_target_bins=4, ) - woe_sub = VertWOESubstitution() - sub_data = woe_sub.substitution(data['v_nan_data'], woe_rules) + woe_sub = VertBinSubstitution() + sub_data = woe_sub.substitution(data['v_nan_data'], bin_rules) alice_data = reveal(sub_data.partitions[env.alice].data).drop("y", axis=1) bob_data = reveal(sub_data.partitions[env.bob].data) - rules = {v['name']: v for v in reveal(woe_rules[env.alice])["variables"]} + rules = {v['name']: v for v in reveal(bin_rules[env.alice])["variables"]} assert alice_data.equals(bob_data), str(alice_data) + "\n,,,,,,\n" + str(bob_data) f1_categories = list(set(alice_data['f1'])) - assert np.isin(rules['f1']['woes'], f1_categories).all(), ( - str(rules['f1']['woes']) + "\n,,,,,,\n" + str(f1_categories) + assert np.isin(rules['f1']['filling_values'], f1_categories).all(), ( + str(rules['f1']['filling_values']) + "\n,,,,,,\n" + str(f1_categories) ) - assert rules['f1']['else_woe'] in f1_categories + assert rules['f1']['else_filling_value'] in f1_categories f2_categories = list(set(alice_data['f2'])) - assert np.isin(f2_categories, rules['f2']['woes']).all() + assert np.isin(f2_categories, rules['f2']['filling_values']).all() f3_categories = list(set(alice_data['f3'])) - assert np.isin(rules['f3']['woes'], f3_categories).all() - assert rules['f3']['else_woe'] in f3_categories + assert np.isin(rules['f3']['filling_values'], f3_categories).all() + assert rules['f3']['else_filling_value'] in f3_categories + + +def test_binning_nan_vert_binning(prod_env_and_data): + env, data = prod_env_and_data + vert_binning = VertBinning() + + rules = vert_binning.binning( + data['v_nan_data'], + binning_method="eq_range", + bin_names={env.alice: ["f1", "f3", "f2"], env.bob: ["f1", "f3", "f2"]}, + ) + + bin_sub = VertBinSubstitution() + sub_data = bin_sub.substitution(data['v_nan_data'], rules) + alice_data = reveal(sub_data.partitions[env.alice].data).drop("y", axis=1) + bob_data = reveal(sub_data.partitions[env.bob].data) + rules = {v['name']: v for v in reveal(rules[env.alice])["variables"]} + + assert alice_data.equals(bob_data), str(alice_data) + "\n,,,,,,\n" + str(bob_data) + f1_categories = list(set(alice_data['f1'])) + assert np.isin(rules['f1']['filling_values'], f1_categories).all(), ( + str(rules['f1']['filling_values']) + "\n,,,,,,\n" + str(f1_categories) + ) + assert rules['f1']['else_filling_value'] == -1 + f2_categories = list(set(alice_data['f2'])) + assert np.isin(f2_categories, rules['f2']['filling_values']).all() + f3_categories = list(set(alice_data['f3'])) + assert np.isin(rules['f3']['filling_values'], f3_categories).all() + assert rules['f3']['else_filling_value'] == -1 + + +def test_binning_normal_vert_binning(prod_env_and_data): + env, data = prod_env_and_data + vert_binning = VertBinning() + + rules = vert_binning.binning( + data['v_float_data'], + binning_method="eq_range", + bin_names={env.alice: ["x1", "x2", "x3"], env.bob: ["x1", "x2", "x3"]}, + ) + sub = VertBinSubstitution() + sub_data = sub.substitution(data['v_float_data'], rules) + alice_data = reveal(sub_data.partitions[env.alice].data).drop("y", axis=1) + bob_data = reveal(sub_data.partitions[env.bob].data) + rules = {v['name']: v for v in reveal(rules[env.alice])["variables"]} + + assert alice_data.equals(bob_data), str(alice_data) + "\n,,,,,,\n" + str(bob_data) + f1_categories = list(set(alice_data['x1'])) + assert np.isin(rules['x1']['filling_values'], f1_categories).all() + f2_categories = list(set(alice_data['x2'])) + assert np.isin(f2_categories, rules['x2']['filling_values']).all() + f3_categories = list(set(alice_data['x3'])) + assert np.isin(rules['x3']['filling_values'], f3_categories).all() def test_binning_normal(prod_env_and_data): env, data = prod_env_and_data ss_binning = VertWoeBinning(env.spu) - woe_rules = ss_binning.binning( + bin_rules = ss_binning.binning( data['v_float_data'], bin_names={env.alice: ["x1", "x2", "x3"], env.bob: ["x1", "x2", "x3"]}, label_name="y", ) - woe_sub = VertWOESubstitution() - sub_data = woe_sub.substitution(data['v_float_data'], woe_rules) + woe_sub = VertBinSubstitution() + sub_data = woe_sub.substitution(data['v_float_data'], bin_rules) alice_data = reveal(sub_data.partitions[env.alice].data).drop("y", axis=1) bob_data = reveal(sub_data.partitions[env.bob].data) - rules = {v['name']: v for v in reveal(woe_rules[env.alice])["variables"]} + rules = {v['name']: v for v in reveal(bin_rules[env.alice])["variables"]} assert alice_data.equals(bob_data), str(alice_data) + "\n,,,,,,\n" + str(bob_data) f1_categories = list(set(alice_data['x1'])) - assert np.isin(rules['x1']['woes'], f1_categories).all() + assert np.isin(rules['x1']['filling_values'], f1_categories).all() f2_categories = list(set(alice_data['x2'])) - assert np.isin(f2_categories, rules['x2']['woes']).all() + assert np.isin(f2_categories, rules['x2']['filling_values']).all() f3_categories = list(set(alice_data['x3'])) - assert np.isin(rules['x3']['woes'], f3_categories).all() + assert np.isin(rules['x3']['filling_values'], f3_categories).all() diff --git a/tests/stats/test_biclassification_eval.py b/tests/stats/test_biclassification_eval.py index af5ebf026..dd01aef37 100644 --- a/tests/stats/test_biclassification_eval.py +++ b/tests/stats/test_biclassification_eval.py @@ -5,7 +5,7 @@ from secretflow import reveal from secretflow.data import FedNdarray, PartitionWay -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame from secretflow.stats import BiClassificationEval diff --git a/tests/stats/test_prediction_bias.py b/tests/stats/test_prediction_bias.py index ece9c8364..a5fecc84d 100644 --- a/tests/stats/test_prediction_bias.py +++ b/tests/stats/test_prediction_bias.py @@ -6,7 +6,7 @@ from secretflow import reveal from secretflow.data import FedNdarray, PartitionWay -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame from secretflow.stats import prediction_bias_eval from secretflow.stats.core.prediction_bias_core import PredictionBiasBucketMethod diff --git a/tests/stats/test_psi.py b/tests/stats/test_psi.py index 301ecf7da..0663614fe 100644 --- a/tests/stats/test_psi.py +++ b/tests/stats/test_psi.py @@ -5,7 +5,7 @@ from secretflow import reveal from secretflow.data import FedNdarray, PartitionWay -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame from secretflow.stats import psi_eval from secretflow.stats.core.utils import equal_range diff --git a/tests/stats/test_ss_pvalue.py b/tests/stats/test_ss_pvalue.py index 341b92645..5e9fcf221 100644 --- a/tests/stats/test_ss_pvalue.py +++ b/tests/stats/test_ss_pvalue.py @@ -23,7 +23,7 @@ from sklearn import linear_model from sklearn.preprocessing import StandardScaler -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame from secretflow.ml.linear import LinearModel, RegType, SSRegression from secretflow.stats import SSPValue diff --git a/tests/stats/test_ss_vif_v.py b/tests/stats/test_ss_vif_v.py index 1a67795e5..4a4c03ec2 100644 --- a/tests/stats/test_ss_vif_v.py +++ b/tests/stats/test_ss_vif_v.py @@ -3,7 +3,7 @@ import sklearn from statsmodels.stats.outliers_influence import variance_inflation_factor as vif -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical import VDataFrame from secretflow.device.driver import reveal from secretflow.stats import SSVertVIF diff --git a/tests/stats/test_table_statistics.py b/tests/stats/test_table_statistics.py index 3e9ddd970..b0c24c944 100644 --- a/tests/stats/test_table_statistics.py +++ b/tests/stats/test_table_statistics.py @@ -2,7 +2,7 @@ import pytest from sklearn.datasets import load_iris -from secretflow.data.base import partition +from secretflow.data import partition from secretflow.data.vertical.dataframe import VDataFrame from secretflow.stats import table_statistics