diff --git a/.github/workflows/build-test-graph.yml b/.github/workflows/build-test-graph.yml index 015988b253..b99090fdf8 100644 --- a/.github/workflows/build-test-graph.yml +++ b/.github/workflows/build-test-graph.yml @@ -16,11 +16,15 @@ name: Vineyard Graph CI on: push: + branches: + - main paths: - - modules/graph + - 'modules/graph' pull_request: + branches: + - main paths: - - modules/graph + - 'modules/graph' concurrency: group: ${{ github.repository }}-${{ github.event.number || github.head_ref || github.sha }}-${{ github.workflow }} diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 8d3403053e..11c550fd8c 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -21,11 +21,6 @@ on: - main - docs - dev/docs - pull_request: - branches: - - main - - docs - - dev/docs concurrency: group: ${{ github.repository }}-${{ github.event.number || github.head_ref || github.sha }}-${{ github.workflow }} diff --git a/docs/notes/cloud-native/vineyard-operator.rst b/docs/notes/cloud-native/vineyard-operator.rst index b9eafe2572..8d09b7e9b9 100644 --- a/docs/notes/cloud-native/vineyard-operator.rst +++ b/docs/notes/cloud-native/vineyard-operator.rst @@ -412,11 +412,13 @@ available configurations. | memory - string - The requested memory of vineyard sidecar container. + - "" * - | vineyard. | cpu - string - The requested cpu of vineyard sidecar container. + - "" * - | metric. | enable diff --git a/docs/notes/integration-orchestration.rst b/docs/notes/integration-orchestration.rst index e9333780b2..9b354750ad 100644 --- a/docs/notes/integration-orchestration.rst +++ b/docs/notes/integration-orchestration.rst @@ -7,9 +7,12 @@ Workflow orchestration :hidden: integration/airflow.rst + integration/kedro.md + +Vineyard seamlessly integrates with the workflow orchestration engines, e.g., +Apache Airflow and Kedro, enabling users to effortlessly incorporate Vineyard +into their workflows for enhanced performance. -Vineyard seamlessly integrates with the workflow orchestration engine, Apache Airflow, -enabling users to effortlessly incorporate Vineyard into their workflows for enhanced performance. Moreover, the Airflow integration empowers users to work with large Python objects featuring complex data types (e.g., :code:`pandas.DataFrame`) at minimal cost, while eliminating the need for cumbersome :code:`pickle.dump/loads` operations. @@ -24,3 +27,20 @@ eliminating the need for cumbersome :code:`pickle.dump/loads` operations. :classes: btn-block stretched-link ^^^^^^^^^^^^ Airflow uses vineyard as the XCom backend to efficiently handle complex data in Python. + +The Kedro integration enables users to easily share large data objects across +nodes in a pipeline and eliminates the high cost of (de)serialization and I/O +compared with alternatives like AWS S3 or Minio, without the need to modify +the pipeline code intrusively, and provides seamless user experience when scaling +pipelines to Kubernetes. + +.. panels:: + :header: text-center + :column: col-lg-12 p-2 + + .. link-button:: integration/kedro + :type: ref + :text: Kedro + :classes: btn-block stretched-link + ^^^^^^^^^^^^ + Kedro uses vineyard as a `DataSet` implementation for efficient intermediate data sharing. diff --git a/docs/notes/integration/kedro.md b/docs/notes/integration/kedro.md new file mode 100644 index 0000000000..c0cb65e339 --- /dev/null +++ b/docs/notes/integration/kedro.md @@ -0,0 +1,270 @@ +Kedro Vineyard Plugin +===================== + +The Kedro vineyard plugin contains components (e.g., `DataSet` and `Runner`) +to share intermediate data among nodes in Kedro pipelines using vineyard. + +Kedro on Vineyard +----------------- + +Vineyard works as the *DataSet* provider for kedro workers to allow transferring +large-scale data objects between tasks that cannot be efficiently serialized and +is not suitable for `pickle`, without involving external storage systems like +AWS S3 (or Minio as an alternative). The Kedro vineyard plugin handles object migration +as well when the required inputs are not located where the task is scheduled to execute. + +Requirements +------------ + +The following packages are needed to run Kedro on vineyard, + +- kedro >= 0.18 +- vineyard >= 0.14.5 + +Configuration +------------- + +1. Install required packages: + + pip3 install vineyard-kedro + +2. Configure Vineyard locally + + The vineyard server can be easier launched locally with the following command: + + python3 -m vineyard --socket=/tmp/vineyard.sock + + See also our documentation about [Launching Vineyard][1]. + +3. Configure the environment variable to tell Kedro vineyard plugin how to connect to the + vineyardd server: + + export VINEYARD_IPC_SOCKET=/tmp/vineyard.sock + +Usage +----- + +After installing the dependencies and preparing the vineyard server, you can execute the +Kedro workflows as usual and benefits from vineyard for intermediate data sharing. + +We take the [Iris example][2] as an example, + +```bash +$ kedro new --starter=pandas-iris +``` + +The nodes in this pipeline look like + +```python +def split_data( + data: pd.DataFrame, parameters: Dict[str, Any] +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: + data_train = data.sample( + frac=parameters["train_fraction"], random_state=parameters["random_state"] + ) + data_test = data.drop(data_train.index) + + X_train = data_train.drop(columns=parameters["target_column"]) + X_test = data_test.drop(columns=parameters["target_column"]) + y_train = data_train[parameters["target_column"]] + y_test = data_test[parameters["target_column"]] + + return X_train, X_test, y_train, y_test + + +def make_predictions( + X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series +) -> pd.Series: + X_train_numpy = X_train.to_numpy() + X_test_numpy = X_test.to_numpy() + + squared_distances = np.sum( + (X_train_numpy[:, None, :] - X_test_numpy[None, :, :]) ** 2, axis=-1 + ) + nearest_neighbour = squared_distances.argmin(axis=0) + y_pred = y_train.iloc[nearest_neighbour] + y_pred.index = X_test.index + + return y_pred +``` + +You can see that the intermediate data between `split_data` and `make_predictions` is some pandas +dataframes and series. + +Try running the pipeline without vineyard, + +```bash +$ cd iris +$ kedro run +[05/25/23 11:38:56] INFO Kedro project iris session.py:355 +[05/25/23 11:38:57] INFO Loading data from 'example_iris_data' (CSVDataSet)... data_catalog.py:343 + INFO Loading data from 'parameters' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: split: split_data([example_iris_data,parameters]) -> [X_train,X_test,y_train,y_test] node.py:329 + INFO Saving data to 'X_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'X_test' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_train' (MemoryDataSet)... data_catalog.py:382 + INFO Saving data to 'y_test' (MemoryDataSet)... data_catalog.py:382 + INFO Completed 1 out of 3 tasks sequential_runner.py:85 + INFO Loading data from 'X_train' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'X_test' (MemoryDataSet)... data_catalog.py:343 + INFO Loading data from 'y_train' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: make_predictions: make_predictions([X_train,X_test,y_train]) -> [y_pred] node.py:329 +... +``` + +You can see that the intermediate data is shared with memory. When kedro is deploy to a cluster, e.g., +to [argo workflow][3], the `MemoryDataSet` is not applicable anymore and you will need to setup the +AWS S3 or Minio service and sharing those intermediate data as CSV files. + +```yaml +X_train: + type: pandas.CSVDataSet + filepath: s3://testing/data/02_intermediate/X_train.csv + credentials: minio + +X_test: + type: pandas.CSVDataSet + filepath: s3://testing/data/02_intermediate/X_test.csv + credentials: minio + +y_train: + type: pandas.CSVDataSet + filepath: s3://testing/data/02_intermediate/y_train.csv + credentials: minio +``` + +It might be inefficient for pickling pandas dataframes when data become larger. With the kedro +vineyard plugin, you can run the pipeline with vineyard as the intermediate data medium by + +```bash +$ kedro run --runner vineyard.contrib.kedro.runner.SequentialRunner +[05/25/23 11:45:34] INFO Kedro project iris session.py:355 + INFO Loading data from 'example_iris_data' (CSVDataSet)... data_catalog.py:343 + INFO Loading data from 'parameters' (MemoryDataSet)... data_catalog.py:343 + INFO Running node: split: split_data([example_iris_data,parameters]) -> [X_train,X_test,y_train,y_test] node.py:329 + INFO Saving data to 'X_train' (VineyardDataSet)... data_catalog.py:382 + INFO Saving data to 'X_test' (VineyardDataSet)... data_catalog.py:382 + INFO Saving data to 'y_train' (VineyardDataSet)... data_catalog.py:382 + INFO Saving data to 'y_test' (VineyardDataSet)... data_catalog.py:382 + INFO Loading data from 'X_train' (VineyardDataSet)... data_catalog.py:343 + INFO Loading data from 'X_test' (VineyardDataSet)... data_catalog.py:343 + INFO Loading data from 'y_train' (VineyardDataSet)... data_catalog.py:343 + INFO Running node: make_predictions: make_predictions([X_train,X_test,y_train]) -> [y_pred] node.py:329 +... +``` + +Without any modification to your pipeline code, you can see that the intermediate data is shared +with vineyard using the `VineyardDataSet` and no longer suffers from the overhead of (de)serialization +and the I/O cost between external AWS S3 or Minio services. + +Like `kedro catalog create`, the Kedro vineyard plugin provides a command-line interface to generate +the catalog configuration for given pipeline, which will rewrite the unspecified intermediate data +to `VineyardDataSet`, e.g., + +```bash +$ kedro vineyard catalog create -p __default__ +``` + +You will get + +```yaml +X_test: + ds_name: X_test + type: vineyard.contrib.kedro.io.dataset.VineyardDataSet +X_train: + ds_name: X_train + type: vineyard.contrib.kedro.io.dataset.VineyardDataSet +y_pred: + ds_name: y_pred + type: vineyard.contrib.kedro.io.dataset.VineyardDataSet +y_test: + ds_name: y_test + type: vineyard.contrib.kedro.io.dataset.VineyardDataSet +y_train: + ds_name: y_train + type: vineyard.contrib.kedro.io.dataset.VineyardDataSet +``` + +Deploy to Kubernetes +-------------------- + +When the pipeline scales to Kubernetes, the interaction with the Kedro vineyard plugin is +still simple and non-intrusive. The plugin provides tools to prepare the docker image and +generate Argo workflow specification file for the Kedro pipeline. Next, we'll demonstrate +how to deploy pipelines to Kubernetes while leverage Vineyard for efficient intermediate +sharing between tasks step-by-step. + +1. Prepare the vineyard cluster (see also [Deploy on Kubernetes][5]): + + ```bash + # export your kubeconfig path here + $ export KUBECONFIG=/path/to/your/kubeconfig + + # install the vineyard operator + $ go run k8s/cmd/main.go deploy vineyard-cluster --create-namespace + ``` + +2. Install the argo server: + + ```bash + # install the argo server + $ kubectl create namespace argo + $ kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/v3.4.8/install.yaml + ``` + +3. Generate the iris demo project from the official template: + + ```bash + $ kedro new --starter=pandas-iris + ``` + +4. Build the Docker image for this iris demo project: + + ```bash + # walk to the iris demo root directory + $ cd iris + $ kedro vineyard docker build + ``` + + A Docker image named `iris` will be built successfully. The docker image + need to be pushed to your image registry, or loaded to the kind/minikube cluster, to be + available in Kubernetes. + + ```bash + $ docker images | grep iris + iris latest 3c92da8241c6 About a minute ago 690MB + ``` + +5. Next, generate the Argo workflow YAML file from the iris demo project: + + ```bash + $ kedro vineyard argo generate -i iris + + # check the generated Argo workflow YAML file, you can see the Argo workflow YAML file named `iris.yaml` + # is generated successfully. + $ ls -l argo-iris.yml + -rw-rw-r-- 1 root root 3685 Jun 12 23:55 argo-iris.yml + ``` + +6. Finally, submit the Argo workflow to Kubernetes: + + ```bash + $ argo submit -n argo argo-iris.yml + ``` + + You can interact with the Argo workflow using the `argo` command-line tool, e.g., + + ```bash + $ argo list workflows -n argo + NAME STATUS AGE DURATION PRIORITY MESSAGE + iris-sg6qf Succeeded 18m 30s 0 + ``` + +We have prepared a benchmark to evaluate the performance gain brought by vineyard for data +sharing when data scales, for more details, please refer to [this report][4]. + +[1]: https://v6d.io/notes/getting-started.html#starting-vineyard-server +[2]: https://docs.kedro.org/en/stable/get_started/new_project.html +[3]: https://docs.kedro.org/en/stable/deployment/argo.html#how-to-run-your-kedro-pipeline-using-argo-workflows +[4]: https://v6d.io/tutorials/data-processing/accelerate-data-sharing-in-kedro.html +[5]: https://v6d.io/notes/cloud-native/deploy-kubernetes.html diff --git a/docs/notes/kedro-integration-performance.rst b/docs/notes/kedro-integration-performance.rst deleted file mode 100644 index a6e4d08e66..0000000000 --- a/docs/notes/kedro-integration-performance.rst +++ /dev/null @@ -1,356 +0,0 @@ -.. _kedro-integration-performance: - -Kedro Integration Performance Report -==================================== - -This is a performance report of kedro integration, here we will compare three -different data catalog of kedro benchmark project: vineyard(v0.15.3), AWS S3 and MinIO S3(the latest one). - -Create a kubernetes cluster ---------------------------- - -If you don't have a kubernetes on hand, you can use the `kind v0.20.0 `_ -to create a kubernetes cluster with 4 nodes(including a master node) as follows: - -.. code:: bash - - $ cat < and Base64 encoded - secretKey: and Base64 encoded - EOF - -4. Set the configurations of MinIO clusters. - -.. code:: bash - - $ cat << EOF > minio-default.yaml - data: - artifactRepository: | - s3: - bucket: minio-s3-benchmark-bucket - endpoint: {{MINIO}} - insecure: true - accessKeySecret: - name: my-minio-cred - key: accessKey - secretKeySecret: - name: my-minio-cred - key: secretKey - useSDKCreds: false - EOF - - # Get the endpoint of minio service - $ minioUrl=$(kubectl get endpoints -n minio-dev -o jsonpath='{.items[*].subsets[*].addresses[*].ip}'):9000 - - # Replace with actual minio url - $ sed -i "s/{{MINIO}}/${minioUrl}/g" ./minio-default.yaml - - # Apply to configmap in the argo namespace - $ kubectl -n argo patch configmap/workflow-controller-configmap --patch "$(cat ./minio-default.yaml)" - -5. Forward minio-artifacts service. - -.. code:: bash - - $ kubectl port-forward service/minio -n minio-dev 9000:9000 - -6. Download the minio client and install it. - -.. code:: bash - - $ wget https://dl.min.io/client/mc/release/linux-amd64/mc - $ chmod +x mc - $ sudo mv mc /usr/local/bin - -7. Configure the minio client. - -.. code:: bash - - $ mc alias set minio http://localhost:9000 - Enter Access Key: - Enter Secret Key: - -1. Create a bucket named `minio-s3-benchmark-bucket` on the MinIO cluster. - -.. code:: bash - - $ mc mb minio/minio-s3-benchmark-bucket - Bucket created successfully `minio/minio-s3-benchmark-bucket`. - - -Prepare the kedro benchmark project ------------------------------------ - -1. Go to the kedro benchmark project under vineyard root directory. - -.. code:: bash - - $ cd python/vineyard/contrib/kedro/benchmark - -2. Fulfill the credentials configurations of AWS S3. - -.. code:: bash - - $ cd aws-s3 - $ cat conf/local/credentials.yml - benchmark_aws_s3: - client_kwargs: - aws_access_key_id: Your AWS Access Key ID - aws_secret_access_key: Your AWS Secret Access Key - region_name: Your AWS Region Name - -2. Build the docker images of the kedro project for vineyard benchmark. - -.. code:: bash - - $ pushd vineyard - # build the docker images - $ make - # check the docker images - $ docker images | grep vineyard-benchmark - vineyard-benchmark-with-500m-data latest 982c6a376597 About a minute ago 1.66GB - vineyard-benchmark-with-100m-data latest e58ca1cada98 About a minute ago 1.25GB - vineyard-benchmark-with-10m-data latest f7c618b48913 About a minute ago 1.16GB - vineyard-benchmark-with-1m-data latest 8f9e74ff5116 About a minute ago 1.15GB - $ popd - -3. Build the docker images of the kedro project for aws s3 benchmark. - -.. code:: bash - - $ pushd aws-s3 - # build the docker images - $ make - # check the docker images - $ docker images | grep aws-s3-benchmark - aws-s3-benchmark-with-500m-data latest 877d8fc1ef78 3 minutes ago 1.42GB - aws-s3-benchmark-with-100m-data latest b8e15edda5cd 3 minutes ago 1.01GB - aws-s3-benchmark-with-10m-data latest c1a58ddb2888 3 minutes ago 915MB - aws-s3-benchmark-with-1m-data latest 9f27ac5ce9dd 3 minutes ago 907MB - $ popd - -4. Build the docker images of the kedro project for minio s3 benchmark. - -.. code:: bash - - $ pushd minio-s3 - - # build the docker images - $ make - - # check the docker images - $ docker images | grep minio-s3-benchmark - minio-s3-benchmark-with-500m-data latest 1c75300390cf 8 seconds ago 1.41GB - minio-s3-benchmark-with-100m-data latest f4aa093ddf36 11 seconds ago 1.01GB - minio-s3-benchmark-with-10m-data latest 8b068600e368 12 seconds ago 913MB - minio-s3-benchmark-with-1m-data latest b3eaf0a5898c 13 seconds ago 904MB - - $ popd - -5. Load the above images to the kind cluster. - -.. code:: bash - - # load the vineyard benchmark images - $ kind load docker-image vineyard-benchmark-with-1m-data && \ - kind load docker-image vineyard-benchmark-with-10m-data && \ - kind load docker-image vineyard-benchmark-with-100m-data && \ - kind load docker-image vineyard-benchmark-with-500m-data - # load the aws s3 benchmark images - $ kind load docker-image aws-s3-benchmark-with-1m-data && \ - kind load docker-image aws-s3-benchmark-with-10m-data && \ - kind load docker-image aws-s3-benchmark-with-100m-data && \ - kind load docker-image aws-s3-benchmark-with-500m-data - # load the minio s3 benchmark images - $ kind load docker-image minio-s3-benchmark-with-1m-data && \ - kind load docker-image minio-s3-benchmark-with-10m-data && \ - kind load docker-image minio-s3-benchmark-with-100m-data && \ - kind load docker-image minio-s3-benchmark-with-500m-data - - -Submit the benchmark workflow ------------------------------ - -1. Submit the vineyard benchmark workflow. - -.. code:: bash - - $ pushd vineyard - # 1M data - $ sed -i "s/vineyard-benchmark/vineyard-benchmark-with-1m-data/g" argo-vineyard-benchmark.yml && \ - argo submit -n vineyard-system --watch argo-vineyard-benchmark.yml - # 10M data - $ sed -i "s/vineyard-benchmark-with-1m-data/vineyard-benchmark-with-10m-data/g" argo-vineyard-benchmark.yml && \ - argo submit -n vineyard-system --watch argo-vineyard-benchmark.yml - # 100M data - $ sed -i "s/vineyard-benchmark-with-10m-data/vineyard-benchmark-with-100m-data/g" argo-vineyard-benchmark.yml && \ - argo submit -n vineyard-system --watch argo-vineyard-benchmark.yml - # 500M data - $ sed -i "s/vineyard-benchmark-with-100m-data/vineyard-benchmark-with-500m-data/g" argo-vineyard-benchmark.yml && \ - argo submit -n vineyard-system --watch argo-vineyard-benchmark.yml - $ popd - -2. Submit the aws s3 benchmark workflow. - -.. code:: bash - - $ pushd aws-s3 - # 1M data - $ sed -i "s/aws-s3-benchmark/aws-s3-benchmark-with-1m-data/g" argo-aws-s3-benchmark.yml && \ - argo submit -n s3 --watch argo-aws-s3-benchmark.yml - # 10M data - $ sed -i "s/aws-s3-benchmark-with-1m-data/aws-s3-benchmark-with-10m-data/g" argo-aws-s3-benchmark.yml && \ - argo submit -n s3 --watch argo-aws-s3-benchmark.yml - # 100M data - $ sed -i "s/aws-s3-benchmark-with-10m-data/aws-s3-benchmark-with-100m-data/g" argo-aws-s3-benchmark.yml && \ - argo submit -n s3 --watch argo-aws-s3-benchmark.yml - # 500M data - $ sed -i "s/aws-s3-benchmark-with-100m-data/aws-s3-benchmark-with-500m-data/g" argo-aws-s3-benchmark.yml && \ - argo submit -n s3 --watch argo-aws-s3-benchmark.yml - $ popd - -3. Submit the minio s3 benchmark workflow. - -.. code:: bash - - $ pushd minio-s3 - # 1M data - $ sed -i "s/minio-s3-benchmark/minio-s3-benchmark-with-1m-data/g" argo-minio-s3-benchmark.yml && \ - argo submit -n minio-dev --watch argo-minio-s3-benchmark.yml - # 10M data - $ sed -i "s/minio-s3-benchmark-with-1m-data/minio-s3-benchmark-with-10m-data/g" argo-minio-s3-benchmark.yml && \ - argo submit -n minio-dev --watch argo-minio-s3-benchmark.yml - # 100M data - $ sed -i "s/minio-s3-benchmark-with-10m-data/minio-s3-benchmark-with-100m-data/g" argo-minio-s3-benchmark.yml && \ - argo submit -n minio-dev --watch argo-minio-s3-benchmark.yml - # 500M data - $ sed -i "s/minio-s3-benchmark-with-100m-data/minio-s3-benchmark-with-500m-data/g" argo-minio-s3-benchmark.yml && \ - argo submit -n minio-dev --watch argo-minio-s3-benchmark.yml - -4. Record the time of each workflow. - - -Summary -------- - -After running the benchmark, we can get the following results: - -The data size is the size of input file, and the time is -the completion time of the argo workflow. - -| Data Size | Vineyard | AWS S3 | MinIO S3 | -| --------- | -------- | -------- | -------- | -| 1M | 30s | 50s | 30s | -| 10M | 30s | 63s | 30s | -| 100M | 60s | 144s | 64s | -| 500M | 91s | 457s | 177s | diff --git a/docs/tutorials/data-processing.rst b/docs/tutorials/data-processing.rst index 68356bb287..700226e66c 100644 --- a/docs/tutorials/data-processing.rst +++ b/docs/tutorials/data-processing.rst @@ -10,6 +10,7 @@ Data processing ./data-processing/using-objects-python.rst ./data-processing/python-sharedmemory.rst ./data-processing/distributed-learning.rst + ./data-processing/accelerate-data-sharing-in-kedro.rst In these comprehensive case studies, we demonstrate how to seamlessly integrate vineyard's capabilities with existing data-intensive tasks. By incorporating vineyard into complex @@ -45,3 +46,14 @@ improvements in both performance and ease of use. ^^^^^^^^^^^^ Discover how vineyard enhances distributed machine learning training workflows by seamlessly integrating with various computing engines for improved efficiency and elegance. + + --- + + .. link-button:: ./data-processing/accelerate-data-sharing-in-kedro + :type: ref + :text: Accelerate Data Sharing in Kedro + :classes: btn-block stretched-link + ^^^^^^^^^^^^ + Vineyard serves as the :code:`DataSet` backend for Kedro pipelines, enabling + efficient data sharing between tasks without intrusive code modification, even + when the pipeline is deployed to Kubernetes. diff --git a/docs/tutorials/data-processing/accelerate-data-sharing-in-kedro.rst b/docs/tutorials/data-processing/accelerate-data-sharing-in-kedro.rst new file mode 100644 index 0000000000..1137114da4 --- /dev/null +++ b/docs/tutorials/data-processing/accelerate-data-sharing-in-kedro.rst @@ -0,0 +1,310 @@ +.. _accelerate-data-sharing-in-kedro: + +Accelerate Data Sharing in Kedro +================================ + +This is a tutorial that shows how Vineyard accelerate the intermediate data +sharing between tasks in Kedro pipelines using our +`vineyard-kedro `_ plugin, when data +scales and the pipeline are deployed on Kubernetes. + +Prepare the Kubernetes cluster +------------------------------ + +To deploy Kedro pipelines on Kubernetes, you must have a kubernetes cluster. + +.. tip:: + + If you already have a K8s cluster, just skip this section and continue + on deploying. + +We recommend `kind v0.20.0 `_ to create a multi-node +Kubernetes cluster on your local machine as follows: + +.. code:: bash + + $ cat < + + - If you are working with Minio, you first need to expose the services + and then create the bucket: + + - Forward minio-artifacts service: + + .. code:: bash + + $ kubectl port-forward service/minio -n minio-dev 9000:9000 + + - Install the minio client: + + .. code:: bash + + $ wget https://dl.min.io/client/mc/release/linux-amd64/mc + $ chmod +x mc + $ sudo mv mc /usr/local/bin + + - Configure the minio client: + + .. code:: bash + + $ mc alias set minio http://localhost:9000 + Enter Access Key: + Enter Secret Key: + + - Finally, create the bucket :code:`minio-s3-benchmark-bucket`: + + .. code:: bash + + $ mc mb minio/minio-s3-benchmark-bucket + Bucket created successfully `minio/minio-s3-benchmark-bucket`. + +Prepare the Docker images +------------------------- + +1. Vineyard has delivered `a benchmark project `_ + to test Kedro pipelines on Vineyard and S3: + + .. code:: bash + + $ cd python/vineyard/contrib/kedro/benchmark + +2. Configure the credentials configurations of AWS S3: + + .. code:: bash + + $ cd aws-s3 + $ cat conf/local/credentials.yml + benchmark_aws_s3: + client_kwargs: + aws_access_key_id: Your AWS/Minio Access Key ID + aws_secret_access_key: Your AWS/Minio Secret Access Key + region_name: Your AWS Region Name + +3. To deploy pipelines to Kubernetes, you first need to build the Docker image for the + benchmark project. + + To show how vineyard can accelerate the data sharing along with the dataset + scales, Docker images for different data size will be generated: + + - For running Kedro on vineyard: + + .. code:: bash + + $ make -C vineyard/ + + You will see Docker images for different data size are generated: + + .. code:: bash + + $ docker images | grep vineyard-benchmark + vineyard-benchmark-with-500m-data latest 982c6a376597 About a minute ago 1.66GB + vineyard-benchmark-with-100m-data latest e58ca1cada98 About a minute ago 1.25GB + vineyard-benchmark-with-10m-data latest f7c618b48913 About a minute ago 1.16GB + vineyard-benchmark-with-1m-data latest 8f9e74ff5116 About a minute ago 1.15GB + + - Similarly, for running Kedro on AWS S3 or Minio: + + .. code:: bash + + # for AWS S3 + $ make -C aws-s3/ + # for Minio + $ make -C minio-s3/ + +4. To make those images available for your Kubernetes cluster, they need to be + pushed to your registry (or load to kind cluster if you setup your Kubernetes + cluster using kind): + + - Push to registry: + + .. code:: bash + + # for vineyard + $ for sz in 1m 10m 100m 500m; do \ + docker tag vineyard-benchmark-with-${sz}-data /vineyard-benchmark-with-${sz}-data; \ + docker push /vineyard-benchmark-with-${sz}-data; \ + done + + # for AWS S3 + $ for sz in 1m 10m 100m 500m; do \ + docker tag aws-s3-benchmark-with-${sz}-data /vineyard-benchmark-with-${sz}-data; \ + docker push /vineyard-benchmark-with-${sz}-data; \ + done + + # for Minio + $ for sz in 1m 10m 100m 500m; do \ + docker tag minio-s3-benchmark-with-${sz}-data /vineyard-benchmark-with-${sz}-data; \ + docker push /vineyard-benchmark-with-${sz}-data; \ + done + + - Load to kind cluster: + + .. code:: bash + + # for vineyard + $ for sz in 1m 10m 100m 500m; do \ + kind load docker-image vineyard-benchmark-with-${sz}-data; \ + done + + # for AWS S3 + $ for sz in 1m 10m 100m 500m; do \ + kind load docker-image aws-s3-benchmark-with-${sz}-data; \ + done + + # for Minio + $ for sz in 1m 10m 100m 500m; do \ + kind load docker-image minio-s3-benchmark-with-${sz}-data; \ + done + +Deploy the Kedro Pipelines +-------------------------- + +1. Deploy the Kedro pipeline with vineyard for intermediate data sharing: + + .. code:: bash + + $ pushd vineyard + $ kubectl create namespace vineyard + + $ for sz in 1m 10m 100m 500m; do \ + sed -i "s/vineyard-benchmark/vineyard-benchmark-with-${sz}-data/g" argo-vineyard-benchmark.yml && \ + argo submit -n vineyard --watch argo-vineyard-benchmark.yml; \ + done + + $ popd + +2. Similarly, using AWS S3 or Minio for intermediate data sharing: + + - Using AWS S3: + + .. code:: bash + + $ pushd vineyard + $ kubectl create namespace aws-s3 + + $ for sz in 1m 10m 100m 500m; do \ + sed -i "s/aws-s3-benchmark/aws-s3-benchmark-with-${sz}-data/g" argo-aws-s3-benchmark.yml && \ + argo submit -n aws-s3 --watch argo-aws-s3-benchmark.yml; \ + done + + $ popd + + - Using Minio: + + .. code:: bash + + $ pushd aws-s3 + $ kubectl create namespace minio-s3 + + $ for sz in 1m 10m 100m 500m; do \ + sed -i "s/minio-s3-benchmark/minio-s3-benchmark-with-${sz}-data/g" argo-minio-s3-benchmark.yml && \ + argo submit -n minio-s3 --watch argo-minio-s3-benchmark.yml; \ + done + + $ popd + +Performance +----------- + +After running the benchmark above on Kubernetes, we recorded the following end-to-end execution time under +different settings: + +========= ========= ========= ========= +Data size Vineyard AWS S3 Minio S3 +========= ========= ========= ========= +1M 30s 50s 30s +10M 30s 63s 30s +100M 60s 144s 64s +500M 91s 457s 177s +========= ========= ========= ========= + +We have the following observations from above comparison: + +- Vineyard can significantly accelerate the data sharing between tasks in Kedro pipelines, without the + need for any intrusive changes to the original Kedro pipelines; +- When data scales, the performance of Vineyard is more impressive, as the intermediate data sharing + cost becomes more dominant in end-to-end execution; +- Even compared with local Minio, Vineyard still outperforms it by a large margin, thanks to the ability + of Vineyard to avoid (de)serialization, file I/O and excessive memory copies. diff --git a/python/core.cc b/python/core.cc index 09e4113d50..d7ed9932bc 100644 --- a/python/core.cc +++ b/python/core.cc @@ -657,10 +657,45 @@ void bind_blobs(py::module& mod) { py::class_>( mod, "RemoteBlobBuilder", py::buffer_protocol(), doc::RemoteBlobBuilder) .def(py::init<>( - [](const size_t size) -> std::unique_ptr { - return std::make_unique(size); + [](const size_t size) -> std::shared_ptr { + return std::make_shared(size); }), py::arg("size")) + .def_static( + "make", + [](const size_t size) -> std::shared_ptr { + return RemoteBlobWriter::Make(size); + }, + "size"_a) + .def_static( + "wrap", + [](uintptr_t const data, + const size_t size) -> std::shared_ptr { + return RemoteBlobWriter::Wrap( + reinterpret_cast(data), size); + }, + "data"_a, "size"_a) + .def_static( + "wrap", + [](py::buffer const& buffer, + const size_t nbytes) -> std::shared_ptr { + return RemoteBlobWriter::Wrap( + reinterpret_cast(buffer.ptr()), nbytes); + }, + "data"_a, "size"_a) + .def_static( + "wrap", + [](py::bytes const& bs) -> std::shared_ptr { + char* buffer = nullptr; + ssize_t length = 0; + if (PYBIND11_BYTES_AS_STRING_AND_SIZE(bs.ptr(), &buffer, &length)) { + py::pybind11_fail("Unable to extract bytes contents!"); + } + std::cout << "length = " << length << std::endl; + return RemoteBlobWriter::Wrap( + reinterpret_cast(buffer), length); + }, + "data"_a) .def_property_readonly("size", &RemoteBlobWriter::size, doc::RemoteBlobBuilder_size) .def("__len__", &RemoteBlobWriter::size, doc::RemoteBlobBuilder_size) diff --git a/python/pybind11_docs.cc b/python/pybind11_docs.cc index 3b15cd9255..99d90d0912 100644 --- a/python/pybind11_docs.cc +++ b/python/pybind11_docs.cc @@ -428,6 +428,41 @@ See Also: IPCClient.create_empty_blob )doc"; +const char* RemoteBlobBuilderMake = R"doc( +Create a new remote blob builder with given size. + +See Also: + RemoteBlobBuilder + RemoteBlobBuilder.wrap +)doc"; + +const char* RemoteBlobBuilderWrap = R"doc( +Wrap a given pointer and with its size as a remote blob builder to send to +the remote vineyard instance. + +See Also: + RemoteBlobBuilder + RemoteBlobBuilder.make +)doc"; + +const char* RemoteBlobBuilderWrapBuffer = R"doc( +Wrap an existing memory buffer as a remote blob builder to send to the remote +vineyard instance. + +See Also: + RemoteBlobBuilder + RemoteBlobBuilder.make +)doc"; + +const char* RemoteBlobBuilderWrapBytes = R"doc( +Wrap an existing bytes buffer as a remote blob builder to send to the remote +vineyard instance. + +See Also: + RemoteBlobBuilder + RemoteBlobBuilder.make +)doc"; + const char* RemoteBlobBuilder_size = R"doc( Size of this blob builder. )doc"; diff --git a/python/vineyard/_C.pyi b/python/vineyard/_C.pyi index 500d876e84..552c30be97 100644 --- a/python/vineyard/_C.pyi +++ b/python/vineyard/_C.pyi @@ -376,7 +376,9 @@ def connect( *, username: str = "", password: str = "" ) -> Union[IPCClient, RPCClient]: ... @overload -def connect(socket: str = "", username: str = "", password: str = "") -> IPCClient: ... +def connect( + socket: str = "", *, username: str = "", password: str = "" +) -> IPCClient: ... @overload def connect( host: str, diff --git a/python/vineyard/contrib/kedro/README.md b/python/vineyard/contrib/kedro/README.md index fb6fe45b8a..72e672eb32 100644 --- a/python/vineyard/contrib/kedro/README.md +++ b/python/vineyard/contrib/kedro/README.md @@ -6,17 +6,19 @@ intermediate data among nodes in Kedro pipelines using vineyard. Vineyard works as the *DataSet* provider for kedro workers to allow transferring large-scale data objects between tasks that cannot be efficiently serialized and -is not suitable for :code:`pickle`, without involving external storage systems like +is not suitable for `pickle`, without involving external storage systems like AWS S3 (or Minio as an alternative). The Kedro vineyard plugin handles object migration as well when the required inputs are not located where the task is scheduled to execute. Table of Contents ----------------- -- [Requirements](#requirements) -- [Configuration](#configuration) -- [Usage](#usage) -- [Argo Workflow Integration](#argo-workflow-integration) +- [Kedro Vineyard Plugin](#kedro-vineyard-plugin) + - [Table of Contents](#table-of-contents) + - [Requirements](#requirements) + - [Configuration](#configuration) + - [Usage](#usage) + - [Deploy to Kubernetes](#deploy-to-kubernetes) Requirements ------------ @@ -39,7 +41,7 @@ Configuration python3 -m vineyard --socket=/tmp/vineyard.sock - See also our documentation about [launching vineyard][1]. + See also our documentation about [Launching Vineyard][1]. 3. Configure the environment variable to tell Kedro vineyard plugin how to connect to the vineyardd server: @@ -138,8 +140,8 @@ y_train: credentials: minio ``` -Which might be inefficient for pickling pandas dataframes and series when data become larger. With the kedro -vineyard plugin, you can run the pipeline with vineyard as the intermediate data medium by simply +It might be inefficient for pickling pandas dataframes when data become larger. With the kedro +vineyard plugin, you can run the pipeline with vineyard as the intermediate data medium by ```bash $ kedro run --runner vineyard.contrib.kedro.runner.SequentialRunner @@ -158,13 +160,13 @@ $ kedro run --runner vineyard.contrib.kedro.runner.SequentialRunner ... ``` -without any modification to your pipeline code. You can see that the intermediate data is shared +Without any modification to your pipeline code, you can see that the intermediate data is shared with vineyard using the `VineyardDataSet` and no longer suffers from the overhead of (de)serialization and the I/O cost between external AWS S3 or Minio services. -Besides the runner, like `kedro catalog create`, the Kedro vineyard plugin provides a command-line -interface to generate the catalog configuration for given pipeline, which will rewrite the unspecified -intermediate data to `VineyardDataSet`, e.g., +Like `kedro catalog create`, the Kedro vineyard plugin provides a command-line interface to generate +the catalog configuration for given pipeline, which will rewrite the unspecified intermediate data +to `VineyardDataSet`, e.g., ```bash $ kedro vineyard catalog create -p __default__ @@ -190,86 +192,86 @@ y_train: type: vineyard.contrib.kedro.io.dataset.VineyardDataSet ``` -Argo Workflow Integration -------------------------- +Deploy to Kubernetes +-------------------- +When the pipeline scales to Kubernetes, the interaction with the Kedro vineyard plugin is +still simple and non-intrusive. The plugin provides tools to prepare the docker image and +generate Argo workflow specification file for the Kedro pipeline. Next, we'll demonstrate +how to deploy pipelines to Kubernetes while leverage Vineyard for efficient intermediate +sharing between tasks step-by-step. -The Kedro vineyard plugin also provides a tool to generate the Argo workflow YAML file. Next, we will -show how to generate the Argo workflow YAML file and run the Argo workflow on Kubernetes. +1. Prepare the vineyard cluster (see also [Deploy on Kubernetes][5]): -Install vineyard operator as follows. + ```bash + # export your kubeconfig path here + $ export KUBECONFIG=/path/to/your/kubeconfig -```bash -# export your kubeconfig path here -$ export KUBECONFIG=/path/to/your/kubeconfig + # install the vineyard operator + $ go run k8s/cmd/main.go deploy vineyard-cluster --create-namespace + ``` -# install the vineyard operator -$ go run k8s/cmd/main.go deploy vineyard-cluster --create-namespace -``` +2. Install the argo server: -Install the argo server as follows. + ```bash + # install the argo server + $ kubectl create namespace argo + $ kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/v3.4.8/install.yaml + ``` -```bash -# install the argo server -$ kubectl create namespace argo -$ kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/v3.4.8/install.yaml -``` +3. Generate the iris demo project from the official template: -Generate the iris demo and + ```bash + $ kedro new --starter=pandas-iris + ``` -```bash -$ kedro new --starter=pandas-iris -``` +4. Build the Docker image for this iris demo project: -Build the docker image for the iris demo and input `N` if you -encounter the usage analytics prompt. + ```bash + # walk to the iris demo root directory + $ cd iris + $ kedro vineyard docker build + ``` -```bash -# go to the iris demo root directory -$ cd iris -$ kedro vineyard docker build -As an open-source project, we collect usage analytics. -We cannot see nor store information contained in a Kedro project. -You can find out more by reading our privacy notice: -https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry#privacy-notice -Do you opt into usage analytics? [y/N]: N -``` + A Docker image named `iris` will be built successfully. The docker image + need to be pushed to your image registry, or loaded to the kind/minikube cluster, to be + available in Kubernetes. -You can see the docker image named `docker.io/library/iris` is built successfully, and then push the docker -image to your docker registry or load the docker image to your Kubernetes cluster. + ```bash + $ docker images | grep iris + iris latest 3c92da8241c6 About a minute ago 690MB + ``` -```bash -$ docker images | grep iris -iris latest 3c92da8241c6 About a minute ago 690MB -``` +5. Next, generate the Argo workflow YAML file from the iris demo project: -Generate the Argo workflow YAML file. + ```bash + $ kedro vineyard argo generate -i iris -```bash -$ kedro vineyard argo generate -i iris -# check the generated Argo workflow YAML file, you can see the Argo workflow YAML file named `iris.yaml` -# is generated successfully. -$ ls -l argo-iris.yml --rw-rw-r-- 1 gsbot gsbot 3685 Jun 12 23:55 argo-iris.yml -``` + # check the generated Argo workflow YAML file, you can see the Argo workflow YAML file named `iris.yaml` + # is generated successfully. + $ ls -l argo-iris.yml + -rw-rw-r-- 1 root root 3685 Jun 12 23:55 argo-iris.yml + ``` -Submit the Argo workflow YAML file to Kubernetes. +6. Finally, submit the Argo workflow to Kubernetes: -```bash -$ argo submit -n argo argo-iris.yml -``` + ```bash + $ argo submit -n argo argo-iris.yml + ``` -Check the Argo workflow status. + You can interact with the Argo workflow using the `argo` command-line tool, e.g., -```bash -$ argo list workflows -n argo -NAME STATUS AGE DURATION PRIORITY MESSAGE -iris-sg6qf Succeeded 18m 30s 0 -``` + ```bash + $ argo list workflows -n argo + NAME STATUS AGE DURATION PRIORITY MESSAGE + iris-sg6qf Succeeded 18m 30s 0 + ``` -For the performance regarding the vineyard as the intermediate data catalog, please refer to [this report][4]. +We have prepared a benchmark to evaluate the performance gain brought by vineyard for data +sharing when data scales, for more details, please refer to [this report][4]. [1]: https://v6d.io/notes/getting-started.html#starting-vineyard-server [2]: https://docs.kedro.org/en/stable/get_started/new_project.html [3]: https://docs.kedro.org/en/stable/deployment/argo.html#how-to-run-your-kedro-pipeline-using-argo-workflows -[4]: https://v6d.io/notes/kedro-integration-performance.html +[4]: https://v6d.io/tutorials/data-processing/accelerate-data-sharing-in-kedro.html +[5]: https://v6d.io/notes/cloud-native/deploy-kubernetes.html diff --git a/python/vineyard/core/tests/test_rpc_client.py b/python/vineyard/core/tests/test_rpc_client.py index d844495d58..dd0996dc44 100644 --- a/python/vineyard/core/tests/test_rpc_client.py +++ b/python/vineyard/core/tests/test_rpc_client.py @@ -23,7 +23,8 @@ import pytest import vineyard -from vineyard._C import RemoteBlobBuilder +from vineyard import RemoteBlobBuilder +from vineyard import RPCClient from vineyard.core import default_builder_context from vineyard.core import default_resolver_context from vineyard.data import register_builtin_types @@ -52,7 +53,7 @@ def test_remote_blob_create(vineyard_client, vineyard_endpoint): def test_remote_blob_get(vineyard_client, vineyard_endpoint): - vineyard_rpc_client = vineyard.connect(*vineyard_endpoint.split(':')) + vineyard_rpc_client: RPCClient = vineyard.connect(*vineyard_endpoint.split(':')) buffer_writer = vineyard_client.create_blob(len(payload)) buffer_writer.copy(0, payload) @@ -88,6 +89,7 @@ def test_remote_blob_create_and_get(vineyard_endpoint): def test_remote_blob_create_and_get_large_object(vineyard_endpoint): vineyard_rpc_client = vineyard.connect(*vineyard_endpoint.split(':')) + # allocate & copy buffer_writer = RemoteBlobBuilder(len(large_payload)) buffer_writer.copy(0, large_payload) blob_id = vineyard_rpc_client.create_remote_blob(buffer_writer) @@ -100,6 +102,18 @@ def test_remote_blob_create_and_get_large_object(vineyard_endpoint): assert remote_blob.size == len(large_payload) assert memoryview(remote_blob) == memoryview(large_payload) + # wrap + buffer_writer = RemoteBlobBuilder.wrap(large_payload) + blob_id = vineyard_rpc_client.create_remote_blob(buffer_writer) + + # get as remote blob + remote_blob = vineyard_rpc_client.get_remote_blob(blob_id) + + # check remote blob + assert remote_blob.id == blob_id + assert remote_blob.size == len(large_payload) + assert memoryview(remote_blob) == memoryview(large_payload) + def test_remote_blob_error(vineyard_endpoint): vineyard_rpc_client = vineyard.connect(*vineyard_endpoint.split(':')) diff --git a/src/client/ds/remote_blob.cc b/src/client/ds/remote_blob.cc index 7c55103b15..0f3f10a20b 100644 --- a/src/client/ds/remote_blob.cc +++ b/src/client/ds/remote_blob.cc @@ -125,6 +125,17 @@ RemoteBlobWriter::RemoteBlobWriter(std::shared_ptr const& buffer) RemoteBlobWriter::~RemoteBlobWriter() {} +std::shared_ptr RemoteBlobWriter::Make(const size_t size) { + return std::shared_ptr(new RemoteBlobWriter(size)); +} + +std::shared_ptr RemoteBlobWriter::Wrap(const uint8_t* data, + const size_t size) { + auto buffer = std::dynamic_pointer_cast( + MutableBuffer::Wrap(const_cast(data), size)); + return std::shared_ptr(new RemoteBlobWriter(buffer)); +} + size_t RemoteBlobWriter::size() const { return buffer_ ? buffer_->size() : 0; } char* RemoteBlobWriter::data() { diff --git a/src/client/ds/remote_blob.h b/src/client/ds/remote_blob.h index a7f8b512fe..e79aa67ebf 100644 --- a/src/client/ds/remote_blob.h +++ b/src/client/ds/remote_blob.h @@ -146,6 +146,11 @@ class RemoteBlobWriter { ~RemoteBlobWriter(); + static std::shared_ptr Make(const size_t size); + + static std::shared_ptr Wrap(const uint8_t* data, + const size_t size); + /** * @brief Get the size of the blob, i.e., the number of bytes of the data * payload in the blob.