From e4838c9c53b639a9dfc688c0e8b6423689b376a9 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Tue, 13 Jun 2023 11:01:40 -0700 Subject: [PATCH 01/40] Enable test for submit_spark_standalone_jobs --- sdk/python/readme.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/readme.py b/sdk/python/readme.py index 5a0d41de43..ab11457096 100644 --- a/sdk/python/readme.py +++ b/sdk/python/readme.py @@ -22,7 +22,6 @@ "interactive_data_wrangling", "attach_manage_spark_pools", "submit_spark_pipeline_jobs", - "submit_spark_standalone_jobs", "submit_spark_standalone_jobs_managed_vnet", # mlflow SDK samples notebooks "mlflow_sdk_online_endpoints_progresive", From 3d92c07137d27ad95e3162807e13563430da8872 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Tue, 13 Jun 2023 15:25:35 -0700 Subject: [PATCH 02/40] Generate workflow yaml --- ...obs-spark-submit_spark_standalone_jobs.yml | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 .github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml diff --git a/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml b/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml new file mode 100644 index 0000000000..e0896b73d3 --- /dev/null +++ b/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml @@ -0,0 +1,75 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: sdk-jobs-spark-submit_spark_standalone_jobs +# This file is created by sdk/python/readme.py. +# Please do not edit directly. +on: + workflow_dispatch: + schedule: + - cron: "36 10/12 * * *" + pull_request: + branches: + - main + paths: + - sdk/python/jobs/spark/** + - .github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml + - sdk/python/dev-requirements.txt + - infra/bootstrapping/** + - sdk/python/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: pip install notebook reqs + run: pip install -r sdk/python/dev-requirements.txt + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup SDK + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: sdk/python + continue-on-error: true + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: run jobs/spark/submit_spark_standalone_jobs.ipynb + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "submit_spark_standalone_jobs.ipynb"; + [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; + papermill -k python submit_spark_standalone_jobs.ipynb submit_spark_standalone_jobs.output.ipynb + working-directory: sdk/python/jobs/spark + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: submit_spark_standalone_jobs + path: sdk/python/jobs/spark From f41b50c74f39b52ff61bda7a7b2abb43e6e4f650 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Wed, 14 Jun 2023 16:11:52 -0700 Subject: [PATCH 03/40] update spark job files for automation test --- ...erverless-spark-pipeline-user-identity.yml | 50 +++++++++++++++++++ ...compute-attached-spark-system-identity.yml | 49 ++++++++++++++++++ ...s-compute-attached-spark-user-identity.yml | 49 ++++++++++++++++++ .../cli-resources-compute-attached-spark.yml | 49 ++++++++++++++++++ ...ached-spark-pipeline-default-identity.yml} | 0 ...ached-spark-pipeline-managed-identity.yml} | 0 ...attached-spark-pipeline-user-identity.yml} | 0 ...hed-spark-standalone-default-identity.yml} | 0 ...hed-spark-standalone-managed-identity.yml} | 0 ...tached-spark-standalone-user-identity.yml} | 0 ...rless-spark-pipeline-default-identity.yml} | 0 ...rless-spark-pipeline-managed-identity.yml} | 0 ...rverless-spark-pipeline-user-identity.yml} | 0 ...ess-spark-standalone-default-identity.yml} | 0 ...ess-spark-standalone-managed-identity.yml} | 0 ...erless-spark-standalone-user-identity.yml} | 0 ...component.yaml => spark-job-component.yml} | 0 .../spark/{storage_pe.yaml => storage_pe.yml} | 0 ...entity.yaml => user-assigned-identity.yml} | 0 cli/readme.py | 6 +-- ...aml => attached-spark-system-identity.yml} | 0 ....yaml => attached-spark-user-identity.yml} | 0 ...attached-spark.yaml => attached-spark.yml} | 0 .../spark/submit_spark_standalone_jobs.ipynb | 2 +- 24 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml create mode 100644 .github/workflows/cli-resources-compute-attached-spark-system-identity.yml create mode 100644 .github/workflows/cli-resources-compute-attached-spark-user-identity.yml create mode 100644 .github/workflows/cli-resources-compute-attached-spark.yml rename cli/jobs/spark/{attached-spark-pipeline-default-identity.yaml => attached-spark-pipeline-default-identity.yml} (100%) rename cli/jobs/spark/{attached-spark-pipeline-managed-identity.yaml => attached-spark-pipeline-managed-identity.yml} (100%) rename cli/jobs/spark/{attached-spark-pipeline-user-identity.yaml => attached-spark-pipeline-user-identity.yml} (100%) rename cli/jobs/spark/{attached-spark-standalone-default-identity.yaml => attached-spark-standalone-default-identity.yml} (100%) rename cli/jobs/spark/{attached-spark-standalone-managed-identity.yaml => attached-spark-standalone-managed-identity.yml} (100%) rename cli/jobs/spark/{attached-spark-standalone-user-identity.yaml => attached-spark-standalone-user-identity.yml} (100%) rename cli/jobs/spark/{serverless-spark-pipeline-default-identity.yaml => serverless-spark-pipeline-default-identity.yml} (100%) rename cli/jobs/spark/{serverless-spark-pipeline-managed-identity.yaml => serverless-spark-pipeline-managed-identity.yml} (100%) rename cli/jobs/spark/{serverless-spark-pipeline-user-identity.yaml => serverless-spark-pipeline-user-identity.yml} (100%) rename cli/jobs/spark/{serverless-spark-standalone-default-identity.yaml => serverless-spark-standalone-default-identity.yml} (100%) rename cli/jobs/spark/{serverless-spark-standalone-managed-identity.yaml => serverless-spark-standalone-managed-identity.yml} (100%) rename cli/jobs/spark/{serverless-spark-standalone-user-identity.yaml => serverless-spark-standalone-user-identity.yml} (100%) rename cli/jobs/spark/{spark-job-component.yaml => spark-job-component.yml} (100%) rename cli/jobs/spark/{storage_pe.yaml => storage_pe.yml} (100%) rename cli/jobs/spark/{user-assigned-identity.yaml => user-assigned-identity.yml} (100%) rename cli/resources/compute/{attached-spark-system-identity.yaml => attached-spark-system-identity.yml} (100%) rename cli/resources/compute/{attached-spark-user-identity.yaml => attached-spark-user-identity.yml} (100%) rename cli/resources/compute/{attached-spark.yaml => attached-spark.yml} (100%) diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml new file mode 100644 index 0000000000..f3c4517901 --- /dev/null +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml @@ -0,0 +1,50 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-serverless-spark-pipeline-user-identity +on: + workflow_dispatch: + schedule: + - cron: "44 8/12 * * *" + pull_request: + branches: + - main + paths: + - cli//** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ./run-job.sh jobs/spark/serverless-spark-pipeline-user-identity.yml + working-directory: cli/ diff --git a/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml b/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml new file mode 100644 index 0000000000..bd6e9373d1 --- /dev/null +++ b/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml @@ -0,0 +1,49 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-resources-compute-attached-spark-system-identity +on: + workflow_dispatch: + schedule: + - cron: "20 3/12 * * *" + pull_request: + branches: + - main + paths: + - cli/resources/compute/attached-spark-system-identity.yml + - infra/bootstrapping/** + - .github/workflows/cli-resources-compute-attached-spark-system-identity.yml + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + bash bootstrap.sh + working-directory: infra + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: create asset + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + az ml compute create -f resources/compute/attached-spark-system-identity.yml + working-directory: cli diff --git a/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml b/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml new file mode 100644 index 0000000000..1b90b1c364 --- /dev/null +++ b/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml @@ -0,0 +1,49 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-resources-compute-attached-spark-user-identity +on: + workflow_dispatch: + schedule: + - cron: "39 4/12 * * *" + pull_request: + branches: + - main + paths: + - cli/resources/compute/attached-spark-user-identity.yml + - infra/bootstrapping/** + - .github/workflows/cli-resources-compute-attached-spark-user-identity.yml + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + bash bootstrap.sh + working-directory: infra + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: create asset + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + az ml compute create -f resources/compute/attached-spark-user-identity.yml + working-directory: cli diff --git a/.github/workflows/cli-resources-compute-attached-spark.yml b/.github/workflows/cli-resources-compute-attached-spark.yml new file mode 100644 index 0000000000..f169dbbd58 --- /dev/null +++ b/.github/workflows/cli-resources-compute-attached-spark.yml @@ -0,0 +1,49 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-resources-compute-attached-spark +on: + workflow_dispatch: + schedule: + - cron: "44 3/12 * * *" + pull_request: + branches: + - main + paths: + - cli/resources/compute/attached-spark.yml + - infra/bootstrapping/** + - .github/workflows/cli-resources-compute-attached-spark.yml + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + bash bootstrap.sh + working-directory: infra + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: create asset + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + az ml compute create -f resources/compute/attached-spark.yml + working-directory: cli diff --git a/cli/jobs/spark/attached-spark-pipeline-default-identity.yaml b/cli/jobs/spark/attached-spark-pipeline-default-identity.yml similarity index 100% rename from cli/jobs/spark/attached-spark-pipeline-default-identity.yaml rename to cli/jobs/spark/attached-spark-pipeline-default-identity.yml diff --git a/cli/jobs/spark/attached-spark-pipeline-managed-identity.yaml b/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml similarity index 100% rename from cli/jobs/spark/attached-spark-pipeline-managed-identity.yaml rename to cli/jobs/spark/attached-spark-pipeline-managed-identity.yml diff --git a/cli/jobs/spark/attached-spark-pipeline-user-identity.yaml b/cli/jobs/spark/attached-spark-pipeline-user-identity.yml similarity index 100% rename from cli/jobs/spark/attached-spark-pipeline-user-identity.yaml rename to cli/jobs/spark/attached-spark-pipeline-user-identity.yml diff --git a/cli/jobs/spark/attached-spark-standalone-default-identity.yaml b/cli/jobs/spark/attached-spark-standalone-default-identity.yml similarity index 100% rename from cli/jobs/spark/attached-spark-standalone-default-identity.yaml rename to cli/jobs/spark/attached-spark-standalone-default-identity.yml diff --git a/cli/jobs/spark/attached-spark-standalone-managed-identity.yaml b/cli/jobs/spark/attached-spark-standalone-managed-identity.yml similarity index 100% rename from cli/jobs/spark/attached-spark-standalone-managed-identity.yaml rename to cli/jobs/spark/attached-spark-standalone-managed-identity.yml diff --git a/cli/jobs/spark/attached-spark-standalone-user-identity.yaml b/cli/jobs/spark/attached-spark-standalone-user-identity.yml similarity index 100% rename from cli/jobs/spark/attached-spark-standalone-user-identity.yaml rename to cli/jobs/spark/attached-spark-standalone-user-identity.yml diff --git a/cli/jobs/spark/serverless-spark-pipeline-default-identity.yaml b/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml similarity index 100% rename from cli/jobs/spark/serverless-spark-pipeline-default-identity.yaml rename to cli/jobs/spark/serverless-spark-pipeline-default-identity.yml diff --git a/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yaml b/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml similarity index 100% rename from cli/jobs/spark/serverless-spark-pipeline-managed-identity.yaml rename to cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml diff --git a/cli/jobs/spark/serverless-spark-pipeline-user-identity.yaml b/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml similarity index 100% rename from cli/jobs/spark/serverless-spark-pipeline-user-identity.yaml rename to cli/jobs/spark/serverless-spark-pipeline-user-identity.yml diff --git a/cli/jobs/spark/serverless-spark-standalone-default-identity.yaml b/cli/jobs/spark/serverless-spark-standalone-default-identity.yml similarity index 100% rename from cli/jobs/spark/serverless-spark-standalone-default-identity.yaml rename to cli/jobs/spark/serverless-spark-standalone-default-identity.yml diff --git a/cli/jobs/spark/serverless-spark-standalone-managed-identity.yaml b/cli/jobs/spark/serverless-spark-standalone-managed-identity.yml similarity index 100% rename from cli/jobs/spark/serverless-spark-standalone-managed-identity.yaml rename to cli/jobs/spark/serverless-spark-standalone-managed-identity.yml diff --git a/cli/jobs/spark/serverless-spark-standalone-user-identity.yaml b/cli/jobs/spark/serverless-spark-standalone-user-identity.yml similarity index 100% rename from cli/jobs/spark/serverless-spark-standalone-user-identity.yaml rename to cli/jobs/spark/serverless-spark-standalone-user-identity.yml diff --git a/cli/jobs/spark/spark-job-component.yaml b/cli/jobs/spark/spark-job-component.yml similarity index 100% rename from cli/jobs/spark/spark-job-component.yaml rename to cli/jobs/spark/spark-job-component.yml diff --git a/cli/jobs/spark/storage_pe.yaml b/cli/jobs/spark/storage_pe.yml similarity index 100% rename from cli/jobs/spark/storage_pe.yaml rename to cli/jobs/spark/storage_pe.yml diff --git a/cli/jobs/spark/user-assigned-identity.yaml b/cli/jobs/spark/user-assigned-identity.yml similarity index 100% rename from cli/jobs/spark/user-assigned-identity.yaml rename to cli/jobs/spark/user-assigned-identity.yml diff --git a/cli/readme.py b/cli/readme.py index 1f6012fcc1..4e5dbf1944 100644 --- a/cli/readme.py +++ b/cli/readme.py @@ -9,7 +9,7 @@ import yaml # define constants -EXCLUDED_JOBS = ["java", "spark"] +EXCLUDED_JOBS = ["java"] # TODO: Re-include these below endpoints and deployments when the workflow generation code supports substituting vars in .yaml files. EXCLUDED_ENDPOINTS = [ "1-uai-create-endpoint", @@ -33,9 +33,6 @@ "instance", "connections", "compute/cluster-user-identity", - "compute/attached-spark", - "compute/attached-spark-system-identity", - "compute/attached-spark-user-identity", "registry", ] EXCLUDED_ASSETS = ["conda-yamls", "mlflow-models"] @@ -77,6 +74,7 @@ def main(args): jobs += sorted(glob.glob("jobs/basics/*.yml", recursive=False)) jobs += sorted(glob.glob("jobs/*/basics/**/*job*.yml", recursive=True)) jobs += sorted(glob.glob("jobs/pipelines/**/*pipeline*.yml", recursive=True)) + jobs += sorted(glob.glob("jobs/spark/*.yml", recursive=False)) jobs += sorted( glob.glob("jobs/automl-standalone-jobs/**/cli-automl-*.yml", recursive=True) ) diff --git a/cli/resources/compute/attached-spark-system-identity.yaml b/cli/resources/compute/attached-spark-system-identity.yml similarity index 100% rename from cli/resources/compute/attached-spark-system-identity.yaml rename to cli/resources/compute/attached-spark-system-identity.yml diff --git a/cli/resources/compute/attached-spark-user-identity.yaml b/cli/resources/compute/attached-spark-user-identity.yml similarity index 100% rename from cli/resources/compute/attached-spark-user-identity.yaml rename to cli/resources/compute/attached-spark-user-identity.yml diff --git a/cli/resources/compute/attached-spark.yaml b/cli/resources/compute/attached-spark.yml similarity index 100% rename from cli/resources/compute/attached-spark.yaml rename to cli/resources/compute/attached-spark.yml diff --git a/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb b/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb index 4a57bfed91..bb5a1c091a 100644 --- a/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb +++ b/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb @@ -126,7 +126,7 @@ " executor_cores=2,\n", " executor_memory=\"2g\",\n", " executor_instances=2,\n", - " compute=\"\",\n", + " compute=\"my-spark-pool\",\n", " inputs={\n", " \"titanic_data\": Input(\n", " type=\"uri_file\",\n", From 30992086b7c63755ed05768b9f6db33d232544c2 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 11:06:18 -0700 Subject: [PATCH 04/40] Add workflow for serverless spark with user identity job --- ...verless-spark-standalone-user-identity.yml | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 .github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml new file mode 100644 index 0000000000..9a9b030e67 --- /dev/null +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml @@ -0,0 +1,50 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-serverless-spark-standalone-user-identity +on: + workflow_dispatch: + schedule: + - cron: "27 1/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh serverless-spark-standalone-user-identity.yml + working-directory: cli/jobs/spark From e20a24b049d57c1ac15beec8606f06e27cb257e8 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 14:20:17 -0700 Subject: [PATCH 05/40] Add scripts to upload input data --- ...erverless-spark-pipeline-user-identity.yml | 11 ++-- ...less-spark-standalone-default-identity.yml | 53 +++++++++++++++++++ ...verless-spark-standalone-user-identity.yml | 3 ++ cli/upload-data-to-blob.sh | 3 ++ 4 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml create mode 100644 cli/upload-data-to-blob.sh diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml index f3c4517901..3956c06545 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml @@ -7,12 +7,12 @@ name: cli-jobs-spark-serverless-spark-pipeline-user-identity on: workflow_dispatch: schedule: - - cron: "44 8/12 * * *" + - cron: "56 7/12 * * *" pull_request: branches: - main paths: - - cli//** + - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml - cli/setup.sh @@ -42,9 +42,12 @@ jobs: bash setup.sh working-directory: cli continue-on-error: true + - name: upload data + run: | + bash upload-data-to-blob.sh - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash -x ./run-job.sh jobs/spark/serverless-spark-pipeline-user-identity.yml - working-directory: cli/ + bash -x ../../run-job.sh serverless-spark-pipeline-user-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml new file mode 100644 index 0000000000..0271de1585 --- /dev/null +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml @@ -0,0 +1,53 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-serverless-spark-standalone-default-identity +on: + workflow_dispatch: + schedule: + - cron: "19 11/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash upload-data-to-blob.sh + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh serverless-spark-standalone-default-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml index 9a9b030e67..2d67118621 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml @@ -42,6 +42,9 @@ jobs: bash setup.sh working-directory: cli continue-on-error: true + - name: upload data + run: | + bash upload-data-to-blob.sh - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/cli/upload-data-to-blob.sh b/cli/upload-data-to-blob.sh new file mode 100644 index 0000000000..ba81db05d6 --- /dev/null +++ b/cli/upload-data-to-blob.sh @@ -0,0 +1,3 @@ +# +az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data -f cli/jobs/spark/titanic.csv --account-name $AZURE_STORAGE_ACCOUNT +# \ No newline at end of file From 04d78f3b9a18c52963a6b0621e31bb0195ad62ef Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 14:29:31 -0700 Subject: [PATCH 06/40] Update workflow to refer the script --- .../cli-jobs-spark-serverless-spark-pipeline-user-identity.yml | 1 + ...i-jobs-spark-serverless-spark-standalone-default-identity.yml | 1 + .../cli-jobs-spark-serverless-spark-standalone-user-identity.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml index 3956c06545..285a6723db 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml @@ -15,6 +15,7 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml + - cli/upload-data-to-blob.sh - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml index 0271de1585..7d98375eec 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml @@ -15,6 +15,7 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml + - cli/upload-data-to-blob.sh - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml index 2d67118621..46fedd01c5 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml @@ -15,6 +15,7 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml + - cli/upload-data-to-blob.sh - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} From d452e9dba6f291670958d923158fc6116f3eb23f Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 14:34:23 -0700 Subject: [PATCH 07/40] Update source file path --- cli/upload-data-to-blob.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/upload-data-to-blob.sh b/cli/upload-data-to-blob.sh index ba81db05d6..235c46920a 100644 --- a/cli/upload-data-to-blob.sh +++ b/cli/upload-data-to-blob.sh @@ -1,3 +1,3 @@ # -az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data -f cli/jobs/spark/titanic.csv --account-name $AZURE_STORAGE_ACCOUNT +az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data -f cli/jobs/spark/data/titanic.csv --account-name $AZURE_STORAGE_ACCOUNT # \ No newline at end of file From 09562748a5d00dbb702fd6a6500dca1a089af350 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 14:47:38 -0700 Subject: [PATCH 08/40] Update workflow with correct file path --- .../cli-jobs-spark-serverless-spark-pipeline-user-identity.yml | 3 +-- ...jobs-spark-serverless-spark-standalone-default-identity.yml | 3 +-- ...li-jobs-spark-serverless-spark-standalone-user-identity.yml | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml index 285a6723db..c5a1a0cdc3 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml @@ -15,7 +15,6 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml - - cli/upload-data-to-blob.sh - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -45,7 +44,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash upload-data-to-blob.sh + bash ../../upload-data-to-blob.sh - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml index 7d98375eec..325a0fb637 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml @@ -15,7 +15,6 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml - - cli/upload-data-to-blob.sh - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -45,7 +44,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash upload-data-to-blob.sh + bash ../../upload-data-to-blob.sh - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml index 46fedd01c5..bd7b2e2a67 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml @@ -15,7 +15,6 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml - - cli/upload-data-to-blob.sh - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -45,7 +44,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash upload-data-to-blob.sh + bash ../../upload-data-to-blob.sh - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; From f56be5671364ae466130028a6568cf90d913cf9d Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 15:17:11 -0700 Subject: [PATCH 09/40] Update working directory --- .../cli-jobs-spark-serverless-spark-pipeline-user-identity.yml | 1 + ...i-jobs-spark-serverless-spark-standalone-default-identity.yml | 1 + .../cli-jobs-spark-serverless-spark-standalone-user-identity.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml index c5a1a0cdc3..dcf7d54dd0 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml @@ -45,6 +45,7 @@ jobs: - name: upload data run: | bash ../../upload-data-to-blob.sh + working-directory: cli - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml index 325a0fb637..b308d4c377 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml @@ -45,6 +45,7 @@ jobs: - name: upload data run: | bash ../../upload-data-to-blob.sh + working-directory: cli - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml index bd7b2e2a67..15e1dea9e0 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml @@ -45,6 +45,7 @@ jobs: - name: upload data run: | bash ../../upload-data-to-blob.sh + working-directory: cli - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; From f5f39751aef8154c8c1269bf8f180647a1dfe41d Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 15:38:51 -0700 Subject: [PATCH 10/40] Update workflow --- .../cli-jobs-spark-serverless-spark-pipeline-user-identity.yml | 3 ++- ...jobs-spark-serverless-spark-standalone-default-identity.yml | 3 ++- ...li-jobs-spark-serverless-spark-standalone-user-identity.yml | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml index dcf7d54dd0..55e7962e26 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml @@ -15,6 +15,7 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml + - cli/jobs/spark/data/titanic.csv - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -44,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash ../../upload-data-to-blob.sh + bash -x ../../upload-data-to-blob.sh titanic.csv working-directory: cli - name: run job run: | diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml index b308d4c377..757ffc3492 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml @@ -15,6 +15,7 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml + - cli/jobs/spark/data/titanic.csv - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -44,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash ../../upload-data-to-blob.sh + bash -x ../../upload-data-to-blob.sh titanic.csv working-directory: cli - name: run job run: | diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml index 15e1dea9e0..3dc5376ae6 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml @@ -15,6 +15,7 @@ on: - cli/jobs/spark/** - infra/bootstrapping/** - .github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml + - cli/jobs/spark/data/titanic.csv - cli/setup.sh concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -44,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash ../../upload-data-to-blob.sh + bash -x ../../upload-data-to-blob.sh titanic.csv working-directory: cli - name: run job run: | From 70f6debe289fe70e86bce9bad5ab8c40dff838e1 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 19:06:11 -0700 Subject: [PATCH 11/40] Update the path --- .../cli-jobs-spark-serverless-spark-pipeline-user-identity.yml | 2 +- ...-jobs-spark-serverless-spark-standalone-default-identity.yml | 2 +- ...cli-jobs-spark-serverless-spark-standalone-user-identity.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml index 55e7962e26..b55164e0a0 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml @@ -45,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash -x ../../upload-data-to-blob.sh titanic.csv + bash -x upload-data-to-blob.sh jobs/spark/data/titanic.csv working-directory: cli - name: run job run: | diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml index 757ffc3492..6a159e47d6 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml @@ -45,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash -x ../../upload-data-to-blob.sh titanic.csv + bash -x upload-data-to-blob.sh jobs/spark/data/titanic.csv working-directory: cli - name: run job run: | diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml index 3dc5376ae6..dac2359d2f 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml @@ -45,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash -x ../../upload-data-to-blob.sh titanic.csv + bash -x upload-data-to-blob.sh jobs/spark/data/titanic.csv working-directory: cli - name: run job run: | From ec30c16b8df83a510ce2434e52e4789361080ed0 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 19:41:22 -0700 Subject: [PATCH 12/40] Update the script to upload data --- cli/upload-data-to-blob.sh | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/cli/upload-data-to-blob.sh b/cli/upload-data-to-blob.sh index 235c46920a..9b9306b2e8 100644 --- a/cli/upload-data-to-blob.sh +++ b/cli/upload-data-to-blob.sh @@ -1,3 +1,19 @@ -# -az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data -f cli/jobs/spark/data/titanic.csv --account-name $AZURE_STORAGE_ACCOUNT -# \ No newline at end of file +# +SUBSCRIPTION_ID=$(az account show --query id -o tsv) +LOCATION=$(az ml workspace show --query location -o tsv) +RESOURCE_GROUP=$(az group show --query name -o tsv) +WORKSPACE=$(az configure -l --query "[?name=='workspace'].value" -o tsv) +API_VERSION="2022-05-01" +TOKEN=$(az account get-access-token --query accessToken -o tsv) +# + +# +response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/datastores?api-version=$API_VERSION&isDefault=true" \ +--header "Authorization: Bearer $TOKEN") +AZUREML_DEFAULT_CONTAINER=$(echo $response | jq -r '.value[0].properties.containerName') +export AZURE_STORAGE_ACCOUNT=$(echo $response | jq -r '.value[0].properties.accountName') +# + +# +az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data -f $1 --account-name $AZURE_STORAGE_ACCOUNT +# \ No newline at end of file From ec9da6e3c9d872f7849d09d8b24e7b000f0bae7d Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 19:48:09 -0700 Subject: [PATCH 13/40] Update the overwrite mode --- cli/upload-data-to-blob.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/upload-data-to-blob.sh b/cli/upload-data-to-blob.sh index 9b9306b2e8..816859fa22 100644 --- a/cli/upload-data-to-blob.sh +++ b/cli/upload-data-to-blob.sh @@ -15,5 +15,5 @@ export AZURE_STORAGE_ACCOUNT=$(echo $response | jq -r '.value[0].properties.acco # # -az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data -f $1 --account-name $AZURE_STORAGE_ACCOUNT +az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data -f $1 --account-name $AZURE_STORAGE_ACCOUNT --overwrite true # \ No newline at end of file From f08200b0f8a35a3c0b1576079fba5a4859ccf2a0 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 15 Jun 2023 23:39:43 -0700 Subject: [PATCH 14/40] Update destination blob name --- cli/upload-data-to-blob.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/upload-data-to-blob.sh b/cli/upload-data-to-blob.sh index 816859fa22..d1e1a73116 100644 --- a/cli/upload-data-to-blob.sh +++ b/cli/upload-data-to-blob.sh @@ -15,5 +15,5 @@ export AZURE_STORAGE_ACCOUNT=$(echo $response | jq -r '.value[0].properties.acco # # -az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data -f $1 --account-name $AZURE_STORAGE_ACCOUNT --overwrite true +az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data/titanic.csv -f $1 --account-name $AZURE_STORAGE_ACCOUNT --overwrite true # \ No newline at end of file From 76fb70da0a7ee4420e6f9991f381e14114a98bf1 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Fri, 16 Jun 2023 07:40:23 -0700 Subject: [PATCH 15/40] Use blob upload batch --- .../cli-jobs-spark-serverless-spark-pipeline-user-identity.yml | 2 +- ...-jobs-spark-serverless-spark-standalone-default-identity.yml | 2 +- ...cli-jobs-spark-serverless-spark-standalone-user-identity.yml | 2 +- cli/upload-data-to-blob.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml index b55164e0a0..d4bab3b999 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml @@ -45,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash -x upload-data-to-blob.sh jobs/spark/data/titanic.csv + bash -x upload-data-to-blob.sh jobs/spark/ working-directory: cli - name: run job run: | diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml index 6a159e47d6..bbff3d7c89 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml @@ -45,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash -x upload-data-to-blob.sh jobs/spark/data/titanic.csv + bash -x upload-data-to-blob.sh jobs/spark/ working-directory: cli - name: run job run: | diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml index dac2359d2f..fce4fbf974 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml @@ -45,7 +45,7 @@ jobs: continue-on-error: true - name: upload data run: | - bash -x upload-data-to-blob.sh jobs/spark/data/titanic.csv + bash -x upload-data-to-blob.sh jobs/spark/ working-directory: cli - name: run job run: | diff --git a/cli/upload-data-to-blob.sh b/cli/upload-data-to-blob.sh index d1e1a73116..c665625efa 100644 --- a/cli/upload-data-to-blob.sh +++ b/cli/upload-data-to-blob.sh @@ -15,5 +15,5 @@ export AZURE_STORAGE_ACCOUNT=$(echo $response | jq -r '.value[0].properties.acco # # -az storage blob upload -c $AZUREML_DEFAULT_CONTAINER -n paths/data/titanic.csv -f $1 --account-name $AZURE_STORAGE_ACCOUNT --overwrite true +az storage blob upload-batch -s $1 --pattern *.csv -d $AZUREML_DEFAULT_CONTAINER --account-name $AZURE_STORAGE_ACCOUNT --overwrite true # \ No newline at end of file From 4caf85122ed10f1cdadaa553d7f1c0e0fd71956c Mon Sep 17 00:00:00 2001 From: Fred Li Date: Fri, 16 Jun 2023 11:09:00 -0700 Subject: [PATCH 16/40] Add spark pipeline tests --- ...erless-spark-pipeline-default-identity.yml | 55 +++++++++++++++++++ ...erless-spark-pipeline-managed-identity.yml | 55 +++++++++++++++++++ ...less-spark-standalone-managed-identity.yml | 55 +++++++++++++++++++ ...erless-spark-pipeline-default-identity.yml | 2 +- ...erless-spark-pipeline-managed-identity.yml | 2 +- ...erverless-spark-pipeline-user-identity.yml | 2 +- 6 files changed, 168 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/cli-jobs-spark-serverless-spark-pipeline-default-identity.yml create mode 100644 .github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml create mode 100644 .github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-default-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-default-identity.yml new file mode 100644 index 0000000000..70cdba7b0c --- /dev/null +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-default-identity.yml @@ -0,0 +1,55 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-serverless-spark-pipeline-default-identity +on: + workflow_dispatch: + schedule: + - cron: "33 10/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-serverless-spark-pipeline-default-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh serverless-spark-pipeline-default-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml new file mode 100644 index 0000000000..a96b33255b --- /dev/null +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml @@ -0,0 +1,55 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-serverless-spark-pipeline-managed-identity +on: + workflow_dispatch: + schedule: + - cron: "57 5/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh serverless-spark-pipeline-managed-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml new file mode 100644 index 0000000000..b76e136ba4 --- /dev/null +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml @@ -0,0 +1,55 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-serverless-spark-standalone-managed-identity +on: + workflow_dispatch: + schedule: + - cron: "46 0/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh serverless-spark-standalone-managed-identity.yml + working-directory: cli/jobs/spark diff --git a/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml index 89bd887e8f..e72d9e4c94 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: ./spark-job-component.yaml + component: spark-job-component.yaml inputs: titanic_data: type: uri_file diff --git a/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml index 4807083944..c7f50f84d5 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: ./spark-job-component.yaml + component: spark-job-component.yaml inputs: titanic_data: type: uri_file diff --git a/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml index 585dacfad9..5e2ebf70a5 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: ./spark-job-component.yaml + component: spark-job-component.yaml inputs: titanic_data: type: uri_file From b5682d7c2dfffe9c2d853c509c9cebaf073ded91 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Fri, 16 Jun 2023 14:57:09 -0700 Subject: [PATCH 17/40] Update spark component extension --- cli/jobs/spark/serverless-spark-pipeline-default-identity.yml | 2 +- cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml | 2 +- cli/jobs/spark/serverless-spark-pipeline-user-identity.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml index e72d9e4c94..dc3b573c3f 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: spark-job-component.yaml + component: ./spark-job-component.yml inputs: titanic_data: type: uri_file diff --git a/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml index c7f50f84d5..4419b716af 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: spark-job-component.yaml + component: ./spark-job-component.yml inputs: titanic_data: type: uri_file diff --git a/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml index 5e2ebf70a5..f679ad6498 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: spark-job-component.yaml + component: ./spark-job-component.yml inputs: titanic_data: type: uri_file From 70e86c1991f4afb92ea9f15cf0d7bbbaf7169eb2 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Mon, 19 Jun 2023 11:19:09 -0700 Subject: [PATCH 18/40] Add script to attache uai --- ...verless-spark-pipeline-managed-identity.yml | 4 ++++ ...rless-spark-standalone-managed-identity.yml | 4 ++++ cli/jobs/spark/setup-identities.sh | 18 ++++++++++++++++++ cli/jobs/spark/user-assigned-identity.yml | 3 +-- 4 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 cli/jobs/spark/setup-identities.sh diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml index a96b33255b..50e8c458d2 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml @@ -47,6 +47,10 @@ jobs: run: | bash -x upload-data-to-blob.sh jobs/spark/ working-directory: cli + -name: setup identities + run: | + bash -x setup-identities.sh + working-directory: cli/jobs/spark - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml index b76e136ba4..19f149b543 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml @@ -47,6 +47,10 @@ jobs: run: | bash -x upload-data-to-blob.sh jobs/spark/ working-directory: cli + -name: setup identities + run: | + bash -x setup-identities.sh + working-directory: cli/jobs/spark - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/cli/jobs/spark/setup-identities.sh b/cli/jobs/spark/setup-identities.sh new file mode 100644 index 0000000000..b128eb05f6 --- /dev/null +++ b/cli/jobs/spark/setup-identities.sh @@ -0,0 +1,18 @@ +# +SUBSCRIPTION_ID=$(az account show --query id -o tsv) +LOCATION=$(az ml workspace show --query location -o tsv) +RESOURCE_GROUP=$(az group show --query name -o tsv) +AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv) +API_VERSION="2022-05-01" +TOKEN=$(az account get-access-token --query accessToken -o tsv) + +AML_USER_MANAGED_ID = "${RESOURCE_GROUP}-uai" +# + +# +az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP --location $LOCATION +# + +# +az ml workspace update --subscription --resource-group --name --file user-assigned-identity.yml +# \ No newline at end of file diff --git a/cli/jobs/spark/user-assigned-identity.yml b/cli/jobs/spark/user-assigned-identity.yml index 6c21e7dc09..95b5b78646 100644 --- a/cli/jobs/spark/user-assigned-identity.yml +++ b/cli/jobs/spark/user-assigned-identity.yml @@ -2,6 +2,5 @@ identity: type: "system_assigned,user_assigned" user_assigned_identities: - - resource_id: /subscriptions//providers/Microsoft.ManagedIdentity/userAssignedIdentities/ - tenant_id: 00x000xx-00x0-00xx-00xx-0x0xx000xx00 + '/subscriptions//resourceGroups//providers/Microsoft.ManagedIdentity/userAssignedIdentities/' : {} \ No newline at end of file From f3df676b743bb273a9c07b53901ff0ff7ad384cc Mon Sep 17 00:00:00 2001 From: Fred Li Date: Mon, 19 Jun 2023 11:29:07 -0700 Subject: [PATCH 19/40] Update property name in workflow --- ...-jobs-spark-serverless-spark-pipeline-managed-identity.yml | 4 +++- ...obs-spark-serverless-spark-standalone-managed-identity.yml | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml index 50e8c458d2..20e1214f5f 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml @@ -47,10 +47,12 @@ jobs: run: | bash -x upload-data-to-blob.sh jobs/spark/ working-directory: cli - -name: setup identities + continue-on-error: true + - name: setup identities run: | bash -x setup-identities.sh working-directory: cli/jobs/spark + continue-on-error: true - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml index 19f149b543..a189d54e60 100644 --- a/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml +++ b/.github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml @@ -47,10 +47,12 @@ jobs: run: | bash -x upload-data-to-blob.sh jobs/spark/ working-directory: cli - -name: setup identities + continue-on-error: true + - name: setup identities run: | bash -x setup-identities.sh working-directory: cli/jobs/spark + continue-on-error: true - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; From ea7b996f3514b4a7d4ee50884addca9ba2f882a4 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Mon, 19 Jun 2023 11:46:29 -0700 Subject: [PATCH 20/40] Update script parameters --- cli/jobs/spark/setup-identities.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/jobs/spark/setup-identities.sh b/cli/jobs/spark/setup-identities.sh index b128eb05f6..c0fa8411cf 100644 --- a/cli/jobs/spark/setup-identities.sh +++ b/cli/jobs/spark/setup-identities.sh @@ -14,5 +14,5 @@ az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP # # -az ml workspace update --subscription --resource-group --name --file user-assigned-identity.yml +az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file user-assigned-identity.yml # \ No newline at end of file From 742509777e5552e7a5ec7983231c9fa6049399e3 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Mon, 19 Jun 2023 12:42:10 -0700 Subject: [PATCH 21/40] Update assign uai script --- cli/jobs/spark/setup-identities.sh | 7 ++++++- cli/jobs/spark/user-assigned-identity.yml | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cli/jobs/spark/setup-identities.sh b/cli/jobs/spark/setup-identities.sh index c0fa8411cf..639e60bbfa 100644 --- a/cli/jobs/spark/setup-identities.sh +++ b/cli/jobs/spark/setup-identities.sh @@ -7,12 +7,17 @@ API_VERSION="2022-05-01" TOKEN=$(az account get-access-token --query accessToken -o tsv) AML_USER_MANAGED_ID = "${RESOURCE_GROUP}-uai" +AML_RESOURCE_ID = "/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.ManagedIdentity/userAssignedIdentities/$AML_USER_MANAGED_ID" # # az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP --location $LOCATION # +TEMP_UAI_FILE = "temp-user-assigned-identity.yml" +cp user-assigned-identity.yml $TEMP_UAI_FILE +sed -i "s/{{AML_RESOURCE_ID}}/$AML_RESOURCE_ID/g;" $TEMP_UAI_FILE + # -az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file user-assigned-identity.yml +az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file $TEMP_UAI_FILE # \ No newline at end of file diff --git a/cli/jobs/spark/user-assigned-identity.yml b/cli/jobs/spark/user-assigned-identity.yml index 95b5b78646..534ba7f53c 100644 --- a/cli/jobs/spark/user-assigned-identity.yml +++ b/cli/jobs/spark/user-assigned-identity.yml @@ -2,5 +2,5 @@ identity: type: "system_assigned,user_assigned" user_assigned_identities: - '/subscriptions//resourceGroups//providers/Microsoft.ManagedIdentity/userAssignedIdentities/' : {} + "{{AML_RESOURCE_ID}}" : {} \ No newline at end of file From 75e217f6a1d65542adb6fd31e0640d39c37c8e9e Mon Sep 17 00:00:00 2001 From: Fred Li Date: Mon, 19 Jun 2023 13:34:13 -0700 Subject: [PATCH 22/40] Format the script --- cli/jobs/spark/setup-identities.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/jobs/spark/setup-identities.sh b/cli/jobs/spark/setup-identities.sh index 639e60bbfa..dfd1a810b9 100644 --- a/cli/jobs/spark/setup-identities.sh +++ b/cli/jobs/spark/setup-identities.sh @@ -6,15 +6,15 @@ AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv API_VERSION="2022-05-01" TOKEN=$(az account get-access-token --query accessToken -o tsv) -AML_USER_MANAGED_ID = "${RESOURCE_GROUP}-uai" -AML_RESOURCE_ID = "/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.ManagedIdentity/userAssignedIdentities/$AML_USER_MANAGED_ID" +AML_USER_MANAGED_ID=${RESOURCE_GROUP}-uai +AML_RESOURCE_ID="/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.ManagedIdentity/userAssignedIdentities/$AML_USER_MANAGED_ID" # # az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP --location $LOCATION # -TEMP_UAI_FILE = "temp-user-assigned-identity.yml" +TEMP_UAI_FILE="temp-user-assigned-identity.yml" cp user-assigned-identity.yml $TEMP_UAI_FILE sed -i "s/{{AML_RESOURCE_ID}}/$AML_RESOURCE_ID/g;" $TEMP_UAI_FILE From 7e9cd2accdc3f18b70c473ccc3fe468c1e8b8ebe Mon Sep 17 00:00:00 2001 From: Fred Li Date: Mon, 19 Jun 2023 15:12:05 -0700 Subject: [PATCH 23/40] Update setup identities script --- cli/jobs/spark/setup-identities.sh | 5 +++-- cli/jobs/spark/user-assigned-identity.yml | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cli/jobs/spark/setup-identities.sh b/cli/jobs/spark/setup-identities.sh index dfd1a810b9..782ee671e7 100644 --- a/cli/jobs/spark/setup-identities.sh +++ b/cli/jobs/spark/setup-identities.sh @@ -7,7 +7,6 @@ API_VERSION="2022-05-01" TOKEN=$(az account get-access-token --query accessToken -o tsv) AML_USER_MANAGED_ID=${RESOURCE_GROUP}-uai -AML_RESOURCE_ID="/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.ManagedIdentity/userAssignedIdentities/$AML_USER_MANAGED_ID" # # @@ -16,7 +15,9 @@ az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP TEMP_UAI_FILE="temp-user-assigned-identity.yml" cp user-assigned-identity.yml $TEMP_UAI_FILE -sed -i "s/{{AML_RESOURCE_ID}}/$AML_RESOURCE_ID/g;" $TEMP_UAI_FILE +sed -i "s//$SUBSCRIPTION_ID/g; + s//$RESOURCE_GROUP/g; + s//$AML_USER_MANAGED_ID/g;" $TEMP_UAI_FILE # az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file $TEMP_UAI_FILE diff --git a/cli/jobs/spark/user-assigned-identity.yml b/cli/jobs/spark/user-assigned-identity.yml index 534ba7f53c..02b41187d5 100644 --- a/cli/jobs/spark/user-assigned-identity.yml +++ b/cli/jobs/spark/user-assigned-identity.yml @@ -2,5 +2,5 @@ identity: type: "system_assigned,user_assigned" user_assigned_identities: - "{{AML_RESOURCE_ID}}" : {} + "/subscriptions//resourceGroups//providers/Microsoft.ManagedIdentity/userAssignedIdentities/" : {} \ No newline at end of file From b9c11ad7d69315d07511d2155dff53bd24af8fff Mon Sep 17 00:00:00 2001 From: Fred Li Date: Mon, 19 Jun 2023 16:59:23 -0700 Subject: [PATCH 24/40] Update path to infra bootstraping --- .../cli-resources-compute-attached-spark-system-identity.yml | 2 +- .../cli-resources-compute-attached-spark-user-identity.yml | 2 +- .github/workflows/cli-resources-compute-attached-spark.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml b/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml index bd6e9373d1..621aa36fae 100644 --- a/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml +++ b/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml @@ -31,7 +31,7 @@ jobs: creds: ${{secrets.AZUREML_CREDENTIALS}} - name: bootstrap resources run: | - bash bootstrap.sh + bash bootstrapping/bootstrap.sh working-directory: infra continue-on-error: false - name: setup-cli diff --git a/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml b/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml index 1b90b1c364..e7dc97cb89 100644 --- a/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml +++ b/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml @@ -31,7 +31,7 @@ jobs: creds: ${{secrets.AZUREML_CREDENTIALS}} - name: bootstrap resources run: | - bash bootstrap.sh + bash bootstrapping/bootstrap.sh working-directory: infra continue-on-error: false - name: setup-cli diff --git a/.github/workflows/cli-resources-compute-attached-spark.yml b/.github/workflows/cli-resources-compute-attached-spark.yml index f169dbbd58..4daf48c156 100644 --- a/.github/workflows/cli-resources-compute-attached-spark.yml +++ b/.github/workflows/cli-resources-compute-attached-spark.yml @@ -31,7 +31,7 @@ jobs: creds: ${{secrets.AZUREML_CREDENTIALS}} - name: bootstrap resources run: | - bash bootstrap.sh + bash bootstrapping/bootstrap.sh working-directory: infra continue-on-error: false - name: setup-cli From 1b3f84888bfcf1bede08863d6c465d518a950e75 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Tue, 20 Jun 2023 12:28:38 -0700 Subject: [PATCH 25/40] Enable automation test for attached spark job --- ...tached-spark-pipeline-default-identity.yml | 61 +++++++++++++++++ ...tached-spark-pipeline-managed-identity.yml | 66 +++++++++++++++++++ ...-attached-spark-pipeline-user-identity.yml | 61 +++++++++++++++++ ...ched-spark-standalone-default-identity.yml | 61 +++++++++++++++++ ...ched-spark-standalone-managed-identity.yml | 66 +++++++++++++++++++ ...ttached-spark-standalone-user-identity.yml | 61 +++++++++++++++++ ...ched-spark-standalone-default-identity.yml | 2 +- cli/jobs/spark/setup-attached-resources.sh | 50 ++++++++++++++ .../attached-spark-system-identity.yml | 4 +- .../compute/attached-spark-user-identity.yml | 6 +- cli/resources/compute/attached-spark.yml | 4 +- 11 files changed, 434 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml create mode 100644 .github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml create mode 100644 .github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml create mode 100644 .github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml create mode 100644 .github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml create mode 100644 .github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml create mode 100644 cli/jobs/spark/setup-attached-resources.sh diff --git a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml new file mode 100644 index 0000000000..c1f57d2aea --- /dev/null +++ b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml @@ -0,0 +1,61 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-attached-spark-pipeline-default-identity +on: + workflow_dispatch: + schedule: + - cron: "30 9/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + continue-on-error: true + - name: setup attached spark + working-directory: cli/jobs/spark + continue-on-error: true + run: | + bash -x setup-attached-resources.sh attached-spark.yml + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh attached-spark-pipeline-default-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml new file mode 100644 index 0000000000..4baf84b449 --- /dev/null +++ b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml @@ -0,0 +1,66 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-attached-spark-pipeline-managed-identity +on: + workflow_dispatch: + schedule: + - cron: "43 7/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + continue-on-error: true + - name: setup identities + run: | + bash -x setup-identities.sh + working-directory: cli/jobs/spark + continue-on-error: true + - name: setup attached spark + working-directory: cli/jobs/spark + continue-on-error: true + run: | + bash -x setup-attached-resources.sh attached-spark-system-identity.yml + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh attached-spark-pipeline-managed-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml new file mode 100644 index 0000000000..b8652c02a9 --- /dev/null +++ b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml @@ -0,0 +1,61 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-attached-spark-pipeline-user-identity +on: + workflow_dispatch: + schedule: + - cron: "15 4/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + continue-on-error: true + - name: setup attached spark + working-directory: cli/jobs/spark + continue-on-error: true + run: | + bash -x setup-attached-resources.sh attached-spark-user-identity.yml + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh attached-spark-pipeline-user-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml new file mode 100644 index 0000000000..7fa5759a6c --- /dev/null +++ b/.github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml @@ -0,0 +1,61 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-attached-spark-standalone-default-identity +on: + workflow_dispatch: + schedule: + - cron: "15 0/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + continue-on-error: true + - name: setup attached spark + working-directory: cli/jobs/spark + continue-on-error: true + run: | + bash -x setup-attached-resources.sh attached-spark.yml + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh attached-spark-standalone-default-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml new file mode 100644 index 0000000000..8b6b215b53 --- /dev/null +++ b/.github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml @@ -0,0 +1,66 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-attached-spark-standalone-managed-identity +on: + workflow_dispatch: + schedule: + - cron: "16 1/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + continue-on-error: true + - name: setup identities + run: | + bash -x setup-identities.sh + working-directory: cli/jobs/spark + continue-on-error: true + - name: setup attached spark + working-directory: cli/jobs/spark + continue-on-error: true + run: | + bash -x setup-attached-resources.sh attached-spark-system-identity.yml + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh attached-spark-standalone-managed-identity.yml + working-directory: cli/jobs/spark diff --git a/.github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml new file mode 100644 index 0000000000..e113634dc8 --- /dev/null +++ b/.github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml @@ -0,0 +1,61 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: cli-jobs-spark-attached-spark-standalone-user-identity +on: + workflow_dispatch: + schedule: + - cron: "7 1/12 * * *" + pull_request: + branches: + - main + paths: + - cli/jobs/spark/** + - infra/bootstrapping/** + - .github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml + - cli/jobs/spark/data/titanic.csv + - cli/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + continue-on-error: true + - name: setup attached spark + working-directory: cli/jobs/spark + continue-on-error: true + run: | + bash -x setup-attached-resources.sh attached-spark-user-identity.yml + - name: run job + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash -x ../../run-job.sh attached-spark-standalone-user-identity.yml + working-directory: cli/jobs/spark diff --git a/cli/jobs/spark/attached-spark-standalone-default-identity.yml b/cli/jobs/spark/attached-spark-standalone-default-identity.yml index f672bb447b..095803ef3e 100644 --- a/cli/jobs/spark/attached-spark-standalone-default-identity.yml +++ b/cli/jobs/spark/attached-spark-standalone-default-identity.yml @@ -29,4 +29,4 @@ args: >- --titanic_data ${{inputs.titanic_data}} --wrangled_data ${{outputs.wrangled_data}} -compute: yuachengcompute \ No newline at end of file +compute: mysparkcompute \ No newline at end of file diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh new file mode 100644 index 0000000000..f5e8551f75 --- /dev/null +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -0,0 +1,50 @@ +# +SUBSCRIPTION_ID=$(az account show --query id -o tsv) +LOCATION=$(az ml workspace show --query location -o tsv) +RESOURCE_GROUP=$(az group show --query name -o tsv) +AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv) +API_VERSION="2022-05-01" +TOKEN=$(az account get-access-token --query accessToken -o tsv) +AML_USER_MANAGED_ID=${RESOURCE_GROUP}-uai +AML_USER_MANAGED_ID_OID=$(az identity show --resource-group $RESOURCE_GROUP -n $AML_USER_MANAGED_ID --query principalId -o tsv) +GEN2_STORAGE_NAME="gen2automationspark" +GEN2_FILE_SYSTEM="gen2filesystem" +SYNAPSE_WORKSPACE_NAME="automation-syws" +SQL_ADMIN_LOGIN_USER="automation" +SQL_ADMIN_LOGIN_PASSWORD="auto123!" +SPARK_POOL_NAME="automationpool" +# + +# +az storage account create --name $GEN2_STORAGE_NAME --resource-group $RESOURCE_GROUP --location $LOCATION --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true +az storage fs create -n $GEN2_FILE_SYSTEM --account-name $GEN2_STORAGE_NAME +az synapse workspace create --name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --storage-account $GEN2_STORAGE_NAME --file-system $GEN2_FILE_SYSTEM --sql-admin-login-user $SQL_ADMIN_LOGIN_USER --sql-admin-login-password $SQL_ADMIN_LOGIN_PASSWORD --location $LOCATION +az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_MANAGED_ID_OID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_NAME/blobServices/default/containers/$GEN2_FILE_SYSTEM +az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true + +TEMP_UAI_FILE="temp-$1" +cp $1 $TEMP_UAI_FILE +sed -i "s//$SUBSCRIPTION_ID/g; + s//$RESOURCE_GROUP/g; + s//$SYNAPSE_WORKSPACE_NAME/g; + s//$SPARK_POOL_NAME/g; + s//$AML_USER_MANAGED_ID/g;" $TEMP_UAI_FILE + +az ml compute attach --file attached-spark.yml --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME +# + + + + + + + +TEMP_UAI_FILE="temp-user-assigned-identity.yml" +cp user-assigned-identity.yml $TEMP_UAI_FILE +sed -i "s//$SUBSCRIPTION_ID/g; + s//$RESOURCE_GROUP/g; + s//$AML_USER_MANAGED_ID/g;" $TEMP_UAI_FILE + +# +az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file $TEMP_UAI_FILE +# \ No newline at end of file diff --git a/cli/resources/compute/attached-spark-system-identity.yml b/cli/resources/compute/attached-spark-system-identity.yml index 93442937bd..cacb1cebe8 100644 --- a/cli/resources/compute/attached-spark-system-identity.yml +++ b/cli/resources/compute/attached-spark-system-identity.yml @@ -1,9 +1,9 @@ # attached-spark-system-identity.yaml -name: my-spark-pool +name: mysparkcompute type: synapsespark -resource_id: /subscriptions//providers/Microsoft.Synapse/workspaces//bigDataPools/ +resource_id: /subscriptions//resourceGroups//providers/Microsoft.Synapse/workspaces//bigDataPools/ identity: type: system_assigned diff --git a/cli/resources/compute/attached-spark-user-identity.yml b/cli/resources/compute/attached-spark-user-identity.yml index ad1833a999..810f451ce8 100644 --- a/cli/resources/compute/attached-spark-user-identity.yml +++ b/cli/resources/compute/attached-spark-user-identity.yml @@ -1,5 +1,5 @@ -# attached-spark-user-identity.yaml -name: my-spark-pool +# attached-spark-user-identity.yml +name: mysparkcompute type: synapsespark @@ -8,4 +8,4 @@ resource_id: /subscriptions//pro identity: type: user_assigned user_assigned_identities: - - resource_id: /subscriptions//providers/Microsoft.ManagedIdentity/userAssignedIdentities/ + - resource_id: /subscriptions//resourceGroups//providers/Microsoft.ManagedIdentity/userAssignedIdentities/ diff --git a/cli/resources/compute/attached-spark.yml b/cli/resources/compute/attached-spark.yml index cb58f0cb25..6bcc0b8a47 100644 --- a/cli/resources/compute/attached-spark.yml +++ b/cli/resources/compute/attached-spark.yml @@ -1,6 +1,6 @@ # attached-spark.yaml -name: my-spark-pool +name: mysparkcompute type: synapsespark -resource_id: /subscriptions//providers/Microsoft.Synapse/workspaces//bigDataPools/ +resource_id: /subscriptions//resourceGroups//providers/Microsoft.Synapse/workspaces//bigDataPools/ From 319423ec559039d239b93165b8b97307d600a695 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Tue, 20 Jun 2023 12:53:49 -0700 Subject: [PATCH 26/40] Update resource path --- ...tached-spark-pipeline-default-identity.yml | 4 +- ...tached-spark-pipeline-managed-identity.yml | 4 +- ...-attached-spark-pipeline-user-identity.yml | 4 +- ...ched-spark-standalone-default-identity.yml | 4 +- ...ched-spark-standalone-managed-identity.yml | 4 +- ...ttached-spark-standalone-user-identity.yml | 4 +- ...compute-attached-spark-system-identity.yml | 49 ------------ ...s-compute-attached-spark-user-identity.yml | 49 ------------ .../cli-resources-compute-attached-spark.yml | 49 ------------ ...obs-spark-submit_spark_standalone_jobs.yml | 75 ------------------- ...tached-spark-pipeline-default-identity.yml | 2 +- ...tached-spark-pipeline-managed-identity.yml | 2 +- .../attached-spark-pipeline-user-identity.yml | 2 +- 13 files changed, 15 insertions(+), 237 deletions(-) delete mode 100644 .github/workflows/cli-resources-compute-attached-spark-system-identity.yml delete mode 100644 .github/workflows/cli-resources-compute-attached-spark-user-identity.yml delete mode 100644 .github/workflows/cli-resources-compute-attached-spark.yml delete mode 100644 .github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml diff --git a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml index c1f57d2aea..2b4acac2ce 100644 --- a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml +++ b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml @@ -49,10 +49,10 @@ jobs: working-directory: cli continue-on-error: true - name: setup attached spark - working-directory: cli/jobs/spark + working-directory: cli continue-on-error: true run: | - bash -x setup-attached-resources.sh attached-spark.yml + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark.yml - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml index 4baf84b449..ea51c4839e 100644 --- a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml +++ b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml @@ -54,10 +54,10 @@ jobs: working-directory: cli/jobs/spark continue-on-error: true - name: setup attached spark - working-directory: cli/jobs/spark + working-directory: cli continue-on-error: true run: | - bash -x setup-attached-resources.sh attached-spark-system-identity.yml + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-system-identity.yml - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml index b8652c02a9..94e5cd154d 100644 --- a/.github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml @@ -49,10 +49,10 @@ jobs: working-directory: cli continue-on-error: true - name: setup attached spark - working-directory: cli/jobs/spark + working-directory: cli continue-on-error: true run: | - bash -x setup-attached-resources.sh attached-spark-user-identity.yml + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-user-identity.yml - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml index 7fa5759a6c..645b9070dd 100644 --- a/.github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml +++ b/.github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml @@ -49,10 +49,10 @@ jobs: working-directory: cli continue-on-error: true - name: setup attached spark - working-directory: cli/jobs/spark + working-directory: cli continue-on-error: true run: | - bash -x setup-attached-resources.sh attached-spark.yml + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark.yml - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml index 8b6b215b53..50fe2aae78 100644 --- a/.github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml +++ b/.github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml @@ -54,10 +54,10 @@ jobs: working-directory: cli/jobs/spark continue-on-error: true - name: setup attached spark - working-directory: cli/jobs/spark + working-directory: cli continue-on-error: true run: | - bash -x setup-attached-resources.sh attached-spark-system-identity.yml + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-system-identity.yml - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml b/.github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml index e113634dc8..bcd60d923d 100644 --- a/.github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml +++ b/.github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml @@ -49,10 +49,10 @@ jobs: working-directory: cli continue-on-error: true - name: setup attached spark - working-directory: cli/jobs/spark + working-directory: cli continue-on-error: true run: | - bash -x setup-attached-resources.sh attached-spark-user-identity.yml + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-user-identity.yml - name: run job run: | source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; diff --git a/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml b/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml deleted file mode 100644 index 621aa36fae..0000000000 --- a/.github/workflows/cli-resources-compute-attached-spark-system-identity.yml +++ /dev/null @@ -1,49 +0,0 @@ -# This code is autogenerated. -# Code is generated by running custom script: python3 readme.py -# Any manual changes to this file may cause incorrect behavior. -# Any manual changes will be overwritten if the code is regenerated. - -name: cli-resources-compute-attached-spark-system-identity -on: - workflow_dispatch: - schedule: - - cron: "20 3/12 * * *" - pull_request: - branches: - - main - paths: - - cli/resources/compute/attached-spark-system-identity.yml - - infra/bootstrapping/** - - .github/workflows/cli-resources-compute-attached-spark-system-identity.yml - - cli/setup.sh -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: check out repo - uses: actions/checkout@v2 - - name: azure login - uses: azure/login@v1 - with: - creds: ${{secrets.AZUREML_CREDENTIALS}} - - name: bootstrap resources - run: | - bash bootstrapping/bootstrap.sh - working-directory: infra - continue-on-error: false - - name: setup-cli - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: cli - continue-on-error: true - - name: create asset - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - az ml compute create -f resources/compute/attached-spark-system-identity.yml - working-directory: cli diff --git a/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml b/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml deleted file mode 100644 index e7dc97cb89..0000000000 --- a/.github/workflows/cli-resources-compute-attached-spark-user-identity.yml +++ /dev/null @@ -1,49 +0,0 @@ -# This code is autogenerated. -# Code is generated by running custom script: python3 readme.py -# Any manual changes to this file may cause incorrect behavior. -# Any manual changes will be overwritten if the code is regenerated. - -name: cli-resources-compute-attached-spark-user-identity -on: - workflow_dispatch: - schedule: - - cron: "39 4/12 * * *" - pull_request: - branches: - - main - paths: - - cli/resources/compute/attached-spark-user-identity.yml - - infra/bootstrapping/** - - .github/workflows/cli-resources-compute-attached-spark-user-identity.yml - - cli/setup.sh -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: check out repo - uses: actions/checkout@v2 - - name: azure login - uses: azure/login@v1 - with: - creds: ${{secrets.AZUREML_CREDENTIALS}} - - name: bootstrap resources - run: | - bash bootstrapping/bootstrap.sh - working-directory: infra - continue-on-error: false - - name: setup-cli - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: cli - continue-on-error: true - - name: create asset - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - az ml compute create -f resources/compute/attached-spark-user-identity.yml - working-directory: cli diff --git a/.github/workflows/cli-resources-compute-attached-spark.yml b/.github/workflows/cli-resources-compute-attached-spark.yml deleted file mode 100644 index 4daf48c156..0000000000 --- a/.github/workflows/cli-resources-compute-attached-spark.yml +++ /dev/null @@ -1,49 +0,0 @@ -# This code is autogenerated. -# Code is generated by running custom script: python3 readme.py -# Any manual changes to this file may cause incorrect behavior. -# Any manual changes will be overwritten if the code is regenerated. - -name: cli-resources-compute-attached-spark -on: - workflow_dispatch: - schedule: - - cron: "44 3/12 * * *" - pull_request: - branches: - - main - paths: - - cli/resources/compute/attached-spark.yml - - infra/bootstrapping/** - - .github/workflows/cli-resources-compute-attached-spark.yml - - cli/setup.sh -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: check out repo - uses: actions/checkout@v2 - - name: azure login - uses: azure/login@v1 - with: - creds: ${{secrets.AZUREML_CREDENTIALS}} - - name: bootstrap resources - run: | - bash bootstrapping/bootstrap.sh - working-directory: infra - continue-on-error: false - - name: setup-cli - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: cli - continue-on-error: true - - name: create asset - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - az ml compute create -f resources/compute/attached-spark.yml - working-directory: cli diff --git a/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml b/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml deleted file mode 100644 index e0896b73d3..0000000000 --- a/.github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml +++ /dev/null @@ -1,75 +0,0 @@ -# This code is autogenerated. -# Code is generated by running custom script: python3 readme.py -# Any manual changes to this file may cause incorrect behavior. -# Any manual changes will be overwritten if the code is regenerated. - -name: sdk-jobs-spark-submit_spark_standalone_jobs -# This file is created by sdk/python/readme.py. -# Please do not edit directly. -on: - workflow_dispatch: - schedule: - - cron: "36 10/12 * * *" - pull_request: - branches: - - main - paths: - - sdk/python/jobs/spark/** - - .github/workflows/sdk-jobs-spark-submit_spark_standalone_jobs.yml - - sdk/python/dev-requirements.txt - - infra/bootstrapping/** - - sdk/python/setup.sh -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: check out repo - uses: actions/checkout@v2 - - name: setup python - uses: actions/setup-python@v2 - with: - python-version: "3.8" - - name: pip install notebook reqs - run: pip install -r sdk/python/dev-requirements.txt - - name: azure login - uses: azure/login@v1 - with: - creds: ${{secrets.AZUREML_CREDENTIALS}} - - name: bootstrap resources - run: | - echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; - bash bootstrap.sh - working-directory: infra/bootstrapping - continue-on-error: false - - name: setup SDK - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: sdk/python - continue-on-error: true - - name: setup-cli - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: cli - continue-on-error: true - - name: run jobs/spark/submit_spark_standalone_jobs.ipynb - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; - bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "submit_spark_standalone_jobs.ipynb"; - [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; - papermill -k python submit_spark_standalone_jobs.ipynb submit_spark_standalone_jobs.output.ipynb - working-directory: sdk/python/jobs/spark - - name: upload notebook's working folder as an artifact - if: ${{ always() }} - uses: actions/upload-artifact@v2 - with: - name: submit_spark_standalone_jobs - path: sdk/python/jobs/spark diff --git a/cli/jobs/spark/attached-spark-pipeline-default-identity.yml b/cli/jobs/spark/attached-spark-pipeline-default-identity.yml index 03a8d10c6d..c372588195 100644 --- a/cli/jobs/spark/attached-spark-pipeline-default-identity.yml +++ b/cli/jobs/spark/attached-spark-pipeline-default-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: ./spark-job-component.yaml + component: ./spark-job-component.yml inputs: titanic_data: type: uri_file diff --git a/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml b/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml index 01958ec7f9..1e00a1321b 100644 --- a/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml +++ b/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: ./spark-job-component.yaml + component: ./spark-job-component.yml inputs: titanic_data: type: uri_file diff --git a/cli/jobs/spark/attached-spark-pipeline-user-identity.yml b/cli/jobs/spark/attached-spark-pipeline-user-identity.yml index 673123b600..74d22dce2e 100644 --- a/cli/jobs/spark/attached-spark-pipeline-user-identity.yml +++ b/cli/jobs/spark/attached-spark-pipeline-user-identity.yml @@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline jobs: spark_job: type: spark - component: ./spark-job-component.yaml + component: ./spark-job-component.yml inputs: titanic_data: type: uri_file From 0e0b4eca2cf21cf3e51883a11705fec8ff7a6390 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Tue, 20 Jun 2023 13:15:41 -0700 Subject: [PATCH 27/40] Update setup attached resource script --- cli/jobs/spark/setup-attached-resources.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index f5e8551f75..4fa2eebe04 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -22,15 +22,15 @@ az synapse workspace create --name $SYNAPSE_WORKSPACE_NAME --resource-group $RES az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_MANAGED_ID_OID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_NAME/blobServices/default/containers/$GEN2_FILE_SYSTEM az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true -TEMP_UAI_FILE="temp-$1" +TEMP_COMPUTE_FILE="temp-compute-setup.yml" cp $1 $TEMP_UAI_FILE sed -i "s//$SUBSCRIPTION_ID/g; s//$RESOURCE_GROUP/g; s//$SYNAPSE_WORKSPACE_NAME/g; s//$SPARK_POOL_NAME/g; - s//$AML_USER_MANAGED_ID/g;" $TEMP_UAI_FILE + s//$AML_USER_MANAGED_ID/g;" $TEMP_COMPUTE_FILE -az ml compute attach --file attached-spark.yml --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME +az ml compute attach --file $TEMP_COMPUTE_FILE --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME # From ddad2f63ea91434cf3d909eb8390212d7488bf29 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Tue, 20 Jun 2023 13:23:12 -0700 Subject: [PATCH 28/40] Update script of setup resources --- cli/jobs/spark/setup-attached-resources.sh | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index 4fa2eebe04..131ff7ab12 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -23,7 +23,7 @@ az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_ az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true TEMP_COMPUTE_FILE="temp-compute-setup.yml" -cp $1 $TEMP_UAI_FILE +cp $1 $TEMP_COMPUTE_FILE sed -i "s//$SUBSCRIPTION_ID/g; s//$RESOURCE_GROUP/g; s//$SYNAPSE_WORKSPACE_NAME/g; @@ -31,20 +31,4 @@ sed -i "s//$SUBSCRIPTION_ID/g; s//$AML_USER_MANAGED_ID/g;" $TEMP_COMPUTE_FILE az ml compute attach --file $TEMP_COMPUTE_FILE --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME -# - - - - - - - -TEMP_UAI_FILE="temp-user-assigned-identity.yml" -cp user-assigned-identity.yml $TEMP_UAI_FILE -sed -i "s//$SUBSCRIPTION_ID/g; - s//$RESOURCE_GROUP/g; - s//$AML_USER_MANAGED_ID/g;" $TEMP_UAI_FILE - -# -az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file $TEMP_UAI_FILE -# \ No newline at end of file +# \ No newline at end of file From 7ee07f751e210fddf031fa00c96f65beb1a070d0 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Tue, 20 Jun 2023 22:05:06 -0700 Subject: [PATCH 29/40] Update setup attached resource script2 --- cli/jobs/spark/setup-attached-resources.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index 131ff7ab12..aec02e4e0a 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -21,6 +21,7 @@ az storage fs create -n $GEN2_FILE_SYSTEM --account-name $GEN2_STORAGE_NAME az synapse workspace create --name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --storage-account $GEN2_STORAGE_NAME --file-system $GEN2_FILE_SYSTEM --sql-admin-login-user $SQL_ADMIN_LOGIN_USER --sql-admin-login-password $SQL_ADMIN_LOGIN_PASSWORD --location $LOCATION az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_MANAGED_ID_OID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_NAME/blobServices/default/containers/$GEN2_FILE_SYSTEM az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true +az synapse workspace firewall-rule create --name allowAll --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --start-ip-address 0.0.0.0 --end-ip-address 255.255.255.255 TEMP_COMPUTE_FILE="temp-compute-setup.yml" cp $1 $TEMP_COMPUTE_FILE From 5bce143d576d5e52bd9beccdbee8a5788269591b Mon Sep 17 00:00:00 2001 From: Fred Li Date: Wed, 21 Jun 2023 12:25:35 -0700 Subject: [PATCH 30/40] Add logic to assign identity role --- cli/jobs/spark/setup-attached-resources.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index aec02e4e0a..1e6e4574c6 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -13,6 +13,8 @@ SYNAPSE_WORKSPACE_NAME="automation-syws" SQL_ADMIN_LOGIN_USER="automation" SQL_ADMIN_LOGIN_PASSWORD="auto123!" SPARK_POOL_NAME="automationpool" +SPARK_POOL_ADMIN_ROLE_ID="6e4bf58a-b8e1-4cc3-bbf9-d73143322b78" +COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name mysparkcompute --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query [*].identity.principal_id --out tsv) # # @@ -23,6 +25,11 @@ az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_ az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true az synapse workspace firewall-rule create --name allowAll --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --start-ip-address 0.0.0.0 --end-ip-address 255.255.255.255 +if [[! -z "$COMPUTE_MANAGED_IDENTITY"]] + then + az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY + + TEMP_COMPUTE_FILE="temp-compute-setup.yml" cp $1 $TEMP_COMPUTE_FILE sed -i "s//$SUBSCRIPTION_ID/g; From 0fb9cbb6dfc1867401f5faca91cb1fe3ea496a62 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Wed, 21 Jun 2023 12:32:56 -0700 Subject: [PATCH 31/40] Format the empty check --- cli/jobs/spark/setup-attached-resources.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index 1e6e4574c6..f56985cb34 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -26,9 +26,9 @@ az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_W az synapse workspace firewall-rule create --name allowAll --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --start-ip-address 0.0.0.0 --end-ip-address 255.255.255.255 if [[! -z "$COMPUTE_MANAGED_IDENTITY"]] - then - az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY - +then + az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY +fi TEMP_COMPUTE_FILE="temp-compute-setup.yml" cp $1 $TEMP_COMPUTE_FILE From 5b06d820d251a4cf22b4e8f29dbb6c7cb06e33e3 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Wed, 21 Jun 2023 13:05:34 -0700 Subject: [PATCH 32/40] Check if identity is empty --- cli/jobs/spark/setup-attached-resources.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index f56985cb34..7027bb440a 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -25,7 +25,7 @@ az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_ az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true az synapse workspace firewall-rule create --name allowAll --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --start-ip-address 0.0.0.0 --end-ip-address 255.255.255.255 -if [[! -z "$COMPUTE_MANAGED_IDENTITY"]] +if [[ ! -z "$COMPUTE_MANAGED_IDENTITY" ]] then az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY fi From 1b57095f765b145bf7ce9a2d4241f07f32c824ed Mon Sep 17 00:00:00 2001 From: Fred Li Date: Wed, 21 Jun 2023 13:16:00 -0700 Subject: [PATCH 33/40] Update to get compute properties --- cli/jobs/spark/setup-attached-resources.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index 7027bb440a..9893ff040b 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -14,7 +14,7 @@ SQL_ADMIN_LOGIN_USER="automation" SQL_ADMIN_LOGIN_PASSWORD="auto123!" SPARK_POOL_NAME="automationpool" SPARK_POOL_ADMIN_ROLE_ID="6e4bf58a-b8e1-4cc3-bbf9-d73143322b78" -COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name mysparkcompute --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query [*].identity.principal_id --out tsv) +COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name mysparkcompute --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv) # # From a2c5e6e0393412abfa418d92229ef9588e3f7958 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Wed, 21 Jun 2023 13:36:01 -0700 Subject: [PATCH 34/40] update readme --- cli/readme.py | 54 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/cli/readme.py b/cli/readme.py index 4e5dbf1944..a2f73808d6 100644 --- a/cli/readme.py +++ b/cli/readme.py @@ -9,7 +9,11 @@ import yaml # define constants -EXCLUDED_JOBS = ["java"] +EXCLUDED_JOBS = [ + "java", + "spark-job-component", + "storage_pe", + "user-assigned-identity"] # TODO: Re-include these below endpoints and deployments when the workflow generation code supports substituting vars in .yaml files. EXCLUDED_ENDPOINTS = [ "1-uai-create-endpoint", @@ -33,6 +37,9 @@ "instance", "connections", "compute/cluster-user-identity", + "compute/attached-spark", + "compute/attached-spark-system-identity", + "compute/attached-spark-user-identity", "registry", ] EXCLUDED_ASSETS = ["conda-yamls", "mlflow-models"] @@ -418,6 +425,7 @@ def write_job_workflow(job): filename, project_dir, hyphenated = parse_path(job) posix_project_dir = project_dir.replace(os.sep, "/") is_pipeline_sample = "jobs/pipelines" in job + is_spark_sample = "jobs/spark" in job creds = CREDENTIALS schedule_hour, schedule_minute = get_schedule_time(filename) # Duplicate name in working directory during checkout @@ -437,6 +445,8 @@ def write_job_workflow(job): - .github/workflows/cli-{hyphenated}.yml\n""" if is_pipeline_sample: workflow_yaml += " - cli/run-pipeline-jobs.sh\n" "" + if is_spark_sample: + workflow_yaml += " - cli/jobs/spark/data/titanic.csv\n" "" workflow_yaml += f""" - cli/setup.sh concurrency: group: {GITHUB_CONCURRENCY_GROUP} @@ -463,8 +473,10 @@ def write_job_workflow(job): source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh"; bash setup.sh working-directory: cli - continue-on-error: true - - name: run job + continue-on-error: true\n""" + if is_spark_sample: + workflow_yaml += get_spark_setup_workflow(job) + workflow_yaml += f""" - name: run job run: | source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh"; source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";\n""" @@ -699,7 +711,7 @@ def write_asset_workflow(asset): creds: {creds} - name: bootstrap resources run: | - bash bootstrap.sh + bash bootstrapping/bootstrap.sh working-directory: infra continue-on-error: false - name: setup-cli @@ -853,6 +865,40 @@ def get_endpoint_name(filename, hyphenated): endpoint_name = yaml.safe_load(f)["name"] return endpoint_name +def get_spark_setup_workflow(job): + is_attached = "attached-spark" in job + is_user_identity = "user-identity" in job + is_managed_identity = "managed-identity" in job + is_default_identity = "default-identity" in job + workflow = f""" - name: upload data + run: | + bash -x upload-data-to-blob.sh jobs/spark/ + working-directory: cli + continue-on-error: true\n""" + if is_managed_identity: + workflow += f""" - name: setup identities + run: | + bash -x setup-identities.sh + working-directory: cli/jobs/spark + continue-on-error: true\n""" + if is_attached: + workflow += f""" - name: setup attached spark + working-directory: cli + continue-on-error: true""" + if is_attached and is_user_identity: + workflow += f""" + run: | + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-user-identity.yml\n""" + if is_attached and is_managed_identity: + workflow += f""" + run: | + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-system-identity.yml\n""" + if is_attached and is_default_identity: + workflow += f""" + run: | + bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark.yml\n""" + + return workflow # run functions if __name__ == "__main__": From 9464d9b752c695194ec72ff4b490eb1ed97b187e Mon Sep 17 00:00:00 2001 From: Fred Li Date: Wed, 21 Jun 2023 13:39:38 -0700 Subject: [PATCH 35/40] Reformat the script --- cli/readme.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cli/readme.py b/cli/readme.py index a2f73808d6..495a538cb9 100644 --- a/cli/readme.py +++ b/cli/readme.py @@ -9,11 +9,7 @@ import yaml # define constants -EXCLUDED_JOBS = [ - "java", - "spark-job-component", - "storage_pe", - "user-assigned-identity"] +EXCLUDED_JOBS = ["java", "spark-job-component", "storage_pe", "user-assigned-identity"] # TODO: Re-include these below endpoints and deployments when the workflow generation code supports substituting vars in .yaml files. EXCLUDED_ENDPOINTS = [ "1-uai-create-endpoint", @@ -475,7 +471,7 @@ def write_job_workflow(job): working-directory: cli continue-on-error: true\n""" if is_spark_sample: - workflow_yaml += get_spark_setup_workflow(job) + workflow_yaml += get_spark_setup_workflow(job) workflow_yaml += f""" - name: run job run: | source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh"; @@ -865,6 +861,7 @@ def get_endpoint_name(filename, hyphenated): endpoint_name = yaml.safe_load(f)["name"] return endpoint_name + def get_spark_setup_workflow(job): is_attached = "attached-spark" in job is_user_identity = "user-identity" in job @@ -882,24 +879,25 @@ def get_spark_setup_workflow(job): working-directory: cli/jobs/spark continue-on-error: true\n""" if is_attached: - workflow += f""" - name: setup attached spark + workflow += f""" - name: setup attached spark working-directory: cli continue-on-error: true""" if is_attached and is_user_identity: - workflow += f""" + workflow += f""" run: | bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-user-identity.yml\n""" if is_attached and is_managed_identity: - workflow += f""" + workflow += f""" run: | bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-system-identity.yml\n""" if is_attached and is_default_identity: - workflow += f""" + workflow += f""" run: | bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark.yml\n""" return workflow + # run functions if __name__ == "__main__": # setup argparse From 805e744fb24446cbccaa885c47f64d2c2cade650 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 22 Jun 2023 10:08:21 -0700 Subject: [PATCH 36/40] Update schema location and revert sdk notebook changes --- cli/jobs/spark/attached-spark-pipeline-default-identity.yml | 2 +- cli/jobs/spark/attached-spark-pipeline-managed-identity.yml | 2 +- cli/jobs/spark/attached-spark-pipeline-user-identity.yml | 2 +- cli/jobs/spark/attached-spark-standalone-default-identity.yml | 2 +- cli/jobs/spark/attached-spark-standalone-managed-identity.yml | 2 +- cli/jobs/spark/attached-spark-standalone-user-identity.yml | 2 +- cli/jobs/spark/serverless-spark-pipeline-default-identity.yml | 2 +- cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml | 2 +- cli/jobs/spark/serverless-spark-pipeline-user-identity.yml | 2 +- cli/jobs/spark/serverless-spark-standalone-default-identity.yml | 2 +- cli/jobs/spark/serverless-spark-standalone-managed-identity.yml | 2 +- cli/jobs/spark/serverless-spark-standalone-user-identity.yml | 2 +- cli/jobs/spark/spark-job-component.yml | 2 +- sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb | 2 +- sdk/python/readme.py | 1 + 15 files changed, 15 insertions(+), 14 deletions(-) diff --git a/cli/jobs/spark/attached-spark-pipeline-default-identity.yml b/cli/jobs/spark/attached-spark-pipeline-default-identity.yml index c372588195..60c4e7517c 100644 --- a/cli/jobs/spark/attached-spark-pipeline-default-identity.yml +++ b/cli/jobs/spark/attached-spark-pipeline-default-identity.yml @@ -1,5 +1,5 @@ # attached-spark-pipeline-default-identity.yaml -$schema: http://azureml/sdk-2-0/PipelineJob.json +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json type: pipeline display_name: Titanic-Spark-CLI-Pipeline-3 description: Spark component for Titanic data in Pipeline diff --git a/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml b/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml index 1e00a1321b..c1f1ce8ad4 100644 --- a/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml +++ b/cli/jobs/spark/attached-spark-pipeline-managed-identity.yml @@ -1,5 +1,5 @@ # attached-spark-pipeline-managed-identity.yaml -$schema: http://azureml/sdk-2-0/PipelineJob.json +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json type: pipeline display_name: Titanic-Spark-CLI-Pipeline-1 description: Spark component for Titanic data in Pipeline diff --git a/cli/jobs/spark/attached-spark-pipeline-user-identity.yml b/cli/jobs/spark/attached-spark-pipeline-user-identity.yml index 74d22dce2e..20802db261 100644 --- a/cli/jobs/spark/attached-spark-pipeline-user-identity.yml +++ b/cli/jobs/spark/attached-spark-pipeline-user-identity.yml @@ -1,5 +1,5 @@ # attached-spark-pipeline-user-identity.yaml -$schema: http://azureml/sdk-2-0/PipelineJob.json +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json type: pipeline display_name: Titanic-Spark-CLI-Pipeline-2 description: Spark component for Titanic data in Pipeline diff --git a/cli/jobs/spark/attached-spark-standalone-default-identity.yml b/cli/jobs/spark/attached-spark-standalone-default-identity.yml index 095803ef3e..9c2f6206ba 100644 --- a/cli/jobs/spark/attached-spark-standalone-default-identity.yml +++ b/cli/jobs/spark/attached-spark-standalone-default-identity.yml @@ -1,5 +1,5 @@ # attached-spark-standalone-default-identity.yaml -$schema: http://azureml/sdk-2-0/SparkJob.json +$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json type: spark code: ./src diff --git a/cli/jobs/spark/attached-spark-standalone-managed-identity.yml b/cli/jobs/spark/attached-spark-standalone-managed-identity.yml index c2c57a3627..faa5015573 100644 --- a/cli/jobs/spark/attached-spark-standalone-managed-identity.yml +++ b/cli/jobs/spark/attached-spark-standalone-managed-identity.yml @@ -1,5 +1,5 @@ # attached-spark-standalone-managed-identity.yaml -$schema: http://azureml/sdk-2-0/SparkJob.json +$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json type: spark code: ./src diff --git a/cli/jobs/spark/attached-spark-standalone-user-identity.yml b/cli/jobs/spark/attached-spark-standalone-user-identity.yml index b35d9b135d..27580c75f2 100644 --- a/cli/jobs/spark/attached-spark-standalone-user-identity.yml +++ b/cli/jobs/spark/attached-spark-standalone-user-identity.yml @@ -1,5 +1,5 @@ # attached-spark-standalone-user-identity.yaml -$schema: http://azureml/sdk-2-0/SparkJob.json +$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json type: spark code: ./src diff --git a/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml index dc3b573c3f..a334b5d824 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-default-identity.yml @@ -1,5 +1,5 @@ # serverless-spark-pipeline-default-identity.yaml -$schema: http://azureml/sdk-2-0/PipelineJob.json +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json type: pipeline display_name: Titanic-Spark-CLI-Pipeline-6 description: Spark component for Titanic data in Pipeline diff --git a/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml index 4419b716af..225d51ffc9 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-managed-identity.yml @@ -1,5 +1,5 @@ # serverless-spark-pipeline-managed-identity.yaml -$schema: http://azureml/sdk-2-0/PipelineJob.json +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json type: pipeline display_name: Titanic-Spark-CLI-Pipeline-4 description: Spark component for Titanic data in Pipeline diff --git a/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml b/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml index f679ad6498..9c1dcd6ea8 100644 --- a/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml +++ b/cli/jobs/spark/serverless-spark-pipeline-user-identity.yml @@ -1,5 +1,5 @@ # serverless-spark-pipeline-user-identity.yaml -$schema: http://azureml/sdk-2-0/PipelineJob.json +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json type: pipeline display_name: Titanic-Spark-CLI-Pipeline-5 description: Spark component for Titanic data in Pipeline diff --git a/cli/jobs/spark/serverless-spark-standalone-default-identity.yml b/cli/jobs/spark/serverless-spark-standalone-default-identity.yml index 98fb3cc637..5f6683b1cf 100644 --- a/cli/jobs/spark/serverless-spark-standalone-default-identity.yml +++ b/cli/jobs/spark/serverless-spark-standalone-default-identity.yml @@ -1,5 +1,5 @@ # serverless-spark-standalone-default-identity.yaml -$schema: http://azureml/sdk-2-0/SparkJob.json +$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json type: spark code: ./src diff --git a/cli/jobs/spark/serverless-spark-standalone-managed-identity.yml b/cli/jobs/spark/serverless-spark-standalone-managed-identity.yml index 4f9fddfdd4..f84f841a3b 100644 --- a/cli/jobs/spark/serverless-spark-standalone-managed-identity.yml +++ b/cli/jobs/spark/serverless-spark-standalone-managed-identity.yml @@ -1,5 +1,5 @@ # serverless-spark-standalone-managed-identity.yaml -$schema: http://azureml/sdk-2-0/SparkJob.json +$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json type: spark code: ./src diff --git a/cli/jobs/spark/serverless-spark-standalone-user-identity.yml b/cli/jobs/spark/serverless-spark-standalone-user-identity.yml index bd58971e51..a4d448d2d6 100644 --- a/cli/jobs/spark/serverless-spark-standalone-user-identity.yml +++ b/cli/jobs/spark/serverless-spark-standalone-user-identity.yml @@ -1,5 +1,5 @@ # serverless-spark-standalone-user-identity.yaml -$schema: http://azureml/sdk-2-0/SparkJob.json +$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json type: spark code: ./src diff --git a/cli/jobs/spark/spark-job-component.yml b/cli/jobs/spark/spark-job-component.yml index d7908e08c3..3dbee239f9 100644 --- a/cli/jobs/spark/spark-job-component.yml +++ b/cli/jobs/spark/spark-job-component.yml @@ -1,5 +1,5 @@ # spark-job-component.yaml -$schema: http://azureml/sdk-2-0/SparkComponent.json +$schema: https://azuremlschemas.azureedge.net/latest/sparkComponent.schema.json name: titanic_spark_component type: spark version: 1 diff --git a/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb b/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb index bb5a1c091a..4a57bfed91 100644 --- a/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb +++ b/sdk/python/jobs/spark/submit_spark_standalone_jobs.ipynb @@ -126,7 +126,7 @@ " executor_cores=2,\n", " executor_memory=\"2g\",\n", " executor_instances=2,\n", - " compute=\"my-spark-pool\",\n", + " compute=\"\",\n", " inputs={\n", " \"titanic_data\": Input(\n", " type=\"uri_file\",\n", diff --git a/sdk/python/readme.py b/sdk/python/readme.py index ab11457096..5a0d41de43 100644 --- a/sdk/python/readme.py +++ b/sdk/python/readme.py @@ -22,6 +22,7 @@ "interactive_data_wrangling", "attach_manage_spark_pools", "submit_spark_pipeline_jobs", + "submit_spark_standalone_jobs", "submit_spark_standalone_jobs_managed_vnet", # mlflow SDK samples notebooks "mlflow_sdk_online_endpoints_progresive", From 9f50ca44353413426df54483ccd476ead86014c6 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 22 Jun 2023 10:25:28 -0700 Subject: [PATCH 37/40] Attach pool first --- cli/jobs/spark/setup-attached-resources.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index 9893ff040b..0165830771 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -14,7 +14,7 @@ SQL_ADMIN_LOGIN_USER="automation" SQL_ADMIN_LOGIN_PASSWORD="auto123!" SPARK_POOL_NAME="automationpool" SPARK_POOL_ADMIN_ROLE_ID="6e4bf58a-b8e1-4cc3-bbf9-d73143322b78" -COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name mysparkcompute --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv) +ATTACHED_COMPUTE_NAME="mysparkcompute" # # @@ -25,11 +25,6 @@ az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_ az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true az synapse workspace firewall-rule create --name allowAll --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --start-ip-address 0.0.0.0 --end-ip-address 255.255.255.255 -if [[ ! -z "$COMPUTE_MANAGED_IDENTITY" ]] -then - az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY -fi - TEMP_COMPUTE_FILE="temp-compute-setup.yml" cp $1 $TEMP_COMPUTE_FILE sed -i "s//$SUBSCRIPTION_ID/g; @@ -39,4 +34,12 @@ sed -i "s//$SUBSCRIPTION_ID/g; s//$AML_USER_MANAGED_ID/g;" $TEMP_COMPUTE_FILE az ml compute attach --file $TEMP_COMPUTE_FILE --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME + +COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name $ATTACHED_COMPUTE_NAME --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv) + +if [[ ! -z "$COMPUTE_MANAGED_IDENTITY" ]] +then + az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY +fi + # \ No newline at end of file From bd06bf72708c92053178a0970bcf7292bd372c89 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 6 Jul 2023 10:42:22 -0700 Subject: [PATCH 38/40] Rename resources and merge main --- cli/jobs/spark/setup-attached-resources.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index 0165830771..b383d02652 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -7,9 +7,9 @@ API_VERSION="2022-05-01" TOKEN=$(az account get-access-token --query accessToken -o tsv) AML_USER_MANAGED_ID=${RESOURCE_GROUP}-uai AML_USER_MANAGED_ID_OID=$(az identity show --resource-group $RESOURCE_GROUP -n $AML_USER_MANAGED_ID --query principalId -o tsv) -GEN2_STORAGE_NAME="gen2automationspark" -GEN2_FILE_SYSTEM="gen2filesystem" -SYNAPSE_WORKSPACE_NAME="automation-syws" +GEN2_STORAGE_NAME=${RESOURCE_GROUP}gen2 +GEN2_FILE_SYSTEM=${RESOURCE_GROUP}file +SYNAPSE_WORKSPACE_NAME=${AML_WORKSPACE_NAME}-syws SQL_ADMIN_LOGIN_USER="automation" SQL_ADMIN_LOGIN_PASSWORD="auto123!" SPARK_POOL_NAME="automationpool" From 279a9eef72643678e7e69d99668b5992e41dd593 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 6 Jul 2023 12:03:42 -0700 Subject: [PATCH 39/40] Update format in yml --- cli/resources/compute/attached-spark-user-identity.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/resources/compute/attached-spark-user-identity.yml b/cli/resources/compute/attached-spark-user-identity.yml index 810f451ce8..3227aa76f1 100644 --- a/cli/resources/compute/attached-spark-user-identity.yml +++ b/cli/resources/compute/attached-spark-user-identity.yml @@ -3,7 +3,7 @@ name: mysparkcompute type: synapsespark -resource_id: /subscriptions//providers/Microsoft.Synapse/workspaces//bigDataPools/ +resource_id: /subscriptions//resourceGroups//providers/Microsoft.Synapse/workspaces//bigDataPools/ identity: type: user_assigned From 01e1d380d37ce605811841319dc5d70a55c7f166 Mon Sep 17 00:00:00 2001 From: Fred Li Date: Thu, 6 Jul 2023 12:28:42 -0700 Subject: [PATCH 40/40] Add role assigment to uid --- cli/jobs/spark/setup-attached-resources.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cli/jobs/spark/setup-attached-resources.sh b/cli/jobs/spark/setup-attached-resources.sh index b383d02652..9792c3f642 100644 --- a/cli/jobs/spark/setup-attached-resources.sh +++ b/cli/jobs/spark/setup-attached-resources.sh @@ -5,8 +5,7 @@ RESOURCE_GROUP=$(az group show --query name -o tsv) AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv) API_VERSION="2022-05-01" TOKEN=$(az account get-access-token --query accessToken -o tsv) -AML_USER_MANAGED_ID=${RESOURCE_GROUP}-uai -AML_USER_MANAGED_ID_OID=$(az identity show --resource-group $RESOURCE_GROUP -n $AML_USER_MANAGED_ID --query principalId -o tsv) + GEN2_STORAGE_NAME=${RESOURCE_GROUP}gen2 GEN2_FILE_SYSTEM=${RESOURCE_GROUP}file SYNAPSE_WORKSPACE_NAME=${AML_WORKSPACE_NAME}-syws @@ -17,6 +16,12 @@ SPARK_POOL_ADMIN_ROLE_ID="6e4bf58a-b8e1-4cc3-bbf9-d73143322b78" ATTACHED_COMPUTE_NAME="mysparkcompute" # +# +AML_USER_MANAGED_ID=${RESOURCE_GROUP}-uai +az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP --location $LOCATION +AML_USER_MANAGED_ID_OID=$(az identity show --resource-group $RESOURCE_GROUP -n $AML_USER_MANAGED_ID --query principalId -o tsv) +# + # az storage account create --name $GEN2_STORAGE_NAME --resource-group $RESOURCE_GROUP --location $LOCATION --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true az storage fs create -n $GEN2_FILE_SYSTEM --account-name $GEN2_STORAGE_NAME @@ -34,6 +39,7 @@ sed -i "s//$SUBSCRIPTION_ID/g; s//$AML_USER_MANAGED_ID/g;" $TEMP_COMPUTE_FILE az ml compute attach --file $TEMP_COMPUTE_FILE --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME +az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $AML_USER_MANAGED_ID_OID COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name $ATTACHED_COMPUTE_NAME --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv)