-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathazure-pipelines-data-factory-accesstoken.yml
222 lines (195 loc) · 10.8 KB
/
azure-pipelines-data-factory-accesstoken.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#
# Pipeline that deploys a Data Factory Pipeline with a Databricks linked service
#
trigger:
- none
variables:
- template: vars.yml # Template reference
parameters:
- name: serviceConnection
displayName: 'Azure Resource Manager service connection'
type: string
default: $(serviceConnection)
- name: pool
displayName: 'Agent Pool'
type: object
default:
vmImage: 'ubuntu-latest'
- name: databricksClusterNumWorkers
displayName: 'Number of worker nodes in the Azure Databricks Cluster'
type: number
default: 1
- name: databricksClusterSparkVersion
displayName: 'Azure Databricks Cluster Spark Version'
type: string
default: '13.3.x-scala2.12'
- name: pipelineDatabaseName
displayName: 'Name of the Database used by the Pipeline'
type: string
default: 'pipeline'
- name: pipelineSourceData
displayName: 'Path to the Source Dataset'
type: string
default: '/databricks-datasets/weather/high_temps'
- name: pipelineBronzeTableName
displayName: 'Name of the Delta Bronze table'
type: string
default: 'bronze'
- name: pipelineSilverTableName
displayName: 'Name of the Delta Silver table'
type: string
default: 'silver'
- name: armTemplatesLocation
displayName: 'Base folder path containing the ARM templates'
type: string
default: '$(System.DefaultWorkingDirectory)/arm'
- name: dataFactoryLinkedServiceArmTemplate
displayName: 'Name of the ARM template file that deploys the Databricks linked service'
type: string
default: 'azure-data-factory-linkedservice-databricks-accesstoken.json'
- name: dataFactoryPipelineArmTemplate
displayName: 'Name of the ARM template file that deploys the Databricks Pipeline'
type: string
default: 'azure-data-factory-pipeline.json'
- name: scriptsLocation
displayName: 'Base folder path containing the scripts'
type: string
default: '$(System.DefaultWorkingDirectory)/scripts'
stages:
- stage: dataPipelineDeployment
displayName: 'Deploy data pipeline'
jobs:
- job: deployDataPipeline
displayName: 'Deploy data pipeline'
pool: ${{ parameters.pool }}
variables:
databricksServiceName: "${{ variables.DATABRICKS_WORKSPACE_NAME }}_accesstoken"
keyVaultServiceName: ${{ variables.KEY_VAULT_NAME }}
secretNameAccessToken: 'dataServicePrincipalAccessToken'
databricksClusterLogPath: 'dbfs:/cluster-logs'
steps:
- checkout: self
# Get the Azure Location of the Resource Group
- task: AzureCLI@2
displayName: 'Get the Azure Location of ${{ variables.RESOURCE_GROUP_NAME }}'
inputs:
azureSubscription: ${{ parameters.serviceConnection }}
scriptType: 'bash'
scriptLocation: 'inlineScript'
inlineScript: |
rg_location=$(az group show --name "${{ variables.RESOURCE_GROUP_NAME }}" --query location)
[ -n "${rg_location}" ] && echo "##vso[task.setvariable variable=resourceGroupLocation;issecret=false]${rg_location}" || exit 1
# Get the Databricks workspace URL and AAD Access Token of the Azure DevOps Service Principal
- template: 'templates/get-workspace-login.yml' # Template reference
parameters:
serviceConnection: ${{ parameters.serviceConnection }}
resourceGroupName: ${{ variables.RESOURCE_GROUP_NAME }}
databricksWorkspaceName: ${{ variables.DATABRICKS_WORKSPACE_NAME }}
scriptsLocation: ${{ parameters.scriptsLocation }}
# Add the newly generated Access Token to the Key Vault
- task: AzureCLI@2
displayName: 'Add the Access Token to Key Vault'
inputs:
azureSubscription: ${{ parameters.serviceConnection }}
scriptType: 'bash'
scriptPath: '${{ parameters.scriptsLocation }}/add_secret_to_key_vault.sh'
arguments: '"${{ variables.KEY_VAULT_NAME }}"
"$(secretNameAccessToken)"
"$(accessToken)"'
# Get the data pipeline Service Principal Client Id and Secret (from the Azure DevOps Service Principal)
- task: AzureCLI@2
displayName: 'Get the running Pipeline Service Principal Client Id and Secret'
inputs:
azureSubscription: ${{ parameters.serviceConnection }}
addSpnToEnvironment: true
scriptType: 'bash'
scriptLocation: 'inlineScript'
inlineScript: |
[ -n "${servicePrincipalId}" ] && echo "##vso[task.setvariable variable=dataServicePrincipalClientId;issecret=false]${servicePrincipalId}" || exit 1
[ -n "${servicePrincipalKey}" ] && echo "##vso[task.setvariable variable=dataServicePrincipalClientSecret;issecret=true]${servicePrincipalKey}" || exit 1
# Add the data pipeline Service Principal Client Secret to the Databricks Secret Scope
# This needs to be done until Key Vault backed Secret Scopes are supported with Service Principals
- task: AzureCLI@2
displayName: 'Add the Client Secret to Databricks Secret Scope'
inputs:
azureSubscription: '${{ parameters.serviceConnection }}'
scriptType: 'bash'
scriptPath: '${{ parameters.scriptsLocation }}/add_secret_to_secret_scope.sh'
arguments: '"$(databricksWorkspaceUrl)"
"$(accessToken)"
"${{ variables.DATABRICKS_SECRET_SCOPE_NAME }}"
"${{ variables.SECRET_NAME_CLIENT_SECRET }}"
"$(dataServicePrincipalClientSecret)"'
# Get the Instance Pool ID
- task: Bash@3
displayName: 'Get the ID of ${{ variables.DATABRICKS_JOBS_POOL_NAME }}'
inputs:
targetType: 'filePath'
filePath: '${{ parameters.scriptsLocation }}/get_instance_pool.sh'
arguments: '"$(databricksWorkspaceUrl)" "$(accessToken)" "${{ variables.DATABRICKS_JOBS_POOL_NAME }}"'
# Setup Python
- template: 'templates/configure-python.yml' # Template reference
# Deploy the Databricks data pipeline notebooks
- template: 'templates/deploy-notebooks.yml' # Template reference
parameters:
databricksWorkspaceUrl: $(databricksWorkspaceUrl)
accessToken: $(accessToken)
notebooksSourceLocation: ${{ variables.NOTEBOOKS_PIPELINE_SOURCE_LOCATION }}
notebooksWorkspaceFolder: ${{ variables.NOTEBOOKS_PIPELINE_WORKSPACE_FOLDER }}
# Deploy the Databricks linked service with Access Token
- task: AzureResourceManagerTemplateDeployment@3
displayName: 'Deploy Azure Databricks linked service'
inputs:
deploymentScope: 'Resource Group'
azureResourceManagerConnection: ${{ parameters.serviceConnection }}
action: 'Create Or Update Resource Group'
resourceGroupName: ${{ variables.RESOURCE_GROUP_NAME }}
location: $(resourceGroupLocation)
templateLocation: 'Linked artifact'
csmFile: '${{ parameters.armTemplatesLocation }}/${{ parameters.dataFactoryLinkedServiceArmTemplate }}'
overrideParameters: '-factoryName "${{ variables.DATA_FACTORY_NAME }}" -keyVaultServiceName "$(keyVaultServiceName)" -keyVaultSecretName "$(secretNameAccessToken)" -databricksServiceName "$(databricksServiceName)" -databricksWorkspaceUrl "$(databricksWorkspaceUrl)" -databricksWorkspaceName "${{ variables.DATABRICKS_WORKSPACE_NAME }}" -databricksPoolId "$(databricksPoolId)" -databricksClusterNumWorkers "${{ parameters.databricksClusterNumWorkers }}" -databricksClusterSparkVersion "${{ parameters.databricksClusterSparkVersion }}" -databricksClusterLogPath "$(databricksClusterLogPath)" -dataServicePrincipalClientId "$(dataServicePrincipalClientId)" -databricksSecretScopeName "${{ variables.DATABRICKS_SECRET_SCOPE_NAME }}" -secretNameClientSecret "${{ variables.SECRET_NAME_CLIENT_SECRET }}"'
deploymentMode: 'Incremental'
deploymentName: $(databricksServiceName)
deploymentOutputs: 'armOutput'
# Deploy the Azure Data Factory Pipeline
- task: AzureResourceManagerTemplateDeployment@3
displayName: 'Deploy Azure Data Factory Pipeline'
inputs:
deploymentScope: 'Resource Group'
azureResourceManagerConnection: ${{ parameters.serviceConnection }}
action: 'Create Or Update Resource Group'
resourceGroupName: ${{ variables.RESOURCE_GROUP_NAME }}
location: $(resourceGroupLocation)
templateLocation: 'Linked artifact'
csmFile: '${{ parameters.armTemplatesLocation }}/${{ parameters.dataFactoryPipelineArmTemplate }}'
overrideParameters: '-factoryName "${{ variables.DATA_FACTORY_NAME }}" -pipelineName "${{ variables.DATA_FACTORY_PIPELINE_NAME }}" -databricksServiceName "$(databricksServiceName)"'
deploymentMode: 'Incremental'
deploymentName: ${{ variables.DATA_FACTORY_PIPELINE_NAME }}
deploymentOutputs: 'armOutput'
- stage: dataPipelineRun
displayName: 'Run Data Pipeline'
dependsOn: dataPipelineDeployment
jobs:
- job: runDataPipeline
displayName: 'Run Azure Data Factory Pipeline'
pool: ${{ parameters.pool }}
steps:
# Invoke the Azure Data Factory Pipeline deployed in the previous step
- task: AzurePowerShell@5
displayName: 'Invoke Azure Data Factory Pipeline'
inputs:
azureSubscription: ${{ parameters.serviceConnection }}
ScriptType: 'InlineScript'
Inline: |
$parameters = @{
"databaseName" = "${{ parameters.pipelineDatabaseName }}"
"databaseLocation" = "abfss://${{ variables.PIPELINE_CONTAINER_NAME }}@${{ variables.STORAGE_ACCOUNT_NAME }}.dfs.core.windows.net/${{ parameters.pipelineDatabaseName }}.db"
"sourcePath" = "${{ parameters.pipelineSourceData }}"
"bronzeTableName" = "${{ parameters.pipelineBronzeTableName }}"
"silverTableName" = "${{ parameters.pipelineSilverTableName }}"
}
Invoke-AzDataFactoryV2Pipeline -ResourceGroupName "${{ variables.RESOURCE_GROUP_NAME }}" `
-DataFactoryName "${{ variables.DATA_FACTORY_NAME }}" `
-PipelineName "${{ variables.DATA_FACTORY_PIPELINE_NAME }}" `
-Parameter $parameters
azurePowerShellVersion: 'LatestVersion'