-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.databricks/ | ||
build/ | ||
dist/ | ||
__pycache__/ | ||
*.egg-info | ||
.venv/ | ||
scratch/** | ||
!scratch/README.md |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Typings for Pylance in Visual Studio Code | ||
# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md | ||
from databricks.sdk.runtime import * |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"recommendations": [ | ||
"databricks.databricks", | ||
"ms-python.vscode-pylance", | ||
"redhat.vscode-yaml" | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"python.analysis.stubPath": ".vscode", | ||
"databricks.python.envFile": "${workspaceFolder}/.env", | ||
"jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", | ||
"jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", | ||
"python.testing.pytestArgs": [ | ||
"." | ||
], | ||
"python.testing.unittestEnabled": false, | ||
"python.testing.pytestEnabled": true, | ||
"python.analysis.extraPaths": ["src"], | ||
"files.exclude": { | ||
"**/*.egg-info": true, | ||
"**/__pycache__": true, | ||
".pytest_cache": true, | ||
}, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# marcin_project | ||
|
||
The 'marcin_project' project was generated by using the default-python template. | ||
|
||
## Getting started | ||
|
||
1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html | ||
|
||
2. Authenticate to your Databricks workspace: | ||
``` | ||
$ databricks configure | ||
``` | ||
3. To deploy a development copy of this project, type: | ||
``` | ||
$ databricks bundle deploy --target dev | ||
``` | ||
(Note that "dev" is the default target, so the `--target` parameter | ||
is optional here.) | ||
This deploys everything that's defined for this project. | ||
For example, the default template would deploy a job called | ||
`[dev yourname] marcin_project_job` to your workspace. | ||
You can find that job by opening your workpace and clicking on **Workflows**. | ||
4. Similarly, to deploy a production copy, type: | ||
``` | ||
$ databricks bundle deploy --target prod | ||
``` | ||
5. To run a job or pipeline, use the "run" command: | ||
``` | ||
$ databricks bundle run | ||
``` | ||
6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from | ||
https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for | ||
**Databricks Connect** for instructions on running the included Python code from a different IDE. | ||
7. For documentation on the Databricks asset bundles format used | ||
for this project, and for CI/CD configuration, see | ||
https://docs.databricks.com/dev-tools/bundles/index.html. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# This is a Databricks asset bundle definition for marcin_project. | ||
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. | ||
bundle: | ||
name: marcin_project | ||
|
||
include: | ||
- resources/*.yml | ||
|
||
targets: | ||
# The 'dev' target, for development purposes. This target is the default. | ||
dev: | ||
# We use 'mode: development' to indicate this is a personal development copy: | ||
# - Deployed resources get prefixed with '[dev my_user_name]' | ||
# - Any job schedules and triggers are paused by default | ||
# - The 'development' mode is used for Delta Live Tables pipelines | ||
mode: development | ||
default: true | ||
workspace: | ||
host: https://adb-8870486534760962.2.azuredatabricks.net | ||
|
||
## Optionally, there could be a 'staging' target here. | ||
## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) | ||
# | ||
# staging: | ||
# workspace: | ||
# host: https://adb-8870486534760962.2.azuredatabricks.net | ||
|
||
# The 'prod' target, used for production deployment. | ||
prod: | ||
# We use 'mode: production' to indicate this is a production deployment. | ||
# Doing so enables strict verification of the settings below. | ||
mode: production | ||
workspace: | ||
host: https://adb-8870486534760962.2.azuredatabricks.net | ||
# We only have a single deployment copy for production, so we use a shared path. | ||
root_path: /Shared/.bundle/prod/${bundle.name} | ||
run_as: | ||
# This runs as [email protected] in production. We could also use a service principal here | ||
# using service_principal_name (see https://docs.databricks.com/dev-tools/bundles/permissions.html). | ||
user_name: [email protected] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Fixtures | ||
|
||
This folder is reserved for fixtures, such as CSV files. | ||
|
||
Below is an example of how to load fixtures as a data frame: | ||
|
||
``` | ||
import pandas as pd | ||
import os | ||
|
||
def get_absolute_path(*relative_parts): | ||
if 'dbutils' in globals(): | ||
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore | ||
path = os.path.normpath(os.path.join(base_dir, *relative_parts)) | ||
return path if path.startswith("/Workspace") else "/Workspace" + path | ||
else: | ||
return os.path.join(*relative_parts) | ||
|
||
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") | ||
df = pd.read_csv(csv_file) | ||
display(df) | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[pytest] | ||
testpaths = tests | ||
pythonpath = src |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
## requirements-dev.txt: dependencies for local development. | ||
## | ||
## For defining dependencies used by jobs in Databricks Workflows, see | ||
## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html | ||
|
||
## pytest is the default package used for testing | ||
pytest | ||
|
||
## Dependencies for building wheel files | ||
setuptools | ||
wheel | ||
|
||
## databricks-connect can be used to run parts of this project locally. | ||
## See https://docs.databricks.com/dev-tools/databricks-connect.html. | ||
## | ||
## databricks-connect is automatically installed if you're using Databricks | ||
## extension for Visual Studio Code | ||
## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). | ||
## | ||
## To manually install databricks-connect, either follow the instructions | ||
## at https://docs.databricks.com/dev-tools/databricks-connect.html | ||
## to install the package system-wide. Or uncomment the line below to install a | ||
## version of db-connect that corresponds to the Databricks Runtime version used | ||
## for this project. | ||
# | ||
databricks-connect>=13.3,<13.4 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# The main job for marcin_project. | ||
resources: | ||
jobs: | ||
marcin_project_job: | ||
name: marcin_project_job | ||
|
||
schedule: | ||
# Run every day at 8:37 AM | ||
quartz_cron_expression: '44 37 8 * * ?' | ||
timezone_id: Europe/Amsterdam | ||
|
||
email_notifications: | ||
on_failure: | ||
- [email protected] | ||
|
||
tasks: | ||
- task_key: notebook_task | ||
job_cluster_key: job_cluster | ||
notebook_task: | ||
notebook_path: ../src/notebook.ipynb | ||
|
||
- task_key: refresh_pipeline | ||
depends_on: | ||
- task_key: notebook_task | ||
pipeline_task: | ||
pipeline_id: ${resources.pipelines.marcin_project_pipeline.id} | ||
|
||
- task_key: main_task | ||
depends_on: | ||
- task_key: refresh_pipeline | ||
job_cluster_key: job_cluster | ||
python_wheel_task: | ||
package_name: marcin_project | ||
entry_point: main | ||
libraries: | ||
# By default we just include the .whl file generated for the marcin_project package. | ||
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html | ||
# for more information on how to add other libraries. | ||
- whl: ../dist/*.whl | ||
|
||
job_clusters: | ||
- job_cluster_key: job_cluster | ||
new_cluster: | ||
spark_version: 13.3.x-scala2.12 | ||
node_type_id: Standard_D3_v2 | ||
autoscale: | ||
min_workers: 1 | ||
max_workers: 4 |