diff --git a/acceptance/acceptance_test.go b/acceptance/acceptance_test.go index 2dfe2bce47..5afab5d276 100644 --- a/acceptance/acceptance_test.go +++ b/acceptance/acceptance_test.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "path/filepath" + "regexp" "runtime" "slices" "sort" @@ -393,6 +394,14 @@ func CopyDir(src, dst string, inputs, outputs map[string]bool) error { } func ListDir(t *testing.T, src string) ([]string, error) { + // exclude folders in .gitignore from comparison + ignoredFolders := []string{ + "\\.ruff_cache", + "\\.venv", + ".*\\.egg-info", + "__pycache__", + } + var files []string err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error { if err != nil { @@ -400,6 +409,12 @@ func ListDir(t *testing.T, src string) ([]string, error) { } if info.IsDir() { + for _, ignoredFolder := range ignoredFolders { + if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched { + return filepath.SkipDir + } + } + return nil } diff --git a/acceptance/bundle/init/experimental-jobs-as-code/input.json b/acceptance/bundle/templates/experimental-jobs-as-code/input.json similarity index 100% rename from acceptance/bundle/init/experimental-jobs-as-code/input.json rename to acceptance/bundle/templates/experimental-jobs-as-code/input.json diff --git a/acceptance/bundle/init/experimental-jobs-as-code/output.txt b/acceptance/bundle/templates/experimental-jobs-as-code/output.txt similarity index 98% rename from acceptance/bundle/init/experimental-jobs-as-code/output.txt rename to acceptance/bundle/templates/experimental-jobs-as-code/output.txt index b20c894a6a..1aa8a94d55 100644 --- a/acceptance/bundle/init/experimental-jobs-as-code/output.txt +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output.txt @@ -1,5 +1,5 @@ ->>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json +>>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output Welcome to (EXPERIMENTAL) "Jobs as code" template for Databricks Asset Bundles! Workspace to use (auto-detected, edit in 'my_jobs_as_code/databricks.yml'): $DATABRICKS_URL diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore new file mode 100644 index 0000000000..0dab7f4995 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/** +!scratch/README.md diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md new file mode 100644 index 0000000000..8c429c6e53 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md @@ -0,0 +1,58 @@ +# my_jobs_as_code + +The 'my_jobs_as_code' project was generated by using the "Jobs as code" template. + +## Prerequisites + +1. Install Databricks CLI 0.238 or later. + See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html). + +2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/). + We use uv to create a virtual environment and install the required dependencies. + +3. Authenticate to your Databricks workspace if you have not done so already: + ``` + $ databricks configure + ``` + +4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for + **Databricks Connect** for instructions on running the included Python code from a different IDE. + +5. For documentation on the Databricks Asset Bundles format used + for this project, and for CI/CD configuration, see + https://docs.databricks.com/dev-tools/bundles/index.html. + +## Deploy and run jobs + +1. Create a new virtual environment and install the required dependencies: + ``` + $ uv sync + ``` + +2. To deploy the bundle to the development target: + ``` + $ databricks bundle deploy --target dev + ``` + + *(Note that "dev" is the default target, so the `--target` parameter is optional here.)* + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] my_jobs_as_code_job` to your workspace. + You can find that job by opening your workspace and clicking on **Workflows**. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + + Note that the default job from the template has a schedule that runs every day + (defined in resources/my_jobs_as_code_job.py). The schedule + is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes]( + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)). + +4. To run a job: + ``` + $ databricks bundle run + ``` diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml new file mode 100644 index 0000000000..fd87aa381c --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml @@ -0,0 +1,48 @@ +# This is a Databricks asset bundle definition for my_jobs_as_code. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: my_jobs_as_code + uuid: + +experimental: + python: + # Activate virtual environment before loading resources defined in Python. + # If disabled, defaults to using the Python interpreter available in the current shell. + venv_path: .venv + # Functions called to load resources defined in Python. See resources/__init__.py + resources: + - "resources:load_resources" + +artifacts: + default: + type: whl + path: . + # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.) + # to ensure that changes to wheel package are picked up when used on all-purpose clusters + build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build + +include: + - resources/*.yml + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: $DATABRICKS_URL + + prod: + mode: production + workspace: + host: $DATABRICKS_URL + # We explicitly specify /Workspace/Users/$USERNAME to make sure we only have a single copy. + root_path: /Workspace/Users/$USERNAME/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: $USERNAME + level: CAN_MANAGE + run_as: + user_name: $USERNAME diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep new file mode 100644 index 0000000000..fa25d2745e --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml new file mode 100644 index 0000000000..28240e3ec4 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml @@ -0,0 +1,49 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "my_jobs_as_code" +requires-python = ">=3.10" +description = "wheel file based on my_jobs_as_code" + +# Dependencies in case the output wheel file is used as a library dependency. +# For defining dependencies, when this package is used in Databricks, see: +# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html +# +# Example: +# dependencies = [ +# "requests==x.y.z", +# ] +dependencies = [ +] + +# see setup.py +dynamic = ["version"] + +[project.entry-points.packages] +main = "my_jobs_as_code.main:main" + +[tool.setuptools] +py-modules = ["resources", "my_jobs_as_code"] + +[tool.uv] +## Dependencies for local development +dev-dependencies = [ + "databricks-bundles==0.7.0", + + ## Add code completion support for DLT + # "databricks-dlt", + + ## databricks-connect can be used to run parts of this project locally. + ## See https://docs.databricks.com/dev-tools/databricks-connect.html. + ## + ## Uncomment line below to install a version of db-connect that corresponds to + ## the Databricks Runtime version used for this project. + # "databricks-connect>=15.4,<15.5", +] + +override-dependencies = [ + # pyspark package conflicts with 'databricks-connect' + "pyspark; sys_platform == 'never'", +] diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py new file mode 100644 index 0000000000..fbcb9dc5f0 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py @@ -0,0 +1,16 @@ +from databricks.bundles.core import ( + Bundle, + Resources, + load_resources_from_current_package_module, +) + + +def load_resources(bundle: Bundle) -> Resources: + """ + 'load_resources' function is referenced in databricks.yml and is responsible for loading + bundle resources defined in Python code. This function is called by Databricks CLI during + bundle deployment. After deployment, this function is not used. + """ + + # the default implementation loads all Python files in 'resources' directory + return load_resources_from_current_package_module() diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_job.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_job.py new file mode 100644 index 0000000000..4854d656f0 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_job.py @@ -0,0 +1,67 @@ +from databricks.bundles.jobs import Job + +""" +The main job for my_jobs_as_code. +""" + + +my_jobs_as_code_job = Job.from_dict( + { + "name": "my_jobs_as_code_job", + "trigger": { + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + "periodic": { + "interval": 1, + "unit": "DAYS", + }, + }, + "email_notifications": { + "on_failure": [ + "$USERNAME", + ], + }, + "tasks": [ + { + "task_key": "notebook_task", + "job_cluster_key": "job_cluster", + "notebook_task": { + "notebook_path": "src/notebook.ipynb", + }, + }, + { + "task_key": "main_task", + "depends_on": [ + { + "task_key": "notebook_task", + }, + ], + "job_cluster_key": "job_cluster", + "python_wheel_task": { + "package_name": "my_jobs_as_code", + "entry_point": "main", + }, + "libraries": [ + # By default we just include the .whl file generated for the my_jobs_as_code package. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + { + "whl": "dist/*.whl", + }, + ], + }, + ], + "job_clusters": [ + { + "job_cluster_key": "job_cluster", + "new_cluster": { + "spark_version": "15.4.x-scala2.12", + "node_type_id": "i3.xlarge", + "autoscale": { + "min_workers": 1, + "max_workers": 4, + }, + }, + }, + ], + } +) diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/scratch/README.md b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/scratch/README.md new file mode 100644 index 0000000000..e6cfb81b46 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/scratch/README.md @@ -0,0 +1,4 @@ +# scratch + +This folder is reserved for personal, exploratory notebooks. +By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/setup.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/setup.py new file mode 100644 index 0000000000..ba284ba828 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/setup.py @@ -0,0 +1,18 @@ +""" +setup.py configuration script describing how to build and package this project. + +This file is primarily used by the setuptools library and typically should not +be executed directly. See README.md for how to deploy, test, and run +the my_jobs_as_code project. +""" + +import os + +from setuptools import setup + +local_version = os.getenv("LOCAL_VERSION") +version = "0.0.1" + +setup( + version=f"{version}+{local_version}" if local_version else version, +) diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/__init__.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/main.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/main.py new file mode 100644 index 0000000000..5ae344c7e2 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/main.py @@ -0,0 +1,25 @@ +from pyspark.sql import SparkSession, DataFrame + + +def get_taxis(spark: SparkSession) -> DataFrame: + return spark.read.table("samples.nyctaxi.trips") + + +# Create a new Databricks Connect session. If this fails, +# check that you have configured Databricks Connect correctly. +# See https://docs.databricks.com/dev-tools/databricks-connect.html. +def get_spark() -> SparkSession: + try: + from databricks.connect import DatabricksSession + + return DatabricksSession.builder.getOrCreate() + except ImportError: + return SparkSession.builder.getOrCreate() + + +def main(): + get_taxis(get_spark()).show(5) + + +if __name__ == "__main__": + main() diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/notebook.ipynb b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/notebook.ipynb new file mode 100644 index 0000000000..9bc3f15601 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/notebook.ipynb @@ -0,0 +1,75 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Default notebook\n", + "\n", + "This default notebook is executed using Databricks Workflows as defined in resources/my_jobs_as_code.job.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from my_jobs_as_code import main\n", + "\n", + "main.get_taxis(spark).show(10)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/tests/main_test.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/tests/main_test.py new file mode 100644 index 0000000000..13e100ee2e --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/tests/main_test.py @@ -0,0 +1,8 @@ +from my_jobs_as_code.main import get_taxis, get_spark + +# running tests requires installing databricks-connect, e.g. by uncommenting it in pyproject.toml + + +def test_main(): + taxis = get_taxis(get_spark()) + assert taxis.count() > 5 diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/uv.lock b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/uv.lock new file mode 100644 index 0000000000..14491a34d9 --- /dev/null +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/uv.lock @@ -0,0 +1,28 @@ +version = 1 +requires-python = ">=3.10" + +[manifest] +overrides = [{ name = "pyspark", marker = "sys_platform == 'never'" }] + +[[package]] +name = "databricks-bundles" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/55/5ccd397771973d23044997ad2555523c107ce1346f5d16f4bffd4ab5a23d/databricks_bundles-0.7.0-py3-none-any.whl", hash = "sha256:9bc7de987c869380f17a1c8e4a684827105f57c1fb6cd01223818548010a33c4", size = 97620 }, +] + +[[package]] +name = "my-jobs-as-code" +version = "0.0.1" +source = { editable = "." } + +[package.dev-dependencies] +dev = [ + { name = "databricks-bundles" }, +] + +[package.metadata] + +[package.metadata.requires-dev] +dev = [{ name = "databricks-bundles", specifier = "==0.7.0" }] diff --git a/acceptance/bundle/init/experimental-jobs-as-code/script b/acceptance/bundle/templates/experimental-jobs-as-code/script similarity index 69% rename from acceptance/bundle/init/experimental-jobs-as-code/script rename to acceptance/bundle/templates/experimental-jobs-as-code/script index 867907d62f..15001c926a 100644 --- a/acceptance/bundle/init/experimental-jobs-as-code/script +++ b/acceptance/bundle/templates/experimental-jobs-as-code/script @@ -1,11 +1,11 @@ -trace $CLI bundle init experimental-jobs-as-code --config-file ./input.json +trace $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output -cd my_jobs_as_code +cd output/my_jobs_as_code # silence uv output because it's non-deterministic uv sync 2> /dev/null # remove version constraint because it always creates a warning on dev builds -sed -i'' -e "s/databricks_cli_version: .*//" databricks.yml +sed -i '' -e "/^ databricks_cli_version: .*$/d" databricks.yml -trace $CLI bundle validate -t dev --output json | jq ".resources" +trace $CLI bundle validate -t dev --output json | jq ".resources" \ No newline at end of file diff --git a/acceptance/bundle/init/experimental-jobs-as-code/script.cleanup b/acceptance/bundle/templates/experimental-jobs-as-code/script.cleanup similarity index 100% rename from acceptance/bundle/init/experimental-jobs-as-code/script.cleanup rename to acceptance/bundle/templates/experimental-jobs-as-code/script.cleanup diff --git a/libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/databricks.yml.tmpl index 29a4690fad..758ec3f160 100644 --- a/libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/databricks.yml.tmpl @@ -14,7 +14,7 @@ experimental: resources: - "resources:load_resources" -{{ if .include_python }} +{{ if .include_python -}} artifacts: default: type: whl @@ -23,7 +23,7 @@ artifacts: # to ensure that changes to wheel package are picked up when used on all-purpose clusters build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build -{{ end }} +{{ end -}} include: - resources/*.yml