Update tests and fix few minor problems

databricks · Jan 17, 2025 · 5fa0e99 · 5fa0e99
1 parent 23505cd
commit 5fa0e99
Show file tree

Hide file tree

Showing 20 changed files with 448 additions and 7 deletions.
diff --git a/acceptance/acceptance_test.go b/acceptance/acceptance_test.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"runtime"
 	"slices"
 	"sort"
@@ -393,13 +394,27 @@ func CopyDir(src, dst string, inputs, outputs map[string]bool) error {
 }
 
 func ListDir(t *testing.T, src string) ([]string, error) {
+	// exclude folders in .gitignore from comparison
+	ignoredFolders := []string{
+		"\\.ruff_cache",
+		"\\.venv",
+		".*\\.egg-info",
+		"__pycache__",
+	}
+
 	var files []string
 	err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
 		if err != nil {
 			return err
 		}
 
 		if info.IsDir() {
+			for _, ignoredFolder := range ignoredFolders {
+				if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
+					return filepath.SkipDir
+				}
+			}
+
 			return nil
 		}
 

diff --git a/...init/experimental-jobs-as-code/input.json → ...ates/experimental-jobs-as-code/input.json b/...init/experimental-jobs-as-code/input.json → ...ates/experimental-jobs-as-code/input.json
diff --git a/...init/experimental-jobs-as-code/output.txt → ...ates/experimental-jobs-as-code/output.txt b/...init/experimental-jobs-as-code/output.txt → ...ates/experimental-jobs-as-code/output.txt
@@ -1,5 +1,5 @@
 
->>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json
+>>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output
 
 Welcome to (EXPERIMENTAL) "Jobs as code" template for Databricks Asset Bundles!
 Workspace to use (auto-detected, edit in 'my_jobs_as_code/databricks.yml'): $DATABRICKS_URL

diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore
@@ -0,0 +1,8 @@
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
diff --git a/...nce/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md b/...nce/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md
@@ -0,0 +1,58 @@
+# my_jobs_as_code
+
+The 'my_jobs_as_code' project was generated by using the "Jobs as code" template.
+
+## Prerequisites
+
+1. Install Databricks CLI 0.238 or later.
+   See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).
+
+2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
+   We use uv to create a virtual environment and install the required dependencies.
+
+3. Authenticate to your Databricks workspace if you have not done so already:
+    ```
+    $ databricks configure
+    ```
+
+4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+   https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
+   **Databricks Connect** for instructions on running the included Python code from a different IDE.
+
+5. For documentation on the Databricks Asset Bundles format used
+   for this project, and for CI/CD configuration, see
+   https://docs.databricks.com/dev-tools/bundles/index.html.
+
+## Deploy and run jobs
+
+1. Create a new virtual environment and install the required dependencies:
+    ```
+    $ uv sync
+    ```
+
+2. To deploy the bundle to the development target:
+    ```
+    $ databricks bundle deploy --target dev
+    ```
+
+   *(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
+
+   This deploys everything that's defined for this project.
+   For example, the default template would deploy a job called
+   `[dev yourname] my_jobs_as_code_job` to your workspace.
+   You can find that job by opening your workspace and clicking on **Workflows**.
+
+3. Similarly, to deploy a production copy, type:
+   ```
+   $ databricks bundle deploy --target prod
+   ```
+
+   Note that the default job from the template has a schedule that runs every day
+   (defined in resources/my_jobs_as_code_job.py). The schedule
+   is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
+   https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
+
+4. To run a job:
+   ```
+   $ databricks bundle run
+   ```
diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml
@@ -0,0 +1,48 @@
+# This is a Databricks asset bundle definition for my_jobs_as_code.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: my_jobs_as_code
+  uuid: <UUID>
+
+experimental:
+  python:
+    # Activate virtual environment before loading resources defined in Python.
+    # If disabled, defaults to using the Python interpreter available in the current shell.
+    venv_path: .venv
+    # Functions called to load resources defined in Python. See resources/__init__.py
+    resources:
+      - "resources:load_resources"
+
+artifacts:
+  default:
+    type: whl
+    path: .
+    # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
+    # to ensure that changes to wheel package are picked up when used on all-purpose clusters
+    build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build
+
+include:
+  - resources/*.yml
+
+targets:
+  dev:
+    # The default target uses 'mode: development' to create a development copy.
+    # - Deployed resources get prefixed with '[dev my_user_name]'
+    # - Any job schedules and triggers are paused by default.
+    # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
+    mode: development
+    default: true
+    workspace:
+      host: $DATABRICKS_URL
+
+  prod:
+    mode: production
+    workspace:
+      host: $DATABRICKS_URL
+      # We explicitly specify /Workspace/Users/$USERNAME to make sure we only have a single copy.
+      root_path: /Workspace/Users/$USERNAME/.bundle/${bundle.name}/${bundle.target}
+    permissions:
+      - user_name: $USERNAME
+        level: CAN_MANAGE
+    run_as:
+      user_name: $USERNAME
diff --git a/...tance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep b/...tance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep
@@ -0,0 +1,22 @@
+# Fixtures
+
+This folder is reserved for fixtures, such as CSV files.
+
+Below is an example of how to load fixtures as a data frame:
+
+```
+import pandas as pd
+import os
+
+def get_absolute_path(*relative_parts):
+    if 'dbutils' in globals():
+        base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
+        path = os.path.normpath(os.path.join(base_dir, *relative_parts))
+        return path if path.startswith("/Workspace") else "/Workspace" + path
+    else:
+        return os.path.join(*relative_parts)
+
+csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
+df = pd.read_csv(csv_file)
+display(df)
+```
diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml
@@ -0,0 +1,49 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "my_jobs_as_code"
+requires-python = ">=3.10"
+description = "wheel file based on my_jobs_as_code"
+
+# Dependencies in case the output wheel file is used as a library dependency.
+# For defining dependencies, when this package is used in Databricks, see:
+# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+#
+# Example:
+# dependencies = [
+#     "requests==x.y.z",
+# ]
+dependencies = [
+]
+
+# see setup.py
+dynamic = ["version"]
+
+[project.entry-points.packages]
+main = "my_jobs_as_code.main:main"
+
+[tool.setuptools]
+py-modules = ["resources", "my_jobs_as_code"]
+
+[tool.uv]
+## Dependencies for local development
+dev-dependencies = [
+    "databricks-bundles==0.7.0",
+
+    ## Add code completion support for DLT
+    # "databricks-dlt",
+
+    ## databricks-connect can be used to run parts of this project locally.
+    ## See https://docs.databricks.com/dev-tools/databricks-connect.html.
+    ##
+    ## Uncomment line below to install a version of db-connect that corresponds to
+    ## the Databricks Runtime version used for this project.
+    # "databricks-connect>=15.4,<15.5",
+]
+
+override-dependencies = [
+    # pyspark package conflicts with 'databricks-connect'
+    "pyspark; sys_platform == 'never'",
+]
diff --git a/...e/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py b/...e/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py
@@ -0,0 +1,16 @@
+from databricks.bundles.core import (
+    Bundle,
+    Resources,
+    load_resources_from_current_package_module,
+)
+
+
+def load_resources(bundle: Bundle) -> Resources:
+    """
+    'load_resources' function is referenced in databricks.yml and is responsible for loading
+    bundle resources defined in Python code. This function is called by Databricks CLI during
+    bundle deployment. After deployment, this function is not used.
+    """
+
+    # the default implementation loads all Python files in 'resources' directory
+    return load_resources_from_current_package_module()
diff --git a/...mplates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_job.py b/...mplates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_job.py
@@ -0,0 +1,67 @@
+from databricks.bundles.jobs import Job
+
+"""
+The main job for my_jobs_as_code.
+"""
+
+
+my_jobs_as_code_job = Job.from_dict(
+    {
+        "name": "my_jobs_as_code_job",
+        "trigger": {
+            # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
+            "periodic": {
+                "interval": 1,
+                "unit": "DAYS",
+            },
+        },
+        "email_notifications": {
+            "on_failure": [
+                "$USERNAME",
+            ],
+        },
+        "tasks": [
+            {
+                "task_key": "notebook_task",
+                "job_cluster_key": "job_cluster",
+                "notebook_task": {
+                    "notebook_path": "src/notebook.ipynb",
+                },
+            },
+            {
+                "task_key": "main_task",
+                "depends_on": [
+                    {
+                        "task_key": "notebook_task",
+                    },
+                ],
+                "job_cluster_key": "job_cluster",
+                "python_wheel_task": {
+                    "package_name": "my_jobs_as_code",
+                    "entry_point": "main",
+                },
+                "libraries": [
+                    # By default we just include the .whl file generated for the my_jobs_as_code package.
+                    # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+                    # for more information on how to add other libraries.
+                    {
+                        "whl": "dist/*.whl",
+                    },
+                ],
+            },
+        ],
+        "job_clusters": [
+            {
+                "job_cluster_key": "job_cluster",
+                "new_cluster": {
+                    "spark_version": "15.4.x-scala2.12",
+                    "node_type_id": "i3.xlarge",
+                    "autoscale": {
+                        "min_workers": 1,
+                        "max_workers": 4,
+                    },
+                },
+            },
+        ],
+    }
+)
diff --git a/...le/templates/experimental-jobs-as-code/output/my_jobs_as_code/scratch/README.md b/...le/templates/experimental-jobs-as-code/output/my_jobs_as_code/scratch/README.md
@@ -0,0 +1,4 @@
+# scratch
+
+This folder is reserved for personal, exploratory notebooks.
+By default these are not committed to Git, as 'scratch' is listed in .gitignore.
diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/setup.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/setup.py
@@ -0,0 +1,18 @@
+"""
+setup.py configuration script describing how to build and package this project.
+
+This file is primarily used by the setuptools library and typically should not
+be executed directly. See README.md for how to deploy, test, and run
+the my_jobs_as_code project.
+"""
+
+import os
+
+from setuptools import setup
+
+local_version = os.getenv("LOCAL_VERSION")
+version = "0.0.1"
+
+setup(
+    version=f"{version}+{local_version}" if local_version else version,
+)
diff --git a/...emplates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/__init__.py b/...emplates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/__init__.py
diff --git a/...le/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/main.py b/...le/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/main.py
@@ -0,0 +1,25 @@
+from pyspark.sql import SparkSession, DataFrame
+
+
+def get_taxis(spark: SparkSession) -> DataFrame:
+    return spark.read.table("samples.nyctaxi.trips")
+
+
+# Create a new Databricks Connect session. If this fails,
+# check that you have configured Databricks Connect correctly.
+# See https://docs.databricks.com/dev-tools/databricks-connect.html.
+def get_spark() -> SparkSession:
+    try:
+        from databricks.connect import DatabricksSession
+
+        return DatabricksSession.builder.getOrCreate()
+    except ImportError:
+        return SparkSession.builder.getOrCreate()
+
+
+def main():
+    get_taxis(get_spark()).show(5)
+
+
+if __name__ == "__main__":
+    main()