Skip to content

Commit

Permalink
Update tests and fix few minor problems
Browse files Browse the repository at this point in the history
  • Loading branch information
kanterov committed Jan 17, 2025
1 parent 23505cd commit 5fa0e99
Show file tree
Hide file tree
Showing 20 changed files with 448 additions and 7 deletions.
15 changes: 15 additions & 0 deletions acceptance/acceptance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"slices"
"sort"
Expand Down Expand Up @@ -393,13 +394,27 @@ func CopyDir(src, dst string, inputs, outputs map[string]bool) error {
}

func ListDir(t *testing.T, src string) ([]string, error) {
// exclude folders in .gitignore from comparison
ignoredFolders := []string{
"\\.ruff_cache",
"\\.venv",
".*\\.egg-info",
"__pycache__",
}

var files []string
err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}

if info.IsDir() {
for _, ignoredFolder := range ignoredFolders {
if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
return filepath.SkipDir
}
}

return nil
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

>>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json
>>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output

Welcome to (EXPERIMENTAL) "Jobs as code" template for Databricks Asset Bundles!
Workspace to use (auto-detected, edit in 'my_jobs_as_code/databricks.yml'): $DATABRICKS_URL
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.databricks/
build/
dist/
__pycache__/
*.egg-info
.venv/
scratch/**
!scratch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# my_jobs_as_code

The 'my_jobs_as_code' project was generated by using the "Jobs as code" template.

## Prerequisites

1. Install Databricks CLI 0.238 or later.
See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).

2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
We use uv to create a virtual environment and install the required dependencies.

3. Authenticate to your Databricks workspace if you have not done so already:
```
$ databricks configure
```
4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
**Databricks Connect** for instructions on running the included Python code from a different IDE.
5. For documentation on the Databricks Asset Bundles format used
for this project, and for CI/CD configuration, see
https://docs.databricks.com/dev-tools/bundles/index.html.
## Deploy and run jobs
1. Create a new virtual environment and install the required dependencies:
```
$ uv sync
```
2. To deploy the bundle to the development target:
```
$ databricks bundle deploy --target dev
```
*(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
This deploys everything that's defined for this project.
For example, the default template would deploy a job called
`[dev yourname] my_jobs_as_code_job` to your workspace.
You can find that job by opening your workspace and clicking on **Workflows**.
3. Similarly, to deploy a production copy, type:
```
$ databricks bundle deploy --target prod
```
Note that the default job from the template has a schedule that runs every day
(defined in resources/my_jobs_as_code_job.py). The schedule
is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
4. To run a job:
```
$ databricks bundle run
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# This is a Databricks asset bundle definition for my_jobs_as_code.
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: my_jobs_as_code
uuid: <UUID>

experimental:
python:
# Activate virtual environment before loading resources defined in Python.
# If disabled, defaults to using the Python interpreter available in the current shell.
venv_path: .venv
# Functions called to load resources defined in Python. See resources/__init__.py
resources:
- "resources:load_resources"

artifacts:
default:
type: whl
path: .
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build

include:
- resources/*.yml

targets:
dev:
# The default target uses 'mode: development' to create a development copy.
# - Deployed resources get prefixed with '[dev my_user_name]'
# - Any job schedules and triggers are paused by default.
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
mode: development
default: true
workspace:
host: $DATABRICKS_URL

prod:
mode: production
workspace:
host: $DATABRICKS_URL
# We explicitly specify /Workspace/Users/$USERNAME to make sure we only have a single copy.
root_path: /Workspace/Users/$USERNAME/.bundle/${bundle.name}/${bundle.target}
permissions:
- user_name: $USERNAME
level: CAN_MANAGE
run_as:
user_name: $USERNAME
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Fixtures

This folder is reserved for fixtures, such as CSV files.

Below is an example of how to load fixtures as a data frame:

```
import pandas as pd
import os

def get_absolute_path(*relative_parts):
if 'dbutils' in globals():
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
return path if path.startswith("/Workspace") else "/Workspace" + path
else:
return os.path.join(*relative_parts)

csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
df = pd.read_csv(csv_file)
display(df)
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"

[project]
name = "my_jobs_as_code"
requires-python = ">=3.10"
description = "wheel file based on my_jobs_as_code"

# Dependencies in case the output wheel file is used as a library dependency.
# For defining dependencies, when this package is used in Databricks, see:
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
#
# Example:
# dependencies = [
# "requests==x.y.z",
# ]
dependencies = [
]

# see setup.py
dynamic = ["version"]

[project.entry-points.packages]
main = "my_jobs_as_code.main:main"

[tool.setuptools]
py-modules = ["resources", "my_jobs_as_code"]

[tool.uv]
## Dependencies for local development
dev-dependencies = [
"databricks-bundles==0.7.0",

## Add code completion support for DLT
# "databricks-dlt",

## databricks-connect can be used to run parts of this project locally.
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
##
## Uncomment line below to install a version of db-connect that corresponds to
## the Databricks Runtime version used for this project.
# "databricks-connect>=15.4,<15.5",
]

override-dependencies = [
# pyspark package conflicts with 'databricks-connect'
"pyspark; sys_platform == 'never'",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from databricks.bundles.core import (
Bundle,
Resources,
load_resources_from_current_package_module,
)


def load_resources(bundle: Bundle) -> Resources:
"""
'load_resources' function is referenced in databricks.yml and is responsible for loading
bundle resources defined in Python code. This function is called by Databricks CLI during
bundle deployment. After deployment, this function is not used.
"""

# the default implementation loads all Python files in 'resources' directory
return load_resources_from_current_package_module()
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from databricks.bundles.jobs import Job

"""
The main job for my_jobs_as_code.
"""


my_jobs_as_code_job = Job.from_dict(
{
"name": "my_jobs_as_code_job",
"trigger": {
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
"periodic": {
"interval": 1,
"unit": "DAYS",
},
},
"email_notifications": {
"on_failure": [
"$USERNAME",
],
},
"tasks": [
{
"task_key": "notebook_task",
"job_cluster_key": "job_cluster",
"notebook_task": {
"notebook_path": "src/notebook.ipynb",
},
},
{
"task_key": "main_task",
"depends_on": [
{
"task_key": "notebook_task",
},
],
"job_cluster_key": "job_cluster",
"python_wheel_task": {
"package_name": "my_jobs_as_code",
"entry_point": "main",
},
"libraries": [
# By default we just include the .whl file generated for the my_jobs_as_code package.
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
# for more information on how to add other libraries.
{
"whl": "dist/*.whl",
},
],
},
],
"job_clusters": [
{
"job_cluster_key": "job_cluster",
"new_cluster": {
"spark_version": "15.4.x-scala2.12",
"node_type_id": "i3.xlarge",
"autoscale": {
"min_workers": 1,
"max_workers": 4,
},
},
},
],
}
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# scratch

This folder is reserved for personal, exploratory notebooks.
By default these are not committed to Git, as 'scratch' is listed in .gitignore.
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
setup.py configuration script describing how to build and package this project.
This file is primarily used by the setuptools library and typically should not
be executed directly. See README.md for how to deploy, test, and run
the my_jobs_as_code project.
"""

import os

from setuptools import setup

local_version = os.getenv("LOCAL_VERSION")
version = "0.0.1"

setup(
version=f"{version}+{local_version}" if local_version else version,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pyspark.sql import SparkSession, DataFrame


def get_taxis(spark: SparkSession) -> DataFrame:
return spark.read.table("samples.nyctaxi.trips")


# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
def get_spark() -> SparkSession:
try:
from databricks.connect import DatabricksSession

return DatabricksSession.builder.getOrCreate()
except ImportError:
return SparkSession.builder.getOrCreate()


def main():
get_taxis(get_spark()).show(5)


if __name__ == "__main__":
main()
Loading

0 comments on commit 5fa0e99

Please sign in to comment.