diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0dab7f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/** +!scratch/README.md diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..1ab831e --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,114 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/marcin_project.iml b/.idea/marcin_project.iml new file mode 100644 index 0000000..0c80114 --- /dev/null +++ b/.idea/marcin_project.iml @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..6bce61c --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..b28d604 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.vscode/__builtins__.pyi b/.vscode/__builtins__.pyi new file mode 100644 index 0000000..0edd518 --- /dev/null +++ b/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..5d15eba --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..f19498d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,17 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["src"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..1425a2c --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +# marcin_project + +The 'marcin_project' project was generated by using the default-python template. + +## Getting started + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace: + ``` + $ databricks configure + ``` + +3. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] marcin_project_job` to your workspace. + You can find that job by opening your workpace and clicking on **Workflows**. + +4. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +5. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for + **Databricks Connect** for instructions on running the included Python code from a different IDE. + +7. For documentation on the Databricks asset bundles format used + for this project, and for CI/CD configuration, see + https://docs.databricks.com/dev-tools/bundles/index.html. diff --git a/databricks.yml b/databricks.yml new file mode 100644 index 0000000..55408bc --- /dev/null +++ b/databricks.yml @@ -0,0 +1,41 @@ +# This is a Databricks asset bundle definition for marcin_project. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: marcin_project + +include: + - resources/*.yml + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + # We use 'mode: development' to indicate this is a personal development copy: + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default + # - The 'development' mode is used for Delta Live Tables pipelines + mode: development + default: true + workspace: + host: https://adb-8870486534760962.2.azuredatabricks.net + + ## Optionally, there could be a 'staging' target here. + ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) + # + # staging: + # workspace: + # host: https://adb-8870486534760962.2.azuredatabricks.net + + # The 'prod' target, used for production deployment. + prod: + # We use 'mode: production' to indicate this is a production deployment. + # Doing so enables strict verification of the settings below. + mode: production + workspace: + host: https://adb-8870486534760962.2.azuredatabricks.net + # We only have a single deployment copy for production, so we use a shared path. + root_path: /Shared/.bundle/prod/${bundle.name} + run_as: + # This runs as wmj1fe@bosch.com in production. We could also use a service principal here + # using service_principal_name (see https://docs.databricks.com/dev-tools/bundles/permissions.html). + user_name: wmj1fe@bosch.com + \ No newline at end of file diff --git a/fixtures/.gitkeep b/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..80432c2 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +pythonpath = src diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..8053b1e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,26 @@ +## requirements-dev.txt: dependencies for local development. +## +## For defining dependencies used by jobs in Databricks Workflows, see +## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + +## pytest is the default package used for testing +pytest + +## Dependencies for building wheel files +setuptools +wheel + +## databricks-connect can be used to run parts of this project locally. +## See https://docs.databricks.com/dev-tools/databricks-connect.html. +## +## databricks-connect is automatically installed if you're using Databricks +## extension for Visual Studio Code +## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). +## +## To manually install databricks-connect, either follow the instructions +## at https://docs.databricks.com/dev-tools/databricks-connect.html +## to install the package system-wide. Or uncomment the line below to install a +## version of db-connect that corresponds to the Databricks Runtime version used +## for this project. +# +databricks-connect>=13.3,<13.4 diff --git a/resources/marcin_project_job.yml b/resources/marcin_project_job.yml new file mode 100644 index 0000000..94f67af --- /dev/null +++ b/resources/marcin_project_job.yml @@ -0,0 +1,48 @@ +# The main job for marcin_project. +resources: + jobs: + marcin_project_job: + name: marcin_project_job + + schedule: + # Run every day at 8:37 AM + quartz_cron_expression: '44 37 8 * * ?' + timezone_id: Europe/Amsterdam + + email_notifications: + on_failure: + - wmj1fe@bosch.com + + tasks: + - task_key: notebook_task + job_cluster_key: job_cluster + notebook_task: + notebook_path: ../src/notebook.ipynb + + - task_key: refresh_pipeline + depends_on: + - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.marcin_project_pipeline.id} + + - task_key: main_task + depends_on: + - task_key: refresh_pipeline + job_cluster_key: job_cluster + python_wheel_task: + package_name: marcin_project + entry_point: main + libraries: + # By default we just include the .whl file generated for the marcin_project package. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + - whl: ../dist/*.whl + + job_clusters: + - job_cluster_key: job_cluster + new_cluster: + spark_version: 13.3.x-scala2.12 + node_type_id: Standard_D3_v2 + autoscale: + min_workers: 1 + max_workers: 4 diff --git a/resources/marcin_project_pipeline.yml b/resources/marcin_project_pipeline.yml new file mode 100644 index 0000000..381cc20 --- /dev/null +++ b/resources/marcin_project_pipeline.yml @@ -0,0 +1,12 @@ +# The main pipeline for marcin_project +resources: + pipelines: + marcin_project_pipeline: + name: marcin_project_pipeline + target: marcin_project_${bundle.environment} + libraries: + - notebook: + path: ../src/dlt_pipeline.ipynb + + configuration: + bundle.sourcePath: /Workspace/${workspace.file_path}/src diff --git a/scratch/README.md b/scratch/README.md new file mode 100644 index 0000000..e6cfb81 --- /dev/null +++ b/scratch/README.md @@ -0,0 +1,4 @@ +# scratch + +This folder is reserved for personal, exploratory notebooks. +By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0f688f4 --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +""" +setup.py configuration script describing how to build and package this project. + +This file is primarily used by the setuptools library and typically should not +be executed directly. See README.md for how to deploy, test, and run +the marcin_project project. +""" +from setuptools import setup, find_packages + +import sys +sys.path.append('./src') + +import datetime +import marcin_project + +setup( + name="marcin_project", + # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.) + # to ensure that changes to wheel package are picked up when used on all-purpose clusters + version=marcin_project.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"), + url="https://databricks.com", + author="wmj1fe@bosch.com", + description="wheel file based on marcin_project/src", + packages=find_packages(where='./src'), + package_dir={'': 'src'}, + entry_points={ + "packages": [ + "main=marcin_project.main:main" + ] + }, + install_requires=[ + # Dependencies in case the output wheel file is used as a library dependency. + # For defining dependencies, when this package is used in Databricks, see: + # https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + "setuptools" + ], +) diff --git a/src/dlt_pipeline.ipynb b/src/dlt_pipeline.ipynb new file mode 100644 index 0000000..d094a3f --- /dev/null +++ b/src/dlt_pipeline.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# DLT pipeline\n", + "\n", + "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/marcin_project_pipeline.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Import DLT and src/marcin_project\n", + "import dlt\n", + "import sys\n", + "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", + "from pyspark.sql.functions import expr\n", + "from marcin_project import main" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "@dlt.view\n", + "def taxi_raw():\n", + " return main.get_taxis()\n", + "\n", + "@dlt.table\n", + "def filtered_taxis():\n", + " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "dlt_pipeline", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/src/marcin_project/__init__.py b/src/marcin_project/__init__.py new file mode 100644 index 0000000..f102a9c --- /dev/null +++ b/src/marcin_project/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.1" diff --git a/src/marcin_project/main.py b/src/marcin_project/main.py new file mode 100644 index 0000000..48a80b0 --- /dev/null +++ b/src/marcin_project/main.py @@ -0,0 +1,11 @@ +from pyspark.sql import SparkSession + +def get_taxis(): + spark = SparkSession.builder.getOrCreate() + return spark.read.table("samples.nyctaxi.trips") + +def main(): + get_taxis().show(5) + +if __name__ == '__main__': + main() diff --git a/src/notebook.ipynb b/src/notebook.ipynb new file mode 100644 index 0000000..2b04eb7 --- /dev/null +++ b/src/notebook.ipynb @@ -0,0 +1,65 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Default notebook\n", + "\n", + "This default notebook is executed using Databricks Workflows as defined in resources/marcin_project_job.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from marcin_project import main\n", + "\n", + "main.get_taxis().show(10)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/src/notebook_test.ipynb b/src/notebook_test.ipynb new file mode 100644 index 0000000..2b04eb7 --- /dev/null +++ b/src/notebook_test.ipynb @@ -0,0 +1,65 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Default notebook\n", + "\n", + "This default notebook is executed using Databricks Workflows as defined in resources/marcin_project_job.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from marcin_project import main\n", + "\n", + "main.get_taxis().show(10)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tests/main_test.py b/tests/main_test.py new file mode 100644 index 0000000..cb4c112 --- /dev/null +++ b/tests/main_test.py @@ -0,0 +1,14 @@ +from databricks.connect import DatabricksSession +from pyspark.sql import SparkSession +from marcin_project import main + +# Create a new Databricks Connect session. If this fails, +# check that you have configured Databricks Connect correctly. +# See https://docs.databricks.com/dev-tools/databricks-connect.html. + +SparkSession.builder = DatabricksSession.builder +SparkSession.builder.getOrCreate() + +def test_main(): + taxis = main.get_taxis() + assert taxis.count() > 5