diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0dab7f4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..1ab831e
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,114 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/marcin_project.iml b/.idea/marcin_project.iml
new file mode 100644
index 0000000..0c80114
--- /dev/null
+++ b/.idea/marcin_project.iml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..6bce61c
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..b28d604
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.vscode/__builtins__.pyi b/.vscode/__builtins__.pyi
new file mode 100644
index 0000000..0edd518
--- /dev/null
+++ b/.vscode/__builtins__.pyi
@@ -0,0 +1,3 @@
+# Typings for Pylance in Visual Studio Code
+# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
+from databricks.sdk.runtime import *
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
new file mode 100644
index 0000000..5d15eba
--- /dev/null
+++ b/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+ "recommendations": [
+ "databricks.databricks",
+ "ms-python.vscode-pylance",
+ "redhat.vscode-yaml"
+ ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..f19498d
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,17 @@
+{
+ "python.analysis.stubPath": ".vscode",
+ "databricks.python.envFile": "${workspaceFolder}/.env",
+ "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
+ "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
+ "python.testing.pytestArgs": [
+ "."
+ ],
+ "python.testing.unittestEnabled": false,
+ "python.testing.pytestEnabled": true,
+ "python.analysis.extraPaths": ["src"],
+ "files.exclude": {
+ "**/*.egg-info": true,
+ "**/__pycache__": true,
+ ".pytest_cache": true,
+ },
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1425a2c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,42 @@
+# marcin_project
+
+The 'marcin_project' project was generated by using the default-python template.
+
+## Getting started
+
+1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
+
+2. Authenticate to your Databricks workspace:
+ ```
+ $ databricks configure
+ ```
+
+3. To deploy a development copy of this project, type:
+ ```
+ $ databricks bundle deploy --target dev
+ ```
+ (Note that "dev" is the default target, so the `--target` parameter
+ is optional here.)
+
+ This deploys everything that's defined for this project.
+ For example, the default template would deploy a job called
+ `[dev yourname] marcin_project_job` to your workspace.
+ You can find that job by opening your workpace and clicking on **Workflows**.
+
+4. Similarly, to deploy a production copy, type:
+ ```
+ $ databricks bundle deploy --target prod
+ ```
+
+5. To run a job or pipeline, use the "run" command:
+ ```
+ $ databricks bundle run
+ ```
+
+6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+ https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
+ **Databricks Connect** for instructions on running the included Python code from a different IDE.
+
+7. For documentation on the Databricks asset bundles format used
+ for this project, and for CI/CD configuration, see
+ https://docs.databricks.com/dev-tools/bundles/index.html.
diff --git a/databricks.yml b/databricks.yml
new file mode 100644
index 0000000..55408bc
--- /dev/null
+++ b/databricks.yml
@@ -0,0 +1,41 @@
+# This is a Databricks asset bundle definition for marcin_project.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+ name: marcin_project
+
+include:
+ - resources/*.yml
+
+targets:
+ # The 'dev' target, for development purposes. This target is the default.
+ dev:
+ # We use 'mode: development' to indicate this is a personal development copy:
+ # - Deployed resources get prefixed with '[dev my_user_name]'
+ # - Any job schedules and triggers are paused by default
+ # - The 'development' mode is used for Delta Live Tables pipelines
+ mode: development
+ default: true
+ workspace:
+ host: https://adb-8870486534760962.2.azuredatabricks.net
+
+ ## Optionally, there could be a 'staging' target here.
+ ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
+ #
+ # staging:
+ # workspace:
+ # host: https://adb-8870486534760962.2.azuredatabricks.net
+
+ # The 'prod' target, used for production deployment.
+ prod:
+ # We use 'mode: production' to indicate this is a production deployment.
+ # Doing so enables strict verification of the settings below.
+ mode: production
+ workspace:
+ host: https://adb-8870486534760962.2.azuredatabricks.net
+ # We only have a single deployment copy for production, so we use a shared path.
+ root_path: /Shared/.bundle/prod/${bundle.name}
+ run_as:
+ # This runs as wmj1fe@bosch.com in production. We could also use a service principal here
+ # using service_principal_name (see https://docs.databricks.com/dev-tools/bundles/permissions.html).
+ user_name: wmj1fe@bosch.com
+
\ No newline at end of file
diff --git a/fixtures/.gitkeep b/fixtures/.gitkeep
new file mode 100644
index 0000000..fa25d27
--- /dev/null
+++ b/fixtures/.gitkeep
@@ -0,0 +1,22 @@
+# Fixtures
+
+This folder is reserved for fixtures, such as CSV files.
+
+Below is an example of how to load fixtures as a data frame:
+
+```
+import pandas as pd
+import os
+
+def get_absolute_path(*relative_parts):
+ if 'dbutils' in globals():
+ base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
+ path = os.path.normpath(os.path.join(base_dir, *relative_parts))
+ return path if path.startswith("/Workspace") else "/Workspace" + path
+ else:
+ return os.path.join(*relative_parts)
+
+csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
+df = pd.read_csv(csv_file)
+display(df)
+```
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..80432c2
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths = tests
+pythonpath = src
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..8053b1e
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,26 @@
+## requirements-dev.txt: dependencies for local development.
+##
+## For defining dependencies used by jobs in Databricks Workflows, see
+## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+
+## pytest is the default package used for testing
+pytest
+
+## Dependencies for building wheel files
+setuptools
+wheel
+
+## databricks-connect can be used to run parts of this project locally.
+## See https://docs.databricks.com/dev-tools/databricks-connect.html.
+##
+## databricks-connect is automatically installed if you're using Databricks
+## extension for Visual Studio Code
+## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html).
+##
+## To manually install databricks-connect, either follow the instructions
+## at https://docs.databricks.com/dev-tools/databricks-connect.html
+## to install the package system-wide. Or uncomment the line below to install a
+## version of db-connect that corresponds to the Databricks Runtime version used
+## for this project.
+#
+databricks-connect>=13.3,<13.4
diff --git a/resources/marcin_project_job.yml b/resources/marcin_project_job.yml
new file mode 100644
index 0000000..94f67af
--- /dev/null
+++ b/resources/marcin_project_job.yml
@@ -0,0 +1,48 @@
+# The main job for marcin_project.
+resources:
+ jobs:
+ marcin_project_job:
+ name: marcin_project_job
+
+ schedule:
+ # Run every day at 8:37 AM
+ quartz_cron_expression: '44 37 8 * * ?'
+ timezone_id: Europe/Amsterdam
+
+ email_notifications:
+ on_failure:
+ - wmj1fe@bosch.com
+
+ tasks:
+ - task_key: notebook_task
+ job_cluster_key: job_cluster
+ notebook_task:
+ notebook_path: ../src/notebook.ipynb
+
+ - task_key: refresh_pipeline
+ depends_on:
+ - task_key: notebook_task
+ pipeline_task:
+ pipeline_id: ${resources.pipelines.marcin_project_pipeline.id}
+
+ - task_key: main_task
+ depends_on:
+ - task_key: refresh_pipeline
+ job_cluster_key: job_cluster
+ python_wheel_task:
+ package_name: marcin_project
+ entry_point: main
+ libraries:
+ # By default we just include the .whl file generated for the marcin_project package.
+ # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+ # for more information on how to add other libraries.
+ - whl: ../dist/*.whl
+
+ job_clusters:
+ - job_cluster_key: job_cluster
+ new_cluster:
+ spark_version: 13.3.x-scala2.12
+ node_type_id: Standard_D3_v2
+ autoscale:
+ min_workers: 1
+ max_workers: 4
diff --git a/resources/marcin_project_pipeline.yml b/resources/marcin_project_pipeline.yml
new file mode 100644
index 0000000..381cc20
--- /dev/null
+++ b/resources/marcin_project_pipeline.yml
@@ -0,0 +1,12 @@
+# The main pipeline for marcin_project
+resources:
+ pipelines:
+ marcin_project_pipeline:
+ name: marcin_project_pipeline
+ target: marcin_project_${bundle.environment}
+ libraries:
+ - notebook:
+ path: ../src/dlt_pipeline.ipynb
+
+ configuration:
+ bundle.sourcePath: /Workspace/${workspace.file_path}/src
diff --git a/scratch/README.md b/scratch/README.md
new file mode 100644
index 0000000..e6cfb81
--- /dev/null
+++ b/scratch/README.md
@@ -0,0 +1,4 @@
+# scratch
+
+This folder is reserved for personal, exploratory notebooks.
+By default these are not committed to Git, as 'scratch' is listed in .gitignore.
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..0f688f4
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+"""
+setup.py configuration script describing how to build and package this project.
+
+This file is primarily used by the setuptools library and typically should not
+be executed directly. See README.md for how to deploy, test, and run
+the marcin_project project.
+"""
+from setuptools import setup, find_packages
+
+import sys
+sys.path.append('./src')
+
+import datetime
+import marcin_project
+
+setup(
+ name="marcin_project",
+ # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
+ # to ensure that changes to wheel package are picked up when used on all-purpose clusters
+ version=marcin_project.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
+ url="https://databricks.com",
+ author="wmj1fe@bosch.com",
+ description="wheel file based on marcin_project/src",
+ packages=find_packages(where='./src'),
+ package_dir={'': 'src'},
+ entry_points={
+ "packages": [
+ "main=marcin_project.main:main"
+ ]
+ },
+ install_requires=[
+ # Dependencies in case the output wheel file is used as a library dependency.
+ # For defining dependencies, when this package is used in Databricks, see:
+ # https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+ "setuptools"
+ ],
+)
diff --git a/src/dlt_pipeline.ipynb b/src/dlt_pipeline.ipynb
new file mode 100644
index 0000000..d094a3f
--- /dev/null
+++ b/src/dlt_pipeline.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {},
+ "inputWidgets": {},
+ "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec",
+ "showTitle": false,
+ "title": ""
+ }
+ },
+ "source": [
+ "# DLT pipeline\n",
+ "\n",
+ "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/marcin_project_pipeline.yml."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {},
+ "inputWidgets": {},
+ "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f",
+ "showTitle": false,
+ "title": ""
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Import DLT and src/marcin_project\n",
+ "import dlt\n",
+ "import sys\n",
+ "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n",
+ "from pyspark.sql.functions import expr\n",
+ "from marcin_project import main"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {},
+ "inputWidgets": {},
+ "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14",
+ "showTitle": false,
+ "title": ""
+ }
+ },
+ "outputs": [],
+ "source": [
+ "@dlt.view\n",
+ "def taxi_raw():\n",
+ " return main.get_taxis()\n",
+ "\n",
+ "@dlt.table\n",
+ "def filtered_taxis():\n",
+ " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
+ ]
+ }
+ ],
+ "metadata": {
+ "application/vnd.databricks.v1+notebook": {
+ "dashboards": [],
+ "language": "python",
+ "notebookMetadata": {
+ "pythonIndentUnit": 2
+ },
+ "notebookName": "dlt_pipeline",
+ "widgets": {}
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/src/marcin_project/__init__.py b/src/marcin_project/__init__.py
new file mode 100644
index 0000000..f102a9c
--- /dev/null
+++ b/src/marcin_project/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.1"
diff --git a/src/marcin_project/main.py b/src/marcin_project/main.py
new file mode 100644
index 0000000..48a80b0
--- /dev/null
+++ b/src/marcin_project/main.py
@@ -0,0 +1,11 @@
+from pyspark.sql import SparkSession
+
+def get_taxis():
+ spark = SparkSession.builder.getOrCreate()
+ return spark.read.table("samples.nyctaxi.trips")
+
+def main():
+ get_taxis().show(5)
+
+if __name__ == '__main__':
+ main()
diff --git a/src/notebook.ipynb b/src/notebook.ipynb
new file mode 100644
index 0000000..2b04eb7
--- /dev/null
+++ b/src/notebook.ipynb
@@ -0,0 +1,65 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {},
+ "inputWidgets": {},
+ "nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
+ "showTitle": false,
+ "title": ""
+ }
+ },
+ "source": [
+ "# Default notebook\n",
+ "\n",
+ "This default notebook is executed using Databricks Workflows as defined in resources/marcin_project_job.yml."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
+ "inputWidgets": {},
+ "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
+ "showTitle": false,
+ "title": ""
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from marcin_project import main\n",
+ "\n",
+ "main.get_taxis().show(10)"
+ ]
+ }
+ ],
+ "metadata": {
+ "application/vnd.databricks.v1+notebook": {
+ "dashboards": [],
+ "language": "python",
+ "notebookMetadata": {
+ "pythonIndentUnit": 2
+ },
+ "notebookName": "notebook",
+ "widgets": {}
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/src/notebook_test.ipynb b/src/notebook_test.ipynb
new file mode 100644
index 0000000..2b04eb7
--- /dev/null
+++ b/src/notebook_test.ipynb
@@ -0,0 +1,65 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {},
+ "inputWidgets": {},
+ "nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
+ "showTitle": false,
+ "title": ""
+ }
+ },
+ "source": [
+ "# Default notebook\n",
+ "\n",
+ "This default notebook is executed using Databricks Workflows as defined in resources/marcin_project_job.yml."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "application/vnd.databricks.v1+cell": {
+ "cellMetadata": {
+ "byteLimit": 2048000,
+ "rowLimit": 10000
+ },
+ "inputWidgets": {},
+ "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
+ "showTitle": false,
+ "title": ""
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from marcin_project import main\n",
+ "\n",
+ "main.get_taxis().show(10)"
+ ]
+ }
+ ],
+ "metadata": {
+ "application/vnd.databricks.v1+notebook": {
+ "dashboards": [],
+ "language": "python",
+ "notebookMetadata": {
+ "pythonIndentUnit": 2
+ },
+ "notebookName": "notebook",
+ "widgets": {}
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.11.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/tests/main_test.py b/tests/main_test.py
new file mode 100644
index 0000000..cb4c112
--- /dev/null
+++ b/tests/main_test.py
@@ -0,0 +1,14 @@
+from databricks.connect import DatabricksSession
+from pyspark.sql import SparkSession
+from marcin_project import main
+
+# Create a new Databricks Connect session. If this fails,
+# check that you have configured Databricks Connect correctly.
+# See https://docs.databricks.com/dev-tools/databricks-connect.html.
+
+SparkSession.builder = DatabricksSession.builder
+SparkSession.builder.getOrCreate()
+
+def test_main():
+ taxis = main.get_taxis()
+ assert taxis.count() > 5