databricks · shreyas-goenka · Dec 20, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/knowledge_base/save_job_result_to_volume/.gitignore b/knowledge_base/save_job_result_to_volume/.gitignore
@@ -0,0 +1 @@
+.databricks/
diff --git a/knowledge_base/save_job_result_to_volume/README.md b/knowledge_base/save_job_result_to_volume/README.md
@@ -0,0 +1,20 @@
+# Save job result to volume
+
+This example demonstrates how to define and use a Unity Catalog Volume in a Databricks Asset Bundle.
+
+Specifically we'll define a `top_ten_trips` job which computes the top ten NYC trips with the highest
+fares and stores the result in a Unity Catalog Volume.
+
+The bundle also defines a Volume and the associated Schema in which we store the results.
+
+## Prerequisites
+
+* Databricks CLI v0.236.0 or above
+
+## Usage
+
+Update the `host` field under `workspace` in `databricks.yml` to the Databricks workspace you wish to deploy to.
+
+Run `databricks bundle deploy` to deploy the job.
+
+Run `databricks bundle run top_ten_trips` to run the job and store the results in UC volume.
diff --git a/knowledge_base/save_job_result_to_volume/databricks.yml b/knowledge_base/save_job_result_to_volume/databricks.yml
@@ -0,0 +1,12 @@
+bundle:
+  name: save_job_result_to_volume
+
+include:
+  - resources/*.yml
+
+workspace:
+  host: https://e2-dogfood.staging.cloud.databricks.com
+
+targets:
+  dev:
+    default: true
diff --git a/knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml b/knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml
@@ -0,0 +1,9 @@
+resources: 
+  volumes: 
+    my_volume:
+      catalog_name: main
+      name: my_volume
+      # We use the ${resources.schemas...} interpolation syntax to force the creation
+      # of the schema before the volume. Usage of the ${resources.schemas...} syntax 
+      # allows Databricks Asset Bundles to form a dependency graph between resources.
+      schema_name: ${resources.schemas.trips.name}
diff --git a/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml b/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml
@@ -0,0 +1,26 @@
+resources:
+  jobs:
+    top_ten_trips:
+      name: top_ten_trips
+
+      trigger:
+        # Run this job every day to update all time the top ten trips recorded top
+        # 10 trips in the UC Volume.
+          interval: 1
+          unit: DAYS
+
+      # No job cluster is configured. The job will run on serverless compute. 
+      # You can explicitly configure job compute here if your workspace does
+      # not have serverless compute enabled.
+      tasks:
+        - task_key: top_ten_trips_task
+          notebook_task:
+            notebook_path: ../src/query.ipynb
+
+      parameters: 
+        - name: schema_name
+          default: ${resources.schemas.trips.name}
+
+
+
+
diff --git a/knowledge_base/save_job_result_to_volume/resources/trips.schema.yml b/knowledge_base/save_job_result_to_volume/resources/trips.schema.yml
@@ -0,0 +1,5 @@
+resources: 
+  schemas: 
+    trips:
+      catalog_name: main
+      name: ${workspace.current_user.short_name}_trips
diff --git a/knowledge_base/save_job_result_to_volume/src/query.ipynb b/knowledge_base/save_job_result_to_volume/src/query.ipynb
@@ -0,0 +1,61 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the SQL query to get the top 10 trips by fare amount and store the result in CSV format in a Volume.\n",
+    "schema_name = dbutils.widgets.get(\"schema_name\")\n",
+    "best_trips_df = spark.sql(\"SELECT * FROM samples.nyctaxi.trips order by fare_amount desc LIMIT 10\")\n",
+    "best_trips_df.write.format(\"csv\").option(\"header\", \"true\").save(f\"/Volumes/main/{schema_name}/my_volume/top_ten/trip_fares.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Load the saved data from the Volume and display it.\n",
+    "df = spark.read.format(\"csv\").load(f\"/Volumes/main/{schema_name}/my_volume/top_ten/trip_fares.csv\")\n",
+    "\n",
+    "display(df)"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "notebook",
+   "widgets": {}
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}