-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Make profiler and generator dependent on UC and introduce run configu…
…rations (#68) ## Changes * Made profiler and generator dependent on UC/Databricks * Introduced RunConfig to be able to specify multiple configurations in the config to be able to run the project (e.g. profiler) for different use cases * Refactored tests * Improved Readme * Added new examples to demos ### Tests - [x] manually tested the whole project on Databricks including demos - [x] added unit tests - [x] added integration tests
- Loading branch information
1 parent
ed835bb
commit 61f1b7d
Showing
22 changed files
with
1,378 additions
and
1,019 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
# Databricks notebook source | ||
# 1. Install DQX in the workspace as per the instructions here: https://github.com/databrickslabs/dqx?tab=readme-ov-file#installation | ||
# Use default filename for data quality rules. | ||
|
||
# 2. Install DQX in the cluster | ||
user_name = "[email protected]" # cannot dynamically retrieve user name as "System-User" is always returned: spark.sql('select current_user() as user').collect()[0]['user'] | ||
|
@@ -27,7 +28,8 @@ | |
# COMMAND ---------- | ||
|
||
import dlt | ||
from databricks.labs.dqx.engine import apply_checks_by_metadata, get_invalid, get_valid | ||
from databricks.labs.dqx.engine import DQEngine | ||
from databricks.sdk import WorkspaceClient | ||
|
||
# COMMAND ---------- | ||
|
||
|
@@ -39,7 +41,7 @@ def bronze(): | |
|
||
# COMMAND ---------- | ||
|
||
# Define our Data Quality cheks | ||
# Define Data Quality checks | ||
import yaml | ||
|
||
|
||
|
@@ -117,24 +119,26 @@ def bronze(): | |
|
||
# COMMAND ---------- | ||
|
||
dq_engine = DQEngine(WorkspaceClient()) | ||
|
||
# Read data from Bronze and apply checks | ||
@dlt.view | ||
def bronze_dq_check(): | ||
df = dlt.read_stream("bronze") | ||
return apply_checks_by_metadata(df, checks) | ||
return dq_engine.apply_checks_by_metadata(df, checks) | ||
|
||
# COMMAND ---------- | ||
|
||
# # get rows without errors or warnings, and drop auxiliary columns | ||
@dlt.table | ||
def silver(): | ||
df = dlt.read_stream("bronze_dq_check") | ||
return get_valid(df) | ||
return dq_engine.get_valid(df) | ||
|
||
# COMMAND ---------- | ||
|
||
# get only rows with errors or warnings | ||
@dlt.table | ||
def quarantine(): | ||
df = dlt.read_stream("bronze_dq_check") | ||
return get_invalid(df) | ||
return dq_engine.get_invalid(df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import abc | ||
from typing import final | ||
from functools import cached_property | ||
from databricks.sdk import WorkspaceClient | ||
|
||
|
||
class DQEngineBase(abc.ABC): | ||
def __init__(self, workspace_client: WorkspaceClient): | ||
self._workspace_client = workspace_client | ||
|
||
@cached_property | ||
def ws(self) -> WorkspaceClient: | ||
""" | ||
Cached property to verify and return the workspace client. | ||
""" | ||
return self._verify_workspace_client(self._workspace_client) | ||
|
||
@staticmethod | ||
@final | ||
def _verify_workspace_client(ws: WorkspaceClient) -> WorkspaceClient: | ||
""" | ||
Verifies the Databricks workspace client configuration. | ||
""" | ||
# make sure Unity Catalog is accessible in the current Databricks workspace | ||
ws.catalogs.list() | ||
return ws |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.