feat(tableau): ability to force extraction of table/column level lina…

…ge from SQL queries (#9838)
datahub-project · Mar 21, 2024 · e6e5c09 · e6e5c09
1 parent 7a2d61d
commit e6e5c09
Show file tree

Hide file tree

Showing 9 changed files with 508 additions and 56 deletions.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py
@@ -2,7 +2,7 @@
 import logging
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 from pydantic.fields import Field
 from tableauserverclient import Server
@@ -762,8 +762,19 @@ def make_upstream_class(
 
 
 def make_fine_grained_lineage_class(
-    parsed_result: Optional[SqlParsingResult], dataset_urn: str
+    parsed_result: Optional[SqlParsingResult],
+    dataset_urn: str,
+    out_columns: List[Dict[Any, Any]],
 ) -> List[FineGrainedLineage]:
+    # 1) fine grained lineage links are case sensitive
+    # 2) parsed out columns are always lower cased
+    # 3) corresponding Custom SQL output columns can be in any case lower/upper/mix
+    #
+    # we need a map between 2 and 3 that will be used during building column level linage links (see below)
+    out_columns_map = {
+        col.get(c.NAME, "").lower(): col.get(c.NAME, "") for col in out_columns
+    }
+
     fine_grained_lineages: List[FineGrainedLineage] = []
 
     if parsed_result is None:
@@ -775,7 +786,15 @@ def make_fine_grained_lineage_class(
 
     for cll_info in cll:
         downstream = (
-            [builder.make_schema_field_urn(dataset_urn, cll_info.downstream.column)]
+            [
+                builder.make_schema_field_urn(
+                    dataset_urn,
+                    out_columns_map.get(
+                        cll_info.downstream.column.lower(),
+                        cll_info.downstream.column,
+                    ),
+                )
+            ]
             if cll_info.downstream is not None
             and cll_info.downstream.column is not None
             else []

diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_result_utils.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_result_utils.py
@@ -0,0 +1,23 @@
+from typing import Dict, Set
+
+from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, Urn
+
+
+def transform_parsing_result_to_in_tables_schemas(
+    parsing_result: SqlParsingResult,
+) -> Dict[Urn, Set[str]]:
+    table_urn_to_schema_map: Dict[str, Set[str]] = (
+        {it: set() for it in parsing_result.in_tables}
+        if parsing_result.in_tables
+        else {}
+    )
+
+    if parsing_result.column_lineage:
+        for cli in parsing_result.column_lineage:
+            for upstream in cli.upstreams:
+                if upstream.table in table_urn_to_schema_map:
+                    table_urn_to_schema_map[upstream.table].add(upstream.column)
+                else:
+                    table_urn_to_schema_map[upstream.table] = {upstream.column}
+
+    return table_urn_to_schema_map
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
@@ -529,6 +529,9 @@ def _schema_aware_fuzzy_column_resolve(
 
                     # Parse the column name out of the node name.
                     # Sqlglot calls .sql(), so we have to do the inverse.
+                    if node.name == "*":
+                        continue
+
                     normalized_col = sqlglot.parse_one(node.name).this.name
                     if node.subfield:
                         normalized_col = f"{normalized_col}.{node.subfield}"
@@ -834,6 +837,7 @@ def _sqlglot_lineage_inner(
     # Fetch schema info for the relevant tables.
     table_name_urn_mapping: Dict[_TableName, str] = {}
     table_name_schema_mapping: Dict[_TableName, SchemaInfo] = {}
+
     for table in tables | modified:
         # For select statements, qualification will be a no-op. For other statements, this
         # is where the qualification actually happens.
@@ -1016,8 +1020,9 @@ def create_lineage_sql_parsed_result(
     env: str,
     default_schema: Optional[str] = None,
     graph: Optional[DataHubGraph] = None,
+    schema_aware: bool = True,
 ) -> SqlParsingResult:
-    if graph:
+    if graph and schema_aware:
         needs_close = False
         schema_resolver = graph._make_schema_resolver(
             platform=platform,

diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_utils.py
@@ -16,6 +16,11 @@ def _get_dialect_str(platform: str) -> str:
         return "tsql"
     elif platform == "athena":
         return "trino"
+    # TODO: define SalesForce SOQL dialect
+    # Temporary workaround is to treat SOQL as databricks dialect
+    # At least it allows to parse simple SQL queries and built linage for them
+    elif platform == "salesforce":
+        return "databricks"
     elif platform in {"mysql", "mariadb"}:
         # In sqlglot v20+, MySQL is now case-sensitive by default, which is the
         # default behavior on Linux. However, MySQL's default case sensitivity
@@ -31,6 +36,7 @@ def _get_dialect_str(platform: str) -> str:
 def get_dialect(platform: DialectOrStr) -> sqlglot.Dialect:
     if isinstance(platform, sqlglot.Dialect):
         return platform
+
     return sqlglot.Dialect.get_or_raise(_get_dialect_str(platform))
 
 

diff --git a/metadata-ingestion/tests/integration/tableau/tableau_cll_mces_golden.json b/metadata-ingestion/tests/integration/tableau/tableau_cll_mces_golden.json
@@ -42870,6 +42870,38 @@
         "lastRunId": "no-run-id-provided"
     }
 },
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,demo-custom-323403.bigquery_demo.order_items,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "status",
+    "aspect": {
+        "json": {
+            "removed": false
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1638860400000,
+        "runId": "tableau-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,demo-custom-323403.bigquery_demo.sellers,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "status",
+    "aspect": {
+        "json": {
+            "removed": false
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1638860400000,
+        "runId": "tableau-test",
+        "lastRunId": "no-run-id-provided"
+    }
+},
 {
     "entityType": "dataset",
     "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.orders,PROD)",

diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -319,6 +319,8 @@ def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph):
     new_pipeline_config: Dict[Any, Any] = {
         **config_source_default,
         "extract_lineage_from_unsupported_custom_sql_queries": True,
+        "force_extraction_of_lineage_from_custom_sql_queries": False,
+        "sql_parsing_disable_schema_awareness": False,
         "extract_column_level_lineage": True,
     }
 
@@ -834,6 +836,7 @@ def test_tableau_unsupported_csql(mock_datahub_graph):
                     "connectionType": "bigquery",
                 },
             },
+            out_columns=[],
         )
 
         mcp = cast(MetadataChangeProposalClass, next(iter(lineage)).metadata)

diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_parsing_result_utils.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_parsing_result_utils.py
@@ -0,0 +1,67 @@
+from datahub.sql_parsing.sql_parsing_result_utils import (
+    transform_parsing_result_to_in_tables_schemas,
+)
+from datahub.sql_parsing.sqlglot_lineage import (
+    ColumnLineageInfo,
+    ColumnRef,
+    DownstreamColumnRef,
+    SqlParsingResult,
+)
+
+
+def test_transform_parsing_result_to_in_tables_schemas__empty_parsing_result():
+    parsing_result = SqlParsingResult(in_tables=[], out_tables=[], column_lineage=None)
+
+    in_tables_schema = transform_parsing_result_to_in_tables_schemas(parsing_result)
+    assert not in_tables_schema
+
+
+def test_transform_parsing_result_to_in_tables_schemas__in_tables_only():
+    parsing_result = SqlParsingResult(
+        in_tables=["table_urn1", "table_urn2", "table_urn3"],
+        out_tables=[],
+        column_lineage=None,
+    )
+
+    in_tables_schema = transform_parsing_result_to_in_tables_schemas(parsing_result)
+    assert in_tables_schema == {
+        "table_urn1": set(),
+        "table_urn2": set(),
+        "table_urn3": set(),
+    }
+
+
+def test_transform_parsing_result_to_in_tables_schemas__in_tables_and_column_linage():
+    parsing_result = SqlParsingResult(
+        in_tables=["table_urn1", "table_urn2", "table_urn3"],
+        out_tables=[],
+        column_lineage=[
+            ColumnLineageInfo(
+                downstream=DownstreamColumnRef(column="out_col1"),
+                upstreams=[
+                    ColumnRef(table="table_urn1", column="col11"),
+                ],
+            ),
+            ColumnLineageInfo(
+                downstream=DownstreamColumnRef(column="out_col2"),
+                upstreams=[
+                    ColumnRef(table="table_urn2", column="col21"),
+                    ColumnRef(table="table_urn2", column="col22"),
+                ],
+            ),
+            ColumnLineageInfo(
+                downstream=DownstreamColumnRef(column="out_col3"),
+                upstreams=[
+                    ColumnRef(table="table_urn1", column="col12"),
+                    ColumnRef(table="table_urn2", column="col23"),
+                ],
+            ),
+        ],
+    )
+
+    in_tables_schema = transform_parsing_result_to_in_tables_schemas(parsing_result)
+    assert in_tables_schema == {
+        "table_urn1": {"col11", "col12"},
+        "table_urn2": {"col21", "col22", "col23"},
+        "table_urn3": set(),
+    }
diff --git a/metadata-ingestion/tests/unit/test_tableau_source.py b/metadata-ingestion/tests/unit/test_tableau_source.py
@@ -0,0 +1,123 @@
+import pytest
+
+from datahub.ingestion.source.tableau import TableauSource
+
+
+def test_tableau_source_unescapes_lt():
+    res = TableauSource._clean_tableau_query_parameters(
+        "select * from t where c1 << 135"
+    )
+
+    assert res == "select * from t where c1 < 135"
+
+
+def test_tableau_source_unescapes_gt():
+    res = TableauSource._clean_tableau_query_parameters(
+        "select * from t where c1 >> 135"
+    )
+
+    assert res == "select * from t where c1 > 135"
+
+
+def test_tableau_source_unescapes_gte():
+    res = TableauSource._clean_tableau_query_parameters(
+        "select * from t where c1 >>= 135"
+    )
+
+    assert res == "select * from t where c1 >= 135"
+
+
+def test_tableau_source_unescapeslgte():
+    res = TableauSource._clean_tableau_query_parameters(
+        "select * from t where c1 <<= 135"
+    )
+
+    assert res == "select * from t where c1 <= 135"
+
+
+def test_tableau_source_doesnt_touch_not_escaped():
+    res = TableauSource._clean_tableau_query_parameters(
+        "select * from t where c1 < 135 and c2 > 15"
+    )
+
+    assert res == "select * from t where c1 < 135 and c2 > 15"
+
+
+TABLEAU_PARAMS = [
+    "<Parameters.MyParam>",
+    "<Parameters.MyParam_1>",
+    "<Parameters.My Param _ 1>",
+    "<Parameters.My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<>",
+    "<[Parameters].MyParam>",
+    "<[Parameters].MyParam_1>",
+    "<[Parameters].My Param _ 1>",
+    "<[Parameters].My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<>",
+    "<Parameters.[MyParam]>",
+    "<Parameters.[MyParam_1]>",
+    "<Parameters.[My Param _ 1]>",
+    "<Parameters.[My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<]>",
+    "<[Parameters].[MyParam]>",
+    "<[Parameters].[MyParam_1]>",
+    "<[Parameters].[My Param _ 1]>",
+    "<[Parameters].[My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<]>",
+    "<Parameters.[My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<>]>",
+    "<[Parameters].[My Param 1 !@\"',.#$%^:;&*()-_+={}|\\ /<>]>",
+]
+
+
+@pytest.mark.parametrize("p", TABLEAU_PARAMS)
+def test_tableau_source_cleanups_tableau_parameters_in_equi_predicates(p):
+    assert (
+        TableauSource._clean_tableau_query_parameters(
+            f"select * from t where c1 = {p} and c2 = {p} and c3 = 7"
+        )
+        == "select * from t where c1 = 1 and c2 = 1 and c3 = 7"
+    )
+
+
+@pytest.mark.parametrize("p", TABLEAU_PARAMS)
+def test_tableau_source_cleanups_tableau_parameters_in_lt_gt_predicates(p):
+    assert (
+        TableauSource._clean_tableau_query_parameters(
+            f"select * from t where c1 << {p} and c2<<{p} and c3 >> {p} and c4>>{p} or {p} >> c1 and {p}>>c2 and {p} << c3 and {p}<<c4"
+        )
+        == "select * from t where c1 < 1 and c2<1 and c3 > 1 and c4>1 or 1 > c1 and 1>c2 and 1 < c3 and 1<c4"
+    )
+
+
+@pytest.mark.parametrize("p", TABLEAU_PARAMS)
+def test_tableau_source_cleanups_tableau_parameters_in_lte_gte_predicates(p):
+    assert (
+        TableauSource._clean_tableau_query_parameters(
+            f"select * from t where c1 <<= {p} and c2<<={p} and c3 >>= {p} and c4>>={p} or {p} >>= c1 and {p}>>=c2 and {p} <<= c3 and {p}<<=c4"
+        )
+        == "select * from t where c1 <= 1 and c2<=1 and c3 >= 1 and c4>=1 or 1 >= c1 and 1>=c2 and 1 <= c3 and 1<=c4"
+    )
+
+
+@pytest.mark.parametrize("p", TABLEAU_PARAMS)
+def test_tableau_source_cleanups_tableau_parameters_in_join_predicate(p):
+    assert (
+        TableauSource._clean_tableau_query_parameters(
+            f"select * from t1 inner join t2 on t1.id = t2.id and t2.c21 = {p} and t1.c11 = 123 + {p}"
+        )
+        == "select * from t1 inner join t2 on t1.id = t2.id and t2.c21 = 1 and t1.c11 = 123 + 1"
+    )
+
+
+@pytest.mark.parametrize("p", TABLEAU_PARAMS)
+def test_tableau_source_cleanups_tableau_parameters_in_complex_expressions(p):
+    assert (
+        TableauSource._clean_tableau_query_parameters(
+            f"select myudf1(c1, {p}, c2) / myudf2({p}) > ({p} + 3 * {p} * c5) * {p} - c4"
+        )
+        == "select myudf1(c1, 1, c2) / myudf2(1) > (1 + 3 * 1 * c5) * 1 - c4"
+    )
+
+
+@pytest.mark.parametrize("p", TABLEAU_PARAMS)
+def test_tableau_source_cleanups_tableau_parameters_in_udfs(p):
+    assert (
+        TableauSource._clean_tableau_query_parameters(f"select myudf({p}) from t")
+        == "select myudf(1) from t"
+    )