Skip to content

Commit

Permalink
feat(ingest): drop sql_metadata parser
Browse files Browse the repository at this point in the history
This appears to be dead code.
  • Loading branch information
hsheth2 committed Aug 31, 2023
1 parent ee06a65 commit 59f4c65
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 127 deletions.
1 change: 0 additions & 1 deletion metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,6 @@ def get_long_description():
# TODO: I doubt we need all three sql parsing libraries.
*sqllineage_lib,
*sqlglot_lib,
"sql_metadata",
"sqlalchemy-bigquery>=1.4.1",
"google-cloud-datacatalog-lineage==0.2.2",
},
Expand Down
56 changes: 0 additions & 56 deletions metadata-ingestion/src/datahub/utilities/sql_parser.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,15 @@
import contextlib
import logging
import multiprocessing
import re
import traceback
from multiprocessing import Process, Queue
from typing import Any, List, Optional, Tuple

from datahub.utilities.sql_lineage_parser_impl import SqlLineageSQLParserImpl
from datahub.utilities.sql_parser_base import SQLParser

with contextlib.suppress(ImportError):
from sql_metadata import Parser as MetadataSQLParser
logger = logging.getLogger(__name__)


class MetadataSQLSQLParser(SQLParser):
_DATE_SWAP_TOKEN = "__d_a_t_e"

def __init__(self, sql_query: str, use_external_process: bool = True) -> None:
super().__init__(sql_query, use_external_process)

original_sql_query = sql_query

# MetadataSQLParser makes mistakes on lateral flatten queries, use the prefix
if "lateral flatten" in sql_query:
sql_query = sql_query[: sql_query.find("lateral flatten")]

# MetadataSQLParser also makes mistakes on columns called "date", rename them
sql_query = re.sub(r"\sdate\s", f" {self._DATE_SWAP_TOKEN} ", sql_query)

# MetadataSQLParser does not handle "encode" directives well. Remove them
sql_query = re.sub(r"\sencode [a-zA-Z]*", "", sql_query)

if sql_query != original_sql_query:
logger.debug(f"rewrote original query {original_sql_query} as {sql_query}")

self._parser = MetadataSQLParser(sql_query)

def get_tables(self) -> List[str]:
result = self._parser.tables
# Sort tables to make the list deterministic
result.sort()
return result

def get_columns(self) -> List[str]:
columns_dict = self._parser.columns_dict
# don't attempt to parse columns if there are joins involved
if columns_dict.get("join", {}) != {}:
return []

columns_alias_dict = self._parser.columns_aliases_dict
filtered_cols = [
c
for c in columns_dict.get("select", {})
if c != "NULL" and not isinstance(c, list)
]
if columns_alias_dict is not None:
for col_alias in columns_alias_dict.get("select", []):
if col_alias in self._parser.columns_aliases:
col_name = self._parser.columns_aliases[col_alias]
filtered_cols = [
col_alias if c == col_name else c for c in filtered_cols
]
# swap back renamed date column
return ["date" if c == self._DATE_SWAP_TOKEN else c for c in filtered_cols]


def sql_lineage_parser_impl_func_wrapper(
queue: Optional[multiprocessing.Queue], sql_query: str, use_raw_names: bool = False
) -> Optional[Tuple[List[str], List[str], Any]]:
Expand Down
74 changes: 4 additions & 70 deletions metadata-ingestion/tests/unit/test_utilities.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datahub.utilities.delayed_iter import delayed_iter
from datahub.utilities.sql_parser import MetadataSQLSQLParser, SqlLineageSQLParser
from datahub.utilities.sql_parser import SqlLineageSQLParser


def test_delayed_iter():
Expand Down Expand Up @@ -36,18 +36,10 @@ def maker(n):
]


def test_metadatasql_sql_parser_get_tables_from_simple_query():
sql_query = "SELECT foo.a, foo.b, bar.c FROM foo JOIN bar ON (foo.a == bar.b);"

tables_list = MetadataSQLSQLParser(sql_query).get_tables()
tables_list.sort()
assert tables_list == ["bar", "foo"]


def test_sqllineage_sql_parser_get_tables_from_simple_query():
sql_query = "SELECT foo.a, foo.b, bar.c FROM foo JOIN bar ON (foo.a == bar.b);"

tables_list = MetadataSQLSQLParser(sql_query).get_tables()
tables_list = SqlLineageSQLParser(sql_query).get_tables()
tables_list.sort()
assert tables_list == ["bar", "foo"]

Expand Down Expand Up @@ -121,15 +113,15 @@ def test_sqllineage_sql_parser_get_columns_from_simple_query():
assert columns_list == ["a", "b"]


def test_metadatasql_sql_parser_get_columns_with_alias_and_count_star():
def test_sqllineage_sql_parser_get_columns_with_alias_and_count_star():
sql_query = "SELECT foo.a, foo.b, bar.c as test, count(*) as count FROM foo JOIN bar ON (foo.a == bar.b);"

columns_list = SqlLineageSQLParser(sql_query).get_columns()
columns_list.sort()
assert columns_list == ["a", "b", "count", "test"]


def test_metadatasql_sql_parser_get_columns_with_more_complex_join():
def test_sqllineage_sql_parser_get_columns_with_more_complex_join():
sql_query = """
INSERT
INTO
Expand Down Expand Up @@ -206,21 +198,6 @@ def test_sqllineage_sql_parser_get_columns_complex_query_with_union():
assert columns_list == ["c", "date", "e", "u", "x"]


def test_metadatasql_sql_parser_get_tables_from_templated_query():
sql_query = """
SELECT
country,
city,
timestamp,
measurement
FROM
${my_view.SQL_TABLE_NAME} AS my_view
"""
tables_list = MetadataSQLSQLParser(sql_query).get_tables()
tables_list.sort()
assert tables_list == ["my_view.SQL_TABLE_NAME"]


def test_sqllineage_sql_parser_get_tables_from_templated_query():
sql_query = """
SELECT
Expand All @@ -236,21 +213,6 @@ def test_sqllineage_sql_parser_get_tables_from_templated_query():
assert tables_list == ["my_view.SQL_TABLE_NAME"]


def test_metadatasql_sql_parser_get_columns_from_templated_query():
sql_query = """
SELECT
country,
city,
timestamp,
measurement
FROM
${my_view.SQL_TABLE_NAME} AS my_view
"""
columns_list = MetadataSQLSQLParser(sql_query).get_columns()
columns_list.sort()
assert columns_list == ["city", "country", "measurement", "timestamp"]


def test_sqllineage_sql_parser_get_columns_from_templated_query():
sql_query = """
SELECT
Expand All @@ -277,34 +239,6 @@ def test_sqllineage_sql_parser_with_weird_lookml_query():
assert columns_list == ["aliased_platform", "country", "date"]


def test_metadatasql_sql_parser_with_weird_lookml_query():
sql_query = """
SELECT date DATE,
platform VARCHAR(20) AS aliased_platform,
country VARCHAR(20) FROM fragment_derived_view'
"""
columns_list = MetadataSQLSQLParser(sql_query).get_columns()
columns_list.sort()
assert columns_list == ["aliased_platform", "country", "date"]


def test_metadatasql_sql_parser_tables_from_redash_query():
sql_query = """SELECT
name,
SUM(quantity * list_price * (1 - discount)) AS total,
YEAR(order_date) as order_year
FROM
`orders` o
INNER JOIN `order_items` i ON i.order_id = o.order_id
INNER JOIN `staffs` s ON s.staff_id = o.staff_id
GROUP BY
name,
year(order_date)"""
table_list = MetadataSQLSQLParser(sql_query).get_tables()
table_list.sort()
assert table_list == ["order_items", "orders", "staffs"]


def test_sqllineage_sql_parser_tables_from_redash_query():
sql_query = """SELECT
name,
Expand Down

0 comments on commit 59f4c65

Please sign in to comment.