diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 32e1cf926cc68..5fe4879ee5478 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -303,7 +303,6 @@ def get_long_description(): # TODO: I doubt we need all three sql parsing libraries. *sqllineage_lib, *sqlglot_lib, - "sql_metadata", "sqlalchemy-bigquery>=1.4.1", "google-cloud-datacatalog-lineage==0.2.2", }, diff --git a/metadata-ingestion/src/datahub/utilities/sql_parser.py b/metadata-ingestion/src/datahub/utilities/sql_parser.py index 6b1a94ba69657..61693b52b350f 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_parser.py +++ b/metadata-ingestion/src/datahub/utilities/sql_parser.py @@ -1,7 +1,5 @@ -import contextlib import logging import multiprocessing -import re import traceback from multiprocessing import Process, Queue from typing import Any, List, Optional, Tuple @@ -9,63 +7,9 @@ from datahub.utilities.sql_lineage_parser_impl import SqlLineageSQLParserImpl from datahub.utilities.sql_parser_base import SQLParser -with contextlib.suppress(ImportError): - from sql_metadata import Parser as MetadataSQLParser logger = logging.getLogger(__name__) -class MetadataSQLSQLParser(SQLParser): - _DATE_SWAP_TOKEN = "__d_a_t_e" - - def __init__(self, sql_query: str, use_external_process: bool = True) -> None: - super().__init__(sql_query, use_external_process) - - original_sql_query = sql_query - - # MetadataSQLParser makes mistakes on lateral flatten queries, use the prefix - if "lateral flatten" in sql_query: - sql_query = sql_query[: sql_query.find("lateral flatten")] - - # MetadataSQLParser also makes mistakes on columns called "date", rename them - sql_query = re.sub(r"\sdate\s", f" {self._DATE_SWAP_TOKEN} ", sql_query) - - # MetadataSQLParser does not handle "encode" directives well. Remove them - sql_query = re.sub(r"\sencode [a-zA-Z]*", "", sql_query) - - if sql_query != original_sql_query: - logger.debug(f"rewrote original query {original_sql_query} as {sql_query}") - - self._parser = MetadataSQLParser(sql_query) - - def get_tables(self) -> List[str]: - result = self._parser.tables - # Sort tables to make the list deterministic - result.sort() - return result - - def get_columns(self) -> List[str]: - columns_dict = self._parser.columns_dict - # don't attempt to parse columns if there are joins involved - if columns_dict.get("join", {}) != {}: - return [] - - columns_alias_dict = self._parser.columns_aliases_dict - filtered_cols = [ - c - for c in columns_dict.get("select", {}) - if c != "NULL" and not isinstance(c, list) - ] - if columns_alias_dict is not None: - for col_alias in columns_alias_dict.get("select", []): - if col_alias in self._parser.columns_aliases: - col_name = self._parser.columns_aliases[col_alias] - filtered_cols = [ - col_alias if c == col_name else c for c in filtered_cols - ] - # swap back renamed date column - return ["date" if c == self._DATE_SWAP_TOKEN else c for c in filtered_cols] - - def sql_lineage_parser_impl_func_wrapper( queue: Optional[multiprocessing.Queue], sql_query: str, use_raw_names: bool = False ) -> Optional[Tuple[List[str], List[str], Any]]: diff --git a/metadata-ingestion/tests/unit/test_utilities.py b/metadata-ingestion/tests/unit/test_utilities.py index 32b5a6401ded6..368cedfe48040 100644 --- a/metadata-ingestion/tests/unit/test_utilities.py +++ b/metadata-ingestion/tests/unit/test_utilities.py @@ -1,5 +1,5 @@ from datahub.utilities.delayed_iter import delayed_iter -from datahub.utilities.sql_parser import MetadataSQLSQLParser, SqlLineageSQLParser +from datahub.utilities.sql_parser import SqlLineageSQLParser def test_delayed_iter(): @@ -36,18 +36,10 @@ def maker(n): ] -def test_metadatasql_sql_parser_get_tables_from_simple_query(): - sql_query = "SELECT foo.a, foo.b, bar.c FROM foo JOIN bar ON (foo.a == bar.b);" - - tables_list = MetadataSQLSQLParser(sql_query).get_tables() - tables_list.sort() - assert tables_list == ["bar", "foo"] - - def test_sqllineage_sql_parser_get_tables_from_simple_query(): sql_query = "SELECT foo.a, foo.b, bar.c FROM foo JOIN bar ON (foo.a == bar.b);" - tables_list = MetadataSQLSQLParser(sql_query).get_tables() + tables_list = SqlLineageSQLParser(sql_query).get_tables() tables_list.sort() assert tables_list == ["bar", "foo"] @@ -121,7 +113,7 @@ def test_sqllineage_sql_parser_get_columns_from_simple_query(): assert columns_list == ["a", "b"] -def test_metadatasql_sql_parser_get_columns_with_alias_and_count_star(): +def test_sqllineage_sql_parser_get_columns_with_alias_and_count_star(): sql_query = "SELECT foo.a, foo.b, bar.c as test, count(*) as count FROM foo JOIN bar ON (foo.a == bar.b);" columns_list = SqlLineageSQLParser(sql_query).get_columns() @@ -129,7 +121,7 @@ def test_metadatasql_sql_parser_get_columns_with_alias_and_count_star(): assert columns_list == ["a", "b", "count", "test"] -def test_metadatasql_sql_parser_get_columns_with_more_complex_join(): +def test_sqllineage_sql_parser_get_columns_with_more_complex_join(): sql_query = """ INSERT INTO @@ -206,21 +198,6 @@ def test_sqllineage_sql_parser_get_columns_complex_query_with_union(): assert columns_list == ["c", "date", "e", "u", "x"] -def test_metadatasql_sql_parser_get_tables_from_templated_query(): - sql_query = """ - SELECT - country, - city, - timestamp, - measurement - FROM - ${my_view.SQL_TABLE_NAME} AS my_view -""" - tables_list = MetadataSQLSQLParser(sql_query).get_tables() - tables_list.sort() - assert tables_list == ["my_view.SQL_TABLE_NAME"] - - def test_sqllineage_sql_parser_get_tables_from_templated_query(): sql_query = """ SELECT @@ -236,21 +213,6 @@ def test_sqllineage_sql_parser_get_tables_from_templated_query(): assert tables_list == ["my_view.SQL_TABLE_NAME"] -def test_metadatasql_sql_parser_get_columns_from_templated_query(): - sql_query = """ - SELECT - country, - city, - timestamp, - measurement - FROM - ${my_view.SQL_TABLE_NAME} AS my_view -""" - columns_list = MetadataSQLSQLParser(sql_query).get_columns() - columns_list.sort() - assert columns_list == ["city", "country", "measurement", "timestamp"] - - def test_sqllineage_sql_parser_get_columns_from_templated_query(): sql_query = """ SELECT @@ -277,34 +239,6 @@ def test_sqllineage_sql_parser_with_weird_lookml_query(): assert columns_list == ["aliased_platform", "country", "date"] -def test_metadatasql_sql_parser_with_weird_lookml_query(): - sql_query = """ - SELECT date DATE, - platform VARCHAR(20) AS aliased_platform, - country VARCHAR(20) FROM fragment_derived_view' - """ - columns_list = MetadataSQLSQLParser(sql_query).get_columns() - columns_list.sort() - assert columns_list == ["aliased_platform", "country", "date"] - - -def test_metadatasql_sql_parser_tables_from_redash_query(): - sql_query = """SELECT -name, -SUM(quantity * list_price * (1 - discount)) AS total, -YEAR(order_date) as order_year -FROM -`orders` o -INNER JOIN `order_items` i ON i.order_id = o.order_id -INNER JOIN `staffs` s ON s.staff_id = o.staff_id -GROUP BY -name, -year(order_date)""" - table_list = MetadataSQLSQLParser(sql_query).get_tables() - table_list.sort() - assert table_list == ["order_items", "orders", "staffs"] - - def test_sqllineage_sql_parser_tables_from_redash_query(): sql_query = """SELECT name,