From ea1273281e3a65ab4d94d002ee19f91907a3eb84 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 30 Oct 2023 20:57:59 -0700 Subject: [PATCH] feat(ingest): sql parser perf + asyncio fixes (#9119) --- metadata-ingestion/setup.py | 2 +- .../src/datahub/cli/docker_cli.py | 5 ++ .../src/datahub/upgrade/upgrade.py | 12 ++--- .../src/datahub/utilities/sqlglot_lineage.py | 5 +- .../goldens/test_select_from_union.json | 2 +- .../test_teradata_strange_operators.json | 46 +++++++++++++++++++ .../unit/sql_parsing/test_sqlglot_lineage.py | 14 ++++++ 7 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index b1c5510efd923..151842bd84d0a 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -108,7 +108,7 @@ sqlglot_lib = { # Using an Acryl fork of sqlglot. # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 - "acryl-sqlglot==18.5.2.dev45", + "acryl-sqlglot==18.17.1.dev16", } sql_common = ( diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 4afccfe711e34..77e3285d359ef 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -5,6 +5,7 @@ import os import pathlib import platform +import signal import subprocess import sys import tempfile @@ -770,6 +771,10 @@ def quickstart( # noqa: C901 logger.debug("docker compose up still running, sending SIGKILL") up_process.kill() up_process.wait() + else: + # If the docker process got a keyboard interrupt, raise one here. + if up_process.returncode in {128 + signal.SIGINT, -signal.SIGINT}: + raise KeyboardInterrupt # Check docker health every few seconds. status = check_docker_quickstart() diff --git a/metadata-ingestion/src/datahub/upgrade/upgrade.py b/metadata-ingestion/src/datahub/upgrade/upgrade.py index 30f19b8b84f35..acc7954ad25a6 100644 --- a/metadata-ingestion/src/datahub/upgrade/upgrade.py +++ b/metadata-ingestion/src/datahub/upgrade/upgrade.py @@ -1,6 +1,5 @@ import asyncio import contextlib -import functools import logging import sys from datetime import datetime, timedelta, timezone @@ -374,17 +373,14 @@ def check_upgrade(func: Callable[..., T]) -> Callable[..., T]: @wraps(func) def async_wrapper(*args: Any, **kwargs: Any) -> Any: async def run_inner_func(): - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, functools.partial(func, *args, **kwargs) - ) + return func(*args, **kwargs) async def run_func_check_upgrade(): version_stats_future = asyncio.ensure_future(retrieve_version_stats()) - the_one_future = asyncio.ensure_future(run_inner_func()) - ret = await the_one_future + main_func_future = asyncio.ensure_future(run_inner_func()) + ret = await main_func_future - # the one future has returned + # the main future has returned # we check the other futures quickly try: version_stats = await asyncio.wait_for(version_stats_future, 0.5) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 388388f9f4b38..6413275ac63a6 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -106,6 +106,7 @@ def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: sqlglot.exp.Update: QueryType.UPDATE, sqlglot.exp.Delete: QueryType.DELETE, sqlglot.exp.Merge: QueryType.MERGE, + sqlglot.exp.Subqueryable: QueryType.SELECT, # unions, etc. are also selects } for cls, query_type in mapping.items(): @@ -820,10 +821,8 @@ def _extract_select_from_update( ) # Update statements always implicitly have the updated table in context. - # TODO: Retain table name alias. + # TODO: Retain table name alias, if one was present. if select_statement.args.get("from"): - # select_statement = sqlglot.parse_one(select_statement.sql(dialect=dialect)) - select_statement = select_statement.join( statement.this, append=True, join_kind="cross" ) diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json index 902aa010c8afc..5d1d421f49a2a 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json @@ -1,5 +1,5 @@ { - "query_type": "UNKNOWN", + "query_type": "SELECT", "in_tables": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf10.orders,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf100.orders,PROD)" diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json new file mode 100644 index 0000000000000..4b21a2512ccd1 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json @@ -0,0 +1,46 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "col1", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)", + "column": "col1" + } + ] + }, + { + "downstream": { + "table": null, + "column": "col2", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "column": "col2" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)", + "column": "col2" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 5559ebe1756a6..3b9fa0d55f18d 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -675,6 +675,20 @@ def test_teradata_default_normalization(): ) +def test_teradata_strange_operators(): + assert_sql_result( + """ +select col1, col2 from dbc.table1 +where col1 eq 'value1' +minus +select col1, col2 from dbc.table2 +""", + dialect="teradata", + default_schema="dbc", + expected_file=RESOURCE_DIR / "test_teradata_strange_operators.json", + ) + + def test_snowflake_update_hardcoded(): assert_sql_result( """