From 1a09cb2c2a37fa20e0dffda819f131b49f13c685 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 23 Aug 2024 16:42:32 -0400 Subject: [PATCH] fix(ingest/snowflake): propagate table list from main to query extractor (#11222) --- .../glossary/classification_mixin.py | 2 +- .../source/snowflake/snowflake_lineage_v2.py | 2 +- .../source/snowflake/snowflake_queries.py | 20 ++++++++++++++++--- .../source/snowflake/snowflake_v2.py | 1 + 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py index 3e5eb4347b474..1d381acbf3dbe 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py @@ -48,7 +48,7 @@ class ClassificationReportMixin: class ClassificationSourceConfigMixin(ConfigModel): classification: ClassificationConfig = Field( default=ClassificationConfig(), - description="For details, refer [Classification](../../../../metadata-ingestion/docs/dev_guides/classification.md).", + description="For details, refer to [Classification](../../../../metadata-ingestion/docs/dev_guides/classification.md).", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 151e9fb631620..6f9c9259b2784 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -353,7 +353,7 @@ def _populate_external_lineage_from_copy_history( def _process_external_lineage_result_row( cls, db_row: dict, - discovered_tables: Optional[List[str]], + discovered_tables: Optional[Collection[str]], identifiers: SnowflakeIdentifierBuilder, ) -> Optional[KnownLineageMapping]: # key is the down-stream table name diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index d5b8f98e40075..d96d389a4246a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -131,7 +131,7 @@ def __init__( self.report = SnowflakeQueriesExtractorReport() self.filters = filters self.identifiers = identifiers - self.discovered_tables = discovered_tables + self.discovered_tables = set(discovered_tables) if discovered_tables else None self._structured_report = structured_report @@ -175,10 +175,24 @@ def local_temp_path(self) -> pathlib.Path: return path def is_temp_table(self, name: str) -> bool: - return any( + if any( re.match(pattern, name, flags=re.IGNORECASE) for pattern in self.config.temporary_tables_pattern - ) + ): + return True + + # This is also a temp table if + # 1. this name would be allowed by the dataset patterns, and + # 2. we have a list of discovered tables, and + # 3. it's not in the discovered tables list + if ( + self.filters.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE) + and self.discovered_tables + and name not in self.discovered_tables + ): + return True + + return False def is_allowed_table(self, name: str) -> bool: if self.discovered_tables and name not in self.discovered_tables: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 1881e1da5be68..0c861b1334d9f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -526,6 +526,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: filters=self.filters, identifiers=self.identifiers, schema_resolver=schema_resolver, + discovered_tables=discovered_datasets, ) # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs