-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(ingest/snowflake): initialize schema resolver from datahub for l… #8903
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -301,14 +301,11 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): | |
# Caches tables for a single database. Consider moving to disk or S3 when possible. | ||
self.db_tables: Dict[str, List[SnowflakeTable]] = {} | ||
|
||
self.sql_parser_schema_resolver = SchemaResolver( | ||
platform=self.platform, | ||
platform_instance=self.config.platform_instance, | ||
env=self.config.env, | ||
) | ||
self.view_definitions: FileBackedDict[str] = FileBackedDict() | ||
self.add_config_to_report() | ||
|
||
self.sql_parser_schema_resolver = self._init_schema_resolver() | ||
|
||
@classmethod | ||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": | ||
config = SnowflakeV2Config.parse_obj(config_dict) | ||
|
@@ -493,6 +490,24 @@ def query(query): | |
|
||
return _report | ||
|
||
def _init_schema_resolver(self) -> SchemaResolver: | ||
if not self.config.include_technical_schema and self.config.parse_view_ddl: | ||
if self.ctx.graph: | ||
return self.ctx.graph.initialize_schema_resolver_from_datahub( | ||
platform=self.platform, | ||
platform_instance=self.config.platform_instance, | ||
env=self.config.env, | ||
) | ||
else: | ||
logger.warning( | ||
"Failed to load schema info from DataHub as DataHubGraph is missing.", | ||
) | ||
return SchemaResolver( | ||
platform=self.platform, | ||
platform_instance=self.config.platform_instance, | ||
env=self.config.env, | ||
) | ||
|
||
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: | ||
return [ | ||
*super().get_workunit_processors(), | ||
|
@@ -764,7 +779,7 @@ def _process_schema( | |
) | ||
self.db_tables[schema_name] = tables | ||
|
||
if self.config.include_technical_schema or self.config.parse_view_ddl: | ||
if self.config.include_technical_schema: | ||
for table in tables: | ||
yield from self._process_table(table, schema_name, db_name) | ||
|
||
|
@@ -776,7 +791,7 @@ def _process_schema( | |
if view.view_definition: | ||
self.view_definitions[key] = view.view_definition | ||
|
||
if self.config.include_technical_schema or self.config.parse_view_ddl: | ||
if self.config.include_technical_schema: | ||
for view in views: | ||
yield from self._process_view(view, schema_name, db_name) | ||
|
||
|
@@ -892,8 +907,6 @@ def _process_table( | |
yield from self._process_tag(tag) | ||
|
||
yield from self.gen_dataset_workunits(table, schema_name, db_name) | ||
elif self.config.parse_view_ddl: | ||
self.gen_schema_metadata(table, schema_name, db_name) | ||
|
||
def fetch_sample_data_for_classification( | ||
self, table: SnowflakeTable, schema_name: str, db_name: str, dataset_name: str | ||
|
@@ -1004,8 +1017,6 @@ def _process_view( | |
yield from self._process_tag(tag) | ||
|
||
yield from self.gen_dataset_workunits(view, schema_name, db_name) | ||
elif self.config.parse_view_ddl: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these changes confuse me a bit - are we dropping There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These conditions were part of schema extraction process, such that - even if schema ingestion is disabled but parse_view_ddl is enabled, the code to fetch and generate schema metadata from snowflake would run, in order for that to be used during view definitions cll extraction. Actual view definition parsing logic is in snowflake_lineage_v2.py and the only thing it requires is view_definitions to be populated. This code is still intact. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok that broadly makes sense |
||
self.gen_schema_metadata(view, schema_name, db_name) | ||
|
||
def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]: | ||
tag_identifier = tag.identifier() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
instead of initialize_schema_resolver_from_datahub returning
Tuple[SchemaResolver, set[urns]]
, can we change it so that it only returns SchemaResolver, and in turn SchemaResolver has a method to get a set of loaded urns?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
makes sense. What would we call this method in SchemaResolver ?
get_urns
?