-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(ingest/snowflake): tables from snowflake shares as siblings #8531
Changes from all commits
47bb20d
752b2dd
daf87ef
8018e5a
f704d9c
69e3792
4ce37e5
b33cafd
ea1ba40
f19d1ed
89a06b1
4f6035a
6d4be65
cfd3924
78eba30
d1bc494
1a77245
c55e37a
afbe095
02585c4
b28ebb1
6cde852
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
import logging | ||
from typing import Callable, Iterable, List | ||
|
||
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance | ||
from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||
from datahub.ingestion.api.workunit import MetadataWorkUnit | ||
from datahub.ingestion.source.snowflake.snowflake_config import ( | ||
DatabaseId, | ||
SnowflakeV2Config, | ||
) | ||
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report | ||
from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase | ||
from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin | ||
from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings | ||
from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( | ||
DatasetLineageType, | ||
Upstream, | ||
UpstreamLineage, | ||
) | ||
|
||
logger: logging.Logger = logging.getLogger(__name__) | ||
|
||
|
||
class SnowflakeSharesHandler(SnowflakeCommonMixin): | ||
def __init__( | ||
self, | ||
config: SnowflakeV2Config, | ||
report: SnowflakeV2Report, | ||
dataset_urn_builder: Callable[[str], str], | ||
) -> None: | ||
self.config = config | ||
self.report = report | ||
self.logger = logger | ||
self.dataset_urn_builder = dataset_urn_builder | ||
|
||
def get_shares_workunits( | ||
self, databases: List[SnowflakeDatabase] | ||
) -> Iterable[MetadataWorkUnit]: | ||
inbounds = self.config.inbounds() | ||
outbounds = self.config.outbounds() | ||
# None of the databases are shared | ||
if not (inbounds or outbounds): | ||
return | ||
|
||
logger.debug("Checking databases for inbound or outbound shares.") | ||
for db in databases: | ||
is_inbound = db.name in inbounds | ||
is_outbound = db.name in outbounds | ||
|
||
if not (is_inbound or is_outbound): | ||
logger.debug(f"database {db.name} is not shared.") | ||
continue | ||
|
||
sibling_dbs = ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is typed as |
||
list(outbounds[db.name]) if is_outbound else [inbounds[db.name]] | ||
) | ||
|
||
for schema in db.schemas: | ||
for table_name in schema.tables + schema.views: | ||
# TODO: If this is outbound database, | ||
# 1. attempt listing shares using `show shares` to identify name of share associated with this database (cache query result). | ||
# 2. if corresponding share is listed, then run `show grants to share <share_name>` to identify exact tables, views included in share. | ||
# 3. emit siblings only for the objects listed above. | ||
# This will work only if the configured role has accountadmin role access OR is owner of share. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is not advisable to use role with "accountadmin" access hence this is not done. Also this PR takes care to hide ghost nodes in siblings relation so this is not required. |
||
# Otherwise ghost nodes may be shown in "Composed Of" section for tables/views in original database which are not granted to share. | ||
yield from self.gen_siblings( | ||
db.name, | ||
schema.name, | ||
table_name, | ||
is_outbound, | ||
sibling_dbs, | ||
) | ||
|
||
if is_inbound: | ||
assert len(sibling_dbs) == 1 | ||
# SnowflakeLineageExtractor is unaware of database->schema->table hierarchy | ||
# hence this lineage code is not written in SnowflakeLineageExtractor | ||
# also this is not governed by configs include_table_lineage and include_view_lineage | ||
yield self.get_upstream_lineage_with_primary_sibling( | ||
db.name, schema.name, table_name, sibling_dbs[0] | ||
) | ||
|
||
self.report_missing_databases( | ||
databases, list(inbounds.keys()), list(outbounds.keys()) | ||
) | ||
|
||
def report_missing_databases( | ||
self, | ||
databases: List[SnowflakeDatabase], | ||
inbounds: List[str], | ||
outbounds: List[str], | ||
) -> None: | ||
db_names = [db.name for db in databases] | ||
missing_dbs = [db for db in inbounds + outbounds if db not in db_names] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could alternatively not cast to |
||
|
||
if missing_dbs: | ||
self.report_warning( | ||
"snowflake-shares", | ||
f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.", | ||
) | ||
|
||
def gen_siblings( | ||
self, | ||
database_name: str, | ||
schema_name: str, | ||
table_name: str, | ||
primary: bool, | ||
sibling_databases: List[DatabaseId], | ||
) -> Iterable[MetadataWorkUnit]: | ||
if not sibling_databases: | ||
return | ||
dataset_identifier = self.get_dataset_identifier( | ||
table_name, schema_name, database_name | ||
) | ||
urn = self.dataset_urn_builder(dataset_identifier) | ||
|
||
sibling_urns = [ | ||
make_dataset_urn_with_platform_instance( | ||
self.platform, | ||
self.get_dataset_identifier( | ||
table_name, schema_name, sibling_db.database | ||
), | ||
sibling_db.platform_instance, | ||
) | ||
for sibling_db in sibling_databases | ||
] | ||
|
||
yield MetadataChangeProposalWrapper( | ||
entityUrn=urn, | ||
aspect=Siblings(primary=primary, siblings=sorted(sibling_urns)), | ||
).as_workunit() | ||
|
||
def get_upstream_lineage_with_primary_sibling( | ||
self, | ||
database_name: str, | ||
schema_name: str, | ||
table_name: str, | ||
primary_sibling_db: DatabaseId, | ||
) -> MetadataWorkUnit: | ||
dataset_identifier = self.get_dataset_identifier( | ||
table_name, schema_name, database_name | ||
) | ||
urn = self.dataset_urn_builder(dataset_identifier) | ||
|
||
upstream_urn = make_dataset_urn_with_platform_instance( | ||
self.platform, | ||
self.get_dataset_identifier( | ||
table_name, schema_name, primary_sibling_db.database | ||
), | ||
primary_sibling_db.platform_instance, | ||
) | ||
|
||
return MetadataChangeProposalWrapper( | ||
entityUrn=urn, | ||
aspect=UpstreamLineage( | ||
upstreams=[Upstream(dataset=upstream_urn, type=DatasetLineageType.COPY)] | ||
), | ||
).as_workunit() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Generally we raise ValueError in validators, not use assertions. Do you want to change that convention? For now at least, can you change to stay consistent?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe we allow ValueError, AssertionError, TypeError as a convention, as also mentioned here - https://datahubproject.io/docs/metadata-ingestion/developing/#coding
Sometimes asserts are more readable /briefer so I'd prefer them. In this case, I'm okay to change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh nice. I like the assert syntax more, I think we're just hesitant because you can disable assertions with a certain flag. I don't feel strongly here, up to you