datahub-project · asikowitz · Aug 24, 2023 · Jul 26, 2023 · Jul 27, 2023 · Jul 27, 2023
diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md
@@ -99,6 +99,41 @@ The steps slightly differ based on which you decide to use.
   including `client_id` and `client_secret`, plus your Okta user's `Username` and `Password`
   * Note: the `username` and `password` config options are not nested under `oauth_config`
 
+### Snowflake Shares
+If you are using [Snowflake Shares](https://docs.snowflake.com/en/user-guide/data-sharing-provider) to share data across different snowflake accounts, and you have set up DataHub recipes for ingesting metadata from all these accounts, you may end up having multiple similar dataset entities corresponding to virtual versions of same table in different snowflake accounts. DataHub Snowflake connector can automatically link such tables together through Siblings and Lineage relationship if user provides information necessary to establish the relationship using configuration `shares` in recipe. 
+
+#### Example
+- Snowflake account `account1` (ingested as platform_instance `instance1`) owns a database `db1`. A share `X` is created in `account1` that includes database `db1` along with schemas and tables inside it. 
+- Now, `X` is shared with snowflake account `account2` (ingested as platform_instance `instance2`). A database `db1_from_X` is created from inbound share `X` in `account2`.
+- In this case, all tables and views included in share `X` will also be present in `instance2`.`db1_from_X`. You would need following configurations in snowflake recipe to setup Siblings and Lineage relationships correctly.
+- In snowflake recipe of `account1` :
+
+  ```yaml
+  account_id: account1
+  platform_instance: instance1
+  shares:
+    X: 
+      platform_instance: instance1
+      database_name: db1
+      consumers:
+        - platform_instance: instance2 # this is a list, as db1 can be shared with multiple snowflake accounts using X
+          database_name: db1_from_X
+  ```
+- In snowflake recipe of `account2` :
+
+  ```yaml
+  account_id: account2
+  platform_instance: instance2
+  shares:
+    X: 
+      platform_instance: instance1
+      database_name: db1
+      consumers:
+        - platform_instance: instance2 # this is a list, as db1 can be shared with multiple snowflake accounts using X
+          database_name: db1_from_X
+  ```
+
+- If share X is shared with more snowflake accounts and database is created from share X in those, additional entries need to be added in `consumers` list for share X, one per snowflake account.
 ### Caveats
 
 - Some of the features are only available in the Snowflake Enterprise Edition. This doc has notes mentioning where this applies.

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
@@ -1,10 +1,11 @@
 import logging
+from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, cast
+from typing import Dict, List, Optional, Set, cast
 
 from pydantic import Field, SecretStr, root_validator, validator
 
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.pattern_utils import UUID_REGEX
 from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -42,6 +43,27 @@ class TagOption(str, Enum):
     skip = "skip"
 
 
+@dataclass(frozen=True)
+class DatabaseId:
+    database: str = Field(
+        description="Database created from share in consumer account."
+    )
+    platform_instance: str = Field(
+        description="Platform instance of consumer snowflake account."
+    )
+
+
+class SnowflakeShareConfig(ConfigModel):
+    database: str = Field(description="Database from which share is created.")
+    platform_instance: str = Field(
+        description="Platform instance for snowflake account in which share is created."
+    )
+
+    consumers: Set[DatabaseId] = Field(
+        description="List of databases created in consumer accounts."
+    )
+
+
 class SnowflakeV2Config(
     SnowflakeConfig,
     SnowflakeUsageConfig,
@@ -120,6 +142,13 @@ class SnowflakeV2Config(
         "upstreams_deny_pattern", "temporary_tables_pattern"
     )
 
+    shares: Optional[Dict[str, SnowflakeShareConfig]] = Field(
+        default=None,
+        description="Required if current account owns or consumes snowflake share."
+        " If specified, connector creates lineage and siblings relationship between current account's database tables and consumer/producer account's database tables."
+        " Map of share name -> details of share.",
+    )
+
     email_as_user_identifier: bool = Field(
         default=True,
         description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is provided, generates email addresses for snowflake users with unset emails, based on their username.",
@@ -197,3 +226,41 @@ def get_sql_alchemy_url(
     @property
     def parse_view_ddl(self) -> bool:
         return self.include_view_column_lineage
+
+    @validator("shares")
+    def validate_shares(
+        cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
+    ) -> Optional[Dict[str, SnowflakeShareConfig]]:
+        current_platform_instance = values.get("platform_instance")
+
+        # Check: platform_instance should be present
+        if shares:
+            assert current_platform_instance is not None, (
+                "Did you forget to set `platform_instance` for current ingestion ?"
+                "It is advisable to use `platform_instance` when ingesting from multiple snowflake accounts."
+            )
+
+            databases_included_in_share: List[DatabaseId] = []
+            databases_created_from_share: List[DatabaseId] = []
+
+            for _, share_details in shares.items():
-            for _, share_details in shares.items():
+            for share_details in shares.values():
-            for _, share_details in shares.items():
+            for share_details in shares.values():
+                shared_db = DatabaseId(
+                    share_details.database, share_details.platform_instance
+                )
+                assert all(
+                    consumer.platform_instance != share_details.platform_instance
+                    for consumer in share_details.consumers
+                ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
+
+                databases_included_in_share.append(shared_db)
+                databases_created_from_share.extend(share_details.consumers)
+
+            for db_from_share in databases_created_from_share:
+                assert (
+                    db_from_share not in databases_included_in_share
+                ), "Database included in a share can not be present as consumer in any share."
+                assert (
+                    databases_created_from_share.count(db_from_share) == 1
+                ), "Same database can not be present as consumer in more than one share."
+
+        return shares
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
@@ -261,6 +261,7 @@ def get_tables_for_database(
         for table in cur:
             if table["TABLE_SCHEMA"] not in tables:
                 tables[table["TABLE_SCHEMA"]] = []
+
             tables[table["TABLE_SCHEMA"]].append(
                 SnowflakeTable(
                     name=table["TABLE_NAME"],

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py
@@ -0,0 +1,229 @@
+import logging
+from dataclasses import dataclass
+from typing import Callable, Dict, Iterable, List, Optional
+
+from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.snowflake.snowflake_config import (
+    DatabaseId,
+    SnowflakeShareConfig,
+    SnowflakeV2Config,
+)
+from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
+from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase
+from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
+from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
+from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
+    DatasetLineageType,
+    Upstream,
+    UpstreamLineage,
+)
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SharedDatabase:
+    """
+    Represents shared database from current platform instance
+    This is either created from an inbound share or included in an outbound share.
+    """
+
+    name: str
+    created_from_share: bool
+
+    # This will have exactly entry if created_from_share = True
+    shares: List[str]
+
+
+class SnowflakeSharesHandler(SnowflakeCommonMixin):
+    def __init__(
+        self,
+        config: SnowflakeV2Config,
+        report: SnowflakeV2Report,
+        dataset_urn_builder: Callable[[str], str],
+    ) -> None:
+        self.config = config
+        self.report = report
+        self.logger = logger
+        self.dataset_urn_builder = dataset_urn_builder
+
+    def _get_shared_databases(
+        self, shares: Dict[str, SnowflakeShareConfig], platform_instance: Optional[str]
+    ) -> Dict[str, SharedDatabase]:
+        # this is ensured in config validators
+        assert platform_instance is not None
+
+        shared_databases: Dict[str, SharedDatabase] = {}
+
+        for share_name, share_details in shares.items():
+            if share_details.platform_instance == platform_instance:
+                if share_details.database not in shared_databases:
+                    shared_databases[share_details.database] = SharedDatabase(
+                        name=share_details.database,
+                        created_from_share=False,
+                        shares=[share_name],
+                    )
+
+                else:
+                    shared_databases[share_details.database].shares.append(share_name)
+
+            else:
+                for consumer in share_details.consumers:
+                    if consumer.platform_instance == platform_instance:
+                        shared_databases[consumer.database] = SharedDatabase(
+                            name=share_details.database,
+                            created_from_share=True,
+                            shares=[share_name],
+                        )
+                        break
+                else:
+                    self.report_warning(
+                        f"Skipping Share, as it does not include current platform instance {platform_instance}",
+                        share_name,
+                    )
+
+        return shared_databases
+
+    def get_shares_workunits(
+        self, databases: List[SnowflakeDatabase]
+    ) -> Iterable[MetadataWorkUnit]:
+        shared_databases = self._get_shared_databases(
+            self.config.shares or {}, self.config.platform_instance
+        )
+
+        # None of the databases are shared
+        if not shared_databases:
+            return
+
+        logger.debug("Checking databases for inbound or outbound shares.")
+        for db in databases:
+            if db.name not in shared_databases:
+                logger.debug(f"database {db.name} is not shared.")
+                continue
+
+            sibling_dbs = self.get_sibling_databases(shared_databases[db.name])
+
+            for schema in db.schemas:
+                for table_name in schema.tables + schema.views:
+                    # TODO: If this is outbound database,
+                    # 1. attempt listing shares using `show shares` to identify name of share associated with this database (cache query result).
+                    # 2. if corresponding share is listed, then run `show grants to share <share_name>` to identify exact tables, views included in share.
+                    # 3. emit siblings only for the objects listed above.
+                    # This will work only if the configured role has accountadmin role access OR is owner of share.
+                    # Otherwise ghost nodes may be shown in "Composed Of" section for tables/views in original database which are not granted to share.
+                    yield from self.gen_siblings(
+                        db.name,
+                        schema.name,
+                        table_name,
+                        not shared_databases[db.name].created_from_share,
+                        sibling_dbs,
+                    )
+
+                    if shared_databases[db.name].created_from_share:
+                        assert len(sibling_dbs) == 1
+                        # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy
+                        # hence this lineage code is not written in SnowflakeLineageExtractor
+                        # also this is not governed by configs include_table_lineage and include_view_lineage
+                        yield self.get_upstream_lineage_with_primary_sibling(
+                            db.name, schema.name, table_name, sibling_dbs[0]
+                        )
+
+        self.report_missing_databases(databases, shared_databases)
+
+    def get_sibling_databases(self, db: SharedDatabase) -> List[DatabaseId]:
+        assert self.config.shares is not None
+        sibling_dbs: List[DatabaseId] = []
+        if db.created_from_share:
+            share_details = self.config.shares[db.shares[0]]
+            logger.debug(
+                f"database {db.name} is created from inbound share {db.shares[0]}."
+            )
+            sibling_dbs = [
+                DatabaseId(share_details.database, share_details.platform_instance)
+            ]
+
+        else:  # not created from share, but is in fact included in share
+            logger.debug(
+                f"database {db.name} is included as outbound share(s) {db.shares}."
+            )
+            sibling_dbs = [
+                consumer
+                for share_name in db.shares
+                for consumer in self.config.shares[share_name].consumers
+            ]
+
+        return sibling_dbs
+
+    def report_missing_databases(
+        self,
+        databases: List[SnowflakeDatabase],
+        shared_databases: Dict[str, SharedDatabase],
+    ) -> None:
+        db_names = [db.name for db in databases]
+        missing_dbs = [db for db in shared_databases.keys() if db not in db_names]
+
+        if missing_dbs:
+            self.report_warning(
+                "snowflake-shares",
+                f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.",
+            )
+
+    def gen_siblings(
+        self,
+        database_name: str,
+        schema_name: str,
+        table_name: str,
+        primary: bool,
+        sibling_databases: List[DatabaseId],
+    ) -> Iterable[MetadataWorkUnit]:
+        if not sibling_databases:
+            return
+        dataset_identifier = self.get_dataset_identifier(
+            table_name, schema_name, database_name
+        )
+        urn = self.dataset_urn_builder(dataset_identifier)
+
+        sibling_urns = [
+            make_dataset_urn_with_platform_instance(
+                self.platform,
+                self.get_dataset_identifier(
+                    table_name, schema_name, sibling_db.database
+                ),
+                sibling_db.platform_instance,
+            )
+            for sibling_db in sibling_databases
+        ]
+
+        yield MetadataChangeProposalWrapper(
+            entityUrn=urn,
+            aspect=Siblings(primary=primary, siblings=sorted(sibling_urns)),
+        ).as_workunit()
+
+    def get_upstream_lineage_with_primary_sibling(
+        self,
+        database_name: str,
+        schema_name: str,
+        table_name: str,
+        primary_sibling_db: DatabaseId,
+    ) -> MetadataWorkUnit:
+        dataset_identifier = self.get_dataset_identifier(
+            table_name, schema_name, database_name
+        )
+        urn = self.dataset_urn_builder(dataset_identifier)
+
+        upstream_urn = make_dataset_urn_with_platform_instance(
+            self.platform,
+            self.get_dataset_identifier(
+                table_name, schema_name, primary_sibling_db.database
+            ),
+            primary_sibling_db.platform_instance,
+        )
+
+        return MetadataChangeProposalWrapper(
+            entityUrn=urn,
+            aspect=UpstreamLineage(
+                upstreams=[Upstream(dataset=upstream_urn, type=DatasetLineageType.COPY)]
+            ),
+        ).as_workunit()