From 68abf9c6a1f0ccb9ad144247805781587c40ceeb Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 23 Aug 2023 07:25:51 -0400 Subject: [PATCH 01/20] build(ingest): Bump pydantic pin (#8660) --- metadata-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 4ff1d06bb8c22..62cb4f1abb8cf 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -454,7 +454,7 @@ def get_long_description(): "mypy==1.0.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.9.0", + "pydantic>=1.10.0", *test_api_requirements, pytest_dep, "pytest-asyncio>=0.16.0", From 8141e2d64920f0511c531c493a3b61b5dc2ca026 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 23 Aug 2023 15:57:46 -0400 Subject: [PATCH 02/20] remove(ingest/snowflake): Remove legacy snowflake lineage (#8653) Co-authored-by: Tamas Nemeth Co-authored-by: Aseem Bansal --- .../source/snowflake/snowflake_config.py | 11 +- .../snowflake/snowflake_lineage_legacy.py | 664 ------------------ .../source/snowflake/snowflake_query.py | 29 - .../source/snowflake/snowflake_v2.py | 18 +- .../tests/integration/snowflake/common.py | 9 - .../integration/snowflake/test_snowflake.py | 2 - .../snowflake/test_snowflake_failures.py | 1 - .../test_snowflake_failures_legacy_lineage.py | 291 -------- .../test_snowflake_legacy_lineage.py | 207 ------ 9 files changed, 6 insertions(+), 1226 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py delete mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py delete mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index e8e80e172a9ce..7699d89ce9ac2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -91,13 +91,8 @@ class SnowflakeV2Config( description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", ) - use_legacy_lineage_method: bool = Field( - default=False, - description=( - "Whether to use the legacy lineage computation method. " - "By default, uses new optimised lineage extraction method that requires less ingestion process memory. " - "Table-to-view and view-to-view column-level lineage are not supported with the legacy method." - ), + _use_legacy_lineage_method_removed = pydantic_removed_field( + "use_legacy_lineage_method" ) validate_upstreams_against_patterns: bool = Field( @@ -113,7 +108,7 @@ class SnowflakeV2Config( # This is required since access_history table does not capture whether the table was temporary table. temporary_tables_pattern: List[str] = Field( default=DEFAULT_TABLES_DENY_LIST, - description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools. Not used if `use_legacy_lineage_method=True`", + description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools.", ) rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py deleted file mode 100644 index 832a072c619f8..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py +++ /dev/null @@ -1,664 +0,0 @@ -import json -import logging -from collections import defaultdict -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set - -from pydantic import Field -from pydantic.error_wrappers import ValidationError -from snowflake.connector import SnowflakeConnection - -import datahub.emitter.mce_builder as builder -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import make_s3_urn -from datahub.ingestion.source.snowflake.constants import ( - LINEAGE_PERMISSION_ERROR, - SnowflakeEdition, - SnowflakeObjectDomain, -) -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( - SnowflakeColumnReference, -) -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakePermissionError, - SnowflakeQueryMixin, -) -from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( - FineGrainedLineage, - FineGrainedLineageDownstreamType, - FineGrainedLineageUpstreamType, - UpstreamLineage, -) -from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass -from datahub.utilities.perf_timer import PerfTimer - -logger: logging.Logger = logging.getLogger(__name__) - - -class SnowflakeColumnWithLineage(SnowflakeColumnReference): - class Config: - # This is for backward compatibility and can be removed later - allow_population_by_field_name = True - - directSourceColumns: Optional[List[SnowflakeColumnReference]] = Field( - default=None, alias="directSources" - ) - - -@dataclass(frozen=True) -class SnowflakeColumnId: - columnName: str - objectName: str - objectDomain: Optional[str] = None - - -@dataclass(frozen=True) -class SnowflakeColumnFineGrainedLineage: - """ - Fie grained upstream of column, - which represents a transformation applied on input columns""" - - inputColumns: FrozenSet[SnowflakeColumnId] - # Transform function, query etc can be added here - - -@dataclass -class SnowflakeColumnUpstreams: - """All upstreams of a column""" - - upstreams: Set[SnowflakeColumnFineGrainedLineage] = field( - default_factory=set, init=False - ) - - def update_column_lineage( - self, directSourceColumns: List[SnowflakeColumnReference] - ) -> None: - input_columns = frozenset( - [ - SnowflakeColumnId( - upstream_col.columnName, - upstream_col.objectName, - upstream_col.objectDomain, - ) - for upstream_col in directSourceColumns - if upstream_col.objectName - ] - ) - if not input_columns: - return - upstream = SnowflakeColumnFineGrainedLineage(inputColumns=input_columns) - if upstream not in self.upstreams: - self.upstreams.add(upstream) - - -@dataclass -class SnowflakeUpstreamTable: - upstreamDataset: str - upstreamColumns: List[SnowflakeColumnReference] - downstreamColumns: List[SnowflakeColumnWithLineage] - - @classmethod - def from_dict( - cls, - dataset: str, - upstreams_columns_json: Optional[str], - downstream_columns_json: Optional[str], - ) -> "SnowflakeUpstreamTable": - try: - upstreams_columns_list = [] - downstream_columns_list = [] - if upstreams_columns_json is not None: - upstreams_columns_list = json.loads(upstreams_columns_json) - if downstream_columns_json is not None: - downstream_columns_list = json.loads(downstream_columns_json) - - table_with_upstreams = cls( - dataset, - [ - SnowflakeColumnReference.parse_obj(col) - for col in upstreams_columns_list - ], - [ - SnowflakeColumnWithLineage.parse_obj(col) - for col in downstream_columns_list - ], - ) - except ValidationError: - # Earlier versions of column lineage did not include columnName, only columnId - table_with_upstreams = cls(dataset, [], []) - return table_with_upstreams - - -@dataclass -class SnowflakeTableLineage: - # key: upstream table name - upstreamTables: Dict[str, SnowflakeUpstreamTable] = field( - default_factory=dict, init=False - ) - - # key: downstream column name - columnLineages: Dict[str, SnowflakeColumnUpstreams] = field( - default_factory=lambda: defaultdict(SnowflakeColumnUpstreams), init=False - ) - - def update_lineage( - self, table: SnowflakeUpstreamTable, include_column_lineage: bool = True - ) -> None: - if table.upstreamDataset not in self.upstreamTables.keys(): - self.upstreamTables[table.upstreamDataset] = table - - if include_column_lineage and table.downstreamColumns: - for col in table.downstreamColumns: - if col.directSourceColumns: - self.columnLineages[col.columnName].update_column_lineage( - col.directSourceColumns - ) - - -class SnowflakeLineageExtractor( - SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin -): - """ - Extracts Lineage from Snowflake. - Following lineage edges are considered. - - 1. "Table to View" lineage via `snowflake.account_usage.object_dependencies` view - 2. "S3 to Table" lineage via `show external tables` query. - 3. "View to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - 4. "Table to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - 5. "S3 to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - - Edition Note - Snowflake Standard Edition does not have Access History Feature. So it does not support lineage extraction for edges 3, 4, 5 mentioned above. - """ - - def __init__( - self, - config: SnowflakeV2Config, - report: SnowflakeV2Report, - dataset_urn_builder: Callable[[str], str], - ) -> None: - self._lineage_map: Dict[str, SnowflakeTableLineage] = defaultdict( - SnowflakeTableLineage - ) - self._external_lineage_map: Dict[str, Set[str]] = defaultdict(set) - self.config = config - self.report = report - self.logger = logger - self.dataset_urn_builder = dataset_urn_builder - self.connection: Optional[SnowflakeConnection] = None - - # Kwargs used by new snowflake lineage extractor need to be ignored here - def get_workunits( - self, discovered_tables: List[str], discovered_views: List[str], **_kwargs: Any - ) -> Iterable[MetadataWorkUnit]: - self.connection = self.create_connection() - if self.connection is None: - return - - self._populate_table_lineage() - - if self.config.include_view_lineage: - if len(discovered_views) > 0: - self._populate_view_lineage() - else: - logger.info("No views found. Skipping View Lineage Extraction.") - - self._populate_external_lineage() - - if ( - len(self._lineage_map.keys()) == 0 - and len(self._external_lineage_map.keys()) == 0 - ): - logger.debug("No lineage found.") - return - - yield from self.get_table_upstream_workunits(discovered_tables) - yield from self.get_view_upstream_workunits(discovered_views) - - def _populate_table_lineage(self): - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. Table to Table Lineage Feature is not supported." - ) # See Edition Note above for why - else: - with PerfTimer() as timer: - self._populate_lineage() - self.report.table_lineage_query_secs = timer.elapsed_seconds() - - def get_table_upstream_workunits(self, discovered_tables): - if self.config.include_table_lineage: - for dataset_name in discovered_tables: - upstream_lineage = self._get_upstream_lineage_info(dataset_name) - if upstream_lineage is not None: - yield MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(dataset_name), - aspect=upstream_lineage, - ).as_workunit() - - def get_view_upstream_workunits(self, discovered_views): - if self.config.include_view_lineage: - for view_name in discovered_views: - upstream_lineage = self._get_upstream_lineage_info(view_name) - if upstream_lineage is not None: - yield MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(view_name), - aspect=upstream_lineage, - ).as_workunit() - - def _get_upstream_lineage_info( - self, dataset_name: str - ) -> Optional[UpstreamLineage]: - lineage = self._lineage_map[dataset_name] - external_lineage = self._external_lineage_map[dataset_name] - if not (lineage.upstreamTables or lineage.columnLineages or external_lineage): - logger.debug(f"No lineage found for {dataset_name}") - return None - - upstream_tables: List[UpstreamClass] = [] - finegrained_lineages: List[FineGrainedLineage] = [] - - # Populate the table-lineage in aspect - self.update_upstream_tables_lineage(upstream_tables, lineage) - - # Populate the column-lineage in aspect - self.update_upstream_columns_lineage( - self.dataset_urn_builder(dataset_name), finegrained_lineages, lineage - ) - - # Populate the external-table-lineage(s3->snowflake) in aspect - self.update_external_tables_lineage(upstream_tables, external_lineage) - - if len(upstream_tables) > 0: - logger.debug( - f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}" - ) - if self.config.upstream_lineage_in_report: - self.report.upstream_lineage[dataset_name] = [ - u.dataset for u in upstream_tables - ] - return UpstreamLineage( - upstreams=upstream_tables, - fineGrainedLineages=sorted( - finegrained_lineages, key=lambda x: (x.downstreams, x.upstreams) - ) - or None, - ) - else: - return None - - def _populate_view_lineage(self) -> None: - with PerfTimer() as timer: - self._populate_view_upstream_lineage() - self.report.view_upstream_lineage_query_secs = timer.elapsed_seconds() - - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. View to Table Lineage Feature is not supported." - ) # See Edition Note above for why - else: - with PerfTimer() as timer: - self._populate_view_downstream_lineage() - self.report.view_downstream_lineage_query_secs = timer.elapsed_seconds() - - def _populate_external_lineage(self) -> None: - with PerfTimer() as timer: - self.report.num_external_table_edges_scanned = 0 - - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. External Lineage Feature via Access History is not supported." - ) # See Edition Note above for why - else: - self._populate_external_lineage_from_access_history() - - self._populate_external_lineage_from_show_query() - - logger.info( - f"Found {self.report.num_external_table_edges_scanned} external lineage edges." - ) - - self.report.external_lineage_queries_secs = timer.elapsed_seconds() - - # Handles the case for explicitly created external tables. - # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_show_query(self): - external_tables_query: str = SnowflakeQuery.show_external_tables() - try: - for db_row in self.query(external_tables_query): - key = self.get_dataset_identifier( - db_row["name"], db_row["schema_name"], db_row["database_name"] - ) - - if not self._is_dataset_pattern_allowed( - key, SnowflakeObjectDomain.TABLE - ): - continue - self._external_lineage_map[key].add(db_row["location"]) - logger.debug( - f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via show external tables" - ) - self.report.num_external_table_edges_scanned += 1 - except Exception as e: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating external table lineage from Snowflake failed due to error {e}.", - ) - - # Handles the case where a table is populated from an external location via copy. - # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv'; - def _populate_external_lineage_from_access_history(self): - query: str = SnowflakeQuery.external_table_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - ) - - try: - for db_row in self.query(query): - self._process_external_lineage_result_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating table external lineage from Snowflake failed due to error {e}.", - ) - - def _process_external_lineage_result_row(self, db_row): - # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed(key, SnowflakeObjectDomain.TABLE): - return - - if db_row["UPSTREAM_LOCATIONS"] is not None: - external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"]) - - for loc in external_locations: - if loc not in self._external_lineage_map[key]: - self._external_lineage_map[key].add(loc) - self.report.num_external_table_edges_scanned += 1 - - logger.debug( - f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via access_history" - ) - - def _populate_lineage(self) -> None: - query: str = SnowflakeQuery.table_to_table_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - include_column_lineage=self.config.include_column_lineage, - ) - self.report.num_table_to_table_edges_scanned = 0 - try: - for db_row in self.query(query): - self._process_table_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get table to table lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "table-lineage", - f"Extracting lineage from Snowflake failed due to error {e}.", - ) - logger.info( - f"A total of {self.report.num_table_to_table_edges_scanned} Table->Table edges found" - f" for {len(self._lineage_map)} downstream tables.", - ) - - def _process_table_lineage_row(self, db_row): - # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - upstream_table_name = self.get_dataset_identifier_from_qualified_name( - db_row["UPSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed( - key, SnowflakeObjectDomain.TABLE - ) or not ( - self._is_dataset_pattern_allowed( - upstream_table_name, SnowflakeObjectDomain.TABLE, is_upstream=True - ) - ): - return - self._lineage_map[key].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict( - upstream_table_name, - db_row["UPSTREAM_TABLE_COLUMNS"], - db_row["DOWNSTREAM_TABLE_COLUMNS"], - ), - self.config.include_column_lineage, - ) - self.report.num_table_to_table_edges_scanned += 1 - logger.debug(f"Lineage[Table(Down)={key}]:Table(Up)={self._lineage_map[key]}") - - def _populate_view_upstream_lineage(self) -> None: - # NOTE: This query captures only the upstream lineage of a view (with no column lineage). - # For more details see: https://docs.snowflake.com/en/user-guide/object-dependencies.html#object-dependencies - # and also https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views. - view_upstream_lineage_query: str = SnowflakeQuery.view_dependencies() - - self.report.num_table_to_view_edges_scanned = 0 - - try: - for db_row in self.query(view_upstream_lineage_query): - self._process_view_upstream_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get table to view lineage. Please grant imported privileges on SNOWFLAKE database." - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "view-upstream-lineage", - f"Extracting the upstream view lineage from Snowflake failed due to error {e}.", - ) - logger.info( - f"A total of {self.report.num_table_to_view_edges_scanned} View upstream edges found." - ) - - def _process_view_upstream_lineage_row(self, db_row): - # Process UpstreamTable/View/ExternalTable/Materialized View->View edge. - view_upstream: str = self.get_dataset_identifier_from_qualified_name( - db_row["VIEW_UPSTREAM"] - ) - view_name: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_VIEW"] - ) - - if not self._is_dataset_pattern_allowed( - dataset_name=view_name, - dataset_type=db_row["REFERENCING_OBJECT_DOMAIN"], - ) or not self._is_dataset_pattern_allowed( - view_upstream, db_row["REFERENCED_OBJECT_DOMAIN"], is_upstream=True - ): - return - # key is the downstream view name - self._lineage_map[view_name].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict(view_upstream, None, None), - self.config.include_column_lineage, - ) - self.report.num_table_to_view_edges_scanned += 1 - logger.debug( - f"Upstream->View: Lineage[View(Down)={view_name}]:Upstream={view_upstream}" - ) - - def _populate_view_downstream_lineage(self) -> None: - # This query captures the downstream table lineage for views. - # See https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views. - # Eg: For viewA->viewB->ViewC->TableD, snowflake does not yet log intermediate view logs, resulting in only the viewA->TableD edge. - view_lineage_query: str = SnowflakeQuery.view_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - include_column_lineage=self.config.include_column_lineage, - ) - - self.report.num_view_to_table_edges_scanned = 0 - - try: - for db_row in self.query(view_lineage_query): - self._process_view_downstream_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get view to table lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "view-downstream-lineage", - f"Extracting the view lineage from Snowflake failed due to error {e}.", - ) - - logger.info( - f"Found {self.report.num_view_to_table_edges_scanned} View->Table edges." - ) - - def _process_view_downstream_lineage_row(self, db_row): - view_name: str = self.get_dataset_identifier_from_qualified_name( - db_row["VIEW_NAME"] - ) - downstream_table: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed( - view_name, db_row["VIEW_DOMAIN"], is_upstream=True - ) or not self._is_dataset_pattern_allowed( - downstream_table, db_row["DOWNSTREAM_TABLE_DOMAIN"] - ): - return - - # Capture view->downstream table lineage. - self._lineage_map[downstream_table].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict( - view_name, - db_row["VIEW_COLUMNS"], - db_row["DOWNSTREAM_TABLE_COLUMNS"], - ), - self.config.include_column_lineage, - ) - self.report.num_view_to_table_edges_scanned += 1 - - logger.debug( - f"View->Table: Lineage[Table(Down)={downstream_table}]:View(Up)={self._lineage_map[downstream_table]}" - ) - - def update_upstream_tables_lineage( - self, upstream_tables: List[UpstreamClass], lineage: SnowflakeTableLineage - ) -> None: - for lineage_entry in sorted( - lineage.upstreamTables.values(), key=lambda x: x.upstreamDataset - ): - upstream_table_name = lineage_entry.upstreamDataset - upstream_table = UpstreamClass( - dataset=self.dataset_urn_builder(upstream_table_name), - type=DatasetLineageTypeClass.TRANSFORMED, - ) - upstream_tables.append(upstream_table) - - def update_upstream_columns_lineage( - self, - dataset_urn: str, - finegrained_lineages: List[FineGrainedLineage], - lineage: SnowflakeTableLineage, - ) -> None: - # For every column for which upstream lineage is available - for col, col_upstreams in lineage.columnLineages.items(): - # For every upstream of column - self.update_upstream_columns_lineage_of_column( - dataset_urn, col, finegrained_lineages, col_upstreams - ) - - def update_upstream_columns_lineage_of_column( - self, - dataset_urn: str, - col: str, - finegrained_lineages: List[FineGrainedLineage], - col_upstreams: SnowflakeColumnUpstreams, - ) -> None: - for fine_upstream in col_upstreams.upstreams: - finegrained_lineage_entry = self.build_finegrained_lineage( - dataset_urn, col, fine_upstream - ) - if finegrained_lineage_entry.upstreams: - finegrained_lineages.append(finegrained_lineage_entry) - - def build_finegrained_lineage( - self, - dataset_urn: str, - col: str, - fine_upstream: SnowflakeColumnFineGrainedLineage, - ) -> FineGrainedLineage: - fieldPath = col - - column_upstreams = self.build_finegrained_lineage_upstreams(fine_upstream) - finegrained_lineage_entry = FineGrainedLineage( - upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, - # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend - # even if the lineage is same but the order is different. - upstreams=sorted(column_upstreams), - downstreamType=FineGrainedLineageDownstreamType.FIELD, - downstreams=[ - builder.make_schema_field_urn( - dataset_urn, self.snowflake_identifier(fieldPath) - ) - ], - ) - - return finegrained_lineage_entry - - def build_finegrained_lineage_upstreams( - self, fine_upstream: SnowflakeColumnFineGrainedLineage - ) -> List[str]: - column_upstreams = [] - for upstream_col in fine_upstream.inputColumns: - if ( - upstream_col.objectName - and upstream_col.columnName - and self._is_dataset_pattern_allowed( - upstream_col.objectName, upstream_col.objectDomain, is_upstream=True - ) - ): - upstream_dataset_name = self.get_dataset_identifier_from_qualified_name( - upstream_col.objectName - ) - column_upstreams.append( - builder.make_schema_field_urn( - self.dataset_urn_builder(upstream_dataset_name), - self.snowflake_identifier(upstream_col.columnName), - ) - ) - return column_upstreams - - def update_external_tables_lineage( - self, upstream_tables: List[UpstreamClass], external_lineage: Set[str] - ) -> None: - for external_lineage_entry in sorted(external_lineage): - # For now, populate only for S3 - if external_lineage_entry.startswith("s3://"): - external_upstream_table = UpstreamClass( - dataset=make_s3_urn(external_lineage_entry, self.config.env), - type=DatasetLineageTypeClass.COPY, - ) - upstream_tables.append(external_upstream_table) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 587c71a98be67..0f89324f5efc6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -506,35 +506,6 @@ def view_dependencies_v2() -> str: def show_external_tables() -> str: return "show external tables in account" - # Note - This method should be removed once legacy lineage is removed - @staticmethod - def external_table_lineage_history( - start_time_millis: int, end_time_millis: int - ) -> str: - return f""" - WITH external_table_lineage_history AS ( - SELECT - r.value:"locations" AS upstream_locations, - w.value:"objectName"::varchar AS downstream_table_name, - w.value:"objectDomain"::varchar AS downstream_table_domain, - w.value:"columns" AS downstream_table_columns, - t.query_start_time AS query_start_time - FROM - (SELECT * from snowflake.account_usage.access_history) t, - lateral flatten(input => t.BASE_OBJECTS_ACCESSED) r, - lateral flatten(input => t.OBJECTS_MODIFIED) w - WHERE r.value:"locations" IS NOT NULL - AND w.value:"objectId" IS NOT NULL - AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3) - AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)) - SELECT - upstream_locations AS "UPSTREAM_LOCATIONS", - downstream_table_name AS "DOWNSTREAM_TABLE_NAME", - downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS" - FROM external_table_lineage_history - WHERE downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}' - QUALIFY ROW_NUMBER() OVER (PARTITION BY downstream_table_name ORDER BY query_start_time DESC) = 1""" - @staticmethod def copy_lineage_history( start_time_millis: int, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 7dd51d5b20e8e..40c4d32525a51 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -51,9 +51,6 @@ SnowflakeV2Config, TagOption, ) -from datahub.ingestion.source.snowflake.snowflake_lineage_legacy import ( - SnowflakeLineageExtractor as SnowflakeLineageLegacyExtractor, -) from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( SnowflakeLineageExtractor, ) @@ -240,19 +237,10 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary() - self.lineage_extractor: Union[ - SnowflakeLineageExtractor, SnowflakeLineageLegacyExtractor - ] if config.include_table_lineage: - # For lineage - if self.config.use_legacy_lineage_method: - self.lineage_extractor = SnowflakeLineageLegacyExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn - ) - else: - self.lineage_extractor = SnowflakeLineageExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn - ) + self.lineage_extractor = SnowflakeLineageExtractor( + config, self.report, dataset_urn_builder=self.gen_dataset_urn + ) if config.include_usage_stats or config.include_operational_stats: self.usage_extractor = SnowflakeUsageExtractor( diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 43f5e04fbc89f..81e307a78ae9e 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -434,11 +434,6 @@ def default_query_results( # noqa: C901 } for op_idx in range(1, num_ops + 1) ] - elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654473600000, - 1654586220000, - ): - return [] elif query in [ snowflake_query.SnowflakeQuery.view_dependencies(), ]: @@ -509,10 +504,6 @@ def default_query_results( # noqa: C901 } ] elif query in [ - snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654473600000, - 1654586220000, - ), snowflake_query.SnowflakeQuery.view_dependencies_v2(), snowflake_query.SnowflakeQuery.view_dependencies(), snowflake_query.SnowflakeQuery.show_external_tables(), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 53b2bcb236cd9..6135b0b3b3274 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -121,7 +121,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): include_table_lineage=True, include_view_lineage=True, include_usage_stats=True, - use_legacy_lineage_method=False, validate_upstreams_against_patterns=False, include_operational_stats=True, email_as_user_identifier=True, @@ -213,7 +212,6 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_ include_column_lineage=False, include_views=False, include_view_lineage=False, - use_legacy_lineage_method=False, include_usage_stats=False, include_operational_stats=False, start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 73a261bb3cb6e..4963e71ae4d96 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -55,7 +55,6 @@ def snowflake_pipeline_config(tmp_path): schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_view_lineage=False, include_usage_stats=False, - use_legacy_lineage_method=False, start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( tzinfo=timezone.utc ), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py deleted file mode 100644 index a5993793e574d..0000000000000 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py +++ /dev/null @@ -1,291 +0,0 @@ -from datetime import datetime, timezone -from typing import cast -from unittest import mock - -from freezegun import freeze_time -from pytest import fixture - -from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig -from datahub.ingestion.source.snowflake import snowflake_query -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from tests.integration.snowflake.common import ( - FROZEN_TIME, - NUM_TABLES, - default_query_results, -) - - -def query_permission_error_override(fn, override_for_query, error_msg): - def my_function(query): - if query in override_for_query: - raise Exception(error_msg) - else: - return fn(query) - - return my_function - - -def query_permission_response_override(fn, override_for_query, response): - def my_function(query): - if query in override_for_query: - return response - else: - return fn(query) - - return my_function - - -@fixture(scope="function") -def snowflake_pipeline_legacy_lineage_config(tmp_path): - output_file = tmp_path / "snowflake_test_events_permission_error.json" - config = PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.aws", - username="TST_USR", - password="TST_PWD", - role="TEST_ROLE", - warehouse="TEST_WAREHOUSE", - include_technical_schema=True, - match_fully_qualified_names=True, - schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_view_lineage=False, - include_usage_stats=False, - use_legacy_lineage_method=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(tzinfo=timezone.utc), - ), - ), - sink=DynamicTypedConfig(type="file", config={"filename": str(output_file)}), - ) - return config - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_role_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - # Snowflake connection fails role not granted error - mock_connect.side_effect = Exception( - "250001 (08001): Failed to connect to DB: abc12345.ap-south-1.snowflakecomputing.com:443. Role 'TEST_ROLE' specified in the connect string is not granted to this user. Contact your local system administrator, or attempt to login with another role, e.g. PUBLIC" - ) - - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_warehouse_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Current warehouse query leads to blank result - sf_cursor.execute.side_effect = query_permission_response_override( - default_query_results, - [SnowflakeQuery.current_warehouse()], - [(None,)], - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_no_databases_with_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing databases - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [SnowflakeQuery.get_databases("TEST_DB")], - "Database 'TEST_DB' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_no_tables_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing databases - no_tables_fn = query_permission_response_override( - default_query_results, - [SnowflakeQuery.tables_for_schema("TEST_SCHEMA", "TEST_DB")], - [], - ) - sf_cursor.execute.side_effect = query_permission_response_override( - no_tables_fn, - [SnowflakeQuery.show_views_for_schema("TEST_SCHEMA", "TEST_DB")], - [], - ) - - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_list_columns_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing columns - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [ - SnowflakeQuery.columns_for_table( - "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB" - ) - for tbl_idx in range(1, NUM_TABLES + 1) - ], - "Database 'TEST_DB' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert ( - "Failed to get columns for table" - in pipeline.source.get_report().warnings.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_list_primary_keys_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing keys leads to warning - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [SnowflakeQuery.show_primary_keys_for_schema("TEST_SCHEMA", "TEST_DB")], - "Insufficient privileges to operate on TEST_DB", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert ( - "Failed to get primary key for table" - in pipeline.source.get_report().warnings.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting lineage - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [ - snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654473600000, 1654586220000, True - ), - ], - "Database 'SNOWFLAKE' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert ( - "lineage-permission-error" in pipeline.source.get_report().failures.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_snowflake_operations_permission_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting access history date range - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [snowflake_query.SnowflakeQuery.get_access_history_date_range()], - "Database 'SNOWFLAKE' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "usage-permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting view lineage - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [snowflake_query.SnowflakeQuery.view_dependencies()], - "Unexpected Error", - ) - - snowflake_pipeline_config1 = snowflake_pipeline_legacy_lineage_config.copy() - cast( - SnowflakeV2Config, - cast(PipelineConfig, snowflake_pipeline_config1).source.config, - ).include_view_lineage = True - pipeline = Pipeline(snowflake_pipeline_config1) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert "view-upstream-lineage" in pipeline.source.get_report().warnings.keys() diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py deleted file mode 100644 index 59da7ddf695d8..0000000000000 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py +++ /dev/null @@ -1,207 +0,0 @@ -import random -from datetime import datetime, timezone -from unittest import mock - -import pandas as pd -import pytest -from freezegun import freeze_time - -from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.glossary.classifier import ( - ClassificationConfig, - DynamicTypedClassifierConfig, -) -from datahub.ingestion.glossary.datahub_classifier import ( - DataHubClassifierConfig, - InfoTypeConfig, - PredictionFactorsAndWeights, - ValuesFactorConfig, -) -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig -from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig -from datahub.ingestion.source.snowflake.snowflake_config import ( - SnowflakeV2Config, - TagOption, -) -from tests.integration.snowflake.common import FROZEN_TIME, default_query_results -from tests.integration.snowflake.test_snowflake import random_cloud_region, random_email -from tests.test_helpers import mce_helpers - - -@pytest.mark.integration -def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" - - # Run the metadata ingestion pipeline. - output_file = tmp_path / "snowflake_test_events.json" - golden_file = test_resources_dir / "snowflake_golden.json" - - with mock.patch("snowflake.connector.connect") as mock_connect, mock.patch( - "datahub.ingestion.source.snowflake.snowflake_v2.SnowflakeV2Source.get_sample_values_for_table" - ) as mock_sample_values: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - sf_cursor.execute.side_effect = default_query_results - - mock_sample_values.return_value = pd.DataFrame( - data={ - "col_1": [random.randint(1, 80) for i in range(20)], - "col_2": [random_email() for i in range(20)], - "col_3": [random_cloud_region() for i in range(20)], - } - ) - - datahub_classifier_config = DataHubClassifierConfig( - minimum_values_threshold=10, - confidence_level_threshold=0.58, - info_types_config={ - "Age": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, Values=1, Description=0, Datatype=0 - ) - ), - "CloudRegion": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, - Description=0, - Datatype=0, - Values=1, - ), - Values=ValuesFactorConfig( - prediction_type="regex", - regex=[ - r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+" - ], - ), - ), - }, - ) - - pipeline = Pipeline( - config=PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.aws", - username="TST_USR", - password="TST_PWD", - match_fully_qualified_names=True, - schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_technical_schema=True, - include_table_lineage=True, - include_view_lineage=True, - include_usage_stats=True, - use_legacy_lineage_method=True, - validate_upstreams_against_patterns=False, - include_operational_stats=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - classification=ClassificationConfig( - enabled=True, - classifiers=[ - DynamicTypedClassifierConfig( - type="datahub", config=datahub_classifier_config - ) - ], - ), - profiling=GEProfilingConfig( - enabled=True, - profile_if_updated_since_days=None, - profile_table_row_limit=None, - profile_table_size_limit=None, - profile_table_level_only=True, - ), - extract_tags=TagOption.without_lineage, - ), - ), - sink=DynamicTypedConfig( - type="file", config={"filename": str(output_file)} - ), - ) - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - # Verify the output. - - mce_helpers.check_golden_file( - pytestconfig, - output_path=output_file, - golden_path=golden_file, - ignore_paths=[ - r"root\[\d+\]\['aspect'\]\['json'\]\['timestampMillis'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['created'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['lastModified'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['fields'\]\[\d+\]\['glossaryTerms'\]\['auditStamp'\]\['time'\]", - r"root\[\d+\]\['systemMetadata'\]", - ], - ) - - -@freeze_time(FROZEN_TIME) -@pytest.mark.integration -def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" - - # Run the metadata ingestion pipeline. - output_file = tmp_path / "snowflake_privatelink_test_events.json" - golden_file = test_resources_dir / "snowflake_privatelink_golden.json" - - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - sf_cursor.execute.side_effect = default_query_results - - pipeline = Pipeline( - config=PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.privatelink", - username="TST_USR", - password="TST_PWD", - schema_pattern=AllowDenyPattern(allow=["test_schema"]), - include_technical_schema=True, - include_table_lineage=True, - include_column_lineage=False, - include_views=False, - include_view_lineage=False, - use_legacy_lineage_method=True, - include_usage_stats=False, - include_operational_stats=False, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - ), - ), - sink=DynamicTypedConfig( - type="file", config={"filename": str(output_file)} - ), - ) - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - # Verify the output. - - mce_helpers.check_golden_file( - pytestconfig, - output_path=output_file, - golden_path=golden_file, - ignore_paths=[], - ) From 01ae5d96da45a259122a547504265025624c0e11 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 23 Aug 2023 15:58:34 -0400 Subject: [PATCH 03/20] fix(ingest/ldap): Handle case when 'objectClass' not in attrs (#8658) --- metadata-ingestion/src/datahub/ingestion/source/ldap.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 497b49acb6505..e1d035a96d42f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -271,10 +271,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if dn is None: continue - if not attrs: + if not attrs or "objectClass" not in attrs: self.report.report_warning( "", - f"skipping {dn} because attrs is empty; check your permissions if this is unexpected", + f"skipping {dn} because attrs ({attrs}) does not contain expected data; " + f"check your permissions if this is unexpected", ) continue From 52f1e32a65e7137b2cdec23d76503adff93cac8a Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 23 Aug 2023 19:29:30 -0400 Subject: [PATCH 04/20] fix(ui) Remove new Role entity from searchable entity types (#8655) Co-authored-by: Indy Prentice Co-authored-by: Aseem Bansal --- .../linkedin/datahub/graphql/resolvers/search/SearchUtils.java | 1 - 1 file changed, 1 deletion(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java index e40bbca56b416..fe5b79ba2ea3d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java @@ -73,7 +73,6 @@ private SearchUtils() { EntityType.CONTAINER, EntityType.DOMAIN, EntityType.DATA_PRODUCT, - EntityType.ROLE, EntityType.NOTEBOOK); From a4cb81cc437796ccc1babfe3fe707924cffec69f Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 23 Aug 2023 19:30:14 -0400 Subject: [PATCH 05/20] fix(java) Use alias for name search sorting and fix missing mappings (#8648) --- .../src/app/search/context/constants.ts | 14 ++++++++++--- .../models/SearchableFieldSpecExtractor.java | 3 ++- .../annotation/SearchableAnnotation.java | 21 ++++++++++++++++++- .../indexbuilder/MappingsBuilder.java | 18 ++++++++++++++++ .../metadata/search/utils/ESUtils.java | 5 ++++- .../indexbuilder/MappingsBuilderTest.java | 7 ++++++- .../request/AggregationQueryBuilderTest.java | 12 +++++++---- .../pegasus/com/linkedin/chart/ChartInfo.pdl | 3 ++- .../container/ContainerProperties.pdl | 3 ++- .../com/linkedin/dashboard/DashboardInfo.pdl | 3 ++- .../com/linkedin/datajob/DataFlowInfo.pdl | 3 ++- .../com/linkedin/datajob/DataJobInfo.pdl | 3 ++- .../dataplatform/DataPlatformInfo.pdl | 3 ++- .../DataPlatformInstanceProperties.pdl | 3 ++- .../dataproduct/DataProductProperties.pdl | 3 ++- .../linkedin/dataset/DatasetProperties.pdl | 3 ++- .../com/linkedin/domain/DomainProperties.pdl | 3 ++- .../linkedin/glossary/GlossaryNodeInfo.pdl | 3 ++- .../linkedin/glossary/GlossaryTermInfo.pdl | 3 ++- .../com/linkedin/identity/CorpGroupInfo.pdl | 3 ++- .../com/linkedin/identity/CorpUserInfo.pdl | 3 ++- .../linkedin/metadata/key/MLFeatureKey.pdl | 3 ++- .../metadata/key/MLFeatureTableKey.pdl | 3 ++- .../metadata/key/MLModelDeploymentKey.pdl | 3 ++- .../linkedin/metadata/key/MLModelGroupKey.pdl | 3 ++- .../com/linkedin/metadata/key/MLModelKey.pdl | 3 ++- .../linkedin/metadata/key/MLPrimaryKeyKey.pdl | 3 ++- .../com/linkedin/notebook/NotebookInfo.pdl | 3 ++- .../com/linkedin/role/RoleProperties.pdl | 3 ++- .../com/linkedin/tag/TagProperties.pdl | 3 ++- .../com.linkedin.entity.aspects.snapshot.json | 12 +++++++++++ ...com.linkedin.entity.entities.snapshot.json | 17 +++++++++++++++ .../com.linkedin.entity.runs.snapshot.json | 12 +++++++++++ ...nkedin.operations.operations.snapshot.json | 12 +++++++++++ ...m.linkedin.platform.platform.snapshot.json | 17 +++++++++++++++ .../com/datahub/test/TestEntityInfo.pdl | 3 ++- 36 files changed, 187 insertions(+), 35 deletions(-) diff --git a/datahub-web-react/src/app/search/context/constants.ts b/datahub-web-react/src/app/search/context/constants.ts index 372230db023e9..5f841b8536e19 100644 --- a/datahub-web-react/src/app/search/context/constants.ts +++ b/datahub-web-react/src/app/search/context/constants.ts @@ -1,15 +1,23 @@ import { SortOrder } from '../../../types.generated'; export const RELEVANCE = 'relevance'; -export const NAME_FIELD = 'name'; +export const ENTITY_NAME_FIELD = '_entityName'; export const LAST_OPERATION_TIME_FIELD = 'lastOperationTime'; export const DEFAULT_SORT_OPTION = RELEVANCE; export const SORT_OPTIONS = { [RELEVANCE]: { label: 'Relevance', field: RELEVANCE, sortOrder: SortOrder.Descending }, - [`${NAME_FIELD}_${SortOrder.Ascending}`]: { label: 'A to Z', field: NAME_FIELD, sortOrder: SortOrder.Ascending }, - [`${NAME_FIELD}_${SortOrder.Descending}`]: { label: 'Z to A', field: NAME_FIELD, sortOrder: SortOrder.Descending }, + [`${ENTITY_NAME_FIELD}_${SortOrder.Ascending}`]: { + label: 'A to Z', + field: ENTITY_NAME_FIELD, + sortOrder: SortOrder.Ascending, + }, + [`${ENTITY_NAME_FIELD}_${SortOrder.Descending}`]: { + label: 'Z to A', + field: ENTITY_NAME_FIELD, + sortOrder: SortOrder.Descending, + }, [`${LAST_OPERATION_TIME_FIELD}_${SortOrder.Descending}`]: { label: 'Last Modified in Platform', field: LAST_OPERATION_TIME_FIELD, diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java index 2ffd9283ed456..8f2f42cd69cae 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java @@ -155,7 +155,8 @@ private void extractSearchableAnnotation(final Object annotationObj, final DataS annotation.getBoostScore(), annotation.getHasValuesFieldName(), annotation.getNumValuesFieldName(), - annotation.getWeightsPerFieldValue()); + annotation.getWeightsPerFieldValue(), + annotation.getFieldNameAliases()); } } log.debug("Searchable annotation for field: {} : {}", schemaPathSpec, annotation); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java index 3d3fbcf3ccaa6..d5e5044f95c23 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java @@ -4,7 +4,10 @@ import com.google.common.collect.ImmutableSet; import com.linkedin.data.schema.DataSchema; import com.linkedin.metadata.models.ModelValidationException; + +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -19,6 +22,7 @@ @Value public class SearchableAnnotation { + public static final String FIELD_NAME_ALIASES = "fieldNameAliases"; public static final String ANNOTATION_NAME = "Searchable"; private static final Set DEFAULT_QUERY_FIELD_TYPES = ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL); @@ -47,6 +51,8 @@ public class SearchableAnnotation { Optional numValuesFieldName; // (Optional) Weights to apply to score for a given value Map weightsPerFieldValue; + // (Optional) Aliases for this given field that can be used for sorting etc. + List fieldNameAliases; public enum FieldType { KEYWORD, @@ -94,6 +100,7 @@ public static SearchableAnnotation fromPegasusAnnotationObject(@Nonnull final Ob final Optional numValuesFieldName = AnnotationUtils.getField(map, "numValuesFieldName", String.class); final Optional weightsPerFieldValueMap = AnnotationUtils.getField(map, "weightsPerFieldValue", Map.class).map(m -> (Map) m); + final List fieldNameAliases = getFieldNameAliases(map); final FieldType resolvedFieldType = getFieldType(fieldType, schemaDataType); return new SearchableAnnotation( @@ -108,7 +115,8 @@ public static SearchableAnnotation fromPegasusAnnotationObject(@Nonnull final Ob boostScore.orElse(1.0), hasValuesFieldName, numValuesFieldName, - weightsPerFieldValueMap.orElse(ImmutableMap.of())); + weightsPerFieldValueMap.orElse(ImmutableMap.of()), + fieldNameAliases); } private static FieldType getFieldType(Optional maybeFieldType, DataSchema.Type schemaDataType) { @@ -156,4 +164,15 @@ private static String capitalizeFirstLetter(String str) { return str.substring(0, 1).toUpperCase() + str.substring(1); } } + + private static List getFieldNameAliases(Map map) { + final List aliases = new ArrayList<>(); + final Optional fieldNameAliases = AnnotationUtils.getField(map, FIELD_NAME_ALIASES, List.class); + if (fieldNameAliases.isPresent()) { + for (Object alias : fieldNameAliases.get()) { + aliases.add((String) alias); + } + } + return aliases; + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index efa4e0c279a76..4bbff3915aca9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -46,6 +47,10 @@ public static Map getPartialNgramConfigWithOverrides(Map getMappingsForField(@Nonnull final Searchable searchableFieldSpec.getSearchableAnnotation() .getNumValuesFieldName() .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, LONG))); + mappings.putAll(getMappingsForFieldNameAliases(searchableFieldSpec)); return mappings; } @@ -189,4 +195,16 @@ private static Map getMappingsForSearchScoreField( return ImmutableMap.of(searchScoreFieldSpec.getSearchScoreAnnotation().getFieldName(), ImmutableMap.of(TYPE, DOUBLE)); } + + private static Map getMappingsForFieldNameAliases(@Nonnull final SearchableFieldSpec searchableFieldSpec) { + Map mappings = new HashMap<>(); + List fieldNameAliases = searchableFieldSpec.getSearchableAnnotation().getFieldNameAliases(); + fieldNameAliases.forEach(alias -> { + Map aliasMappings = new HashMap<>(); + aliasMappings.put(TYPE, ALIAS); + aliasMappings.put(PATH, searchableFieldSpec.getSearchableAnnotation().getFieldName()); + mappings.put(alias, aliasMappings); + }); + return mappings; + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index 8a385e4ab2b54..5179f2be6d060 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -45,6 +45,7 @@ public class ESUtils { public static final int MAX_RESULT_SIZE = 10000; public static final String OPAQUE_ID_HEADER = "X-Opaque-Id"; public static final String HEADER_VALUE_DELIMITER = "|"; + public static final String KEYWORD_TYPE = "keyword"; // we use this to make sure we filter for editable & non-editable fields. Also expands out top-level properties // to field level properties @@ -174,6 +175,8 @@ public static QueryBuilder getQueryBuilderFromCriterion(@Nonnull final Criterion * If no sort criterion is provided then the default sorting criterion is chosen which is descending order of score * Furthermore to resolve conflicts, the results are further sorted by ascending order of urn * If the input sort criterion is urn itself, then no additional sort criterion is applied as there will be no conflicts. + * When sorting, set the unmappedType param to arbitrary "keyword" so we essentially ignore sorting where indices do not + * have the field we are sorting on. *

* * @param searchSourceBuilder {@link SearchSourceBuilder} that needs to be populated with sort order @@ -187,7 +190,7 @@ public static void buildSortOrder(@Nonnull SearchSourceBuilder searchSourceBuild final SortOrder esSortOrder = (sortCriterion.getOrder() == com.linkedin.metadata.query.filter.SortOrder.ASCENDING) ? SortOrder.ASC : SortOrder.DESC; - searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder)); + searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder).unmappedType(KEYWORD_TYPE)); } if (sortCriterion == null || !sortCriterion.getField().equals(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD)) { searchSourceBuilder.sort(new FieldSortBuilder(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD).order(SortOrder.ASC)); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java index 5a8f80f325dbd..0b33185549299 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java @@ -16,7 +16,7 @@ public void testMappingsBuilder() { Map result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec()); assertEquals(result.size(), 1); Map properties = (Map) result.get("properties"); - assertEquals(properties.size(), 18); + assertEquals(properties.size(), 19); assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword", "fields", ImmutableMap.of("delimited", @@ -66,6 +66,11 @@ public void testMappingsBuilder() { assertTrue(textFieldSubfields.containsKey("delimited")); assertTrue(textFieldSubfields.containsKey("keyword")); + // TEXT with addToFilters aliased under "_entityName" + Map textFieldAlias = (Map) properties.get("_entityName"); + assertEquals(textFieldAlias.get("type"), "alias"); + assertEquals(textFieldAlias.get("path"), "textFieldOverride"); + // TEXT_PARTIAL Map textArrayField = (Map) properties.get("textArrayField"); assertEquals(textArrayField.get("type"), "keyword"); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java index 10b4ee42b1a71..36c8bb8f9a676 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java @@ -31,7 +31,8 @@ public void testGetDefaultAggregationsHasFields() { 1.0, Optional.of("hasTest"), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); @@ -60,7 +61,8 @@ public void testGetDefaultAggregationsFields() { 1.0, Optional.empty(), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); @@ -89,7 +91,8 @@ public void testGetSpecificAggregationsHasFields() { 1.0, Optional.of("hasTest1"), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchableAnnotation annotation2 = new SearchableAnnotation( @@ -104,7 +107,8 @@ public void testGetSpecificAggregationsHasFields() { 1.0, Optional.empty(), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); diff --git a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl index 5047c824e2617..9fea71003ae6e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl @@ -21,7 +21,8 @@ record ChartInfo includes CustomProperties, ExternalReference { */ @Searchable = { "fieldType": "WORD_GRAM", - "enableAutocomplete": true + "enableAutocomplete": true, + "fieldNameAliases": [ "_entityName" ] } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl index 0b9c89ea30c90..526878cbe60d3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl @@ -17,7 +17,8 @@ record ContainerProperties includes CustomProperties, ExternalReference { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl index 84b3065a08022..c436011eb58db 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl @@ -24,7 +24,8 @@ record DashboardInfo includes CustomProperties, ExternalReference { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl index 1303bfbc863ea..2ff3e8cd930af 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl @@ -19,7 +19,8 @@ record DataFlowInfo includes CustomProperties, ExternalReference { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl index 1e305816f96a2..250fb76003777 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl @@ -20,7 +20,8 @@ record DataJobInfo includes CustomProperties, ExternalReference { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl index 0be58d73dc79f..5dd35c7f49520 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl @@ -17,7 +17,8 @@ record DataPlatformInfo { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": false, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl index 1220741ee5726..b24e220ac3bcf 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl @@ -18,7 +18,8 @@ record DataPlatformInstanceProperties includes CustomProperties, ExternalReferen @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl index c0a50a5e0e688..b2d26094fd0b7 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl @@ -15,7 +15,8 @@ record DataProductProperties includes CustomProperties, ExternalReference { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl index 49d0dcd58ee27..ad8705a29d4ed 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl @@ -19,7 +19,8 @@ record DatasetProperties includes CustomProperties, ExternalReference { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl index a362d412a32b9..5c8c8a4912e4c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl @@ -16,7 +16,8 @@ record DomainProperties { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl index 557b5e2a0f419..c3388d4f462d4 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl @@ -37,7 +37,8 @@ record GlossaryNodeInfo { "fieldName": "displayName", "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl index 13e7af311fba1..e987a71be7131 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl @@ -25,7 +25,8 @@ record GlossaryTermInfo includes CustomProperties { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl index 8d764604237da..28b87476c61bd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl @@ -21,7 +21,8 @@ record CorpGroupInfo { "fieldType": "TEXT_PARTIAL" "queryByDefault": true, "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } displayName: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl index 6cb0e8fd6aa6d..382b120fa942a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl @@ -29,7 +29,8 @@ record CorpUserInfo includes CustomProperties { "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } displayName: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl index 0dcb194bccce0..050b954c89fb8 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl @@ -22,7 +22,8 @@ record MLFeatureKey { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl index 880daa4423573..175a7b0d31b00 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl @@ -24,7 +24,8 @@ record MLFeatureTableKey { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl index 83ba35e0af601..daa1deceb5fc3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl @@ -21,7 +21,8 @@ record MLModelDeploymentKey { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl index b1e2b7b7ede70..582a899633c2a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl @@ -21,7 +21,8 @@ record MLModelGroupKey { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl index 24fe89dcce654..f097bbda738a2 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl @@ -21,7 +21,8 @@ record MLModelKey { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl index 7987f3a3345b7..ef812df206b46 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl @@ -23,7 +23,8 @@ record MLPrimaryKeyKey { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string } diff --git a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl index 5df4daacffa49..8ec5f262890f3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl @@ -20,7 +20,8 @@ record NotebookInfo includes CustomProperties, ExternalReference { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl index 84d8ecc379ec2..8422d3c49046c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl @@ -16,7 +16,8 @@ record RoleProperties { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl index e808aef491749..9df47fac3928a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl @@ -13,7 +13,8 @@ record TagProperties { @Searchable = { "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 7aeca546af3c9..e3beef5ac4871 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -341,6 +341,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1279,6 +1280,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1405,6 +1407,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1464,6 +1467,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1865,6 +1869,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2061,6 +2066,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2097,6 +2103,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2161,6 +2168,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2340,6 +2348,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -3217,6 +3226,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -3282,6 +3292,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -3867,6 +3878,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 83ecaf41022c4..e6198435bce6c 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -94,6 +94,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1326,6 +1327,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1471,6 +1473,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1530,6 +1533,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1922,6 +1926,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : false, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" }, "validate" : { @@ -2111,6 +2116,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2437,6 +2443,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2585,6 +2592,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -3704,6 +3712,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4302,6 +4311,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4390,6 +4400,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4484,6 +4495,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4590,6 +4602,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4696,6 +4709,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4796,6 +4810,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4879,6 +4894,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -5096,6 +5112,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index b1489df3db55e..ffaefc8232e83 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -94,6 +94,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1032,6 +1033,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1158,6 +1160,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1217,6 +1220,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1618,6 +1622,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1806,6 +1811,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1842,6 +1848,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1906,6 +1913,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2085,6 +2093,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2962,6 +2971,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -3027,6 +3037,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -3612,6 +3623,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index f4c2d16f84747..e385c7c30b21a 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -94,6 +94,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1032,6 +1033,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1158,6 +1160,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1217,6 +1220,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1618,6 +1622,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1800,6 +1805,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1836,6 +1842,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1900,6 +1907,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2079,6 +2087,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2956,6 +2965,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -3021,6 +3031,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -3606,6 +3617,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 2676c2687bd72..b85c84be23795 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -94,6 +94,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1326,6 +1327,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1471,6 +1473,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1530,6 +1533,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1922,6 +1926,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : false, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" }, "validate" : { @@ -2111,6 +2116,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2431,6 +2437,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2579,6 +2586,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -3698,6 +3706,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4296,6 +4305,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4384,6 +4394,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4478,6 +4489,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4584,6 +4596,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4690,6 +4703,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4790,6 +4804,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4873,6 +4888,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -5090,6 +5106,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl index cc579ba488174..6dff14133ee60 100644 --- a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl +++ b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl @@ -14,7 +14,8 @@ record TestEntityInfo includes CustomProperties { @Searchable = { "fieldName": "textFieldOverride", "fieldType": "TEXT", - "addToFilters": true + "addToFilters": true, + "fieldNameAliases": [ "_entityName" ] } textField: optional string From e12d910648267701e5146355e71f2658e83719f7 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 23 Aug 2023 21:30:27 -0400 Subject: [PATCH 06/20] feat(ui) Create page for managing home page posts (#8707) --- .../authorization/AuthorizationUtils.java | 26 ++- .../datahub/graphql/resolvers/MeResolver.java | 1 + .../resolvers/post/DeletePostResolver.java | 2 +- .../src/main/resources/app.graphql | 5 + datahub-web-react/src/Mocks.tsx | 2 + .../src/app/search/PostLinkCard.tsx | 25 ++- .../src/app/search/PostTextCard.tsx | 5 +- .../src/app/settings/SettingsPage.tsx | 9 + .../src/app/settings/posts/CreatePostForm.tsx | 91 ++++++++ .../app/settings/posts/CreatePostModal.tsx | 107 ++++++++++ .../src/app/settings/posts/ManagePosts.tsx | 40 ++++ .../src/app/settings/posts/PostItemMenu.tsx | 62 ++++++ .../src/app/settings/posts/PostsList.tsx | 200 ++++++++++++++++++ .../app/settings/posts/PostsListColumns.tsx | 26 +++ .../src/app/settings/posts/constants.ts | 13 ++ .../src/app/settings/posts/utils.ts | 77 +++++++ datahub-web-react/src/conf/Global.ts | 1 + datahub-web-react/src/graphql/me.graphql | 1 + datahub-web-react/src/graphql/post.graphql | 8 + .../war/src/main/resources/boot/policies.json | 4 + .../authorization/PoliciesConfig.java | 6 + 21 files changed, 699 insertions(+), 12 deletions(-) create mode 100644 datahub-web-react/src/app/settings/posts/CreatePostForm.tsx create mode 100644 datahub-web-react/src/app/settings/posts/CreatePostModal.tsx create mode 100644 datahub-web-react/src/app/settings/posts/ManagePosts.tsx create mode 100644 datahub-web-react/src/app/settings/posts/PostItemMenu.tsx create mode 100644 datahub-web-react/src/app/settings/posts/PostsList.tsx create mode 100644 datahub-web-react/src/app/settings/posts/PostsListColumns.tsx create mode 100644 datahub-web-react/src/app/settings/posts/constants.ts create mode 100644 datahub-web-react/src/app/settings/posts/utils.ts diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java index 94880c77d74bc..3089b8c8fc2db 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java @@ -107,7 +107,31 @@ public static boolean canEditGroupMembers(@Nonnull String groupUrnStr, @Nonnull } public static boolean canCreateGlobalAnnouncements(@Nonnull QueryContext context) { - return isAuthorized(context, Optional.empty(), PoliciesConfig.CREATE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE); + final DisjunctivePrivilegeGroup orPrivilegeGroups = new DisjunctivePrivilegeGroup( + ImmutableList.of( + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.CREATE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())), + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())) + )); + + return AuthorizationUtils.isAuthorized( + context.getAuthorizer(), + context.getActorUrn(), + orPrivilegeGroups); + } + + public static boolean canManageGlobalAnnouncements(@Nonnull QueryContext context) { + final DisjunctivePrivilegeGroup orPrivilegeGroups = new DisjunctivePrivilegeGroup( + ImmutableList.of( + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())) + )); + + return AuthorizationUtils.isAuthorized( + context.getAuthorizer(), + context.getActorUrn(), + orPrivilegeGroups); } public static boolean canManageGlobalViews(@Nonnull QueryContext context) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java index d2a7b19857f95..02921b453e315 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java @@ -74,6 +74,7 @@ public CompletableFuture get(DataFetchingEnvironment environm platformPrivileges.setManageTags(AuthorizationUtils.canManageTags(context)); platformPrivileges.setManageGlobalViews(AuthorizationUtils.canManageGlobalViews(context)); platformPrivileges.setManageOwnershipTypes(AuthorizationUtils.canManageOwnershipTypes(context)); + platformPrivileges.setManageGlobalAnnouncements(AuthorizationUtils.canManageGlobalAnnouncements(context)); // Construct and return authenticated user object. final AuthenticatedUser authUser = new AuthenticatedUser(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java index cd2a3dda70033..d3cd0126fb852 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java @@ -23,7 +23,7 @@ public class DeletePostResolver implements DataFetcher get(final DataFetchingEnvironment environment) throws Exception { final QueryContext context = environment.getContext(); - if (!AuthorizationUtils.canCreateGlobalAnnouncements(context)) { + if (!AuthorizationUtils.canManageGlobalAnnouncements(context)) { throw new AuthorizationException( "Unauthorized to delete posts. Please contact your DataHub administrator if this needs corrective action."); } diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index 37183bac13f0e..761242a6711c1 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -125,6 +125,11 @@ type PlatformPrivileges { Whether the user should be able to create, update, and delete ownership types. """ manageOwnershipTypes: Boolean! + + """ + Whether the user can create and delete posts pinned to the home page. + """ + manageGlobalAnnouncements: Boolean! } """ diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx index dcefc7f70d785..b772341370050 100644 --- a/datahub-web-react/src/Mocks.tsx +++ b/datahub-web-react/src/Mocks.tsx @@ -3363,6 +3363,7 @@ export const mocks = [ generatePersonalAccessTokens: true, manageGlobalViews: true, manageOwnershipTypes: true, + manageGlobalAnnouncements: true, }, }, }, @@ -3609,4 +3610,5 @@ export const platformPrivileges: PlatformPrivileges = { createDomains: true, manageGlobalViews: true, manageOwnershipTypes: true, + manageGlobalAnnouncements: true, }; diff --git a/datahub-web-react/src/app/search/PostLinkCard.tsx b/datahub-web-react/src/app/search/PostLinkCard.tsx index 04308632c61c9..2111c0b25ad84 100644 --- a/datahub-web-react/src/app/search/PostLinkCard.tsx +++ b/datahub-web-react/src/app/search/PostLinkCard.tsx @@ -39,12 +39,17 @@ const TextContainer = styled.div` flex: 2; `; -const TextWrapper = styled.div` - text-align: left; +const FlexWrapper = styled.div<{ alignCenter?: boolean }>` display: flex; flex-direction: column; justify-content: center; flex: 2; + ${(props) => props.alignCenter && 'align-items: center;'} +`; + +const TextWrapper = styled.div` + display: flex; + flex-direction: column; `; const HeaderText = styled(Typography.Text)` @@ -74,19 +79,21 @@ export const PostLinkCard = ({ linkPost }: Props) => { const link = linkPost?.content?.link || ''; return ( - + {hasMedia && ( )} - - Link - - {linkPost?.content?.title} - - + + + Link + + {linkPost?.content?.title} + + + diff --git a/datahub-web-react/src/app/search/PostTextCard.tsx b/datahub-web-react/src/app/search/PostTextCard.tsx index 1bba55425fe0d..15b34e37fc01c 100644 --- a/datahub-web-react/src/app/search/PostTextCard.tsx +++ b/datahub-web-react/src/app/search/PostTextCard.tsx @@ -7,7 +7,6 @@ import { Post } from '../../types.generated'; const CardContainer = styled.div` display: flex; flex-direction: row; - min-height: 140px; border: 1px solid ${ANTD_GRAY[4]}; border-radius: 12px; box-shadow: ${(props) => props.theme.styles['box-shadow']}; @@ -15,6 +14,7 @@ const CardContainer = styled.div` box-shadow: ${(props) => props.theme.styles['box-shadow-hover']}; } white-space: unset; + padding-bottom: 4px; `; const TextContainer = styled.div` @@ -28,6 +28,9 @@ const TextContainer = styled.div` const TitleText = styled(Typography.Title)` word-break: break-word; min-height: 20px; + &&& { + margin-top: 8px; + } `; const HeaderText = styled(Typography.Text)` diff --git a/datahub-web-react/src/app/settings/SettingsPage.tsx b/datahub-web-react/src/app/settings/SettingsPage.tsx index bfec9b395cff2..339cc0cf44bac 100644 --- a/datahub-web-react/src/app/settings/SettingsPage.tsx +++ b/datahub-web-react/src/app/settings/SettingsPage.tsx @@ -7,6 +7,7 @@ import { ToolOutlined, FilterOutlined, TeamOutlined, + PushpinOutlined, } from '@ant-design/icons'; import { Redirect, Route, useHistory, useLocation, useRouteMatch, Switch } from 'react-router'; import styled from 'styled-components'; @@ -19,6 +20,7 @@ import { Preferences } from './Preferences'; import { ManageViews } from '../entity/view/ManageViews'; import { useUserContext } from '../context/useUserContext'; import { ManageOwnership } from '../entity/ownership/ManageOwnership'; +import ManagePosts from './posts/ManagePosts'; const PageContainer = styled.div` display: flex; @@ -62,6 +64,7 @@ const PATHS = [ { path: 'preferences', content: }, { path: 'views', content: }, { path: 'ownership', content: }, + { path: 'posts', content: }, ]; /** @@ -91,6 +94,7 @@ export const SettingsPage = () => { const showUsersGroups = (isIdentityManagementEnabled && me && me?.platformPrivileges?.manageIdentities) || false; const showViews = isViewsEnabled || false; const showOwnershipTypes = me && me?.platformPrivileges?.manageOwnershipTypes; + const showHomePagePosts = me && me?.platformPrivileges?.manageGlobalAnnouncements; return ( @@ -143,6 +147,11 @@ export const SettingsPage = () => { Ownership Types )} + {showHomePagePosts && ( + + Home Page Posts + + )} diff --git a/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx b/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx new file mode 100644 index 0000000000000..a8d6cfa64c9c1 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx @@ -0,0 +1,91 @@ +import React, { useState } from 'react'; +import { Form, Input, Typography, FormInstance, Radio } from 'antd'; +import styled from 'styled-components'; +import { + DESCRIPTION_FIELD_NAME, + LINK_FIELD_NAME, + LOCATION_FIELD_NAME, + TITLE_FIELD_NAME, + TYPE_FIELD_NAME, +} from './constants'; +import { PostContentType } from '../../../types.generated'; + +const TopFormItem = styled(Form.Item)` + margin-bottom: 24px; +`; + +const SubFormItem = styled(Form.Item)` + margin-bottom: 0; +`; + +type Props = { + setCreateButtonEnabled: (isEnabled: boolean) => void; + form: FormInstance; +}; + +export default function CreatePostForm({ setCreateButtonEnabled, form }: Props) { + const [postType, setPostType] = useState(PostContentType.Text); + + return ( +
{ + setCreateButtonEnabled(!form.getFieldsError().some((field) => field.errors.length > 0)); + }} + > + Post Type}> + setPostType(e.target.value)} + value={postType} + defaultValue={postType} + optionType="button" + buttonStyle="solid" + > + Announcement + Link + + + + Title}> + The title for your new post. + + + + + {postType === PostContentType.Text && ( + Description}> + The main content for your new post. + + + + + )} + {postType === PostContentType.Link && ( + <> + Link URL}> + + Where users will be directed when they click this post. + + + + + + Image URL}> + + A URL to an image you want to display on your link post. + + + + + + + )} +
+ ); +} diff --git a/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx b/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx new file mode 100644 index 0000000000000..b4851ecb02969 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx @@ -0,0 +1,107 @@ +import React, { useState } from 'react'; +import { Button, Form, message, Modal } from 'antd'; +import CreatePostForm from './CreatePostForm'; +import { + CREATE_POST_BUTTON_ID, + DESCRIPTION_FIELD_NAME, + LINK_FIELD_NAME, + LOCATION_FIELD_NAME, + TYPE_FIELD_NAME, + TITLE_FIELD_NAME, +} from './constants'; +import { useEnterKeyListener } from '../../shared/useEnterKeyListener'; +import { MediaType, PostContentType, PostType } from '../../../types.generated'; +import { useCreatePostMutation } from '../../../graphql/mutations.generated'; + +type Props = { + onClose: () => void; + onCreate: ( + contentType: string, + title: string, + description: string | undefined, + link: string | undefined, + location: string | undefined, + ) => void; +}; + +export default function CreatePostModal({ onClose, onCreate }: Props) { + const [createPostMutation] = useCreatePostMutation(); + const [createButtonEnabled, setCreateButtonEnabled] = useState(false); + const [form] = Form.useForm(); + const onCreatePost = () => { + const contentTypeValue = form.getFieldValue(TYPE_FIELD_NAME) ?? PostContentType.Text; + const mediaValue = + form.getFieldValue(TYPE_FIELD_NAME) && form.getFieldValue(LOCATION_FIELD_NAME) + ? { + type: MediaType.Image, + location: form.getFieldValue(LOCATION_FIELD_NAME) ?? null, + } + : null; + createPostMutation({ + variables: { + input: { + postType: PostType.HomePageAnnouncement, + content: { + contentType: contentTypeValue, + title: form.getFieldValue(TITLE_FIELD_NAME), + description: form.getFieldValue(DESCRIPTION_FIELD_NAME) ?? null, + link: form.getFieldValue(LINK_FIELD_NAME) ?? null, + media: mediaValue, + }, + }, + }, + }) + .then(({ errors }) => { + if (!errors) { + message.success({ + content: `Created Post!`, + duration: 3, + }); + onCreate( + form.getFieldValue(TYPE_FIELD_NAME) ?? PostContentType.Text, + form.getFieldValue(TITLE_FIELD_NAME), + form.getFieldValue(DESCRIPTION_FIELD_NAME), + form.getFieldValue(LINK_FIELD_NAME), + form.getFieldValue(LOCATION_FIELD_NAME), + ); + form.resetFields(); + } + }) + .catch((e) => { + message.destroy(); + message.error({ content: 'Failed to create Post! An unknown error occured.', duration: 3 }); + console.error('Failed to create Post:', e.message); + }); + onClose(); + }; + + // Handle the Enter press + useEnterKeyListener({ + querySelectorToExecuteClick: '#createPostButton', + }); + + return ( + + + + + } + > + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/ManagePosts.tsx b/datahub-web-react/src/app/settings/posts/ManagePosts.tsx new file mode 100644 index 0000000000000..e0f694c192c62 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/ManagePosts.tsx @@ -0,0 +1,40 @@ +import { Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components/macro'; +import { PostList } from './PostsList'; + +const PageContainer = styled.div` + padding-top: 20px; + width: 100%; + height: 100%; +`; + +const PageHeaderContainer = styled.div` + && { + padding-left: 24px; + } +`; + +const PageTitle = styled(Typography.Title)` + && { + margin-bottom: 12px; + } +`; + +const ListContainer = styled.div``; + +export default function ManagePosts() { + return ( + + + Home Page Posts + + View and manage pinned posts that appear to all users on the landing page. + + + + + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx b/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx new file mode 100644 index 0000000000000..e3fc424a47ef2 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx @@ -0,0 +1,62 @@ +import React from 'react'; +import { DeleteOutlined } from '@ant-design/icons'; +import { Dropdown, Menu, message, Modal } from 'antd'; +import { MenuIcon } from '../../entity/shared/EntityDropdown/EntityDropdown'; +import { useDeletePostMutation } from '../../../graphql/post.generated'; + +type Props = { + urn: string; + title: string; + onDelete?: () => void; +}; + +export default function PostItemMenu({ title, urn, onDelete }: Props) { + const [deletePostMutation] = useDeletePostMutation(); + + const deletePost = () => { + deletePostMutation({ + variables: { + urn, + }, + }) + .then(({ errors }) => { + if (!errors) { + message.success('Deleted Post!'); + onDelete?.(); + } + }) + .catch(() => { + message.destroy(); + message.error({ content: `Failed to delete Post!: An unknown error occurred.`, duration: 3 }); + }); + }; + + const onConfirmDelete = () => { + Modal.confirm({ + title: `Delete Post '${title}'`, + content: `Are you sure you want to remove this Post?`, + onOk() { + deletePost(); + }, + onCancel() {}, + okText: 'Yes', + maskClosable: true, + closable: true, + }); + }; + + return ( + + +  Delete + + + } + > + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/PostsList.tsx b/datahub-web-react/src/app/settings/posts/PostsList.tsx new file mode 100644 index 0000000000000..5ae2be1547f9b --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostsList.tsx @@ -0,0 +1,200 @@ +import React, { useEffect, useState } from 'react'; +import { Button, Empty, Pagination, Typography } from 'antd'; +import { useLocation } from 'react-router'; +import styled from 'styled-components'; +import * as QueryString from 'query-string'; +import { PlusOutlined } from '@ant-design/icons'; +import { AlignType } from 'rc-table/lib/interface'; +import CreatePostModal from './CreatePostModal'; +import { PostColumn, PostEntry, PostListMenuColumn } from './PostsListColumns'; +import { useEntityRegistry } from '../../useEntityRegistry'; +import { useListPostsQuery } from '../../../graphql/post.generated'; +import { scrollToTop } from '../../shared/searchUtils'; +import { addToListPostCache, removeFromListPostCache } from './utils'; +import { Message } from '../../shared/Message'; +import TabToolbar from '../../entity/shared/components/styled/TabToolbar'; +import { SearchBar } from '../../search/SearchBar'; +import { StyledTable } from '../../entity/shared/components/styled/StyledTable'; +import { POST_TYPE_TO_DISPLAY_TEXT } from './constants'; + +const PostsContainer = styled.div``; + +export const PostsPaginationContainer = styled.div` + display: flex; + justify-content: center; + padding: 12px; + padding-left: 16px; + border-bottom: 1px solid; + border-color: ${(props) => props.theme.styles['border-color-base']}; + display: flex; + justify-content: space-between; + align-items: center; +`; + +const PaginationInfo = styled(Typography.Text)` + padding: 0px; +`; + +const DEFAULT_PAGE_SIZE = 10; + +export const PostList = () => { + const entityRegistry = useEntityRegistry(); + const location = useLocation(); + const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); + const paramsQuery = (params?.query as string) || undefined; + const [query, setQuery] = useState(undefined); + useEffect(() => setQuery(paramsQuery), [paramsQuery]); + + const [page, setPage] = useState(1); + const [isCreatingPost, setIsCreatingPost] = useState(false); + + const pageSize = DEFAULT_PAGE_SIZE; + const start = (page - 1) * pageSize; + + const { loading, error, data, client, refetch } = useListPostsQuery({ + variables: { + input: { + start, + count: pageSize, + query, + }, + }, + fetchPolicy: query && query.length > 0 ? 'no-cache' : 'cache-first', + }); + + const totalPosts = data?.listPosts?.total || 0; + const lastResultIndex = start + pageSize > totalPosts ? totalPosts : start + pageSize; + const posts = data?.listPosts?.posts || []; + + const onChangePage = (newPage: number) => { + scrollToTop(); + setPage(newPage); + }; + + const handleDelete = (urn: string) => { + removeFromListPostCache(client, urn, page, pageSize); + setTimeout(() => { + refetch?.(); + }, 2000); + }; + + const allColumns = [ + { + title: 'Title', + dataIndex: '', + key: 'title', + sorter: (sourceA, sourceB) => { + return sourceA.title.localeCompare(sourceB.title); + }, + render: (record: PostEntry) => PostColumn(record.title, 200), + width: '20%', + }, + { + title: 'Description', + dataIndex: '', + key: 'description', + render: (record: PostEntry) => PostColumn(record.description || ''), + }, + { + title: 'Type', + dataIndex: '', + key: 'type', + render: (record: PostEntry) => PostColumn(POST_TYPE_TO_DISPLAY_TEXT[record.contentType]), + style: { minWidth: 100 }, + width: '10%', + }, + { + title: '', + dataIndex: '', + width: '5%', + align: 'right' as AlignType, + key: 'menu', + render: PostListMenuColumn(handleDelete), + }, + ]; + + const tableData = posts.map((post) => { + return { + urn: post.urn, + title: post.content.title, + description: post.content.description, + contentType: post.content.contentType, + }; + }); + + return ( + <> + {!data && loading && } + {error && } + + + + null} + onQueryChange={(q) => setQuery(q && q.length > 0 ? q : undefined)} + entityRegistry={entityRegistry} + hideRecommendations + /> + + }} + /> + {totalPosts > pageSize && ( + + + + {lastResultIndex > 0 ? (page - 1) * pageSize + 1 : 0} - {lastResultIndex} + {' '} + of {totalPosts} + + + + + )} + {isCreatingPost && ( + setIsCreatingPost(false)} + onCreate={(urn, title, description) => { + addToListPostCache( + client, + { + urn, + properties: { + title, + description: description || null, + }, + }, + pageSize, + ); + setTimeout(() => refetch(), 2000); + }} + /> + )} + + + ); +}; diff --git a/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx b/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx new file mode 100644 index 0000000000000..38f910baf8f41 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx @@ -0,0 +1,26 @@ +import React from 'react'; +// import { Typography } from 'antd'; +import styled from 'styled-components/macro'; +import { Maybe } from 'graphql/jsutils/Maybe'; +import PostItemMenu from './PostItemMenu'; + +export interface PostEntry { + title: string; + contentType: string; + description: Maybe; + urn: string; +} + +const PostText = styled.div<{ minWidth?: number }>` + ${(props) => props.minWidth !== undefined && `min-width: ${props.minWidth}px;`} +`; + +export function PostListMenuColumn(handleDelete: (urn: string) => void) { + return (record: PostEntry) => ( + handleDelete(record.urn)} /> + ); +} + +export function PostColumn(text: string, minWidth?: number) { + return {text}; +} diff --git a/datahub-web-react/src/app/settings/posts/constants.ts b/datahub-web-react/src/app/settings/posts/constants.ts new file mode 100644 index 0000000000000..5a164019fe2e5 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/constants.ts @@ -0,0 +1,13 @@ +import { PostContentType } from '../../../types.generated'; + +export const TITLE_FIELD_NAME = 'title'; +export const DESCRIPTION_FIELD_NAME = 'description'; +export const LINK_FIELD_NAME = 'link'; +export const LOCATION_FIELD_NAME = 'location'; +export const TYPE_FIELD_NAME = 'type'; +export const CREATE_POST_BUTTON_ID = 'createPostButton'; + +export const POST_TYPE_TO_DISPLAY_TEXT = { + [PostContentType.Link]: 'Link', + [PostContentType.Text]: 'Announcement', +}; diff --git a/datahub-web-react/src/app/settings/posts/utils.ts b/datahub-web-react/src/app/settings/posts/utils.ts new file mode 100644 index 0000000000000..ce48c7400738c --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/utils.ts @@ -0,0 +1,77 @@ +import { ListPostsDocument, ListPostsQuery } from '../../../graphql/post.generated'; + +/** + * Add an entry to the list posts cache. + */ +export const addToListPostCache = (client, newPost, pageSize) => { + // Read the data from our cache for this query. + const currData: ListPostsQuery | null = client.readQuery({ + query: ListPostsDocument, + variables: { + input: { + start: 0, + count: pageSize, + }, + }, + }); + + // Add our new post into the existing list. + const newPosts = [newPost, ...(currData?.listPosts?.posts || [])]; + + // Write our data back to the cache. + client.writeQuery({ + query: ListPostsDocument, + variables: { + input: { + start: 0, + count: pageSize, + }, + }, + data: { + listPosts: { + start: 0, + count: (currData?.listPosts?.count || 0) + 1, + total: (currData?.listPosts?.total || 0) + 1, + posts: newPosts, + }, + }, + }); +}; + +/** + * Remove an entry from the list posts cache. + */ +export const removeFromListPostCache = (client, urn, page, pageSize) => { + // Read the data from our cache for this query. + const currData: ListPostsQuery | null = client.readQuery({ + query: ListPostsDocument, + variables: { + input: { + start: (page - 1) * pageSize, + count: pageSize, + }, + }, + }); + + // Remove the post from the existing posts set. + const newPosts = [...(currData?.listPosts?.posts || []).filter((post) => post.urn !== urn)]; + + // Write our data back to the cache. + client.writeQuery({ + query: ListPostsDocument, + variables: { + input: { + start: (page - 1) * pageSize, + count: pageSize, + }, + }, + data: { + listPosts: { + start: currData?.listPosts?.start || 0, + count: (currData?.listPosts?.count || 1) - 1, + total: (currData?.listPosts?.total || 1) - 1, + posts: newPosts, + }, + }, + }); +}; diff --git a/datahub-web-react/src/conf/Global.ts b/datahub-web-react/src/conf/Global.ts index b16dd1eaace57..e1220b8c81b53 100644 --- a/datahub-web-react/src/conf/Global.ts +++ b/datahub-web-react/src/conf/Global.ts @@ -28,6 +28,7 @@ export enum PageRoutes { SETTINGS_VIEWS = '/settings/views', EMBED = '/embed', EMBED_LOOKUP = '/embed/lookup/:url', + SETTINGS_POSTS = '/settings/posts', } /** diff --git a/datahub-web-react/src/graphql/me.graphql b/datahub-web-react/src/graphql/me.graphql index 2c693c747af56..af850c9c3ce28 100644 --- a/datahub-web-react/src/graphql/me.graphql +++ b/datahub-web-react/src/graphql/me.graphql @@ -46,6 +46,7 @@ query getMe { createTags manageGlobalViews manageOwnershipTypes + manageGlobalAnnouncements } } } diff --git a/datahub-web-react/src/graphql/post.graphql b/datahub-web-react/src/graphql/post.graphql index c19f38fc7751c..ee092ad4fba90 100644 --- a/datahub-web-react/src/graphql/post.graphql +++ b/datahub-web-react/src/graphql/post.graphql @@ -20,3 +20,11 @@ query listPosts($input: ListPostsInput!) { } } } + +mutation createPost($input: CreatePostInput!) { + createPost(input: $input) +} + +mutation deletePost($urn: String!) { + deletePost(urn: $urn) +} diff --git a/metadata-service/war/src/main/resources/boot/policies.json b/metadata-service/war/src/main/resources/boot/policies.json index 3fddf3456ecd7..3cda0269b79f1 100644 --- a/metadata-service/war/src/main/resources/boot/policies.json +++ b/metadata-service/war/src/main/resources/boot/policies.json @@ -19,6 +19,7 @@ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_USER_CREDENTIALS", @@ -102,6 +103,7 @@ "VIEW_ANALYTICS", "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_TAGS", @@ -190,6 +192,7 @@ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_USER_CREDENTIALS", @@ -283,6 +286,7 @@ "privileges":[ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_GLOSSARIES", "MANAGE_TAGS" ], diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java index c46d02a6eadf0..d515c1747bee4 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java @@ -64,6 +64,11 @@ public class PoliciesConfig { "Manage Domains", "Create and remove Asset Domains."); + public static final Privilege MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE = Privilege.of( + "MANAGE_GLOBAL_ANNOUNCEMENTS", + "Manage Home Page Posts", + "Create and delete home page posts"); + public static final Privilege MANAGE_TESTS_PRIVILEGE = Privilege.of( "MANAGE_TESTS", "Manage Tests", @@ -113,6 +118,7 @@ public class PoliciesConfig { MANAGE_USERS_AND_GROUPS_PRIVILEGE, VIEW_ANALYTICS_PRIVILEGE, MANAGE_DOMAINS_PRIVILEGE, + MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE, MANAGE_INGESTION_PRIVILEGE, MANAGE_SECRETS_PRIVILEGE, GENERATE_PERSONAL_ACCESS_TOKENS_PRIVILEGE, From a97548ce46174b20e17f48653ce299a076bcf289 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 23 Aug 2023 22:05:53 -0700 Subject: [PATCH 07/20] fix(ingest/powerbi): add sqlglot python dep (#8704) --- metadata-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 62cb4f1abb8cf..59cdcee79f052 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -388,7 +388,7 @@ def get_long_description(): "trino": sql_common | trino, "starburst-trino-usage": sql_common | usage_common | trino, "nifi": {"requests", "packaging", "requests-gssapi"}, - "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"}, + "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"} | sqlglot_lib, "powerbi-report-server": powerbi_report_server, "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8"}, "unity-catalog": databricks | sqllineage_lib, From 090f8af8e4e02ff3f75c55ce29fd9ead76c5dfdc Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 23 Aug 2023 22:06:28 -0700 Subject: [PATCH 08/20] ci(ingest): make ingestion caching rules correct (#8685) --- metadata-ingestion/build.gradle | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index f636cf25c67f7..199ccc59c21e0 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -21,11 +21,13 @@ task checkPythonVersion(type: Exec) { } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { + def sentinel_file = "${venv_name}/.venv_environment_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") + outputs.file(sentinel_file) commandLine 'bash', '-c', "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0'" + "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " + + "touch ${sentinel_file}" } task runPreFlightScript(type: Exec, dependsOn: environmentSetup) { @@ -39,7 +41,6 @@ task runPreFlightScript(type: Exec, dependsOn: environmentSetup) { task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) { def sentinel_file = "${venv_name}/.build_install_package_only_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . &&" + @@ -47,9 +48,12 @@ task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) { } task installPackage(type: Exec, dependsOn: installPackageOnly) { + def sentinel_file = "${venv_name}/.build_install_package_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") - commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . ${extra_pip_requirements}" + outputs.file(sentinel_file) + commandLine 'bash', '-x', '-c', + "${venv_name}/bin/pip install -e . ${extra_pip_requirements} && " + + "touch ${sentinel_file}" } task codegen(type: Exec, dependsOn: [environmentSetup, installPackage, ':metadata-events:mxe-schemas:build']) { @@ -63,7 +67,6 @@ task install(dependsOn: [installPackage, codegen]) task installDev(type: Exec, dependsOn: [install]) { def sentinel_file = "${venv_name}/.build_install_dev_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + @@ -75,7 +78,6 @@ task installDev(type: Exec, dependsOn: [install]) { task installAll(type: Exec, dependsOn: [install]) { def sentinel_file = "${venv_name}/.build_install_all_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + From 9472636d0850766ebb12353cedc75754fe1b20bb Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Thu, 24 Aug 2023 13:09:57 +0530 Subject: [PATCH 09/20] fix(cleanup): cleanup of 1 sub-module (#8678) --- metadata-integration/java/datahub-client/build.gradle | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 025273fc9263e..82273427974af 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -235,3 +235,7 @@ sourceSets.main.java.srcDir "${generateOpenApiPojos.outputDir}/src/main/java" sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/resources" checkstyleMain.exclude '**/generated/**' + +clean { + project.delete("$projectDir/generated") +} \ No newline at end of file From aab5b6af330c7b1a6e86818cf16e088a7c409a3b Mon Sep 17 00:00:00 2001 From: RyanHolstien Date: Thu, 24 Aug 2023 02:43:58 -0500 Subject: [PATCH 10/20] fix(policies): fix concurrent modification exception (#8681) --- .../java/com/datahub/authorization/DataHubAuthorizer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index 690528059b555..f653ccf72cf54 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -250,11 +250,11 @@ private void addPoliciesToCache(final Map> cache private void addPolicyToCache(final Map> cache, final DataHubPolicyInfo policy) { final List privileges = policy.getPrivileges(); for (String privilege : privileges) { - List existingPolicies = cache.getOrDefault(privilege, new ArrayList<>()); + List existingPolicies = cache.containsKey(privilege) ? new ArrayList<>(cache.get(privilege)) : new ArrayList<>(); existingPolicies.add(policy); cache.put(privilege, existingPolicies); } - List existingPolicies = cache.getOrDefault(ALL, new ArrayList<>()); + List existingPolicies = cache.containsKey(ALL) ? new ArrayList<>(cache.get(ALL)) : new ArrayList<>(); existingPolicies.add(policy); cache.put(ALL, existingPolicies); } From 22c35f1a231b39aa8c1e83974608ac0a47ad7170 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Thu, 24 Aug 2023 05:16:06 -0400 Subject: [PATCH 11/20] fix(ingest/bigquery): Add config option to create DataPlatformInstance, default off (#8659) --- docs/how/updating-datahub.md | 3 +++ .../ingestion/source/bigquery_v2/bigquery.py | 4 +++- .../source/bigquery_v2/bigquery_config.py | 7 +++++++ .../integration/bigquery_v2/test_bigquery.py | 1 + .../tests/unit/test_bigquery_source.py | 17 +++++++++++++++-- 5 files changed, 29 insertions(+), 3 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 2b6fd5571cc9e..7ba516c82cf1b 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -15,6 +15,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #8300: Clickhouse source now inherited from TwoTierSQLAlchemy. In old way we have platform_instance -> container -> co container db (None) -> container schema and now we have platform_instance -> container database. - #8300: Added `uri_opts` argument; now we can add any options for clickhouse client. +- #8659: BigQuery ingestion no longer creates DataPlatformInstance aspects by default. + This will only affect users that were depending on this aspect for custom functionality, + and can be enabled via the `include_data_platform_instance` config option. ## 0.10.5 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 7725d63ce0e1e..1446812c29216 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -429,7 +429,9 @@ def get_dataplatform_instance_aspect( ) -> MetadataWorkUnit: aspect = DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), - instance=make_dataplatform_instance_urn(self.platform, project_id), + instance=make_dataplatform_instance_urn(self.platform, project_id) + if self.config.include_data_platform_instance + else None, ) return MetadataChangeProposalWrapper( entityUrn=dataset_urn, aspect=aspect diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index e5730ee87daf4..0f2082c5e53bf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -81,6 +81,13 @@ class BigQueryV2Config( description="Whether to populate BigQuery Console url to Datasets/Tables", ) + include_data_platform_instance: bool = Field( + default=False, + description="Whether to create a DataPlatformInstance aspect, equal to the BigQuery project id." + " If enabled, will cause redundancy in the browse path for BigQuery entities in the UI," + " because the project id is represented as the top-level container.", + ) + debug_include_full_payloads: bool = Field( default=False, description="Include full payload into events. It is only for debugging and internal use.", diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 3bda6c5cce84b..cc3ee1f6ceaa4 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -61,6 +61,7 @@ def test_bigquery_v2_ingest( "project_ids": ["project-id-1"], "include_usage_statistics": False, "include_table_lineage": False, + "include_data_platform_instance": True, } pipeline_config_dict: Dict[str, Any] = { diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index fc8ca166b105a..47418d9a989bb 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -138,13 +138,12 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})" ) - config = BigQueryV2Config.parse_obj({}) + config = BigQueryV2Config.parse_obj({"include_data_platform_instance": True}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) data_platform_instance = source.get_dataplatform_instance_aspect( "urn:li:test", project_id ) - metadata = data_platform_instance.get_metadata()["metadata"] assert data_platform_instance is not None @@ -152,6 +151,20 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): assert metadata.aspect.instance == expected_instance +def test_get_dataplatform_instance_default_no_instance(): + config = BigQueryV2Config.parse_obj({}) + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + + data_platform_instance = source.get_dataplatform_instance_aspect( + "urn:li:test", "project_id" + ) + metadata = data_platform_instance.get_metadata()["metadata"] + + assert data_platform_instance is not None + assert metadata.aspectName == "dataPlatformInstance" + assert metadata.aspect.instance is None + + @patch("google.cloud.bigquery.client.Client") def test_get_projects_with_single_project_id(client_mock): config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) From bcef25acd3cde72f4b7875732006d6fc5add8fdb Mon Sep 17 00:00:00 2001 From: Alexander Date: Thu, 24 Aug 2023 05:17:04 -0400 Subject: [PATCH 12/20] feat(ingest/looker): Record observed lineage timestamps for Looker and LookML sources (#7735) --- .../ingestion/source/looker/looker_common.py | 8 +++ .../ingestion/source/looker/lookml_source.py | 9 +++- .../looker/golden_looker_mces.json | 8 +-- .../looker/golden_test_allow_ingest.json | 4 +- ...olden_test_external_project_view_mces.json | 4 +- .../golden_test_independent_look_ingest.json | 8 +-- .../looker/golden_test_ingest.json | 4 +- .../looker/golden_test_ingest_joins.json | 16 +++--- .../golden_test_ingest_unaliased_joins.json | 12 ++--- .../looker_mces_golden_deleted_stateful.json | 4 +- .../looker/looker_mces_usage_history.json | 4 +- .../integration/lookml/expected_output.json | 52 +++++++++---------- .../lookml/lookml_mces_api_bigquery.json | 52 +++++++++---------- .../lookml/lookml_mces_api_hive2.json | 52 +++++++++---------- .../lookml/lookml_mces_badsql_parser.json | 40 +++++++------- .../lookml_mces_golden_deleted_stateful.json | 8 +-- .../lookml/lookml_mces_offline.json | 52 +++++++++---------- .../lookml_mces_offline_deny_pattern.json | 20 +++---- ...lookml_mces_offline_platform_instance.json | 52 +++++++++---------- .../lookml_mces_with_external_urls.json | 52 +++++++++---------- .../lookml/lookml_reachable_views.json | 12 ++--- .../refinement_include_order_golden.json | 16 +++--- .../lookml/refinements_ingestion_golden.json | 52 +++++++++---------- 23 files changed, 278 insertions(+), 263 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index d568ddcb02afa..40b90d216348c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -34,6 +34,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, ) +from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageTypeClass, FineGrainedLineageDownstreamType, @@ -76,6 +77,8 @@ from datahub.utilities.lossy_collections import LossyList, LossySet from datahub.utilities.url_util import remove_port_from_url +CORPUSER_DATAHUB = "urn:li:corpuser:datahub" + if TYPE_CHECKING: from datahub.ingestion.source.looker.lookml_source import ( LookerViewFileLoader, @@ -786,6 +789,7 @@ def _to_metadata_events( # noqa: C901 if self.upstream_views is not None: assert self.project_name is not None upstreams = [] + observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc) for view_ref in sorted(self.upstream_views): view_urn = LookerViewId( project_name=view_ref.project @@ -799,6 +803,10 @@ def _to_metadata_events( # noqa: C901 UpstreamClass( dataset=view_urn, type=DatasetLineageTypeClass.VIEW, + auditStamp=AuditStamp( + time=int(observed_lineage_ts.timestamp() * 1000), + actor=CORPUSER_DATAHUB, + ), ) ) view_name_to_urn_map[view_ref.include] = view_urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 362b4e5530638..1a32afa2b7fdd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -6,7 +6,7 @@ import re import tempfile from dataclasses import dataclass, field as dataclass_field, replace -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import ( Any, ClassVar, @@ -50,6 +50,7 @@ from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.git.git_import import GitClone from datahub.ingestion.source.looker.looker_common import ( + CORPUSER_DATAHUB, LookerCommonConfig, LookerExplore, LookerUtil, @@ -83,6 +84,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ( + AuditStampClass, DatasetPropertiesClass, FineGrainedLineageClass, FineGrainedLineageUpstreamTypeClass, @@ -1615,11 +1617,16 @@ def _get_upstream_lineage( # Generate the upstream + fine grained lineage objects. upstreams = [] + observed_lineage_ts = datetime.now(tz=timezone.utc) fine_grained_lineages: List[FineGrainedLineageClass] = [] for upstream_dataset_urn in upstream_dataset_urns: upstream = UpstreamClass( dataset=upstream_dataset_urn, type=DatasetLineageTypeClass.VIEW, + auditStamp=AuditStampClass( + time=int(observed_lineage_ts.timestamp() * 1000), + actor=CORPUSER_DATAHUB, + ), ) upstreams.append(upstream) diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index 6167c63e6c9b8..dee85b40bb7a8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -262,8 +262,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" @@ -412,8 +412,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index e66ec4bb89d8c..72db36e63daf7 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index 11e0760decae3..e5508bdb06b9e 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,datahub-demo.view.faa_flights,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index ddfd102cb15b0..91e13debfa028 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -279,8 +279,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" @@ -429,8 +429,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 54624986216b8..e93079119e4f4 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 6cab0db8c33cf..a9c8efa7cdb98 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -206,32 +206,32 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 9a088a7a8baef..edd15624a14cd 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -206,24 +206,24 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index f8e2565e492e1..aebc89b609a08 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 32d4f7bc64ab4..34bded3cf691e 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -158,8 +158,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/expected_output.json b/metadata-ingestion/tests/integration/lookml/expected_output.json index cdf520cc23a30..b53d5857f1d66 100644 --- a/metadata-ingestion/tests/integration/lookml/expected_output.json +++ b/metadata-ingestion/tests/integration/lookml/expected_output.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json index 73edecbe62205..238f4c2580cdf 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json index 9aa6a952c40b4..45d5d839e9d21 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json index 6ce6d809ae8f5..187cedaefb6b2 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json @@ -450,8 +450,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -557,8 +557,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -664,8 +664,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -816,8 +816,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -923,8 +923,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1123,8 +1123,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1230,8 +1230,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1416,8 +1416,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1615,8 +1615,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1854,8 +1854,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json index 1016d4e211458..a323118666940 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json index fc91c97a53003..c2c879e38f37b 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json index 8635a570c0621..c1ac54b0fb588 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -128,8 +128,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -235,8 +235,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -387,8 +387,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -494,8 +494,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json index 19168aa323142..f602ca37b3160 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.autodetect_sql_name_based_on_view_name,DEV)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.ecommerce.ability,DEV)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.flightstats.accidents,DEV)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json index d4ced76a7475d..104bd365669e3 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" @@ -261,8 +261,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -480,8 +480,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -588,8 +588,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -696,8 +696,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -849,8 +849,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -957,8 +957,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1065,8 +1065,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" @@ -1248,8 +1248,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1356,8 +1356,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1543,8 +1543,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1743,8 +1743,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1983,8 +1983,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json index 2bae6452145df..37a6c94c6952e 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)", "type": "VIEW" @@ -459,8 +459,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json index a5c316f365d4b..49831ee554ab1 100644 --- a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" @@ -303,8 +303,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" @@ -410,8 +410,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.order,PROD)", "type": "VIEW" @@ -607,8 +607,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.issue_history,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json index de303d50e7acd..dc5e1aa9096f8 100644 --- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" @@ -1764,8 +1764,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -2003,8 +2003,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", "type": "VIEW" From 43d48ddde470f646ebd74fda9600596be4774267 Mon Sep 17 00:00:00 2001 From: RChygir <30907647+RChygir@users.noreply.github.com> Date: Thu, 24 Aug 2023 12:18:03 +0300 Subject: [PATCH 13/20] feat(ingest/mssql): load jobs and stored procedures (#5363) --- .../docs/sources/mssql/mssql_pre.md | 14 + .../src/datahub/ingestion/source/sql/mssql.py | 278 -------- .../ingestion/source/sql/mssql/__init__.py | 1 + .../ingestion/source/sql/mssql/job_models.py | 239 +++++++ .../ingestion/source/sql/mssql/source.py | 665 ++++++++++++++++++ .../ingestion/source/sql/sql_common.py | 38 +- .../golden_mces_mssql_no_db_to_file.json | 186 +++++ .../golden_mces_mssql_no_db_with_filter.json | 186 +++++ .../golden_mces_mssql_to_file.json | 186 +++++ ...golden_mces_mssql_with_lower_case_urn.json | 186 +++++ .../integration/sql_server/setup/setup.sql | 34 +- .../integration/sql_server/test_sql_server.py | 5 + 12 files changed, 1726 insertions(+), 292 deletions(-) create mode 100644 metadata-ingestion/docs/sources/mssql/mssql_pre.md delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py diff --git a/metadata-ingestion/docs/sources/mssql/mssql_pre.md b/metadata-ingestion/docs/sources/mssql/mssql_pre.md new file mode 100644 index 0000000000000..396581966e691 --- /dev/null +++ b/metadata-ingestion/docs/sources/mssql/mssql_pre.md @@ -0,0 +1,14 @@ +### Prerequisites + +If you want to ingest MSSQL Jobs and stored procedures (with code) the user credentials needs the proper privileges. + +Script for granting the privileges: +``` +USE MSDB +GRANT SELECT ON OBJECT::msdb.dbo.sysjobsteps TO 'USERNAME' +GRANT SELECT ON OBJECT::msdb.dbo.sysjobs TO 'USERNAME' + +USE 'DATA_DB_NAME' +GRANT VIEW DEFINITION TO 'USERNAME' +GRANT SELECT ON OBJECT::sys.sql_expression_dependencies TO 'USERNAME' +``` \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py deleted file mode 100644 index a9afd40fd45b6..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py +++ /dev/null @@ -1,278 +0,0 @@ -import logging -import urllib.parse -from typing import Any, Dict, Iterable, List, Optional, Tuple - -import pydantic -import sqlalchemy.dialects.mssql - -# This import verifies that the dependencies are available. -import sqlalchemy_pytds # noqa: F401 -from pydantic.fields import Field -from sqlalchemy import create_engine, inspect -from sqlalchemy.engine.base import Connection -from sqlalchemy.engine.reflection import Inspector - -from datahub.configuration.common import AllowDenyPattern -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.source.sql.sql_common import ( - SQLAlchemySource, - register_custom_type, -) -from datahub.ingestion.source.sql.sql_config import ( - BasicSQLAlchemyConfig, - make_sqlalchemy_uri, -) -from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass - -logger: logging.Logger = logging.getLogger(__name__) - -register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) -register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) - - -class SQLServerConfig(BasicSQLAlchemyConfig): - # defaults - host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") - scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) - use_odbc: bool = Field( - default=False, - description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", - ) - uri_args: Dict[str, str] = Field( - default={}, - description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", - ) - database_pattern: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for databases to filter in ingestion.", - ) - database: Optional[str] = Field( - default=None, - description="database (catalog). If set to Null, all databases will be considered for ingestion.", - ) - convert_urns_to_lowercase: bool = Field( - default=False, - description="Enable to convert the SQL Server assets urns to lowercase", - ) - - @pydantic.validator("uri_args") - def passwords_match(cls, v, values, **kwargs): - if values["use_odbc"] and "driver" not in v: - raise ValueError("uri_args must contain a 'driver' option") - elif not values["use_odbc"] and v: - raise ValueError("uri_args is not supported when ODBC is disabled") - return v - - def get_sql_alchemy_url( - self, - uri_opts: Optional[Dict[str, Any]] = None, - current_db: Optional[str] = None, - ) -> str: - if self.use_odbc: - # Ensure that the import is available. - import pyodbc # noqa: F401 - - self.scheme = "mssql+pyodbc" - - uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri( - self.scheme, # type: ignore - self.username, - self.password.get_secret_value() if self.password else None, - self.host_port, # type: ignore - current_db if current_db else self.database, - uri_opts=uri_opts, - ) - if self.use_odbc: - uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}" - return uri - - -@platform_name("Microsoft SQL Server", id="mssql") -@config_class(SQLServerConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") -@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") -@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") -@capability( - SourceCapability.USAGE_STATS, - "Not provided by this module, use `bigquery-usage` for that.", - supported=False, -) -@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") -class SQLServerSource(SQLAlchemySource): - """ - This plugin extracts the following: - - - Metadata for databases, schemas, views and tables - - Column types associated with each table/view - - Table, row, and column statistics via optional SQL profiling - - We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. - """ - - def __init__(self, config: SQLServerConfig, ctx: PipelineContext): - super().__init__(config, ctx, "mssql") - # Cache the table and column descriptions - self.config: SQLServerConfig = config - self.current_database = None - self.table_descriptions: Dict[str, str] = {} - self.column_descriptions: Dict[str, str] = {} - for inspector in self.get_inspectors(): - db_name: str = self.get_db_name(inspector) - with inspector.engine.connect() as conn: - if self.config.use_odbc: - self._add_output_converters(conn) - self._populate_table_descriptions(conn, db_name) - self._populate_column_descriptions(conn, db_name) - - @staticmethod - def _add_output_converters(conn: Connection) -> None: - def handle_sql_variant_as_string(value): - return value.decode("utf-16le") - - # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported - # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy - try: - conn.connection.add_output_converter(-150, handle_sql_variant_as_string) - except AttributeError as e: - logger.debug( - f"Failed to mount output converter for MSSQL data type -150 due to {e}" - ) - - def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: - # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name - # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/ - table_metadata = conn.execute( - """ - SELECT - SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, - T.NAME AS table_name, - EP.VALUE AS table_description - FROM sys.tables AS T - INNER JOIN sys.extended_properties AS EP - ON EP.MAJOR_ID = T.[OBJECT_ID] - AND EP.MINOR_ID = 0 - AND EP.NAME = 'MS_Description' - AND EP.CLASS = 1 - """ - ) - for row in table_metadata: - self.table_descriptions[ - f"{db_name}.{row['schema_name']}.{row['table_name']}" - ] = row["table_description"] - - def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None: - column_metadata = conn.execute( - """ - SELECT - SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, - T.NAME AS table_name, - C.NAME AS column_name , - EP.VALUE AS column_description - FROM sys.tables AS T - INNER JOIN sys.all_columns AS C - ON C.OBJECT_ID = T.[OBJECT_ID] - INNER JOIN sys.extended_properties AS EP - ON EP.MAJOR_ID = T.[OBJECT_ID] - AND EP.MINOR_ID = C.COLUMN_ID - AND EP.NAME = 'MS_Description' - AND EP.CLASS = 1 - """ - ) - for row in column_metadata: - self.column_descriptions[ - f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}" - ] = row["column_description"] - - @classmethod - def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource": - config = SQLServerConfig.parse_obj(config_dict) - return cls(config, ctx) - - # override to get table descriptions - def get_table_properties( - self, inspector: Inspector, schema: str, table: str - ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: - description, properties, location_urn = super().get_table_properties( - inspector, schema, table - ) - # Update description if available. - db_name: str = self.get_db_name(inspector) - description = self.table_descriptions.get( - f"{db_name}.{schema}.{table}", description - ) - return description, properties, location_urn - - # override to get column descriptions - def _get_columns( - self, dataset_name: str, inspector: Inspector, schema: str, table: str - ) -> List[Dict]: - columns: List[Dict] = super()._get_columns( - dataset_name, inspector, schema, table - ) - # Update column description if available. - db_name: str = self.get_db_name(inspector) - for column in columns: - description: Optional[str] = self.column_descriptions.get( - f"{db_name}.{schema}.{table}.{column['name']}", - ) - if description: - column["comment"] = description - return columns - - def get_inspectors(self) -> Iterable[Inspector]: - # This method can be overridden in the case that you want to dynamically - # run on multiple databases. - url = self.config.get_sql_alchemy_url() - logger.debug(f"sql_alchemy_url={url}") - engine = create_engine(url, **self.config.options) - with engine.connect() as conn: - if self.config.database and self.config.database != "": - inspector = inspect(conn) - yield inspector - else: - databases = conn.execute( - "SELECT name FROM master.sys.databases WHERE name NOT IN \ - ('master', 'model', 'msdb', 'tempdb', 'Resource', \ - 'distribution' , 'reportserver', 'reportservertempdb'); " - ) - for db in databases: - if self.config.database_pattern.allowed(db["name"]): - url = self.config.get_sql_alchemy_url(current_db=db["name"]) - with create_engine( - url, **self.config.options - ).connect() as conn: - inspector = inspect(conn) - self.current_database = db["name"] - yield inspector - - def get_identifier( - self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any - ) -> str: - regular = f"{schema}.{entity}" - - qualified_table_name = regular - - if self.config.database: - if self.config.database_alias: - qualified_table_name = f"{self.config.database_alias}.{regular}" - else: - qualified_table_name = f"{self.config.database}.{regular}" - - if self.current_database: - qualified_table_name = f"{self.current_database}.{regular}" - - return ( - qualified_table_name.lower() - if self.config.convert_urns_to_lowercase - else qualified_table_name - ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py new file mode 100644 index 0000000000000..8db89505a9cf6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py @@ -0,0 +1 @@ +from datahub.ingestion.source.sql.mssql.source import SQLServerConfig, SQLServerSource diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py new file mode 100644 index 0000000000000..8aeb5421891aa --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -0,0 +1,239 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union + +from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn +from datahub.metadata.schema_classes import ( + DataFlowInfoClass, + DataJobInfoClass, + DataJobInputOutputClass, +) + + +@dataclass +class ProcedureDependency: + db: str + schema: str + name: str + type: str + env: str + server: str + source: str = "mssql" + + +@dataclass +class ProcedureLineageStream: + dependencies: List[ProcedureDependency] + + @property + def as_property(self) -> Dict[str, str]: + return { + f"{dep.db}.{dep.schema}.{dep.name}": dep.type for dep in self.dependencies + } + + +@dataclass +class MSSQLJob: + db: str + platform_instance: str + name: str + env: str + source: str = "mssql" + type: str = "JOB" + + @property + def formatted_name(self) -> str: + return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}" + + @property + def full_type(self) -> str: + return f"({self.source},{self.formatted_name},{self.env})" + + @property + def orchestrator(self) -> str: + return self.source + + @property + def formatted_platform_instance(self) -> str: + return self.platform_instance.replace(".", "/") + + @property + def cluster(self) -> str: + return f"{self.env}" + + +@dataclass +class MSSQLProceduresContainer: + db: str + platform_instance: str + name: str + env: str + source: str = "mssql" + type: str = "JOB" + + @property + def formatted_name(self) -> str: + return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}" + + @property + def orchestrator(self) -> str: + return self.source + + @property + def formatted_platform_instance(self) -> str: + return self.platform_instance.replace(".", "/") + + @property + def cluster(self) -> str: + return f"{self.env}" + + @property + def full_type(self) -> str: + return f"({self.source},{self.name},{self.env})" + + +@dataclass +class ProcedureParameter: + name: str + type: str + + @property + def properties(self) -> Dict[str, str]: + return {"type": self.type} + + +@dataclass +class StoredProcedure: + db: str + schema: str + name: str + flow: Union[MSSQLJob, MSSQLProceduresContainer] + type: str = "STORED_PROCEDURE" + source: str = "mssql" + + @property + def full_type(self) -> str: + return self.source.upper() + "_" + self.type + + @property + def formatted_name(self) -> str: + return self.name.replace(",", "-") + + @property + def full_name(self) -> str: + return f"{self.db}.{self.schema}.{self.formatted_name}" + + @property + def escape_full_name(self) -> str: + return f"[{self.db}].[{self.schema}].[{self.formatted_name}]" + + +@dataclass +class JobStep: + job_name: str + step_name: str + flow: MSSQLJob + type: str = "JOB_STEP" + source: str = "mssql" + + @property + def formatted_step(self) -> str: + return self.step_name.replace(",", "-").replace(" ", "_").lower() + + @property + def formatted_name(self) -> str: + return self.job_name.replace(",", "-") + + @property + def full_type(self) -> str: + return self.source.upper() + "_" + self.type + + @property + def full_name(self) -> str: + return f"{self.formatted_name}.{self.formatted_name}" + + +@dataclass +class MSSQLDataJob: + entity: Union[StoredProcedure, JobStep] + type: str = "dataJob" + source: str = "mssql" + external_url: str = "" + description: Optional[str] = None + status: Optional[str] = None + incoming: List[str] = field(default_factory=list) + outgoing: List[str] = field(default_factory=list) + input_jobs: List[str] = field(default_factory=list) + job_properties: Dict[str, str] = field(default_factory=dict) + + @property + def urn(self) -> str: + return make_data_job_urn( + orchestrator=self.entity.flow.orchestrator, + flow_id=self.entity.flow.formatted_name, + job_id=self.entity.formatted_name, + cluster=self.entity.flow.cluster, + ) + + def add_property( + self, + name: str, + value: str, + ) -> None: + self.job_properties[name] = value + + @property + def valued_properties(self) -> Dict[str, str]: + if self.job_properties: + return {k: v for k, v in self.job_properties.items() if v is not None} + return self.job_properties + + @property + def as_datajob_input_output_aspect(self) -> DataJobInputOutputClass: + return DataJobInputOutputClass( + inputDatasets=sorted(self.incoming), + outputDatasets=sorted(self.outgoing), + inputDatajobs=sorted(self.input_jobs), + ) + + @property + def as_datajob_info_aspect(self) -> DataJobInfoClass: + return DataJobInfoClass( + name=self.entity.full_name, + type=self.entity.full_type, + description=self.description, + customProperties=self.valued_properties, + externalUrl=self.external_url, + status=self.status, + ) + + +@dataclass +class MSSQLDataFlow: + entity: Union[MSSQLJob, MSSQLProceduresContainer] + type: str = "dataFlow" + source: str = "mssql" + external_url: str = "" + flow_properties: Dict[str, str] = field(default_factory=dict) + + def add_property( + self, + name: str, + value: str, + ) -> None: + self.flow_properties[name] = value + + @property + def urn(self) -> str: + return make_data_flow_urn( + orchestrator=self.entity.orchestrator, + flow_id=self.entity.formatted_name, + cluster=self.entity.cluster, + ) + + @property + def as_dataflow_info_aspect(self) -> DataFlowInfoClass: + return DataFlowInfoClass( + name=self.entity.formatted_name, + customProperties=self.flow_properties, + externalUrl=self.external_url, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py new file mode 100644 index 0000000000000..3c7701d93edeb --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -0,0 +1,665 @@ +import logging +import re +import urllib.parse +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import pydantic +import sqlalchemy.dialects.mssql + +# This import verifies that the dependencies are available. +import sqlalchemy_pytds # noqa: F401 +from pydantic.fields import Field +from sqlalchemy import create_engine, inspect +from sqlalchemy.engine.base import Connection +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.exc import ProgrammingError, ResourceClosedError + +from datahub.configuration.common import AllowDenyPattern +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.sql.mssql.job_models import ( + JobStep, + MSSQLDataFlow, + MSSQLDataJob, + MSSQLJob, + MSSQLProceduresContainer, + ProcedureDependency, + ProcedureLineageStream, + ProcedureParameter, + StoredProcedure, +) +from datahub.ingestion.source.sql.sql_common import ( + SQLAlchemySource, + SqlWorkUnit, + register_custom_type, +) +from datahub.ingestion.source.sql.sql_config import ( + BasicSQLAlchemyConfig, + make_sqlalchemy_uri, +) +from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass + +logger: logging.Logger = logging.getLogger(__name__) + +register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) +register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) + + +class SQLServerConfig(BasicSQLAlchemyConfig): + # defaults + host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") + scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) + include_stored_procedures: bool = Field( + default=True, + description="Include ingest of stored procedures. Requires access to the 'sys' schema.", + ) + include_stored_procedures_code: bool = Field( + default=True, description="Include information about object code." + ) + include_jobs: bool = Field( + default=True, + description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.", + ) + include_descriptions: bool = Field( + default=True, description="Include table descriptions information." + ) + use_odbc: bool = Field( + default=False, + description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", + ) + uri_args: Dict[str, str] = Field( + default={}, + description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", + ) + database_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for databases to filter in ingestion.", + ) + database: Optional[str] = Field( + default=None, + description="database (catalog). If set to Null, all databases will be considered for ingestion.", + ) + convert_urns_to_lowercase: bool = Field( + default=False, + description="Enable to convert the SQL Server assets urns to lowercase", + ) + + @pydantic.validator("uri_args") + def passwords_match(cls, v, values, **kwargs): + if values["use_odbc"] and "driver" not in v: + raise ValueError("uri_args must contain a 'driver' option") + elif not values["use_odbc"] and v: + raise ValueError("uri_args is not supported when ODBC is disabled") + return v + + def get_sql_alchemy_url( + self, + uri_opts: Optional[Dict[str, Any]] = None, + current_db: Optional[str] = None, + ) -> str: + if self.use_odbc: + # Ensure that the import is available. + import pyodbc # noqa: F401 + + self.scheme = "mssql+pyodbc" + + uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri( + self.scheme, # type: ignore + self.username, + self.password.get_secret_value() if self.password else None, + self.host_port, # type: ignore + current_db if current_db else self.database, + uri_opts=uri_opts, + ) + if self.use_odbc: + uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}" + return uri + + @property + def host(self): + return self.platform_instance or self.host_port.split(":")[0] + + @property + def db(self): + return self.database_alias or self.database + + +@platform_name("Microsoft SQL Server", id="mssql") +@config_class(SQLServerConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +class SQLServerSource(SQLAlchemySource): + """ + This plugin extracts the following: + - Metadata for databases, schemas, views and tables + - Column types associated with each table/view + - Table, row, and column statistics via optional SQL profiling + We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. + """ + + def __init__(self, config: SQLServerConfig, ctx: PipelineContext): + super().__init__(config, ctx, "mssql") + # Cache the table and column descriptions + self.config: SQLServerConfig = config + self.current_database = None + self.table_descriptions: Dict[str, str] = {} + self.column_descriptions: Dict[str, str] = {} + if self.config.include_descriptions: + for inspector in self.get_inspectors(): + db_name: str = self.get_db_name(inspector) + with inspector.engine.connect() as conn: + if self.config.use_odbc: + self._add_output_converters(conn) + self._populate_table_descriptions(conn, db_name) + self._populate_column_descriptions(conn, db_name) + + @staticmethod + def _add_output_converters(conn: Connection) -> None: + def handle_sql_variant_as_string(value): + try: + return value.decode("utf-16le") + except UnicodeDecodeError: + return value.decode("Windows-1251") + + # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported + # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy + try: + conn.connection.add_output_converter(-150, handle_sql_variant_as_string) + except AttributeError as e: + logger.debug( + f"Failed to mount output converter for MSSQL data type -150 due to {e}" + ) + + def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: + # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name + # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/ + table_metadata = conn.execute( + """ + SELECT + SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, + T.NAME AS table_name, + EP.VALUE AS table_description + FROM sys.tables AS T + INNER JOIN sys.extended_properties AS EP + ON EP.MAJOR_ID = T.[OBJECT_ID] + AND EP.MINOR_ID = 0 + AND EP.NAME = 'MS_Description' + AND EP.CLASS = 1 + """ + ) + for row in table_metadata: + self.table_descriptions[ + f"{db_name}.{row['schema_name']}.{row['table_name']}" + ] = row["table_description"] + + def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None: + column_metadata = conn.execute( + """ + SELECT + SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, + T.NAME AS table_name, + C.NAME AS column_name , + EP.VALUE AS column_description + FROM sys.tables AS T + INNER JOIN sys.all_columns AS C + ON C.OBJECT_ID = T.[OBJECT_ID] + INNER JOIN sys.extended_properties AS EP + ON EP.MAJOR_ID = T.[OBJECT_ID] + AND EP.MINOR_ID = C.COLUMN_ID + AND EP.NAME = 'MS_Description' + AND EP.CLASS = 1 + """ + ) + for row in column_metadata: + self.column_descriptions[ + f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}" + ] = row["column_description"] + + @classmethod + def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource": + config = SQLServerConfig.parse_obj(config_dict) + return cls(config, ctx) + + # override to get table descriptions + def get_table_properties( + self, inspector: Inspector, schema: str, table: str + ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: + description, properties, location_urn = super().get_table_properties( + inspector, schema, table + ) + # Update description if available. + db_name: str = self.get_db_name(inspector) + description = self.table_descriptions.get( + f"{db_name}.{schema}.{table}", description + ) + return description, properties, location_urn + + # override to get column descriptions + def _get_columns( + self, dataset_name: str, inspector: Inspector, schema: str, table: str + ) -> List[Dict]: + columns: List[Dict] = super()._get_columns( + dataset_name, inspector, schema, table + ) + # Update column description if available. + db_name: str = self.get_db_name(inspector) + for column in columns: + description: Optional[str] = self.column_descriptions.get( + f"{db_name}.{schema}.{table}.{column['name']}", + ) + if description: + column["comment"] = description + return columns + + def get_database_level_workunits( + self, + inspector: Inspector, + database: str, + ) -> Iterable[MetadataWorkUnit]: + yield from super().get_database_level_workunits( + inspector=inspector, + database=database, + ) + if self.config.include_jobs: + try: + yield from self.loop_jobs(inspector, self.config) + except Exception as e: + self.report.report_failure( + "jobs", + f"Failed to list jobs due to error {e}", + ) + + def get_schema_level_workunits( + self, + inspector: Inspector, + schema: str, + database: str, + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from super().get_schema_level_workunits( + inspector=inspector, + schema=schema, + database=database, + ) + if self.config.include_stored_procedures: + try: + yield from self.loop_stored_procedures(inspector, schema, self.config) + except Exception as e: + self.report.report_failure( + "jobs", + f"Failed to list jobs due to error {e}", + ) + + def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]: + jobs_data = conn.execute( + f""" + SELECT + job.job_id, + job.name, + job.description, + job.date_created, + job.date_modified, + steps.step_id, + steps.step_name, + steps.subsystem, + steps.command, + steps.database_name + FROM + msdb.dbo.sysjobs job + INNER JOIN + msdb.dbo.sysjobsteps steps + ON + job.job_id = steps.job_id + where database_name = '{db_name}' + """ + ) + jobs: Dict[str, Dict[str, Any]] = {} + for row in jobs_data: + step_data = dict( + job_id=row["job_id"], + job_name=row["name"], + description=row["description"], + date_created=row["date_created"], + date_modified=row["date_modified"], + step_id=row["step_id"], + step_name=row["step_name"], + subsystem=row["subsystem"], + command=row["command"], + ) + if row["name"] in jobs: + jobs[row["name"]][row["step_id"]] = step_data + else: + jobs[row["name"]] = {row["step_id"]: step_data} + return jobs + + def loop_jobs( + self, + inspector: Inspector, + sql_config: SQLServerConfig, + ) -> Iterable[MetadataWorkUnit]: + """ + Loop MS SQL jobs as dataFlow-s. + :return: + """ + db_name = self.get_db_name(inspector) + with inspector.engine.connect() as conn: + jobs = self._get_jobs(conn, db_name) + for job_name, job_steps in jobs.items(): + job = MSSQLJob( + name=job_name, + env=sql_config.env, + db=db_name, + platform_instance=sql_config.host, + ) + data_flow = MSSQLDataFlow(entity=job) + yield from self.construct_flow_workunits(data_flow=data_flow) + yield from self.loop_job_steps(job, job_steps) + + def loop_job_steps( + self, job: MSSQLJob, job_steps: Dict[str, Any] + ) -> Iterable[MetadataWorkUnit]: + for step_id, step_data in job_steps.items(): + step = JobStep( + job_name=job.formatted_name, + step_name=step_data["step_name"], + flow=job, + ) + data_job = MSSQLDataJob(entity=step) + for data_name, data_value in step_data.items(): + data_job.add_property(name=data_name, value=str(data_value)) + yield from self.construct_job_workunits(data_job) + + def loop_stored_procedures( # noqa: C901 + self, + inspector: Inspector, + schema: str, + sql_config: SQLServerConfig, + ) -> Iterable[MetadataWorkUnit]: + """ + Loop schema data for get stored procedures as dataJob-s. + """ + db_name = self.get_db_name(inspector) + procedure_flow_name = f"{db_name}.{schema}.stored_procedures" + mssql_default_job = MSSQLProceduresContainer( + name=procedure_flow_name, + env=sql_config.env, + db=db_name, + platform_instance=sql_config.host, + ) + data_flow = MSSQLDataFlow(entity=mssql_default_job) + with inspector.engine.connect() as conn: + procedures_data_list = self._get_stored_procedures(conn, db_name, schema) + procedures = [ + StoredProcedure(flow=mssql_default_job, **procedure_data) + for procedure_data in procedures_data_list + ] + if procedures: + yield from self.construct_flow_workunits(data_flow=data_flow) + for procedure in procedures: + upstream = self._get_procedure_upstream(conn, procedure) + downstream = self._get_procedure_downstream(conn, procedure) + data_job = MSSQLDataJob( + entity=procedure, + ) + # TODO: because of this upstream and downstream are more dependencies, + # can't be used as DataJobInputOutput. + # Should be reorganized into lineage. + data_job.add_property("procedure_depends_on", str(upstream.as_property)) + data_job.add_property( + "depending_on_procedure", str(downstream.as_property) + ) + procedure_definition, procedure_code = self._get_procedure_code( + conn, procedure + ) + if procedure_definition: + data_job.add_property("definition", procedure_definition) + if sql_config.include_stored_procedures_code and procedure_code: + data_job.add_property("code", procedure_code) + procedure_inputs = self._get_procedure_inputs(conn, procedure) + properties = self._get_procedure_properties(conn, procedure) + data_job.add_property( + "input parameters", str([param.name for param in procedure_inputs]) + ) + for param in procedure_inputs: + data_job.add_property( + f"parameter {param.name}", str(param.properties) + ) + for property_name, property_value in properties.items(): + data_job.add_property(property_name, str(property_value)) + yield from self.construct_job_workunits(data_job) + + @staticmethod + def _get_procedure_downstream( + conn: Connection, procedure: StoredProcedure + ) -> ProcedureLineageStream: + downstream_data = conn.execute( + f""" + SELECT DISTINCT OBJECT_SCHEMA_NAME ( referencing_id ) AS [schema], + OBJECT_NAME(referencing_id) AS [name], + o.type_desc AS [type] + FROM sys.sql_expression_dependencies AS sed + INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id + left join sys.objects o1 on sed.referenced_id = o1.object_id + WHERE referenced_id = OBJECT_ID(N'{procedure.escape_full_name}') + AND o.type_desc in ('TABLE_TYPE', 'VIEW', 'USER_TABLE') + """ + ) + downstream_dependencies = [] + for row in downstream_data: + downstream_dependencies.append( + ProcedureDependency( + db=procedure.db, + schema=row["schema"], + name=row["name"], + type=row["type"], + env=procedure.flow.env, + server=procedure.flow.platform_instance, + ) + ) + return ProcedureLineageStream(dependencies=downstream_dependencies) + + @staticmethod + def _get_procedure_upstream( + conn: Connection, procedure: StoredProcedure + ) -> ProcedureLineageStream: + upstream_data = conn.execute( + f""" + SELECT DISTINCT + coalesce(lower(referenced_database_name), db_name()) AS db, + referenced_schema_name AS [schema], + referenced_entity_name AS [name], + o1.type_desc AS [type] + FROM sys.sql_expression_dependencies AS sed + INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id + left join sys.objects o1 on sed.referenced_id = o1.object_id + WHERE referencing_id = OBJECT_ID(N'{procedure.escape_full_name}') + AND referenced_schema_name is not null + AND o1.type_desc in ('TABLE_TYPE', 'VIEW', 'SQL_STORED_PROCEDURE', 'USER_TABLE') + """ + ) + upstream_dependencies = [] + for row in upstream_data: + upstream_dependencies.append( + ProcedureDependency( + db=row["db"], + schema=row["schema"], + name=row["name"], + type=row["type"], + env=procedure.flow.env, + server=procedure.flow.platform_instance, + ) + ) + return ProcedureLineageStream(dependencies=upstream_dependencies) + + @staticmethod + def _get_procedure_inputs( + conn: Connection, procedure: StoredProcedure + ) -> List[ProcedureParameter]: + inputs_data = conn.execute( + f""" + SELECT + name, + type_name(user_type_id) AS 'type' + FROM sys.parameters + WHERE object_id = object_id('{procedure.escape_full_name}') + """ + ) + inputs_list = [] + for row in inputs_data: + inputs_list.append(ProcedureParameter(name=row["name"], type=row["type"])) + return inputs_list + + @staticmethod + def _get_procedure_code( + conn: Connection, procedure: StoredProcedure + ) -> Tuple[Optional[str], Optional[str]]: + query = f"EXEC [{procedure.db}].dbo.sp_helptext '{procedure.full_name}'" + try: + code_data = conn.execute(query) + except ProgrammingError: + logger.warning( + "Denied permission for read text from procedure '%s'", + procedure.full_name, + ) + return None, None + code_list = [] + code_slice_index = 0 + code_slice_text = "create procedure" + try: + for index, row in enumerate(code_data): + code_list.append(row["Text"]) + if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip(): + code_slice_index = index + definition = "\n".join(code_list[:code_slice_index]) + code = "\n".join(code_list[code_slice_index:]) + except ResourceClosedError: + logger.warning( + "Connection was closed from procedure '%s'", + procedure.full_name, + ) + return None, None + return definition, code + + @staticmethod + def _get_procedure_properties( + conn: Connection, procedure: StoredProcedure + ) -> Dict[str, Any]: + properties_data = conn.execute( + f""" + SELECT + create_date as date_created, + modify_date as date_modified + FROM sys.procedures + WHERE object_id = object_id('{procedure.full_name}') + """ + ) + properties = {} + for row in properties_data: + properties = dict( + date_created=row["date_created"], date_modified=row["date_modified"] + ) + return properties + + @staticmethod + def _get_stored_procedures( + conn: Connection, db_name: str, schema: str + ) -> List[Dict[str, str]]: + stored_procedures_data = conn.execute( + f""" + SELECT + pr.name as procedure_name, + s.name as schema_name + FROM + [{db_name}].[sys].[procedures] pr + INNER JOIN + [{db_name}].[sys].[schemas] s ON pr.schema_id = s.schema_id + where s.name = '{schema}' + """ + ) + procedures_list = [] + for row in stored_procedures_data: + procedures_list.append( + dict(db=db_name, schema=row["schema_name"], name=row["procedure_name"]) + ) + return procedures_list + + def construct_job_workunits( + self, + data_job: MSSQLDataJob, + ) -> Iterable[MetadataWorkUnit]: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_info_aspect, + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_input_output_aspect, + ).as_workunit() + # TODO: Add SubType when it appear + + def construct_flow_workunits( + self, + data_flow: MSSQLDataFlow, + ) -> Iterable[MetadataWorkUnit]: + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_flow.as_dataflow_info_aspect, + ).as_workunit() + # TODO: Add SubType when it appear + + def get_inspectors(self) -> Iterable[Inspector]: + # This method can be overridden in the case that you want to dynamically + # run on multiple databases. + url = self.config.get_sql_alchemy_url() + logger.debug(f"sql_alchemy_url={url}") + engine = create_engine(url, **self.config.options) + with engine.connect() as conn: + if self.config.database and self.config.database != "": + inspector = inspect(conn) + yield inspector + else: + databases = conn.execute( + "SELECT name FROM master.sys.databases WHERE name NOT IN \ + ('master', 'model', 'msdb', 'tempdb', 'Resource', \ + 'distribution' , 'reportserver', 'reportservertempdb'); " + ) + for db in databases: + if self.config.database_pattern.allowed(db["name"]): + url = self.config.get_sql_alchemy_url(current_db=db["name"]) + with create_engine( + url, **self.config.options + ).connect() as conn: + inspector = inspect(conn) + self.current_database = db["name"] + yield inspector + + def get_identifier( + self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any + ) -> str: + regular = f"{schema}.{entity}" + qualified_table_name = regular + if self.config.database: + if self.config.database_alias: + qualified_table_name = f"{self.config.database_alias}.{regular}" + else: + qualified_table_name = f"{self.config.database}.{regular}" + if self.current_database: + qualified_table_name = f"{self.current_database}.{regular}" + return ( + qualified_table_name.lower() + if self.config.convert_urns_to_lowercase + else qualified_table_name + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 280f4f47adcdf..b5458a42192fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -478,6 +478,27 @@ def add_table_to_schema_container( parent_container_key=schema_container_key, ) + def get_database_level_workunits( + self, + inspector: Inspector, + database: str, + ) -> Iterable[MetadataWorkUnit]: + yield from self.gen_database_containers(database=database) + + def get_schema_level_workunits( + self, + inspector: Inspector, + schema: str, + database: str, + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from self.gen_schema_containers(schema=schema, database=database) + + if self.config.include_tables: + yield from self.loop_tables(inspector, schema, self.config) + + if self.config.include_views: + yield from self.loop_views(inspector, schema, self.config) + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), @@ -516,27 +537,20 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit ) db_name = self.get_db_name(inspector) - yield from self.gen_database_containers( + yield from self.get_database_level_workunits( + inspector=inspector, database=db_name, ) for schema in self.get_allowed_schemas(inspector, db_name): self.add_information_for_schema(inspector, schema) - yield from self.gen_schema_containers( - database=db_name, + yield from self.get_schema_level_workunits( + inspector=inspector, schema=schema, - extra_properties=self.get_schema_properties( - inspector=inspector, schema=schema, database=db_name - ), + database=db_name, ) - if sql_config.include_tables: - yield from self.loop_tables(inspector, schema, sql_config) - - if sql_config.include_views: - yield from self.loop_views(inspector, schema, sql_config) - if profiler: profile_requests += list( self.loop_profiler_requests(inspector, schema, sql_config) diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index be4ae9e047aea..67a563baa561c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", @@ -3985,6 +4111,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:c6627af82d44de89492e1a9315ae9f4b", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index bc81ce9633432..ef6033dd91943 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", @@ -2053,6 +2179,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index 8be2fe134dca1..8098accebb424 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoDataAlias.Foo.SalesReason,PROD)", @@ -2053,6 +2179,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index ba2ab7330fded..d32002fb5648c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -81,6 +81,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "b6a0c1e2-f90a-4c86-a226-bf7ca59ad79f", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-08-06 21:01:05.157000", + "date_modified": "2023-08-06 21:01:05.283000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:f1b4c0e379c4b2e2e09a8ecd6c1b6dec", @@ -1764,6 +1828,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-08-06 21:01:05.093000", + "date_modified": "2023-08-06 21:01:05.093000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:a6bea84fba7b05fb5d12630c8e6306ac", @@ -2072,5 +2198,65 @@ "lastObserved": 1615443388097, "runId": "mssql-test" } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql index 612de3eb1583c..2ff46e249007a 100644 --- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql +++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql @@ -44,6 +44,10 @@ CREATE TABLE Foo.SalesReason ) ; GO +CREATE PROCEDURE Foo.DBs @ID INT +AS + SELECT @ID AS ThatDB; +GO GO EXEC sys.sp_addextendedproperty @@ -59,5 +63,31 @@ EXEC sys.sp_addextendedproperty @value = N'Description for column LastName of table Persons of schema Foo.', @level0type = N'SCHEMA', @level0name = 'Foo', @level1type = N'TABLE', @level1name = 'Persons', -@level2type = N'COLUMN',@level2name = 'LastName'; -GO \ No newline at end of file +@level2type = N'COLUMN',@level2name = 'LastName'; +GO +USE msdb ; +GO +EXEC dbo.sp_add_job + @job_name = N'Weekly Demo Data Backup' ; +GO +EXEC sp_add_jobstep + @job_name = N'Weekly Demo Data Backup', + @step_name = N'Set database to read only', + @database_name = N'DemoData', + @subsystem = N'TSQL', + @command = N'ALTER DATABASE DemoData SET READ_ONLY', + @retry_attempts = 5, + @retry_interval = 5 ; +GO +EXEC dbo.sp_add_schedule + @schedule_name = N'RunOnce', + @freq_type = 1, + @active_start_time = 233000 ; +GO +EXEC sp_attach_schedule + @job_name = N'Weekly Demo Data Backup', + @schedule_name = N'RunOnce'; +GO +EXEC dbo.sp_add_jobserver + @job_name = N'Weekly Demo Data Backup' +GO diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index 3e7b75edd4878..099690fed34c2 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -50,4 +50,9 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi output_path=tmp_path / "mssql_mces.json", golden_path=test_resources_dir / f"golden_files/golden_mces_{config_file.replace('yml','json')}", + ignore_paths=[ + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['job_id'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_created'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_modified'\]", + ], ) From d6a935e3ca0ee64117aa80cf10f04f7e8cabcbc2 Mon Sep 17 00:00:00 2001 From: Adriano Vega Llobell Date: Thu, 24 Aug 2023 11:19:04 +0200 Subject: [PATCH 14/20] fix(ingestion/kafka-connect): update retrieval of database name in Debezium SQL Server (#8608) --- .../datahub/ingestion/source/kafka_connect.py | 68 +++++++++++++------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index c8a4c7a6ab8fa..b3fa5e3401c07 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -626,12 +626,17 @@ def _extract_lineages(self): @dataclass class DebeziumSourceConnector: connector_manifest: ConnectorManifest + report: KafkaConnectSourceReport def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig + self, + connector_manifest: ConnectorManifest, + config: KafkaConnectSourceConfig, + report: KafkaConnectSourceReport, ) -> None: self.connector_manifest = connector_manifest self.config = config + self.report = report self._extract_lineages() @dataclass @@ -683,10 +688,19 @@ def get_parser( database_name=connector_manifest.config.get("database.dbname"), ) elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + parser = self.DebeziumParser( source_platform="mssql", server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), + database_name=database_name, ) elif connector_class == "io.debezium.connector.db2.Db2Connector": parser = self.DebeziumParser( @@ -707,29 +721,37 @@ def get_parser( def _extract_lineages(self): lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name) - if not self.connector_manifest.topic_names: - return lineages + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name) - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) + if not self.connector_manifest.topic_names: + return lineages - if found: - table_name = get_dataset_name(database_name, found.group(2)) + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.connector_manifest.lineages = lineages + except Exception as e: + self.report.report_warning( + self.connector_manifest.name, f"Error resolving lineage: {e}" + ) + + return @dataclass @@ -1061,7 +1083,9 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: "io.debezium.connector" ): connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, config=self.config + connector_manifest=connector_manifest, + config=self.config, + report=self.report, ).connector_manifest elif ( connector_manifest.config.get(CONNECTOR_CLASS, "") From e285da3e752f8cd7f7aa7243cc1c42499b8f3901 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Thu, 24 Aug 2023 19:53:07 +0530 Subject: [PATCH 15/20] feat(ingest/snowflake): tables from snowflake shares as siblings (#8531) --- .../docs/sources/snowflake/snowflake_pre.md | 18 + .../source/snowflake/snowflake_config.py | 112 +++++- .../source/snowflake/snowflake_schema.py | 1 + .../source/snowflake/snowflake_shares.py | 158 ++++++++ .../source/snowflake/snowflake_v2.py | 107 +++--- .../tests/unit/test_snowflake_shares.py | 348 ++++++++++++++++++ 6 files changed, 696 insertions(+), 48 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py create mode 100644 metadata-ingestion/tests/unit/test_snowflake_shares.py diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md index 9a381fb351aec..75bd579417a48 100644 --- a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md @@ -99,6 +99,24 @@ The steps slightly differ based on which you decide to use. including `client_id` and `client_secret`, plus your Okta user's `Username` and `Password` * Note: the `username` and `password` config options are not nested under `oauth_config` +### Snowflake Shares +If you are using [Snowflake Shares](https://docs.snowflake.com/en/user-guide/data-sharing-provider) to share data across different snowflake accounts, and you have set up DataHub recipes for ingesting metadata from all these accounts, you may end up having multiple similar dataset entities corresponding to virtual versions of same table in different snowflake accounts. DataHub Snowflake connector can automatically link such tables together through Siblings and Lineage relationship if user provides information necessary to establish the relationship using configuration `shares` in recipe. + +#### Example +- Snowflake account `account1` (ingested as platform_instance `instance1`) owns a database `db1`. A share `X` is created in `account1` that includes database `db1` along with schemas and tables inside it. +- Now, `X` is shared with snowflake account `account2` (ingested as platform_instance `instance2`). A database `db1_from_X` is created from inbound share `X` in `account2`. In this case, all tables and views included in share `X` will also be present in `instance2`.`db1_from_X`. +- This can be represented in `shares` configuration section as + ```yaml + shares: + X: # name of the share + database_name: db1 + platform_instance: instance1 + consumers: # list of all databases created from share X + - database_name: db1_from_X + platform_instance: instance2 + + ``` +- If share `X` is shared with more snowflake accounts and database is created from share `X` in those account then additional entries need to be added in `consumers` list for share `X`, one per snowflake account. The same `shares` config can then be copied across recipes of all accounts. ### Caveats - Some of the features are only available in the Snowflake Enterprise Edition. This doc has notes mentioning where this applies. diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 7699d89ce9ac2..a7d946e99d806 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -1,10 +1,12 @@ import logging +from collections import defaultdict +from dataclasses import dataclass from enum import Enum -from typing import Dict, List, Optional, cast +from typing import Dict, List, Optional, Set, cast from pydantic import Field, SecretStr, root_validator, validator -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pattern_utils import UUID_REGEX from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -42,6 +44,31 @@ class TagOption(str, Enum): skip = "skip" +@dataclass(frozen=True) +class DatabaseId: + database: str = Field( + description="Database created from share in consumer account." + ) + platform_instance: str = Field( + description="Platform instance of consumer snowflake account." + ) + + +class SnowflakeShareConfig(ConfigModel): + database: str = Field(description="Database from which share is created.") + platform_instance: str = Field( + description="Platform instance for snowflake account in which share is created." + ) + + consumers: Set[DatabaseId] = Field( + description="List of databases created in consumer accounts." + ) + + @property + def source_database(self) -> DatabaseId: + return DatabaseId(self.database, self.platform_instance) + + class SnowflakeV2Config( SnowflakeConfig, SnowflakeUsageConfig, @@ -115,6 +142,13 @@ class SnowflakeV2Config( "upstreams_deny_pattern", "temporary_tables_pattern" ) + shares: Optional[Dict[str, SnowflakeShareConfig]] = Field( + default=None, + description="Required if current account owns or consumes snowflake share." + " If specified, connector creates lineage and siblings relationship between current account's database tables and consumer/producer account's database tables." + " Map of share name -> details of share.", + ) + email_as_user_identifier: bool = Field( default=True, description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is provided, generates email addresses for snowflake users with unset emails, based on their username.", @@ -192,3 +226,77 @@ def get_sql_alchemy_url( @property def parse_view_ddl(self) -> bool: return self.include_view_column_lineage + + @validator("shares") + def validate_shares( + cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict + ) -> Optional[Dict[str, SnowflakeShareConfig]]: + current_platform_instance = values.get("platform_instance") + + if shares: + # Check: platform_instance should be present + assert current_platform_instance is not None, ( + "Did you forget to set `platform_instance` for current ingestion ? " + "It is required to use `platform_instance` when ingesting from multiple snowflake accounts." + ) + + databases_included_in_share: List[DatabaseId] = [] + databases_created_from_share: List[DatabaseId] = [] + + for share_details in shares.values(): + shared_db = DatabaseId( + share_details.database, share_details.platform_instance + ) + assert all( + consumer.platform_instance != share_details.platform_instance + for consumer in share_details.consumers + ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake." + + databases_included_in_share.append(shared_db) + databases_created_from_share.extend(share_details.consumers) + + for db_from_share in databases_created_from_share: + assert ( + db_from_share not in databases_included_in_share + ), "Database included in a share can not be present as consumer in any share." + assert ( + databases_created_from_share.count(db_from_share) == 1 + ), "Same database can not be present as consumer in more than one share." + + return shares + + def outbounds(self) -> Dict[str, Set[DatabaseId]]: + """ + Returns mapping of + database included in current account's outbound share -> all databases created from this share in other accounts + """ + outbounds: Dict[str, Set[DatabaseId]] = defaultdict(set) + if self.shares: + for share_name, share_details in self.shares.items(): + if share_details.platform_instance == self.platform_instance: + logger.debug( + f"database {share_details.database} is included in outbound share(s) {share_name}." + ) + outbounds[share_details.database].update(share_details.consumers) + return outbounds + + def inbounds(self) -> Dict[str, DatabaseId]: + """ + Returns mapping of + database created from an current account's inbound share -> other-account database from which this share was created + """ + inbounds: Dict[str, DatabaseId] = {} + if self.shares: + for share_name, share_details in self.shares.items(): + for consumer in share_details.consumers: + if consumer.platform_instance == self.platform_instance: + logger.debug( + f"database {consumer.database} is created from inbound share {share_name}." + ) + inbounds[consumer.database] = share_details.source_database + break + else: + logger.info( + f"Skipping Share {share_name}, as it does not include current platform instance {self.platform_instance}", + ) + return inbounds diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index dab46645bffcc..e5b214ba35e4b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -261,6 +261,7 @@ def get_tables_for_database( for table in cur: if table["TABLE_SCHEMA"] not in tables: tables[table["TABLE_SCHEMA"]] = [] + tables[table["TABLE_SCHEMA"]].append( SnowflakeTable( name=table["TABLE_NAME"], diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py new file mode 100644 index 0000000000000..6f7520bbf1988 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -0,0 +1,158 @@ +import logging +from typing import Callable, Iterable, List + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_config import ( + DatabaseId, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + DatasetLineageType, + Upstream, + UpstreamLineage, +) + +logger: logging.Logger = logging.getLogger(__name__) + + +class SnowflakeSharesHandler(SnowflakeCommonMixin): + def __init__( + self, + config: SnowflakeV2Config, + report: SnowflakeV2Report, + dataset_urn_builder: Callable[[str], str], + ) -> None: + self.config = config + self.report = report + self.logger = logger + self.dataset_urn_builder = dataset_urn_builder + + def get_shares_workunits( + self, databases: List[SnowflakeDatabase] + ) -> Iterable[MetadataWorkUnit]: + inbounds = self.config.inbounds() + outbounds = self.config.outbounds() + # None of the databases are shared + if not (inbounds or outbounds): + return + + logger.debug("Checking databases for inbound or outbound shares.") + for db in databases: + is_inbound = db.name in inbounds + is_outbound = db.name in outbounds + + if not (is_inbound or is_outbound): + logger.debug(f"database {db.name} is not shared.") + continue + + sibling_dbs = ( + list(outbounds[db.name]) if is_outbound else [inbounds[db.name]] + ) + + for schema in db.schemas: + for table_name in schema.tables + schema.views: + # TODO: If this is outbound database, + # 1. attempt listing shares using `show shares` to identify name of share associated with this database (cache query result). + # 2. if corresponding share is listed, then run `show grants to share ` to identify exact tables, views included in share. + # 3. emit siblings only for the objects listed above. + # This will work only if the configured role has accountadmin role access OR is owner of share. + # Otherwise ghost nodes may be shown in "Composed Of" section for tables/views in original database which are not granted to share. + yield from self.gen_siblings( + db.name, + schema.name, + table_name, + is_outbound, + sibling_dbs, + ) + + if is_inbound: + assert len(sibling_dbs) == 1 + # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy + # hence this lineage code is not written in SnowflakeLineageExtractor + # also this is not governed by configs include_table_lineage and include_view_lineage + yield self.get_upstream_lineage_with_primary_sibling( + db.name, schema.name, table_name, sibling_dbs[0] + ) + + self.report_missing_databases( + databases, list(inbounds.keys()), list(outbounds.keys()) + ) + + def report_missing_databases( + self, + databases: List[SnowflakeDatabase], + inbounds: List[str], + outbounds: List[str], + ) -> None: + db_names = [db.name for db in databases] + missing_dbs = [db for db in inbounds + outbounds if db not in db_names] + + if missing_dbs: + self.report_warning( + "snowflake-shares", + f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.", + ) + + def gen_siblings( + self, + database_name: str, + schema_name: str, + table_name: str, + primary: bool, + sibling_databases: List[DatabaseId], + ) -> Iterable[MetadataWorkUnit]: + if not sibling_databases: + return + dataset_identifier = self.get_dataset_identifier( + table_name, schema_name, database_name + ) + urn = self.dataset_urn_builder(dataset_identifier) + + sibling_urns = [ + make_dataset_urn_with_platform_instance( + self.platform, + self.get_dataset_identifier( + table_name, schema_name, sibling_db.database + ), + sibling_db.platform_instance, + ) + for sibling_db in sibling_databases + ] + + yield MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=Siblings(primary=primary, siblings=sorted(sibling_urns)), + ).as_workunit() + + def get_upstream_lineage_with_primary_sibling( + self, + database_name: str, + schema_name: str, + table_name: str, + primary_sibling_db: DatabaseId, + ) -> MetadataWorkUnit: + dataset_identifier = self.get_dataset_identifier( + table_name, schema_name, database_name + ) + urn = self.dataset_urn_builder(dataset_identifier) + + upstream_urn = make_dataset_urn_with_platform_instance( + self.platform, + self.get_dataset_identifier( + table_name, schema_name, primary_sibling_db.database + ), + primary_sibling_db.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=UpstreamLineage( + upstreams=[Upstream(dataset=upstream_urn, type=DatasetLineageType.COPY)] + ), + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 40c4d32525a51..2cb4b37fdd696 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -68,6 +68,7 @@ SnowflakeTag, SnowflakeView, ) +from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler from datahub.ingestion.source.snowflake.snowflake_tag import SnowflakeTagExtractor from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeUsageExtractor, @@ -491,9 +492,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: return self.data_dictionary.set_connection(self.connection) - databases = self.get_databases() + databases: List[SnowflakeDatabase] = [] - if databases is None or len(databases) == 0: + for database in self.get_databases() or []: + self.report.report_entity_scanned(database.name, "database") + if not self.config.database_pattern.allowed(database.name): + self.report.report_dropped(f"{database.name}.*") + else: + databases.append(database) + + if len(databases) == 0: return for snowflake_db in databases: @@ -520,25 +528,22 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # TODO: The checkpoint state for stale entity detection can be committed here. + if self.config.shares: + yield from SnowflakeSharesHandler( + self.config, self.report, self.gen_dataset_urn + ).get_shares_workunits(databases) + discovered_tables: List[str] = [ self.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.tables - if self._is_dataset_pattern_allowed( - self.get_dataset_identifier(table_name, schema.name, db.name), - SnowflakeObjectDomain.TABLE, - ) ] discovered_views: List[str] = [ self.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.views - if self._is_dataset_pattern_allowed( - self.get_dataset_identifier(table_name, schema.name, db.name), - SnowflakeObjectDomain.VIEW, - ) ] if len(discovered_tables) == 0 and len(discovered_views) == 0: @@ -642,11 +647,6 @@ def get_databases_from_ischema(self, databases): def _process_database( self, snowflake_db: SnowflakeDatabase ) -> Iterable[MetadataWorkUnit]: - self.report.report_entity_scanned(snowflake_db.name, "database") - if not self.config.database_pattern.allowed(snowflake_db.name): - self.report.report_dropped(f"{snowflake_db.name}.*") - return - db_name = snowflake_db.name try: @@ -692,11 +692,22 @@ def _process_database( if self.config.is_profiling_enabled() and self.db_tables: yield from self.profiler.get_workunits(snowflake_db, self.db_tables) - def fetch_schemas_for_database(self, snowflake_db, db_name): + def fetch_schemas_for_database( + self, snowflake_db: SnowflakeDatabase, db_name: str + ) -> None: + schemas: List[SnowflakeSchema] = [] try: - snowflake_db.schemas = self.data_dictionary.get_schemas_for_database( - db_name - ) + for schema in self.data_dictionary.get_schemas_for_database(db_name): + self.report.report_entity_scanned(schema.name, "schema") + if not is_schema_allowed( + self.config.schema_pattern, + schema.name, + db_name, + self.config.match_fully_qualified_names, + ): + self.report.report_dropped(f"{db_name}.{schema.name}.*") + else: + schemas.append(schema) except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = f"Failed to get schemas for database {db_name}. Please check permissions." @@ -712,25 +723,17 @@ def fetch_schemas_for_database(self, snowflake_db, db_name): db_name, ) - if not snowflake_db.schemas: + if not schemas: self.report_warning( "No schemas found in database. If schemas exist, please grant USAGE permissions on them.", db_name, ) + else: + snowflake_db.schemas = schemas def _process_schema( self, snowflake_schema: SnowflakeSchema, db_name: str ) -> Iterable[MetadataWorkUnit]: - self.report.report_entity_scanned(snowflake_schema.name, "schema") - if not is_schema_allowed( - self.config.schema_pattern, - snowflake_schema.name, - db_name, - self.config.match_fully_qualified_names, - ): - self.report.report_dropped(f"{db_name}.{snowflake_schema.name}.*") - return - schema_name = snowflake_schema.name if self.config.extract_tags != TagOption.skip: @@ -772,9 +775,20 @@ def _process_schema( f"{db_name}.{schema_name}", ) - def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name): + def fetch_views_for_schema( + self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str + ) -> List[SnowflakeView]: try: - views = self.get_views_for_schema(schema_name, db_name) + views: List[SnowflakeView] = [] + for view in self.get_views_for_schema(schema_name, db_name): + view_name = self.get_dataset_identifier(view.name, schema_name, db_name) + + self.report.report_entity_scanned(view_name, "view") + + if not self.config.view_pattern.allowed(view_name): + self.report.report_dropped(view_name) + else: + views.append(view) snowflake_schema.views = [view.name for view in views] return views except Exception as e: @@ -792,10 +806,22 @@ def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name): "Failed to get views for schema", f"{db_name}.{schema_name}", ) + return [] - def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name): + def fetch_tables_for_schema( + self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str + ) -> List[SnowflakeTable]: try: - tables = self.get_tables_for_schema(schema_name, db_name) + tables: List[SnowflakeTable] = [] + for table in self.get_tables_for_schema(schema_name, db_name): + table_identifier = self.get_dataset_identifier( + table.name, schema_name, db_name + ) + self.report.report_entity_scanned(table_identifier) + if not self.config.table_pattern.allowed(table_identifier): + self.report.report_dropped(table_identifier) + else: + tables.append(table) snowflake_schema.tables = [table.name for table in tables] return tables except Exception as e: @@ -812,6 +838,7 @@ def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name): "Failed to get tables for schema", f"{db_name}.{schema_name}", ) + return [] def _process_table( self, @@ -821,12 +848,6 @@ def _process_table( ) -> Iterable[MetadataWorkUnit]: table_identifier = self.get_dataset_identifier(table.name, schema_name, db_name) - self.report.report_entity_scanned(table_identifier) - - if not self.config.table_pattern.allowed(table_identifier): - self.report.report_dropped(table_identifier) - return - self.fetch_columns_for_table(table, schema_name, db_name, table_identifier) self.fetch_pk_for_table(table, schema_name, db_name, table_identifier) @@ -938,12 +959,6 @@ def _process_view( ) -> Iterable[MetadataWorkUnit]: view_name = self.get_dataset_identifier(view.name, schema_name, db_name) - self.report.report_entity_scanned(view_name, "view") - - if not self.config.view_pattern.allowed(view_name): - self.report.report_dropped(view_name) - return - try: view.columns = self.get_columns_for_table(view.name, schema_name, db_name) if self.config.extract_tags != TagOption.skip: diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/test_snowflake_shares.py new file mode 100644 index 0000000000000..7de86139baf39 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_snowflake_shares.py @@ -0,0 +1,348 @@ +from typing import List + +import pytest + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.source.snowflake.snowflake_config import ( + DatabaseId, + SnowflakeShareConfig, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_schema import ( + SnowflakeDatabase, + SnowflakeSchema, +) +from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler +from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings +from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeProposal + + +@pytest.fixture(scope="module") +def snowflake_databases() -> List[SnowflakeDatabase]: + return [ + SnowflakeDatabase( + name="db1", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema11", + created=None, + comment=None, + last_altered=None, + tables=["table111", "table112"], + views=["view111"], + ), + SnowflakeSchema( + name="schema12", + created=None, + comment=None, + last_altered=None, + tables=["table121", "table122"], + views=["view121"], + ), + ], + ), + SnowflakeDatabase( + name="db2", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema21", + created=None, + comment=None, + last_altered=None, + tables=["table211", "table212"], + views=["view211"], + ), + SnowflakeSchema( + name="schema22", + created=None, + comment=None, + last_altered=None, + tables=["table221", "table222"], + views=["view221"], + ), + ], + ), + SnowflakeDatabase( + name="db3", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema31", + created=None, + comment=None, + last_altered=None, + tables=["table311", "table312"], + views=["view311"], + ) + ], + ), + ] + + +def make_snowflake_urn(table_name, instance_name=None): + return make_dataset_urn_with_platform_instance( + "snowflake", table_name, instance_name + ) + + +def test_snowflake_shares_workunit_no_shares( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config(account_id="abc12345", platform_instance="instance1") + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x) + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + assert len(wus) == 0 + + +def test_same_database_inbound_and_outbound_invalid_config() -> None: + with pytest.raises( + ValueError, + match="Same database can not be present as consumer in more than one share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance3", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + }, + ) + + with pytest.raises( + ValueError, + match="Database included in a share can not be present as consumer in any share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance1", + consumers=[ + DatabaseId(database="db1", platform_instance="instance3") + ], + ), + }, + ) + + with pytest.raises( + ValueError, + match="Database included in a share can not be present as consumer in any share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance1", + consumers=[ + DatabaseId(database="db1", platform_instance="instance3") + ], + ), + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + }, + ) + + +def test_snowflake_shares_workunit_inbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[DatabaseId(database="db1", platform_instance="instance1")], + ) + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets + # Hence 6 Sibling and 6 upstreamLineage aspects + assert len(wus) == 12 + upstream_lineage_aspect_entity_urns = set() + sibling_aspect_entity_urns = set() + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "instance1.db1", "instance2.db1" + ) + upstream_lineage_aspect_entity_urns.add(wu.get_urn()) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db1", "instance2.db1") + ] + sibling_aspect_entity_urns.add(wu.get_urn()) + + assert upstream_lineage_aspect_entity_urns == sibling_aspect_entity_urns + + +def test_snowflake_shares_workunit_outbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share2": SnowflakeShareConfig( + database="db2", + platform_instance="instance1", + consumers=[ + DatabaseId( + database="db2_from_share", platform_instance="instance2" + ), + DatabaseId(database="db2", platform_instance="instance3"), + ], + ) + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets + # Hence 6 Sibling aspects + assert len(wus) == 6 + entity_urns = set() + + for wu in wus: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), + wu.get_urn().replace("instance1.db2", "instance3.db2"), + ] + entity_urns.add(wu.get_urn()) + + assert len((entity_urns)) == 6 + + +def test_snowflake_shares_workunit_inbound_and_outbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[DatabaseId(database="db1", platform_instance="instance1")], + ), + "share2": SnowflakeShareConfig( + database="db2", + platform_instance="instance1", + consumers=[ + DatabaseId( + database="db2_from_share", platform_instance="instance2" + ), + DatabaseId(database="db2", platform_instance="instance3"), + ], + ), + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 6 Sibling and 6 upstreamLineage aspects for db1 tables + # 6 Sibling aspects for db2 tables + assert len(wus) == 18 + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "instance1.db1", "instance2.db1" + ) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + if "db1" in wu.get_urn(): + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db1", "instance2.db1") + ] + else: + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), + wu.get_urn().replace("instance1.db2", "instance3.db2"), + ] From 6659ff26ef151adfe6d25e1c23db5cab7d42f8f9 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Thu, 24 Aug 2023 10:35:46 -0400 Subject: [PATCH 16/20] feat(ingest/sql-queries): Add sql queries source, SqlParsingBuilder, sqlglot_lineage performance optimizations (#8494) Co-authored-by: Harshal Sheth Co-authored-by: Tamas Nemeth --- metadata-ingestion/setup.py | 2 + .../datahub/emitter/sql_parsing_builder.py | 289 ++++++++++++++++++ .../src/datahub/ingestion/graph/client.py | 58 +++- .../datahub/ingestion/source/sql_queries.py | 223 ++++++++++++++ .../src/datahub/utilities/sqlglot_lineage.py | 35 ++- 5 files changed, 595 insertions(+), 12 deletions(-) create mode 100644 metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/sql_queries.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 59cdcee79f052..ded9186e08a22 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -376,6 +376,7 @@ def get_long_description(): "salesforce": {"simple-salesforce"}, "snowflake": snowflake_common | usage_common | sqlglot_lib, "sqlalchemy": sql_common, + "sql-queries": usage_common | sqlglot_lib, "superset": { "requests", "sqlalchemy", @@ -608,6 +609,7 @@ def get_long_description(): "demo-data = datahub.ingestion.source.demo_data.DemoDataSource", "unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource", "gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource", + "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource", ], "datahub.ingestion.transformer.plugins": [ "simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership", diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py new file mode 100644 index 0000000000000..071d590f270f8 --- /dev/null +++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py @@ -0,0 +1,289 @@ +import logging +import time +from collections import defaultdict +from dataclasses import dataclass, field +from datetime import datetime +from typing import Collection, Dict, Iterable, List, Optional, Set + +from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator +from datahub.metadata.schema_classes import ( + AuditStampClass, + DatasetLineageTypeClass, + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, + OperationClass, + OperationTypeClass, + UpstreamClass, + UpstreamLineageClass, +) +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult + +logger = logging.getLogger(__name__) + +# TODO: Use this over other sources' equivalent code, if possible + +DatasetUrn = str +FieldUrn = str +UserUrn = str + + +@dataclass +class LineageEdge: + """Stores information about a single lineage edge, from an upstream table to a downstream table.""" + + downstream_urn: DatasetUrn + upstream_urn: DatasetUrn + audit_stamp: Optional[datetime] + actor: Optional[UserUrn] + type: str = DatasetLineageTypeClass.TRANSFORMED + + # Maps downstream_col -> {upstream_col} + column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set)) + + def gen_upstream_aspect(self) -> UpstreamClass: + return UpstreamClass( + auditStamp=AuditStampClass( + time=int(self.audit_stamp.timestamp() * 1000), actor=self.actor or "" + ) + if self.audit_stamp + else None, + dataset=self.upstream_urn, + type=self.type, + ) + + def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]: + for downstream_col, upstream_cols in self.column_map.items(): + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + # Sort to avoid creating multiple aspects in backend with same lineage but different order + upstreams=sorted( + make_schema_field_urn(self.upstream_urn, col) + for col in upstream_cols + ), + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn(self.downstream_urn, downstream_col) + ], + ) + + +@dataclass +class SqlParsingBuilder: + # Open question: does it make sense to iterate over out_tables? When will we have multiple? + + generate_lineage: bool = True + generate_usage_statistics: bool = True + generate_operations: bool = True + usage_config: Optional[BaseUsageConfig] = None + + # TODO: Make inner dict a FileBackedDict and make LineageEdge frozen + # Builds up a single LineageEdge for each upstream -> downstream pair + _lineage_map: Dict[DatasetUrn, Dict[DatasetUrn, LineageEdge]] = field( + default_factory=lambda: defaultdict(dict), init=False + ) + + # TODO: Replace with FileBackedDict approach like in BigQuery usage + _usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False) + + def __post_init__(self) -> None: + if self.usage_config: + self._usage_aggregator = UsageAggregator(self.usage_config) + else: + logger.info("No usage config provided, not generating usage statistics") + self.generate_usage_statistics = False + + def process_sql_parsing_result( + self, + result: SqlParsingResult, + *, + query: str, + query_timestamp: Optional[datetime] = None, + is_view_ddl: bool = False, + user: Optional[UserUrn] = None, + custom_operation_type: Optional[str] = None, + include_urns: Optional[Set[DatasetUrn]] = None, + ) -> Iterable[MetadataWorkUnit]: + """Process a single query and yield any generated workunits. + + Args: + result: The result of parsing the query, or a mock result if parsing failed. + query: The SQL query to parse and process. + query_timestamp: When the query was run. + is_view_ddl: Whether the query is a DDL statement that creates a view. + user: The urn of the user who ran the query. + custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed. + include_urns: If provided, only generate workunits for these urns. + """ + downstreams_to_ingest = result.out_tables + upstreams_to_ingest = result.in_tables + if include_urns: + logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}") + downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns) + upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns) + + if self.generate_lineage: + for downstream_urn in downstreams_to_ingest: + _merge_lineage_data( + downstream_urn=downstream_urn, + upstream_urns=result.in_tables, + column_lineage=result.column_lineage, + upstream_edges=self._lineage_map[downstream_urn], + query_timestamp=query_timestamp, + is_view_ddl=is_view_ddl, + user=user, + ) + + if self.generate_usage_statistics and query_timestamp is not None: + upstream_fields = _compute_upstream_fields(result) + for upstream_urn in upstreams_to_ingest: + self._usage_aggregator.aggregate_event( + resource=upstream_urn, + start_time=query_timestamp, + query=query, + user=user, + fields=sorted(upstream_fields.get(upstream_urn, [])), + ) + + if self.generate_operations and query_timestamp is not None: + for downstream_urn in downstreams_to_ingest: + yield from _gen_operation_workunit( + result, + downstream_urn=downstream_urn, + query_timestamp=query_timestamp, + user=user, + custom_operation_type=custom_operation_type, + ) + + def add_lineage( + self, + downstream_urn: DatasetUrn, + upstream_urns: Collection[DatasetUrn], + timestamp: Optional[datetime] = None, + is_view_ddl: bool = False, + user: Optional[UserUrn] = None, + ) -> None: + """Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails.""" + _merge_lineage_data( + downstream_urn=downstream_urn, + upstream_urns=upstream_urns, + column_lineage=None, + upstream_edges=self._lineage_map[downstream_urn], + query_timestamp=timestamp, + is_view_ddl=is_view_ddl, + user=user, + ) + + def gen_workunits(self) -> Iterable[MetadataWorkUnit]: + if self.generate_lineage: + yield from self._gen_lineage_workunits() + if self.generate_usage_statistics: + yield from self._gen_usage_statistics_workunits() + + def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]: + for downstream_urn in self._lineage_map: + upstreams: List[UpstreamClass] = [] + fine_upstreams: List[FineGrainedLineageClass] = [] + for upstream_urn, edge in self._lineage_map[downstream_urn].items(): + upstreams.append(edge.gen_upstream_aspect()) + fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects()) + + upstream_lineage = UpstreamLineageClass( + upstreams=sorted(upstreams, key=lambda x: x.dataset), + fineGrainedLineages=sorted( + fine_upstreams, + key=lambda x: (x.downstreams, x.upstreams), + ) + or None, + ) + yield MetadataChangeProposalWrapper( + entityUrn=downstream_urn, aspect=upstream_lineage + ).as_workunit() + + def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]: + yield from self._usage_aggregator.generate_workunits( + resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn + ) + + +def _merge_lineage_data( + downstream_urn: DatasetUrn, + *, + upstream_urns: Collection[DatasetUrn], + column_lineage: Optional[List[ColumnLineageInfo]], + upstream_edges: Dict[DatasetUrn, LineageEdge], + query_timestamp: Optional[datetime], + is_view_ddl: bool, + user: Optional[UserUrn], +) -> None: + for upstream_urn in upstream_urns: + edge = upstream_edges.setdefault( + upstream_urn, + LineageEdge( + downstream_urn=downstream_urn, + upstream_urn=upstream_urn, + audit_stamp=query_timestamp, + actor=user, + type=DatasetLineageTypeClass.VIEW + if is_view_ddl + else DatasetLineageTypeClass.TRANSFORMED, + ), + ) + if query_timestamp and ( # Use the most recent query + edge.audit_stamp is None or query_timestamp > edge.audit_stamp + ): + edge.audit_stamp = query_timestamp + if user: + edge.actor = user + + # Note: Inefficient as we loop through all column_lineage entries for each downstream table + for cl in column_lineage or []: + if cl.downstream.table == downstream_urn: + for upstream_column_info in cl.upstreams: + if upstream_column_info.table not in upstream_urns: + continue + column_map = upstream_edges[upstream_column_info.table].column_map + column_map[cl.downstream.column].add(upstream_column_info.column) + + +def _compute_upstream_fields( + result: SqlParsingResult, +) -> Dict[DatasetUrn, Set[DatasetUrn]]: + upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set) + for cl in result.column_lineage or []: + for upstream in cl.upstreams: + upstream_fields[upstream.table].add(upstream.column) + return upstream_fields + + +def _gen_operation_workunit( + result: SqlParsingResult, + *, + downstream_urn: DatasetUrn, + query_timestamp: datetime, + user: Optional[UserUrn], + custom_operation_type: Optional[str], +) -> Iterable[MetadataWorkUnit]: + operation_type = result.query_type.to_operation_type() + # Filter out SELECT and other undesired statements + if operation_type is None: + return + elif operation_type == OperationTypeClass.UNKNOWN: + if custom_operation_type is None: + return + else: + operation_type = OperationTypeClass.CUSTOM + + aspect = OperationClass( + timestampMillis=int(time.time() * 1000), + operationType=operation_type, + lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000), + actor=user, + customOperationType=custom_operation_type, + ) + yield MetadataChangeProposalWrapper( + entityUrn=downstream_urn, aspect=aspect + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 243c1848279c7..50ea69b6c13a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type from avro.schema import RecordSchema from deprecated import deprecated @@ -38,6 +38,8 @@ SystemMetadataClass, TelemetryClientIdClass, ) +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn, guess_entity_type if TYPE_CHECKING: @@ -957,7 +959,11 @@ def delete_references_to_urn( @functools.lru_cache() def _make_schema_resolver( - self, platform: str, platform_instance: Optional[str], env: str + self, + platform: str, + platform_instance: Optional[str], + env: str, + include_graph: bool = True, ) -> "SchemaResolver": from datahub.utilities.sqlglot_lineage import SchemaResolver @@ -965,8 +971,50 @@ def _make_schema_resolver( platform=platform, platform_instance=platform_instance, env=env, - graph=self, + graph=self if include_graph else None, + ) + + def initialize_schema_resolver_from_datahub( + self, platform: str, platform_instance: Optional[str], env: str + ) -> Tuple["SchemaResolver", Set[str]]: + logger.info("Initializing schema resolver") + + # TODO: Filter on platform instance? + logger.info(f"Fetching urns for platform {platform}, env {env}") + with PerfTimer() as timer: + urns = set( + self.get_urns_by_filter( + entity_types=[DatasetUrn.ENTITY_TYPE], + platform=platform, + env=env, + batch_size=3000, + ) + ) + logger.info( + f"Fetched {len(urns)} urns in {timer.elapsed_seconds()} seconds" + ) + + schema_resolver = self._make_schema_resolver( + platform, platform_instance, env, include_graph=False ) + with PerfTimer() as timer: + count = 0 + for i, urn in enumerate(urns): + if i % 1000 == 0: + logger.debug(f"Loaded {i} schema metadata") + try: + schema_metadata = self.get_aspect(urn, SchemaMetadataClass) + if schema_metadata: + schema_resolver.add_schema_metadata(urn, schema_metadata) + count += 1 + except Exception: + logger.warning("Failed to load schema metadata", exc_info=True) + logger.info( + f"Loaded {count} schema metadata in {timer.elapsed_seconds()} seconds" + ) + + logger.info("Finished initializing schema resolver") + return schema_resolver, urns def parse_sql_lineage( self, @@ -982,9 +1030,7 @@ def parse_sql_lineage( # Cache the schema resolver to make bulk parsing faster. schema_resolver = self._make_schema_resolver( - platform=platform, - platform_instance=platform_instance, - env=env, + platform=platform, platform_instance=platform_instance, env=env ) return sqlglot_lineage( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py new file mode 100644 index 0000000000000..2fcc93292c2ef --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -0,0 +1,223 @@ +import json +import logging +import os +from dataclasses import dataclass +from datetime import datetime, timezone +from functools import partial +from typing import Iterable, List, Optional, Set + +from pydantic import Field + +from datahub.configuration.source_common import ( + EnvConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.emitter.mce_builder import ( + make_dataset_urn_with_platform_instance, + make_user_urn, +) +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit_reporter +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig +from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage + +logger = logging.getLogger(__name__) + + +class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): + query_file: str = Field(description="Path to file to ingest") + + platform: str = Field( + description="The platform for which to generate data, e.g. snowflake" + ) + + usage: BaseUsageConfig = Field( + description="The usage config to use when generating usage statistics", + default=BaseUsageConfig(), + ) + + use_schema_resolver: bool = Field( + description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.", + default=True, + hidden_from_docs=True, + ) + default_db: Optional[str] = Field( + description="The default database to use for unqualified table names", + default=None, + ) + default_schema: Optional[str] = Field( + description="The default schema to use for unqualified table names", + default=None, + ) + + +class SqlQueriesSourceReport(SourceReport): + num_queries_parsed: int = 0 + num_table_parse_failures: int = 0 + num_column_parse_failures: int = 0 + + def compute_stats(self) -> None: + super().compute_stats() + self.table_failure_rate = ( + f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}" + if self.num_queries_parsed + else "0" + ) + self.column_failure_rate = ( + f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}" + if self.num_queries_parsed + else "0" + ) + + +@platform_name("SQL Queries") +@config_class(SqlQueriesSourceConfig) +@support_status(SupportStatus.TESTING) +class SqlQueriesSource(Source): + # TODO: Documentation + urns: Optional[Set[str]] + schema_resolver: SchemaResolver + builder: SqlParsingBuilder + + def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig): + if not ctx.graph: + raise ValueError( + "SqlQueriesSource needs a datahub_api from which to pull schema metadata" + ) + + self.graph: DataHubGraph = ctx.graph + self.ctx = ctx + self.config = config + self.report = SqlQueriesSourceReport() + + self.builder = SqlParsingBuilder(usage_config=self.config.usage) + + if self.config.use_schema_resolver: + schema_resolver, urns = self.graph.initialize_schema_resolver_from_datahub( + platform=self.config.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self.schema_resolver = schema_resolver + self.urns = urns + else: + self.schema_resolver = self.graph._make_schema_resolver( + platform=self.config.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self.urns = None + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource": + config = SqlQueriesSourceConfig.parse_obj(config_dict) + return cls(ctx, config) + + def get_report(self) -> SqlQueriesSourceReport: + return self.report + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [partial(auto_workunit_reporter, self.get_report())] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}") + with open(self.config.query_file) as f: + for line in f: + try: + query_dict = json.loads(line, strict=False) + entry = QueryEntry.create(query_dict, config=self.config) + yield from self._process_query(entry) + except Exception as e: + logger.warning("Error processing query", exc_info=True) + self.report.report_warning("process-query", str(e)) + + logger.info("Generating workunits") + yield from self.builder.gen_workunits() + + def _process_query(self, entry: "QueryEntry") -> Iterable[MetadataWorkUnit]: + self.report.num_queries_parsed += 1 + if self.report.num_queries_parsed % 1000 == 0: + logger.info(f"Parsed {self.report.num_queries_parsed} queries") + + result = sqlglot_lineage( + sql=entry.query, + schema_resolver=self.schema_resolver, + default_db=self.config.default_db, + default_schema=self.config.default_schema, + ) + if result.debug_info.table_error: + logger.info(f"Error parsing table lineage, {result.debug_info.table_error}") + self.report.num_table_parse_failures += 1 + for downstream_urn in set(entry.downstream_tables): + self.builder.add_lineage( + downstream_urn=downstream_urn, + upstream_urns=entry.upstream_tables, + timestamp=entry.timestamp, + user=entry.user, + ) + return + elif result.debug_info.column_error: + logger.debug( + f"Error parsing column lineage, {result.debug_info.column_error}" + ) + self.report.num_column_parse_failures += 1 + + yield from self.builder.process_sql_parsing_result( + result, + query=entry.query, + query_timestamp=entry.timestamp, + user=entry.user, + custom_operation_type=entry.operation_type, + include_urns=self.urns, + ) + + +@dataclass +class QueryEntry: + query: str + timestamp: Optional[datetime] + user: Optional[str] + operation_type: Optional[str] + downstream_tables: List[str] + upstream_tables: List[str] + + @classmethod + def create( + cls, entry_dict: dict, *, config: SqlQueriesSourceConfig + ) -> "QueryEntry": + return cls( + query=entry_dict["query"], + timestamp=datetime.fromtimestamp(entry_dict["timestamp"], tz=timezone.utc) + if "timestamp" in entry_dict + else None, + user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None, + operation_type=entry_dict.get("operation_type"), + downstream_tables=[ + make_dataset_urn_with_platform_instance( + name=table, + platform=config.platform, + platform_instance=config.platform_instance, + env=config.env, + ) + for table in entry_dict.get("downstream_tables", []) + ], + upstream_tables=[ + make_dataset_urn_with_platform_instance( + name=table, + platform=config.platform, + platform_instance=config.platform_instance, + env=config.env, + ) + for table in entry_dict.get("upstream_tables", []) + ], + ) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 6d028c4ac1b9e..534cac5cef2aa 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -7,7 +7,6 @@ from collections import defaultdict from typing import Dict, List, Optional, Set, Tuple, Union -import pydantic import pydantic.dataclasses import sqlglot import sqlglot.errors @@ -23,7 +22,7 @@ from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.metadata.schema_classes import SchemaMetadataClass +from datahub.metadata.schema_classes import OperationTypeClass, SchemaMetadataClass from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.dataset_urn import DatasetUrn @@ -34,6 +33,8 @@ # A lightweight table schema: column -> type mapping. SchemaInfo = Dict[str, str] +SQL_PARSE_RESULT_CACHE_SIZE = 1000 + class QueryType(enum.Enum): CREATE = "CREATE" @@ -45,6 +46,22 @@ class QueryType(enum.Enum): UNKNOWN = "UNKNOWN" + def to_operation_type(self) -> Optional[str]: + if self == QueryType.CREATE: + return OperationTypeClass.CREATE + elif self == QueryType.INSERT: + return OperationTypeClass.INSERT + elif self == QueryType.UPDATE: + return OperationTypeClass.UPDATE + elif self == QueryType.DELETE: + return OperationTypeClass.DELETE + elif self == QueryType.MERGE: + return OperationTypeClass.UPDATE + elif self == QueryType.SELECT: + return None + else: + return OperationTypeClass.UNKNOWN + def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: # UPGRADE: Once we use Python 3.10, replace this with a match expression. @@ -623,16 +640,21 @@ def _translate_internal_column_lineage( ) +def _get_dialect(platform: str) -> str: + # TODO: convert datahub platform names to sqlglot dialect + if platform == "presto-on-hive": + return "hive" + else: + return platform + + def _sqlglot_lineage_inner( sql: str, schema_resolver: SchemaResolver, default_db: Optional[str] = None, default_schema: Optional[str] = None, ) -> SqlParsingResult: - # TODO: convert datahub platform names to sqlglot dialect - # TODO: Pull the platform name from the schema resolver? - dialect = schema_resolver.platform - + dialect = _get_dialect(schema_resolver.platform) if dialect == "snowflake": # in snowflake, table identifiers must be uppercased to match sqlglot's behavior. if default_db: @@ -755,6 +777,7 @@ def _sqlglot_lineage_inner( ) +@functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE) def sqlglot_lineage( sql: str, schema_resolver: SchemaResolver, From a78e72caf81014434d404e4c5acd142728af3d20 Mon Sep 17 00:00:00 2001 From: Joshua Eilers Date: Thu, 24 Aug 2023 10:12:26 -0700 Subject: [PATCH 17/20] highlight matched fields in search results (#8651) --- .../datahub/graphql/GmsGraphQLEngine.java | 5 + .../resolvers/config/AppConfigResolver.java | 8 ++ .../graphql/types/mappers/MapperUtils.java | 21 ++- .../src/main/resources/app.graphql | 15 ++ .../src/main/resources/search.graphql | 5 + .../src/app/entity/EntityRegistry.tsx | 6 +- .../src/app/entity/chart/ChartEntity.tsx | 9 +- .../src/app/entity/chart/ChartSnippet.tsx | 53 ------- .../app/entity/dashboard/DashboardEntity.tsx | 10 +- .../src/app/entity/dataset/DatasetEntity.tsx | 5 +- .../entity/dataset/DatasetSearchSnippet.tsx | 39 ----- .../app/entity/dataset/search/highlights.ts | 7 - .../app/entity/dataset/shared/TagSummary.tsx | 38 ----- .../app/entity/dataset/shared/TermSummary.tsx | 36 ----- .../src/app/entity/group/preview/Preview.tsx | 12 +- .../app/entity/shared/__tests__/utils.test.ts | 37 ----- .../components/styled/StripMarkdownText.tsx | 6 +- .../shared/components/styled/StyledTag.tsx | 10 +- .../src/app/entity/shared/utils.ts | 44 +----- .../src/app/entity/user/preview/Preview.tsx | 11 +- .../src/app/preview/DefaultPreviewCard.tsx | 4 +- .../app/search/EntityGroupSearchResults.tsx | 98 ------------- .../src/app/search/context/SearchContext.tsx | 6 + .../search/context/SearchContextProvider.tsx | 3 +- .../search/context/SearchResultContext.tsx | 72 ++++++++++ .../app/search/matches/MatchedFieldList.tsx | 133 +++++++++++++++++ .../search/matches/SearchTextHighlighter.tsx | 42 ++++++ .../src/app/search/matches/constants.ts | 129 +++++++++++++++++ .../matches/matchedFieldPathsRenderer.tsx | 8 ++ .../matches/matchedInputFieldRenderer.tsx | 40 ++++++ .../src/app/search/matches/utils.test.ts | 110 ++++++++++++++ .../src/app/search/matches/utils.ts | 136 ++++++++++++++++++ .../src/app/shared/tags/tag/Tag.tsx | 3 + .../src/app/shared/tags/term/TermContent.tsx | 13 +- datahub-web-react/src/appConfigContext.tsx | 3 + .../src/conf/theme/theme_dark.config.json | 4 +- .../src/conf/theme/theme_light.config.json | 4 +- datahub-web-react/src/conf/theme/types.ts | 2 + datahub-web-react/src/graphql/app.graphql | 3 + datahub-web-react/src/graphql/search.graphql | 5 + .../metadata/search/utils/SearchUtils.java | 2 +- .../config/SearchResultVisualConfig.java | 11 ++ .../metadata/config/VisualConfiguration.java | 5 + .../src/main/resources/application.yml | 2 + 44 files changed, 840 insertions(+), 375 deletions(-) delete mode 100644 datahub-web-react/src/app/entity/chart/ChartSnippet.tsx delete mode 100644 datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx delete mode 100644 datahub-web-react/src/app/entity/dataset/search/highlights.ts delete mode 100644 datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx delete mode 100644 datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx delete mode 100644 datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts delete mode 100644 datahub-web-react/src/app/search/EntityGroupSearchResults.tsx create mode 100644 datahub-web-react/src/app/search/context/SearchResultContext.tsx create mode 100644 datahub-web-react/src/app/search/matches/MatchedFieldList.tsx create mode 100644 datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx create mode 100644 datahub-web-react/src/app/search/matches/constants.ts create mode 100644 datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx create mode 100644 datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx create mode 100644 datahub-web-react/src/app/search/matches/utils.test.ts create mode 100644 datahub-web-react/src/app/search/matches/utils.ts create mode 100644 metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index d6dd2de6d31e3..682710ad5d539 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -68,6 +68,7 @@ import com.linkedin.datahub.graphql.generated.ListQueriesResult; import com.linkedin.datahub.graphql.generated.ListTestsResult; import com.linkedin.datahub.graphql.generated.ListViewsResult; +import com.linkedin.datahub.graphql.generated.MatchedField; import com.linkedin.datahub.graphql.generated.MLFeature; import com.linkedin.datahub.graphql.generated.MLFeatureProperties; import com.linkedin.datahub.graphql.generated.MLFeatureTable; @@ -1008,6 +1009,10 @@ private void configureGenericEntityResolvers(final RuntimeWiring.Builder builder .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((SearchResult) env.getSource()).getEntity())) ) + .type("MatchedField", typeWiring -> typeWiring + .dataFetcher("entity", new EntityTypeResolver(entityTypes, + (env) -> ((MatchedField) env.getSource()).getEntity())) + ) .type("SearchAcrossLineageResult", typeWiring -> typeWiring .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((SearchAcrossLineageResult) env.getSource()).getEntity())) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java index 2c55bc79fe501..90017f7b87997 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java @@ -18,6 +18,7 @@ import com.linkedin.datahub.graphql.generated.Privilege; import com.linkedin.datahub.graphql.generated.QueriesTabConfig; import com.linkedin.datahub.graphql.generated.ResourcePrivileges; +import com.linkedin.datahub.graphql.generated.SearchResultsVisualConfig; import com.linkedin.datahub.graphql.generated.TelemetryConfig; import com.linkedin.datahub.graphql.generated.TestsConfig; import com.linkedin.datahub.graphql.generated.ViewsConfig; @@ -144,6 +145,13 @@ public CompletableFuture get(final DataFetchingEnvironment environmen } visualConfig.setEntityProfiles(entityProfilesConfig); } + if (_visualConfiguration != null && _visualConfiguration.getSearchResult() != null) { + SearchResultsVisualConfig searchResultsVisualConfig = new SearchResultsVisualConfig(); + if (_visualConfiguration.getSearchResult().getEnableNameHighlight() != null) { + searchResultsVisualConfig.setEnableNameHighlight(_visualConfiguration.getSearchResult().getEnableNameHighlight()); + } + visualConfig.setSearchResult(searchResultsVisualConfig); + } appConfig.setVisualConfig(visualConfig); final TelemetryConfig telemetryConfig = new TelemetryConfig(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java index 0b292a373ea40..2c9aa13934afc 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java @@ -1,5 +1,6 @@ package com.linkedin.datahub.graphql.types.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.generated.AggregationMetadata; import com.linkedin.datahub.graphql.generated.FacetMetadata; import com.linkedin.datahub.graphql.generated.MatchedField; @@ -7,6 +8,10 @@ import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; import com.linkedin.metadata.search.SearchEntity; +import com.linkedin.metadata.search.utils.SearchUtils; +import lombok.extern.slf4j.Slf4j; + +import java.net.URISyntaxException; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -16,6 +21,7 @@ import static com.linkedin.metadata.utils.SearchUtil.*; +@Slf4j public class MapperUtils { private MapperUtils() { @@ -54,7 +60,20 @@ public static String convertFilterValue(String filterValue, List isEnti public static List getMatchedFieldEntry(List highlightMetadata) { return highlightMetadata.stream() - .map(field -> new MatchedField(field.getName(), field.getValue())) + .map(field -> { + MatchedField matchedField = new MatchedField(); + matchedField.setName(field.getName()); + matchedField.setValue(field.getValue()); + if (SearchUtils.isUrn(field.getValue())) { + try { + Urn urn = Urn.createFromString(field.getValue()); + matchedField.setEntity(UrnToEntityMapper.map(urn)); + } catch (URISyntaxException e) { + log.warn("Failed to create urn from MatchedField value: {}", field.getValue(), e); + } + } + return matchedField; + }) .collect(Collectors.toList()); } } diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index 761242a6711c1..dbee24b4bf6f7 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -221,6 +221,11 @@ type VisualConfig { Configuration for the queries tab """ entityProfiles: EntityProfilesConfig + + """ + Configuration for search results + """ + searchResult: SearchResultsVisualConfig } """ @@ -255,6 +260,16 @@ type EntityProfileConfig { defaultTab: String } +""" +Configuration for a search result +""" +type SearchResultsVisualConfig { + """ + Whether a search result should highlight the name/description if it was matched on those fields. + """ + enableNameHighlight: Boolean +} + """ Configurations related to tracking users in the app """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index fbea66f738955..85f33756ae744 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -665,6 +665,11 @@ type MatchedField { Value of the field that matched """ value: String! + + """ + Entity if the value is an urn + """ + entity: Entity } """ diff --git a/datahub-web-react/src/app/entity/EntityRegistry.tsx b/datahub-web-react/src/app/entity/EntityRegistry.tsx index a07fd02841197..56b085cf69f4a 100644 --- a/datahub-web-react/src/app/entity/EntityRegistry.tsx +++ b/datahub-web-react/src/app/entity/EntityRegistry.tsx @@ -1,5 +1,7 @@ +import React from 'react'; import { Entity as EntityInterface, EntityType, SearchResult } from '../../types.generated'; import { FetchedEntity } from '../lineage/types'; +import { SearchResultProvider } from '../search/context/SearchResultContext'; import { Entity, EntityCapabilityType, IconStyleType, PreviewType } from './Entity'; import { GLOSSARY_ENTITY_TYPES } from './shared/constants'; import { GenericEntityProperties } from './shared/types'; @@ -119,7 +121,9 @@ export default class EntityRegistry { renderSearchResult(type: EntityType, searchResult: SearchResult): JSX.Element { const entity = validatedGet(type, this.entityTypeToEntity); - return entity.renderSearch(searchResult); + return ( + {entity.renderSearch(searchResult)} + ); } renderBrowse(type: EntityType, data: T): JSX.Element { diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx index b5ebcbef80379..0f1b6dbf3d660 100644 --- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx @@ -19,13 +19,14 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { ChartStatsSummarySubHeader } from './profile/stats/ChartStatsSummarySubHeader'; import { InputFieldsTab } from '../shared/tabs/Entity/InputFieldsTab'; -import { ChartSnippet } from './ChartSnippet'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import { LOOKER_URN } from '../../ingest/source/builder/constants'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; +import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; /** * Definition of the DataHub Chart entity. @@ -203,7 +204,11 @@ export class ChartEntity implements Entity { lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} externalUrl={data.properties?.externalUrl} - snippet={} + snippet={ + matchedInputFieldRenderer(matchedField, data)} + /> + } degree={(result as any).degree} paths={(result as any).paths} /> diff --git a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx b/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx deleted file mode 100644 index 27982d3037207..0000000000000 --- a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx +++ /dev/null @@ -1,53 +0,0 @@ -import React from 'react'; - -import { Typography } from 'antd'; -import { InputFields, MatchedField, Maybe } from '../../../types.generated'; -import TagTermGroup from '../../shared/tags/TagTermGroup'; -import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights'; -import { getMatchPrioritizingPrimary } from '../shared/utils'; - -type Props = { - matchedFields: MatchedField[]; - inputFields: Maybe | undefined; - isMatchingDashboard?: boolean; -}; - -const LABEL_INDEX_NAME = 'fieldLabels'; -const TYPE_PROPERTY_KEY_NAME = 'type'; - -export const ChartSnippet = ({ matchedFields, inputFields, isMatchingDashboard = false }: Props) => { - const matchedField = getMatchPrioritizingPrimary(matchedFields, 'fieldLabels'); - - if (matchedField?.name === LABEL_INDEX_NAME) { - const matchedSchemaField = inputFields?.fields?.find( - (field) => field?.schemaField?.label === matchedField.value, - ); - const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find( - (term) => term?.term?.name === matchedField.value, - ); - - if (matchedGlossaryTerm) { - let termType = 'term'; - const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find( - (property) => property.key === TYPE_PROPERTY_KEY_NAME, - ); - if (typeProperty) { - termType = typeProperty.value || termType; - } - - return ( - - Matches {termType} {' '} - {isMatchingDashboard && 'on a contained Chart'} - - ); - } - } - - return matchedField ? ( - - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {matchedField.value}{' '} - {isMatchingDashboard && 'on a contained Chart'} - - ) : null; -}; diff --git a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx index a64e437265262..0a36d0e5f1bfa 100644 --- a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx +++ b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx @@ -24,12 +24,13 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import { DashboardStatsSummarySubHeader } from './profile/DashboardStatsSummarySubHeader'; -import { ChartSnippet } from '../chart/ChartSnippet'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; import { LOOKER_URN } from '../../ingest/source/builder/constants'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; +import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; /** * Definition of the DataHub Dashboard entity. @@ -227,10 +228,9 @@ export class DashboardEntity implements Entity { lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} snippet={ - matchedInputFieldRenderer(matchedField, data)} + matchSuffix="on a contained chart" /> } subtype={data.subTypes?.typeNames?.[0]} diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx index cb4239872045f..ed3904bcf4e2d 100644 --- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx +++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx @@ -25,11 +25,12 @@ import { OperationsTab } from './profile/OperationsTab'; import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { SidebarSiblingsSection } from '../shared/containers/profile/sidebar/SidebarSiblingsSection'; import { DatasetStatsSummarySubHeader } from './profile/stats/stats/DatasetStatsSummarySubHeader'; -import { DatasetSearchSnippet } from './DatasetSearchSnippet'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; +import { matchedFieldPathsRenderer } from '../../search/matches/matchedFieldPathsRenderer'; const SUBTYPES = { VIEW: 'view', @@ -290,7 +291,7 @@ export class DatasetEntity implements Entity { subtype={data.subTypes?.typeNames?.[0]} container={data.container} parentContainers={data.parentContainers} - snippet={} + snippet={} insights={result.insights} externalUrl={data.properties?.externalUrl} statsSummary={data.statsSummary} diff --git a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx b/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx deleted file mode 100644 index e4f88eb0fbbfa..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx +++ /dev/null @@ -1,39 +0,0 @@ -import React from 'react'; - -import { Typography } from 'antd'; -import { MatchedField } from '../../../types.generated'; -import { TagSummary } from './shared/TagSummary'; -import { TermSummary } from './shared/TermSummary'; -import { FIELDS_TO_HIGHLIGHT } from './search/highlights'; -import { getMatchPrioritizingPrimary } from '../shared/utils'; -import { downgradeV2FieldPath } from './profile/schema/utils/utils'; - -type Props = { - matchedFields: MatchedField[]; -}; - -const LABEL_INDEX_NAME = 'fieldLabels'; - -export const DatasetSearchSnippet = ({ matchedFields }: Props) => { - const matchedField = getMatchPrioritizingPrimary(matchedFields, LABEL_INDEX_NAME); - - let snippet: React.ReactNode; - - if (matchedField) { - if (matchedField.value.includes('urn:li:tag')) { - snippet = ; - } else if (matchedField.value.includes('urn:li:glossaryTerm')) { - snippet = ; - } else if (matchedField.name === 'fieldPaths') { - snippet = {downgradeV2FieldPath(matchedField.value)}; - } else { - snippet = {matchedField.value}; - } - } - - return matchedField ? ( - - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {snippet}{' '} - - ) : null; -}; diff --git a/datahub-web-react/src/app/entity/dataset/search/highlights.ts b/datahub-web-react/src/app/entity/dataset/search/highlights.ts deleted file mode 100644 index 64505e0709c7b..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/search/highlights.ts +++ /dev/null @@ -1,7 +0,0 @@ -export const FIELDS_TO_HIGHLIGHT = new Map(); -FIELDS_TO_HIGHLIGHT.set('fieldPaths', 'column'); -FIELDS_TO_HIGHLIGHT.set('fieldDescriptions', 'column description'); -FIELDS_TO_HIGHLIGHT.set('fieldTags', 'column tag'); -FIELDS_TO_HIGHLIGHT.set('editedFieldDescriptions', 'column description'); -FIELDS_TO_HIGHLIGHT.set('editedFieldTags', 'column tag'); -FIELDS_TO_HIGHLIGHT.set('fieldLabels', 'label'); diff --git a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx deleted file mode 100644 index 106cc298fb58c..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx +++ /dev/null @@ -1,38 +0,0 @@ -import React from 'react'; -import styled from 'styled-components'; -import { useGetTagQuery } from '../../../../graphql/tag.generated'; -import { EntityType, Tag } from '../../../../types.generated'; -import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; -import { useEntityRegistry } from '../../../useEntityRegistry'; -import { StyledTag } from '../../shared/components/styled/StyledTag'; - -const TagLink = styled.span` - display: inline-block; -`; - -type Props = { - urn: string; -}; - -export const TagSummary = ({ urn }: Props) => { - const entityRegistry = useEntityRegistry(); - const { data } = useGetTagQuery({ variables: { urn } }); - return ( - <> - {data && ( - - - - {entityRegistry.getDisplayName(EntityType.Tag, data?.tag)} - - - - )} - - ); -}; diff --git a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx deleted file mode 100644 index cc1274693a342..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx +++ /dev/null @@ -1,36 +0,0 @@ -import React from 'react'; -import { Tag } from 'antd'; -import { BookOutlined } from '@ant-design/icons'; -import styled from 'styled-components'; -import { useGetGlossaryTermQuery } from '../../../../graphql/glossaryTerm.generated'; -import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; -import { EntityType, GlossaryTerm } from '../../../../types.generated'; -import { useEntityRegistry } from '../../../useEntityRegistry'; - -const TermLink = styled.span` - display: inline-block; -`; - -type Props = { - urn: string; -}; - -export const TermSummary = ({ urn }: Props) => { - const entityRegistry = useEntityRegistry(); - const { data } = useGetGlossaryTermQuery({ variables: { urn } }); - - return ( - <> - {data && ( - - - - - {entityRegistry.getDisplayName(EntityType.GlossaryTerm, data?.glossaryTerm)} - - - - )} - - ); -}; diff --git a/datahub-web-react/src/app/entity/group/preview/Preview.tsx b/datahub-web-react/src/app/entity/group/preview/Preview.tsx index dc83f6fe4f840..67449b9a481f0 100644 --- a/datahub-web-react/src/app/entity/group/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/group/preview/Preview.tsx @@ -8,6 +8,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry'; import { ANTD_GRAY } from '../../shared/constants'; import { IconStyleType } from '../../Entity'; import NoMarkdownViewer from '../../shared/components/styled/StripMarkdownText'; +import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter'; const PreviewContainer = styled.div` margin-bottom: 4px; @@ -87,7 +88,9 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpGroup)} - {name || urn} + + {name ? : urn} + {membersCount} members @@ -96,7 +99,12 @@ export const Preview = ({ {description && description.length > 0 && ( - {description} + } + > + {description} + )} diff --git a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts b/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts deleted file mode 100644 index 86dec46528b49..0000000000000 --- a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { getMatchPrioritizingPrimary } from '../utils'; - -const MOCK_MATCHED_FIELDS = [ - { - name: 'fieldPaths', - value: 'rain', - }, - { - name: 'description', - value: 'rainbow', - }, - { - name: 'fieldPaths', - value: 'rainbow', - }, - { - name: 'fieldPaths', - value: 'rainbows', - }, -]; - -describe('utils', () => { - describe('getMatchPrioritizingPrimary', () => { - it('prioritizes exact match', () => { - global.window.location.search = 'query=rainbow'; - const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths'); - expect(match?.value).toEqual('rainbow'); - expect(match?.name).toEqual('fieldPaths'); - }); - it('will accept first contains match', () => { - global.window.location.search = 'query=bow'; - const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths'); - expect(match?.value).toEqual('rainbow'); - expect(match?.name).toEqual('fieldPaths'); - }); - }); -}); diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx index 59293c2b0eee5..212813ffcb643 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx @@ -17,6 +17,7 @@ export type Props = { suffix?: JSX.Element; limit?: number; shouldWrap?: boolean; + customRender?: (text: string) => JSX.Element; }; export const removeMarkdown = (text: string) => { @@ -29,7 +30,7 @@ export const removeMarkdown = (text: string) => { .replace(/^•/, ''); // remove first • }; -export default function NoMarkdownViewer({ children, readMore, suffix, limit, shouldWrap }: Props) { +export default function NoMarkdownViewer({ children, customRender, readMore, suffix, limit, shouldWrap }: Props) { let plainText = removeMarkdown(children || ''); if (limit) { @@ -44,7 +45,8 @@ export default function NoMarkdownViewer({ children, readMore, suffix, limit, sh return ( - {plainText} {showReadMore && <>{readMore}} {suffix} + {customRender ? customRender(plainText) : plainText} + {showReadMore && <>{readMore}} {suffix} ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx index c1a23811fdd7e..08087bfd79b8e 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx @@ -6,7 +6,15 @@ export const generateColor = new ColorHash({ saturation: 0.9, }); -export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number }>` +export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number; highlightTag?: boolean }>` + &&& { + ${(props) => + props.highlightTag && + ` + background: ${props.theme.styles['highlight-color']}; + border: 1px solid ${props.theme.styles['highlight-border-color']}; + `} + } ${(props) => props.fontSize && `font-size: ${props.fontSize}px;`} ${(props) => props.$colorHash && diff --git a/datahub-web-react/src/app/entity/shared/utils.ts b/datahub-web-react/src/app/entity/shared/utils.ts index 7ec604785d1ff..a158cc9b7c119 100644 --- a/datahub-web-react/src/app/entity/shared/utils.ts +++ b/datahub-web-react/src/app/entity/shared/utils.ts @@ -1,9 +1,7 @@ -import * as QueryString from 'query-string'; import { Maybe } from 'graphql/jsutils/Maybe'; -import { Entity, EntityType, MatchedField, EntityRelationshipsResult, DataProduct } from '../../../types.generated'; +import { Entity, EntityType, EntityRelationshipsResult, DataProduct } from '../../../types.generated'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; -import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights'; import { GenericEntityProperties } from './types'; export function dictToQueryStringParams(params: Record) { @@ -87,46 +85,6 @@ export const isListSubset = (l1, l2): boolean => { return l1.every((result) => l2.indexOf(result) >= 0); }; -function normalize(value: string) { - return value.trim().toLowerCase(); -} - -function fromQueryGetBestMatch(selectedMatchedFields: MatchedField[], rawQuery: string) { - const query = normalize(rawQuery); - // first lets see if there's an exact match between a field value and the query - const exactMatch = selectedMatchedFields.find((field) => normalize(field.value) === query); - if (exactMatch) { - return exactMatch; - } - - // if no exact match exists, we'll see if the entire query is contained in any of the values - const containedMatch = selectedMatchedFields.find((field) => normalize(field.value).includes(query)); - if (containedMatch) { - return containedMatch; - } - - // otherwise, just return whichever is first - return selectedMatchedFields[0]; -} - -export const getMatchPrioritizingPrimary = ( - matchedFields: MatchedField[], - primaryField: string, -): MatchedField | undefined => { - const { location } = window; - const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); - const query: string = decodeURIComponent(params.query ? (params.query as string) : ''); - - const primaryMatches = matchedFields.filter((field) => field.name === primaryField); - if (primaryMatches.length > 0) { - return fromQueryGetBestMatch(primaryMatches, query); - } - - const matchesThatShouldBeShownOnFE = matchedFields.filter((field) => FIELDS_TO_HIGHLIGHT.has(field.name)); - - return fromQueryGetBestMatch(matchesThatShouldBeShownOnFE, query); -}; - function getGraphqlErrorCode(e) { if (e.graphQLErrors && e.graphQLErrors.length) { const firstError = e.graphQLErrors[0]; diff --git a/datahub-web-react/src/app/entity/user/preview/Preview.tsx b/datahub-web-react/src/app/entity/user/preview/Preview.tsx index 01f68d9065523..8893d4ab86786 100644 --- a/datahub-web-react/src/app/entity/user/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/user/preview/Preview.tsx @@ -7,6 +7,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry'; import { ANTD_GRAY } from '../../shared/constants'; import { IconStyleType } from '../../Entity'; import { CustomAvatar } from '../../../shared/avatar'; +import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter'; const PreviewContainer = styled.div` display: flex; @@ -80,11 +81,17 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpUser)} - {name || urn} + + {name ? : urn} + - {title && {title}} + {title && ( + + + + )} diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 36713cfb7ffcf..5c7500f0bcf79 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -34,6 +34,7 @@ import ExternalUrlButton from '../entity/shared/ExternalUrlButton'; import EntityPaths from './EntityPaths/EntityPaths'; import { DataProductLink } from '../shared/tags/DataProductLink'; import { EntityHealth } from '../entity/shared/containers/profile/header/EntityHealth'; +import SearchTextHighlighter from '../search/matches/SearchTextHighlighter'; import { getUniqueOwners } from './utils'; const PreviewContainer = styled.div` @@ -289,7 +290,7 @@ export default function DefaultPreviewCard({ ) : ( - {name || ' '} + )} @@ -336,6 +337,7 @@ export default function DefaultPreviewCard({ ) : undefined } + customRender={(text) => } > {description} diff --git a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx b/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx deleted file mode 100644 index 9b577048145c5..0000000000000 --- a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx +++ /dev/null @@ -1,98 +0,0 @@ -import { ArrowRightOutlined } from '@ant-design/icons'; -import { Button, Card, Divider, List, Space, Typography } from 'antd'; -import { ListProps } from 'antd/lib/list'; -import * as React from 'react'; -import { useHistory } from 'react-router-dom'; -import styled from 'styled-components'; -import { EntityType, SearchResult } from '../../types.generated'; -import { IconStyleType } from '../entity/Entity'; -import { useEntityRegistry } from '../useEntityRegistry'; -import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; -import analytics, { EventType } from '../analytics'; - -const styles = { - header: { marginBottom: 20 }, - resultHeaderCardBody: { padding: '16px 24px' }, - resultHeaderCard: { right: '52px', top: '-40px', position: 'absolute' }, - seeAllButton: { fontSize: 18 }, - resultsContainer: { width: '100%', padding: '40px 132px' }, -}; - -const ResultList = styled(List)` - &&& { - width: 100%; - border-color: ${(props) => props.theme.styles['border-color-base']}; - margin-top: 8px; - padding: 16px 48px; - box-shadow: ${(props) => props.theme.styles['box-shadow']}; - } -`; - -interface Props { - type: EntityType; - query: string; - searchResults: Array; -} - -export const EntityGroupSearchResults = ({ type, query, searchResults }: Props) => { - const history = useHistory(); - const entityRegistry = useEntityRegistry(); - - const onResultClick = (result: SearchResult, index: number) => { - analytics.event({ - type: EventType.SearchResultClickEvent, - query, - entityUrn: result.entity.urn, - entityType: result.entity.type, - index, - total: searchResults.length, - }); - }; - - return ( - - >> - header={ - - {entityRegistry.getCollectionName(type)} - - {entityRegistry.getIcon(type, 36, IconStyleType.ACCENT)} - - - } - footer={ - searchResults.length > 0 && ( - - ) - } - dataSource={searchResults as SearchResult[]} - split={false} - renderItem={(searchResult, index) => ( - <> - onResultClick(searchResult, index)}> - {entityRegistry.renderSearchResult(type, searchResult)} - - {index < searchResults.length - 1 && } - - )} - bordered - /> - - ); -}; diff --git a/datahub-web-react/src/app/search/context/SearchContext.tsx b/datahub-web-react/src/app/search/context/SearchContext.tsx index ec9a0c895e876..656c57b0b22d0 100644 --- a/datahub-web-react/src/app/search/context/SearchContext.tsx +++ b/datahub-web-react/src/app/search/context/SearchContext.tsx @@ -1,11 +1,13 @@ import React, { useContext } from 'react'; export type SearchContextType = { + query: string | undefined; selectedSortOption: string | undefined; setSelectedSortOption: (sortOption: string) => void; }; export const DEFAULT_CONTEXT = { + query: undefined, selectedSortOption: undefined, setSelectedSortOption: (_: string) => null, }; @@ -21,3 +23,7 @@ export function useSearchContext() { export function useSelectedSortOption() { return useSearchContext().selectedSortOption; } + +export function useSearchQuery() { + return useSearchContext().query; +} diff --git a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx index bfb65c1d74d3e..5ad9667ab1fc0 100644 --- a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx +++ b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx @@ -8,6 +8,7 @@ export default function SearchContextProvider({ children }: { children: React.Re const history = useHistory(); const location = useLocation(); const params = useMemo(() => QueryString.parse(location.search, { arrayFormat: 'comma' }), [location.search]); + const query = (params.query ? decodeURIComponent(params.query as string) : undefined) as string | undefined; const selectedSortOption = params.sortOption as string | undefined; function setSelectedSortOption(selectedOption: string) { @@ -15,7 +16,7 @@ export default function SearchContextProvider({ children }: { children: React.Re } return ( - + {children} ); diff --git a/datahub-web-react/src/app/search/context/SearchResultContext.tsx b/datahub-web-react/src/app/search/context/SearchResultContext.tsx new file mode 100644 index 0000000000000..68adead005149 --- /dev/null +++ b/datahub-web-react/src/app/search/context/SearchResultContext.tsx @@ -0,0 +1,72 @@ +import React, { ReactNode, createContext, useContext, useMemo } from 'react'; +import { SearchResult } from '../../../types.generated'; +import { + getMatchedFieldsByUrn, + getMatchedFieldNames, + getMatchedFieldsByNames, + shouldShowInMatchedFieldList, + getMatchedFieldLabel, + getMatchesPrioritized, +} from '../matches/utils'; +import { MatchedFieldName } from '../matches/constants'; + +type SearchResultContextValue = { + searchResult: SearchResult; +} | null; + +const SearchResultContext = createContext(null); + +type Props = { + children: ReactNode; + searchResult: SearchResult; +}; + +export const SearchResultProvider = ({ children, searchResult }: Props) => { + const value = useMemo( + () => ({ + searchResult, + }), + [searchResult], + ); + return {children}; +}; + +const useSearchResultContext = () => { + return useContext(SearchResultContext); +}; + +export const useSearchResult = () => { + return useSearchResultContext()?.searchResult; +}; + +export const useEntityType = () => { + return useSearchResultContext()?.searchResult.entity.type; +}; + +export const useMatchedFields = () => { + return useSearchResult()?.matchedFields ?? []; +}; + +export const useMatchedFieldsForList = (primaryField: MatchedFieldName) => { + const entityType = useEntityType(); + const matchedFields = useMatchedFields(); + const showableFields = matchedFields.filter((field) => shouldShowInMatchedFieldList(entityType, field)); + return entityType ? getMatchesPrioritized(entityType, showableFields, primaryField) : []; +}; + +export const useMatchedFieldsByGroup = (fieldName: MatchedFieldName) => { + const entityType = useEntityType(); + const matchedFields = useMatchedFields(); + const matchedFieldNames = getMatchedFieldNames(entityType, fieldName); + return getMatchedFieldsByNames(matchedFields, matchedFieldNames); +}; + +export const useHasMatchedFieldByUrn = (urn: string, fieldName: MatchedFieldName) => { + const matchedFields = useMatchedFieldsByGroup(fieldName); + return getMatchedFieldsByUrn(matchedFields, urn).length > 0; +}; + +export const useMatchedFieldLabel = (fieldName: string) => { + const entityType = useEntityType(); + return getMatchedFieldLabel(entityType, fieldName); +}; diff --git a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx new file mode 100644 index 0000000000000..0bfe000dea366 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx @@ -0,0 +1,133 @@ +import React from 'react'; + +import { Tooltip, Typography } from 'antd'; +import styled from 'styled-components'; +import { useMatchedFieldLabel, useMatchedFieldsForList } from '../context/SearchResultContext'; +import { MatchedField } from '../../../types.generated'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; +import { useSearchQuery } from '../context/SearchContext'; +import { MatchesGroupedByFieldName } from './constants'; +import { useEntityRegistry } from '../../useEntityRegistry'; +import { getDescriptionSlice, isDescriptionField, isHighlightableEntityField } from './utils'; + +const MatchesContainer = styled.div` + display: flex; + flex-wrap: wrap; + gap: 8px; +`; + +const MatchText = styled(Typography.Text)` + color: ${ANTD_GRAY_V2[8]}; + background: ${(props) => props.theme.styles['highlight-color']}; + border-radius: 4px; + padding: 2px 4px 2px 4px; + padding-right: 4px; +`; + +const MATCH_GROUP_LIMIT = 3; +const TOOLTIP_MATCH_GROUP_LIMIT = 10; + +type CustomFieldRenderer = (field: MatchedField) => JSX.Element | null; + +type Props = { + customFieldRenderer?: CustomFieldRenderer; + matchSuffix?: string; +}; + +const RenderedField = ({ + customFieldRenderer, + field, +}: { + customFieldRenderer?: CustomFieldRenderer; + field: MatchedField; +}) => { + const entityRegistry = useEntityRegistry(); + const query = useSearchQuery()?.trim().toLowerCase(); + const customRenderedField = customFieldRenderer?.(field); + if (customRenderedField) return {customRenderedField}; + if (isHighlightableEntityField(field)) { + return field.entity ? <>{entityRegistry.getDisplayName(field.entity.type, field.entity)} : <>; + } + if (isDescriptionField(field) && query) return {getDescriptionSlice(field.value, query)}; + return {field.value}; +}; + +const MatchedFieldsList = ({ + groupedMatch, + limit, + tooltip, + matchSuffix = '', + customFieldRenderer, +}: { + groupedMatch: MatchesGroupedByFieldName; + limit: number; + tooltip?: JSX.Element; + matchSuffix?: string; + customFieldRenderer?: CustomFieldRenderer; +}) => { + const label = useMatchedFieldLabel(groupedMatch.fieldName); + const count = groupedMatch.matchedFields.length; + const moreCount = Math.max(count - limit, 0); + const andMore = ( + <> + {' '} + & more + + ); + return ( + <> + Matches {count > 1 && `${count} `} + {label} + {count > 1 && 's'}{' '} + {groupedMatch.matchedFields.slice(0, limit).map((field, index) => ( + <> + {index > 0 && ', '} + <> + + + + ))} + {moreCount > 0 && + (tooltip ? ( + + {andMore} + + ) : ( + <>{andMore} + ))}{' '} + {matchSuffix} + + ); +}; + +export const MatchedFieldList = ({ customFieldRenderer, matchSuffix = '' }: Props) => { + const groupedMatches = useMatchedFieldsForList('fieldLabels'); + + return ( + <> + {groupedMatches.length > 0 ? ( + + {groupedMatches.map((groupedMatch) => { + return ( + + + } + /> + + ); + })} + + ) : null} + + ); +}; diff --git a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx new file mode 100644 index 0000000000000..d8da1088ea89d --- /dev/null +++ b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx @@ -0,0 +1,42 @@ +import React from 'react'; +import Highlight from 'react-highlighter'; +import styled from 'styled-components'; +import { useMatchedFieldsByGroup } from '../context/SearchResultContext'; +import { useSearchQuery } from '../context/SearchContext'; +import { MatchedFieldName } from './constants'; +import { useAppConfig } from '../../useAppConfig'; + +type Props = { + field: MatchedFieldName; + text: string; + enableFullHighlight?: boolean; +}; + +const HIGHLIGHT_ALL_PATTERN = /.*/; + +const StyledHighlight = styled(Highlight).attrs((props) => ({ + matchStyle: { background: props.theme.styles['highlight-color'] }, +}))``; + +const SearchTextHighlighter = ({ field, text, enableFullHighlight = false }: Props) => { + const appConfig = useAppConfig(); + const enableNameHighlight = appConfig.config.visualConfig.searchResult?.enableNameHighlight; + const matchedFields = useMatchedFieldsByGroup(field); + const hasMatchedField = !!matchedFields?.length; + const normalizedSearchQuery = useSearchQuery()?.trim().toLowerCase(); + const normalizedText = text.trim().toLowerCase(); + const hasSubstring = hasMatchedField && !!normalizedSearchQuery && normalizedText.includes(normalizedSearchQuery); + const pattern = enableFullHighlight ? HIGHLIGHT_ALL_PATTERN : undefined; + + return ( + <> + {enableNameHighlight && hasMatchedField ? ( + {text} + ) : ( + text + )} + + ); +}; + +export default SearchTextHighlighter; diff --git a/datahub-web-react/src/app/search/matches/constants.ts b/datahub-web-react/src/app/search/matches/constants.ts new file mode 100644 index 0000000000000..25ca82eef9597 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/constants.ts @@ -0,0 +1,129 @@ +import { EntityType, MatchedField } from '../../../types.generated'; + +export type MatchedFieldName = + | 'urn' + | 'name' + | 'displayName' + | 'title' + | 'description' + | 'editedDescription' + | 'editedFieldDescriptions' + | 'fieldDescriptions' + | 'tags' + | 'fieldTags' + | 'editedFieldTags' + | 'glossaryTerms' + | 'fieldGlossaryTerms' + | 'editedFieldGlossaryTerms' + | 'fieldLabels' + | 'fieldPaths'; + +export type MatchedFieldConfig = { + name: MatchedFieldName; + groupInto?: MatchedFieldName; + label: string; + showInMatchedFieldList?: boolean; +}; + +const DEFAULT_MATCHED_FIELD_CONFIG: Array = [ + { + name: 'urn', + label: 'urn', + }, + { + name: 'title', + label: 'title', + }, + { + name: 'displayName', + groupInto: 'name', + label: 'display name', + }, + { + name: 'name', + groupInto: 'name', + label: 'name', + }, + { + name: 'editedDescription', + groupInto: 'description', + label: 'description', + }, + { + name: 'description', + groupInto: 'description', + label: 'description', + }, + { + name: 'editedFieldDescriptions', + groupInto: 'fieldDescriptions', + label: 'column description', + showInMatchedFieldList: true, + }, + { + name: 'fieldDescriptions', + groupInto: 'fieldDescriptions', + label: 'column description', + showInMatchedFieldList: true, + }, + { + name: 'tags', + label: 'tag', + }, + { + name: 'editedFieldTags', + groupInto: 'fieldTags', + label: 'column tag', + showInMatchedFieldList: true, + }, + { + name: 'fieldTags', + groupInto: 'fieldTags', + label: 'column tag', + showInMatchedFieldList: true, + }, + { + name: 'glossaryTerms', + label: 'term', + }, + { + name: 'editedFieldGlossaryTerms', + groupInto: 'fieldGlossaryTerms', + label: 'column term', + showInMatchedFieldList: true, + }, + { + name: 'fieldGlossaryTerms', + groupInto: 'fieldGlossaryTerms', + label: 'column term', + showInMatchedFieldList: true, + }, + { + name: 'fieldLabels', + label: 'label', + showInMatchedFieldList: true, + }, + { + name: 'fieldPaths', + label: 'column', + showInMatchedFieldList: true, + }, +]; + +export const CHART_DASHBOARD_FIELD_CONFIG: Array = DEFAULT_MATCHED_FIELD_CONFIG.map((config) => { + if (config.name === 'title') return { ...config, groupInto: 'name' }; + return config; +}); + +export const MATCHED_FIELD_CONFIG = { + [EntityType.Chart]: CHART_DASHBOARD_FIELD_CONFIG, + [EntityType.Dashboard]: CHART_DASHBOARD_FIELD_CONFIG, + DEFAULT: DEFAULT_MATCHED_FIELD_CONFIG, +} as const; + +export type MatchesGroupedByFieldName = { + fieldName: string; + matchedFields: Array; +}; + +export const HIGHLIGHTABLE_ENTITY_TYPES = [EntityType.Tag, EntityType.GlossaryTerm]; diff --git a/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx new file mode 100644 index 0000000000000..0a33530552864 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx @@ -0,0 +1,8 @@ +import React from 'react'; + +import { MatchedField } from '../../../types.generated'; +import { downgradeV2FieldPath } from '../../entity/dataset/profile/schema/utils/utils'; + +export const matchedFieldPathsRenderer = (matchedField: MatchedField) => { + return matchedField?.name === 'fieldPaths' ? {downgradeV2FieldPath(matchedField.value)} : null; +}; diff --git a/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx new file mode 100644 index 0000000000000..25634c9e8b80e --- /dev/null +++ b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx @@ -0,0 +1,40 @@ +import React from 'react'; + +import { Chart, Dashboard, EntityType, GlossaryTerm, MatchedField } from '../../../types.generated'; +import { useEntityRegistry } from '../../useEntityRegistry'; + +const LABEL_INDEX_NAME = 'fieldLabels'; +const TYPE_PROPERTY_KEY_NAME = 'type'; + +const TermName = ({ term }: { term: GlossaryTerm }) => { + const entityRegistry = useEntityRegistry(); + return <>{entityRegistry.getDisplayName(EntityType.GlossaryTerm, term)}; +}; + +export const matchedInputFieldRenderer = (matchedField: MatchedField, entity: Chart | Dashboard) => { + if (matchedField?.name === LABEL_INDEX_NAME) { + const matchedSchemaField = entity.inputFields?.fields?.find( + (field) => field?.schemaField?.label === matchedField.value, + ); + const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find( + (term) => term?.term?.name === matchedField.value, + ); + + if (matchedGlossaryTerm) { + let termType = 'term'; + const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find( + (property) => property.key === TYPE_PROPERTY_KEY_NAME, + ); + if (typeProperty) { + termType = typeProperty.value || termType; + } + + return ( + <> + {termType} + + ); + } + } + return null; +}; diff --git a/datahub-web-react/src/app/search/matches/utils.test.ts b/datahub-web-react/src/app/search/matches/utils.test.ts new file mode 100644 index 0000000000000..8b5ed27f5c2ad --- /dev/null +++ b/datahub-web-react/src/app/search/matches/utils.test.ts @@ -0,0 +1,110 @@ +import { EntityType } from '../../../types.generated'; +import { getMatchesPrioritized } from './utils'; + +const mapping = new Map(); +mapping.set('fieldPaths', 'column'); +mapping.set('fieldDescriptions', 'column description'); +mapping.set('fieldTags', 'column tag'); + +const MOCK_MATCHED_FIELDS = [ + { + name: 'fieldPaths', + value: 'rain', + }, + { + name: 'fieldDescriptions', + value: 'rainbow', + }, + { + name: 'fieldPaths', + value: 'rainbow', + }, + { + name: 'fieldPaths', + value: 'rainbows', + }, +]; + +const MOCK_MATCHED_DESCRIPTION_FIELDS = [ + { + name: 'editedDescription', + value: 'edited description value', + }, + { + name: 'description', + value: 'description value', + }, + { + name: 'fieldDescriptions', + value: 'field descriptions value', + }, + { + name: 'editedFieldDescriptions', + value: 'edited field descriptions value', + }, +]; + +describe('utils', () => { + describe('getMatchPrioritizingPrimary', () => { + it('prioritizes exact match', () => { + global.window.location.search = 'query=rainbow'; + const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths'); + expect(groupedMatches).toEqual([ + { + fieldName: 'fieldPaths', + matchedFields: [ + { name: 'fieldPaths', value: 'rainbow' }, + { name: 'fieldPaths', value: 'rainbows' }, + { name: 'fieldPaths', value: 'rain' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }], + }, + ]); + }); + it('will accept first contains match', () => { + global.window.location.search = 'query=bow'; + const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths'); + expect(groupedMatches).toEqual([ + { + fieldName: 'fieldPaths', + matchedFields: [ + { name: 'fieldPaths', value: 'rainbow' }, + { name: 'fieldPaths', value: 'rainbows' }, + { name: 'fieldPaths', value: 'rain' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }], + }, + ]); + }); + it('will group by field name', () => { + global.window.location.search = ''; + const groupedMatches = getMatchesPrioritized( + EntityType.Dataset, + MOCK_MATCHED_DESCRIPTION_FIELDS, + 'fieldPaths', + ); + expect(groupedMatches).toEqual([ + { + fieldName: 'description', + matchedFields: [ + { name: 'editedDescription', value: 'edited description value' }, + { name: 'description', value: 'description value' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [ + { name: 'fieldDescriptions', value: 'field descriptions value' }, + { name: 'editedFieldDescriptions', value: 'edited field descriptions value' }, + ], + }, + ]); + }); + }); +}); diff --git a/datahub-web-react/src/app/search/matches/utils.ts b/datahub-web-react/src/app/search/matches/utils.ts new file mode 100644 index 0000000000000..78c62f7eef458 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/utils.ts @@ -0,0 +1,136 @@ +import * as QueryString from 'query-string'; +import { EntityType, MatchedField } from '../../../types.generated'; +import { + HIGHLIGHTABLE_ENTITY_TYPES, + MATCHED_FIELD_CONFIG, + MatchedFieldConfig, + MatchedFieldName, + MatchesGroupedByFieldName, +} from './constants'; + +const getFieldConfigsByEntityType = (entityType: EntityType | undefined): Array => { + return entityType && entityType in MATCHED_FIELD_CONFIG + ? MATCHED_FIELD_CONFIG[entityType] + : MATCHED_FIELD_CONFIG.DEFAULT; +}; + +export const shouldShowInMatchedFieldList = (entityType: EntityType | undefined, field: MatchedField): boolean => { + const configs = getFieldConfigsByEntityType(entityType); + return configs.some((config) => config.name === field.name && config.showInMatchedFieldList); +}; + +export const getMatchedFieldLabel = (entityType: EntityType | undefined, fieldName: string): string => { + const configs = getFieldConfigsByEntityType(entityType); + return configs.find((config) => config.name === fieldName)?.label ?? ''; +}; + +export const getGroupedFieldName = ( + entityType: EntityType | undefined, + fieldName: string, +): MatchedFieldName | undefined => { + const configs = getFieldConfigsByEntityType(entityType); + const fieldConfig = configs.find((config) => config.name === fieldName); + return fieldConfig?.groupInto; +}; + +export const getMatchedFieldNames = ( + entityType: EntityType | undefined, + fieldName: MatchedFieldName, +): Array => { + return getFieldConfigsByEntityType(entityType) + .filter((config) => fieldName === config.groupInto || fieldName === config.name) + .map((field) => field.name); +}; + +export const getMatchedFieldsByNames = (fields: Array, names: Array): Array => { + return fields.filter((field) => names.includes(field.name)); +}; + +export const getMatchedFieldsByUrn = (fields: Array, urn: string): Array => { + return fields.filter((field) => field.value === urn); +}; + +function normalize(value: string) { + return value.trim().toLowerCase(); +} + +function fromQueryGetBestMatch( + selectedMatchedFields: MatchedField[], + rawQuery: string, + prioritizedField: string, +): Array { + const query = normalize(rawQuery); + const priorityMatches: Array = selectedMatchedFields.filter( + (field) => field.name === prioritizedField, + ); + const nonPriorityMatches: Array = selectedMatchedFields.filter( + (field) => field.name !== prioritizedField, + ); + const exactMatches: Array = []; + const containedMatches: Array = []; + const rest: Array = []; + + [...priorityMatches, ...nonPriorityMatches].forEach((field) => { + const normalizedValue = normalize(field.value); + if (normalizedValue === query) exactMatches.push(field); + else if (normalizedValue.includes(query)) containedMatches.push(field); + else rest.push(field); + }); + + return [...exactMatches, ...containedMatches, ...rest]; +} + +const getMatchesGroupedByFieldName = ( + entityType: EntityType, + matchedFields: Array, +): Array => { + const fieldNameToMatches = new Map>(); + const fieldNames: Array = []; + matchedFields.forEach((field) => { + const groupedFieldName = getGroupedFieldName(entityType, field.name) || field.name; + const matchesInMap = fieldNameToMatches.get(groupedFieldName); + if (matchesInMap) { + matchesInMap.push(field); + } else { + fieldNameToMatches.set(groupedFieldName, [field]); + fieldNames.push(groupedFieldName); + } + }); + return fieldNames.map((fieldName) => ({ + fieldName, + matchedFields: fieldNameToMatches.get(fieldName) ?? [], + })); +}; + +export const getMatchesPrioritized = ( + entityType: EntityType, + matchedFields: MatchedField[], + prioritizedField: string, +): Array => { + const { location } = window; + const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); + const query: string = decodeURIComponent(params.query ? (params.query as string) : ''); + const matches = fromQueryGetBestMatch(matchedFields, query, prioritizedField); + return getMatchesGroupedByFieldName(entityType, matches); +}; + +export const isHighlightableEntityField = (field: MatchedField) => + !!field.entity && HIGHLIGHTABLE_ENTITY_TYPES.includes(field.entity.type); + +export const isDescriptionField = (field: MatchedField) => field.name.toLowerCase().includes('description'); + +const SURROUNDING_DESCRIPTION_CHARS = 10; +const MAX_DESCRIPTION_CHARS = 50; + +export const getDescriptionSlice = (text: string, target: string) => { + const queryIndex = text.indexOf(target); + const start = Math.max(0, queryIndex - SURROUNDING_DESCRIPTION_CHARS); + const end = Math.min( + start + MAX_DESCRIPTION_CHARS, + text.length, + queryIndex + target.length + SURROUNDING_DESCRIPTION_CHARS, + ); + const startEllipsis = start > 0 ? '...' : ''; + const endEllipsis = end < text.length ? '...' : ''; + return `${startEllipsis}${text.slice(start, end)}${endEllipsis}`; +}; diff --git a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx index 2288238091776..ed2460b6eea3c 100644 --- a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx +++ b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx @@ -8,6 +8,7 @@ import { StyledTag } from '../../../entity/shared/components/styled/StyledTag'; import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { TagProfileDrawer } from '../TagProfileDrawer'; +import { useHasMatchedFieldByUrn } from '../../../search/context/SearchResultContext'; const TagLink = styled.span` display: inline-block; @@ -41,6 +42,7 @@ export default function Tag({ }: Props) { const entityRegistry = useEntityRegistry(); const [removeTagMutation] = useRemoveTagMutation(); + const highlightTag = useHasMatchedFieldByUrn(tag.tag.urn, 'tags'); const [tagProfileDrawerVisible, setTagProfileDrawerVisible] = useState(false); const [addTagUrn, setAddTagUrn] = useState(''); @@ -110,6 +112,7 @@ export default function Tag({ removeTag(tag); }} fontSize={fontSize} + highlightTag={highlightTag} > ` +const StyledTag = styled(Tag)<{ fontSize?: number; highlightTerm?: boolean }>` + &&& { + ${(props) => + props.highlightTerm && + ` + background: ${props.theme.styles['highlight-color']}; + border: 1px solid ${props.theme.styles['highlight-border-color']}; + `} + } ${(props) => props.fontSize && `font-size: ${props.fontSize}px;`} `; @@ -38,6 +47,7 @@ export default function TermContent({ }: Props) { const entityRegistry = useEntityRegistry(); const [removeTermMutation] = useRemoveTermMutation(); + const highlightTerm = useHasMatchedFieldByUrn(term.term.urn, 'glossaryTerms'); const removeTerm = (termToRemove: GlossaryTermAssociation) => { onOpenModal?.(); @@ -85,6 +95,7 @@ export default function TermContent({ removeTerm(term); }} fontSize={fontSize} + highlightTerm={highlightTerm} > diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx index 3b34b108ecc93..807a17c4fd6a4 100644 --- a/datahub-web-react/src/appConfigContext.tsx +++ b/datahub-web-react/src/appConfigContext.tsx @@ -27,6 +27,9 @@ export const DEFAULT_APP_CONFIG = { entityProfile: { domainDefaultTab: null, }, + searchResult: { + enableNameHighlight: false, + }, }, authConfig: { tokenAuthEnabled: false, diff --git a/datahub-web-react/src/conf/theme/theme_dark.config.json b/datahub-web-react/src/conf/theme/theme_dark.config.json index b648f3d997f21..9746c3ddde5f3 100644 --- a/datahub-web-react/src/conf/theme/theme_dark.config.json +++ b/datahub-web-react/src/conf/theme/theme_dark.config.json @@ -17,7 +17,9 @@ "disabled-color": "fade(white, 25%)", "steps-nav-arrow-color": "fade(white, 25%)", "homepage-background-upper-fade": "#FFFFFF", - "homepage-background-lower-fade": "#333E4C" + "homepage-background-lower-fade": "#333E4C", + "highlight-color": "#E6F4FF", + "highlight-border-color": "#BAE0FF" }, "assets": { "logoUrl": "/assets/logo.png" diff --git a/datahub-web-react/src/conf/theme/theme_light.config.json b/datahub-web-react/src/conf/theme/theme_light.config.json index e842fdb1bb8aa..906c04e38a1ba 100644 --- a/datahub-web-react/src/conf/theme/theme_light.config.json +++ b/datahub-web-react/src/conf/theme/theme_light.config.json @@ -20,7 +20,9 @@ "homepage-background-lower-fade": "#FFFFFF", "homepage-text-color": "#434343", "box-shadow": "0px 0px 30px 0px rgb(239 239 239)", - "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)" + "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)", + "highlight-color": "#E6F4FF", + "highlight-border-color": "#BAE0FF" }, "assets": { "logoUrl": "/assets/logo.png" diff --git a/datahub-web-react/src/conf/theme/types.ts b/datahub-web-react/src/conf/theme/types.ts index 98140cbbd553d..7d78230092700 100644 --- a/datahub-web-react/src/conf/theme/types.ts +++ b/datahub-web-react/src/conf/theme/types.ts @@ -18,6 +18,8 @@ export type Theme = { 'homepage-background-lower-fade': string; 'box-shadow': string; 'box-shadow-hover': string; + 'highlight-color': string; + 'highlight-border-color': string; }; assets: { logoUrl: string; diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql index 4b1295f1024a2..bf15e5f757f8f 100644 --- a/datahub-web-react/src/graphql/app.graphql +++ b/datahub-web-react/src/graphql/app.graphql @@ -45,6 +45,9 @@ query appConfig { defaultTab } } + searchResult { + enableNameHighlight + } } telemetryConfig { enableThirdPartyLogging diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 172a6d957e287..7d6d7ef109e16 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -832,6 +832,11 @@ fragment searchResults on SearchResults { matchedFields { name value + entity { + urn + type + ...entityDisplayNameFields + } } insights { text diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java index 35a322d37b2fd..8b56ae0beb3f1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java @@ -78,7 +78,7 @@ public static Map getRequestMap(@Nullable Filter requestParams) return criterionArray.stream().collect(Collectors.toMap(Criterion::getField, Criterion::getValue)); } - static boolean isUrn(@Nonnull String value) { + public static boolean isUrn(@Nonnull String value) { // TODO(https://github.com/datahub-project/datahub-gma/issues/51): This method is a bit of a hack to support searching for // URNs that have commas in them, while also using commas a delimiter for search. We should stop supporting commas // as delimiter, and then we can stop using this hack. diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java new file mode 100644 index 0000000000000..7094bbd710f75 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java @@ -0,0 +1,11 @@ +package com.linkedin.metadata.config; + +import lombok.Data; + +@Data +public class SearchResultVisualConfig { + /** + * The default tab to show first on a Domain entity profile. Defaults to React code sorting if not present. + */ + public Boolean enableNameHighlight; +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java index d1c357186e1ae..14ac2406c2256 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java @@ -22,4 +22,9 @@ public class VisualConfiguration { * Queries tab related configurations */ public EntityProfileConfig entityProfile; + + /** + * Search result related configurations + */ + public SearchResultVisualConfig searchResult; } diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 82cf9e8fdc8a7..d21442d0bf5c8 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -111,6 +111,8 @@ visualConfig: entityProfile: # we only support default tab for domains right now. In order to implement for other entities, update React code domainDefaultTab: ${DOMAIN_DEFAULT_TAB:} # set to DOCUMENTATION_TAB to show documentation tab first + searchResult: + enableNameHighlight: ${SEARCH_RESULT_NAME_HIGHLIGHT_ENABLED:true} # Enables visual highlighting on search result names/descriptions. # Storage Layer From d15f080a457d49eff2a4f5df7bc3b54fb9b4e473 Mon Sep 17 00:00:00 2001 From: Joshua Eilers Date: Thu, 24 Aug 2023 11:12:54 -0700 Subject: [PATCH 18/20] Add links to glossary term cards without counts (#8705) --- .../entity/glossaryTerm/preview/Preview.tsx | 5 +++ .../profile/GlossaryRelatedEntity.tsx | 2 +- .../src/app/entity/glossaryTerm/utils.ts | 4 ++ .../app/entity/shared/ExternalUrlButton.tsx | 34 ++--------------- .../src/app/entity/shared/UrlButton.tsx | 37 +++++++++++++++++++ .../src/app/preview/DefaultPreviewCard.tsx | 3 ++ 6 files changed, 54 insertions(+), 31 deletions(-) create mode 100644 datahub-web-react/src/app/entity/shared/UrlButton.tsx diff --git a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx index 26d3cf456ab7a..b6802e37652cb 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx @@ -4,6 +4,8 @@ import { Deprecation, Domain, EntityType, Owner, ParentNodesResult } from '../.. import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType, PreviewType } from '../../Entity'; +import UrlButton from '../../shared/UrlButton'; +import { getRelatedEntitiesUrl } from '../utils'; export const Preview = ({ urn, @@ -39,6 +41,9 @@ export const Preview = ({ deprecation={deprecation} parentNodes={parentNodes} domain={domain} + entityTitleSuffix={ + View Related Entities + } /> ); }; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx index d0e8de0928b48..098e97e526fd8 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx @@ -5,7 +5,7 @@ import { EmbeddedListSearchSection } from '../../shared/components/styled/search import { useEntityData } from '../../shared/EntityContext'; export default function GlossaryRelatedEntity() { - const { entityData }: any = useEntityData(); + const { entityData } = useEntityData(); const entityUrn = entityData?.urn; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts index 3a2a3d35a8126..cbfa76fa34866 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts +++ b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts @@ -6,3 +6,7 @@ export function sortGlossaryTerms(entityRegistry: EntityRegistry, nodeA?: Entity const nodeBName = entityRegistry.getDisplayName(EntityType.GlossaryTerm, nodeB) || ''; return nodeAName.localeCompare(nodeBName); } + +export function getRelatedEntitiesUrl(entityRegistry: EntityRegistry, urn: string) { + return `${entityRegistry.getEntityUrl(EntityType.GlossaryTerm, urn)}/${encodeURIComponent('Related Entities')}`; +} diff --git a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx index 9677af0776604..dce74c02cdb34 100644 --- a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx +++ b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx @@ -1,28 +1,11 @@ -import { ArrowRightOutlined } from '@ant-design/icons'; -import { Button } from 'antd'; import React from 'react'; -import styled from 'styled-components/macro'; import { EntityType } from '../../../types.generated'; import analytics, { EventType, EntityActionType } from '../../analytics'; +import UrlButton from './UrlButton'; const GITHUB_LINK = 'github.com'; const GITHUB = 'GitHub'; -const ExternalUrlWrapper = styled.span` - font-size: 12px; -`; - -const StyledButton = styled(Button)` - > :hover { - text-decoration: underline; - } - &&& { - padding-bottom: 0px; - } - padding-left: 12px; - padding-right: 12px; -`; - interface Props { externalUrl: string; platformName?: string; @@ -46,17 +29,8 @@ export default function ExternalUrlButton({ externalUrl, platformName, entityTyp } return ( - - - {displayedName ? `View in ${displayedName}` : 'View link'}{' '} - - - + + {displayedName ? `View in ${displayedName}` : 'View link'} + ); } diff --git a/datahub-web-react/src/app/entity/shared/UrlButton.tsx b/datahub-web-react/src/app/entity/shared/UrlButton.tsx new file mode 100644 index 0000000000000..a6f6da4a60ad5 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/UrlButton.tsx @@ -0,0 +1,37 @@ +import React, { ReactNode } from 'react'; +import { ArrowRightOutlined } from '@ant-design/icons'; +import { Button } from 'antd'; +import styled from 'styled-components/macro'; + +const UrlButtonContainer = styled.span` + font-size: 12px; +`; + +const StyledButton = styled(Button)` + > :hover { + text-decoration: underline; + } + &&& { + padding-bottom: 0px; + } + padding-left: 12px; + padding-right: 12px; +`; + +interface Props { + href: string; + children: ReactNode; + onClick?: () => void; +} + +const NOOP = () => {}; + +export default function UrlButton({ href, children, onClick = NOOP }: Props) { + return ( + + + {children} + + + ); +} diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 5c7500f0bcf79..0d0a32f7750a8 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -174,6 +174,7 @@ interface Props { deprecation?: Deprecation | null; topUsers?: Array | null; externalUrl?: string | null; + entityTitleSuffix?: React.ReactNode; subHeader?: React.ReactNode; snippet?: React.ReactNode; insights?: Array | null; @@ -226,6 +227,7 @@ export default function DefaultPreviewCard({ titleSizePx, dataTestID, externalUrl, + entityTitleSuffix, onClick, degree, parentContainers, @@ -306,6 +308,7 @@ export default function DefaultPreviewCard({ entityType={type} /> )} + {entityTitleSuffix} {degree !== undefined && degree !== null && ( Date: Thu, 24 Aug 2023 13:44:43 -0700 Subject: [PATCH 19/20] fix non sibling document links (#8724) --- .../Documentation/components/LinkList.tsx | 4 +- .../e2e/mutations/edit_documentation.js | 150 ++++++++++-------- 2 files changed, 90 insertions(+), 64 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx index 1aef497ced57b..bcce994c3f0f8 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx @@ -33,7 +33,7 @@ type LinkListProps = { }; export const LinkList = ({ refetch }: LinkListProps) => { - const { entityData } = useEntityData(); + const { urn: entityUrn, entityData } = useEntityData(); const entityRegistry = useEntityRegistry(); const [removeLinkMutation] = useRemoveLinkMutation(); const links = entityData?.institutionalMemory?.elements || []; @@ -41,7 +41,7 @@ export const LinkList = ({ refetch }: LinkListProps) => { const handleDeleteLink = async (metadata: InstitutionalMemoryMetadata) => { try { await removeLinkMutation({ - variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn } }, + variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn || entityUrn } }, }); message.success({ content: 'Link Removed', duration: 2 }); } catch (e: unknown) { diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js index 1f40cdf602062..e4e5a39ce1100 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js @@ -4,68 +4,94 @@ const wrong_url = "https://www.linkedincom"; const correct_url = "https://www.linkedin.com"; describe("edit documentation and link to dataset", () => { + it("open test dataset page, edit documentation", () => { + //edit documentation and verify changes saved + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("[role='tab']").contains("Documentation").click(); + cy.waitTextVisible("my hive dataset"); + cy.waitTextVisible("Sample doc"); + cy.clickOptionWithText("Edit"); + cy.focused().clear(); + cy.focused().type(documentation_edited); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible(documentation_edited); + //return documentation to original state + cy.clickOptionWithText("Edit"); + cy.focused().clear().wait(1000); + cy.focused().type("my hive dataset"); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible("my hive dataset"); + }); - it("open test dataset page, edit documentation", () => { - //edit documentation and verify changes saved - cy.loginWithCredentials(); - cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.waitTextVisible("my hive dataset"); - cy.waitTextVisible("Sample doc"); - cy.clickOptionWithText("Edit"); - cy.focused().clear(); - cy.focused().type(documentation_edited); - cy.get("button").contains("Save").click(); - cy.waitTextVisible("Description Updated"); - cy.waitTextVisible(documentation_edited); - //return documentation to original state - cy.clickOptionWithText("Edit"); - cy.focused().clear().wait(1000); - cy.focused().type("my hive dataset"); - cy.get("button").contains("Save").click(); - cy.waitTextVisible("Description Updated"); - cy.waitTextVisible("my hive dataset"); - }); + it("open test dataset page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("[role='tab']").contains("Documentation").click(); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + }); - it("open test dataset page, remove and add dataset link", () => { - cy.loginWithCredentials(); - cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.contains("Sample doc").trigger("mouseover", { force: true }); - cy.get('[data-icon="delete"]').click(); - cy.waitTextVisible("Link Removed"); - cy.get("button").contains("Add Link").click(); - cy.get("#addLinkForm_url").type(wrong_url); - cy.waitTextVisible("This field must be a valid url."); - cy.focused().clear(); - cy.waitTextVisible("A URL is required."); - cy.focused().type(correct_url); - cy.ensureTextNotPresent("This field must be a valid url."); - cy.get("#addLinkForm_label").type("Sample doc"); - cy.get('[role="dialog"] button').contains("Add").click(); - cy.waitTextVisible("Link Added"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.get(`[href='${correct_url}']`).should("be.visible"); - }); + it("open test domain page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit("/domain/urn:li:domain:marketing/Entities"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + }); - it("edit field documentation", () => { - cy.loginWithCredentials(); - cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); - cy.get("tbody [data-icon='edit']").first().click({ force: true }); - cy.waitTextVisible("Update description"); - cy.waitTextVisible("Foo field description has changed"); - cy.focused().clear().wait(1000); - cy.focused().type(documentation_edited); - cy.get("button").contains("Update").click(); - cy.waitTextVisible("Updated!"); - cy.waitTextVisible(documentation_edited); - cy.waitTextVisible("(edited)"); - cy.get("tbody [data-icon='edit']").first().click({ force: true }); - cy.focused().clear().wait(1000); - cy.focused().type("Foo field description has changed"); - cy.get("button").contains("Update").click(); - cy.waitTextVisible("Updated!"); - cy.waitTextVisible("Foo field description has changed"); - cy.waitTextVisible("(edited)"); - }); -}); \ No newline at end of file + it("edit field documentation", () => { + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.waitTextVisible("Update description"); + cy.waitTextVisible("Foo field description has changed"); + cy.focused().clear().wait(1000); + cy.focused().type(documentation_edited); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible(documentation_edited); + cy.waitTextVisible("(edited)"); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.focused().clear().wait(1000); + cy.focused().type("Foo field description has changed"); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible("Foo field description has changed"); + cy.waitTextVisible("(edited)"); + }); +}); From 86481262c2987f3c4e827d6c5cb72808c6d7fb5b Mon Sep 17 00:00:00 2001 From: John Joyce Date: Thu, 24 Aug 2023 13:55:54 -0700 Subject: [PATCH 20/20] refactor(policies): Rename edit all privilege to edit entity (#8722) --- docs/authorization/access-policies-guide.md | 23 +++++++++++++++++-- .../authorization/PoliciesConfig.java | 4 ++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/docs/authorization/access-policies-guide.md b/docs/authorization/access-policies-guide.md index 5820e513a83e3..1eabb64d2878f 100644 --- a/docs/authorization/access-policies-guide.md +++ b/docs/authorization/access-policies-guide.md @@ -110,10 +110,13 @@ In the second step, we can simply select the Privileges that this Platform Polic | Manage Tags | Allow the actor to create and remove any Tags | | Manage Public Views | Allow the actor to create, edit, and remove any public (shared) Views. | | Manage Ownership Types | Allow the actor to create, edit, and remove any Ownership Types. | +| Manage Platform Settings | (Acryl DataHub only) Allow the actor to manage global integrations and notification settings | +| Manage Monitors | (Acryl DataHub only) Allow the actor to create, remove, start, or stop any entity assertion monitors | | Restore Indices API[^1] | Allow the actor to restore indices for a set of entities via API | | Enable/Disable Writeability API[^1] | Allow the actor to enable or disable GMS writeability for use in data migrations | | Apply Retention API[^1] | Allow the actor to apply aspect retention via API | + [^1]: Only active if REST_API_AUTHORIZATION_ENABLED environment flag is enabled #### Step 3: Choose Policy Actors @@ -204,8 +207,15 @@ The common Metadata Privileges, which span across entity types, include: | Edit Status | Allow actor to edit the status of an entity (soft deleted or not). | | Edit Domain | Allow actor to edit the Domain of an entity. | | Edit Deprecation | Allow actor to edit the Deprecation status of an entity. | -| Edit Assertions | Allow actor to add and remove assertions from an entity. | -| Edit All | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. | +| Edit Lineage | Allow actor to edit custom lineage edges for the entity. | +| Edit Data Product | Allow actor to edit the data product that an entity is part of | +| Propose Tags | (Acryl DataHub only) Allow actor to propose new Tags for the entity. | +| Propose Glossary Terms | (Acryl DataHub only) Allow actor to propose new Glossary Terms for the entity. | +| Propose Documentation | (Acryl DataHub only) Allow actor to propose new Documentation for the entity. | +| Manage Tag Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Tags for the entity. | +| Manage Glossary Terms Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Glossary Terms for the entity. | +| Manage Documentation Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Documentation for the entity | +| Edit Entity | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. | | Get Timeline API[^1] | Allow actor to get the timeline of an entity via API. | | Get Entity API[^1] | Allow actor to get an entity via API. | | Get Timeseries Aspect API[^1] | Allow actor to get a timeseries aspect via API. | @@ -225,10 +235,19 @@ The common Metadata Privileges, which span across entity types, include: | Dataset | Edit Dataset Queries | Allow actor to edit the Highlighted Queries on the Queries tab of the dataset. | | Dataset | View Dataset Usage | Allow actor to access usage metadata about a dataset both in the UI and in the GraphQL API. This includes example queries, number of queries, etc. Also applies to REST APIs when REST API Authorization is enabled. | | Dataset | View Dataset Profile | Allow actor to access a dataset's profile both in the UI and in the GraphQL API. This includes snapshot statistics like #rows, #columns, null percentage per field, etc. | +| Dataset | Edit Assertions | Allow actor to change the assertions associated with a dataset. | +| Dataset | Edit Incidents | (Acryl DataHub only) Allow actor to change the incidents associated with a dataset. | +| Dataset | Edit Monitors | (Acryl DataHub only) Allow actor to change the assertion monitors associated with a dataset. | | Tag | Edit Tag Color | Allow actor to change the color of a Tag. | | Group | Edit Group Members | Allow actor to add and remove members to a group. | +| Group | Edit Contact Information | Allow actor to change email, slack handle associated with the group. | +| Group | Manage Group Subscriptions | (Acryl DataHub only) Allow actor to subscribe the group to entities. | +| Group | Manage Group Notifications | (Acryl DataHub only) Allow actor to change notification settings for the group. | | User | Edit User Profile | Allow actor to change the user's profile including display name, bio, title, profile image, etc. | | User + Group | Edit Contact Information | Allow actor to change the contact information such as email & chat handles. | +| Term Group | Manage Direct Glossary Children | Allow actor to change the direct child Term Groups or Terms of the group. | +| Term Group | Manage All Glossary Children | Allow actor to change any direct or indirect child Term Groups or Terms of the group. | + > **Still have questions about Privileges?** Let us know in [Slack](https://slack.datahubproject.io)! diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java index d515c1747bee4..0b0d462f079bf 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java @@ -198,8 +198,8 @@ public class PoliciesConfig { public static final Privilege EDIT_ENTITY_PRIVILEGE = Privilege.of( "EDIT_ENTITY", - "Edit All", - "The ability to edit any information about an entity. Super user privileges."); + "Edit Entity", + "The ability to edit any information about an entity. Super user privileges for the entity."); public static final Privilege DELETE_ENTITY_PRIVILEGE = Privilege.of( "DELETE_ENTITY",