-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(ingest/tableau): Allow parsing of database name from fullName #8981
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,14 @@ | ||
import html | ||
import logging | ||
from dataclasses import dataclass | ||
from functools import lru_cache | ||
from typing import Dict, List, Optional, Tuple | ||
|
||
from pydantic.fields import Field | ||
|
||
import datahub.emitter.mce_builder as builder | ||
from datahub.configuration.common import ConfigModel | ||
from datahub.ingestion.source import tableau_constant as tc | ||
from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( | ||
DatasetLineageType, | ||
FineGrainedLineage, | ||
|
@@ -31,6 +34,8 @@ | |
) | ||
from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class TableauLineageOverrides(ConfigModel): | ||
platform_override_map: Optional[Dict[str, str]] = Field( | ||
|
@@ -537,12 +542,12 @@ def get_fully_qualified_table_name( | |
platform: str, | ||
upstream_db: str, | ||
schema: str, | ||
full_name: str, | ||
table_name: str, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was never passed full_name...:
|
||
) -> str: | ||
if platform == "athena": | ||
upstream_db = "" | ||
database_name = f"{upstream_db}." if upstream_db else "" | ||
final_name = full_name.replace("[", "").replace("]", "") | ||
final_name = table_name.replace("[", "").replace("]", "") | ||
|
||
schema_name = f"{schema}." if schema else "" | ||
|
||
|
@@ -573,17 +578,123 @@ def get_fully_qualified_table_name( | |
return fully_qualified_table_name | ||
|
||
|
||
def get_platform_instance( | ||
platform: str, platform_instance_map: Optional[Dict[str, str]] | ||
) -> Optional[str]: | ||
if platform_instance_map is not None and platform in platform_instance_map.keys(): | ||
return platform_instance_map[platform] | ||
@dataclass | ||
class TableauUpstreamReference: | ||
database: Optional[str] | ||
schema: Optional[str] | ||
table: str | ||
|
||
connection_type: str | ||
|
||
@classmethod | ||
def create( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Meant to replicate logic in |
||
cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None | ||
) -> "TableauUpstreamReference": | ||
# Values directly from `table` object from Tableau | ||
database = t_database = d.get(tc.DATABASE, {}).get(tc.NAME) | ||
schema = t_schema = d.get(tc.SCHEMA) | ||
table = t_table = d.get(tc.NAME) or "" | ||
t_full_name = d.get(tc.FULL_NAME) | ||
t_connection_type = d[tc.CONNECTION_TYPE] # required to generate urn | ||
t_id = d[tc.ID] | ||
|
||
parsed_full_name = cls.parse_full_name(t_full_name) | ||
if parsed_full_name and len(parsed_full_name) == 3: | ||
database, schema, table = parsed_full_name | ||
elif parsed_full_name and len(parsed_full_name) == 2: | ||
schema, table = parsed_full_name | ||
else: | ||
logger.debug( | ||
f"Upstream urn generation ({t_id}):" | ||
f" Did not parse full name {t_full_name}: unexpected number of values", | ||
) | ||
|
||
if not schema and default_schema_map and database in default_schema_map: | ||
schema = default_schema_map[database] | ||
|
||
if database != t_database: | ||
logger.debug( | ||
f"Upstream urn generation ({t_id}):" | ||
f" replacing database {t_database} with {database} from full name {t_full_name}" | ||
) | ||
if schema != t_schema: | ||
logger.debug( | ||
f"Upstream urn generation ({t_id}):" | ||
f" replacing schema {t_schema} with {schema} from full name {t_full_name}" | ||
) | ||
if table != t_table: | ||
logger.debug( | ||
f"Upstream urn generation ({t_id}):" | ||
f" replacing table {t_table} with {table} from full name {t_full_name}" | ||
) | ||
|
||
# TODO: See if we can remove this -- made for redshift | ||
if ( | ||
schema | ||
and t_table | ||
and t_full_name | ||
and t_table == t_full_name | ||
and schema in t_table | ||
): | ||
logger.debug( | ||
f"Omitting schema for upstream table {t_id}, schema included in table name" | ||
) | ||
schema = "" | ||
|
||
return cls( | ||
database=database, | ||
schema=schema, | ||
table=table, | ||
connection_type=t_connection_type, | ||
) | ||
|
||
@staticmethod | ||
def parse_full_name(full_name: Optional[str]) -> Optional[List[str]]: | ||
# fullName is observed to be in formats: | ||
# [database].[schema].[table] | ||
# [schema].[table] | ||
# [table] | ||
# table | ||
# schema | ||
|
||
# TODO: Validate the startswith check. Currently required for our integration tests | ||
if full_name is None or not full_name.startswith("["): | ||
return None | ||
|
||
return full_name.replace("[", "").replace("]", "").split(".") | ||
|
||
def make_dataset_urn( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Basically copied from |
||
self, | ||
env: str, | ||
platform_instance_map: Optional[Dict[str, str]], | ||
lineage_overrides: Optional[TableauLineageOverrides] = None, | ||
) -> str: | ||
( | ||
upstream_db, | ||
platform_instance, | ||
platform, | ||
original_platform, | ||
) = get_overridden_info( | ||
connection_type=self.connection_type, | ||
upstream_db=self.database, | ||
lineage_overrides=lineage_overrides, | ||
platform_instance_map=platform_instance_map, | ||
) | ||
|
||
table_name = get_fully_qualified_table_name( | ||
original_platform, | ||
upstream_db or "", | ||
self.schema, | ||
self.table, | ||
) | ||
|
||
return None | ||
return builder.make_dataset_urn_with_platform_instance( | ||
platform, table_name, platform_instance, env | ||
) | ||
|
||
|
||
def get_overridden_info( | ||
connection_type: str, | ||
connection_type: Optional[str], | ||
upstream_db: Optional[str], | ||
platform_instance_map: Optional[Dict[str, str]], | ||
lineage_overrides: Optional[TableauLineageOverrides] = None, | ||
|
@@ -605,43 +716,16 @@ def get_overridden_info( | |
): | ||
upstream_db = lineage_overrides.database_override_map[upstream_db] | ||
|
||
platform_instance = get_platform_instance(original_platform, platform_instance_map) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed this method in favor of this one line statement |
||
platform_instance = ( | ||
platform_instance_map.get(original_platform) if platform_instance_map else None | ||
) | ||
|
||
if original_platform in ("athena", "hive", "mysql"): # Two tier databases | ||
upstream_db = None | ||
|
||
return upstream_db, platform_instance, platform, original_platform | ||
|
||
|
||
def make_table_urn( | ||
env: str, | ||
upstream_db: Optional[str], | ||
connection_type: str, | ||
schema: str, | ||
full_name: str, | ||
platform_instance_map: Optional[Dict[str, str]], | ||
lineage_overrides: Optional[TableauLineageOverrides] = None, | ||
) -> str: | ||
|
||
upstream_db, platform_instance, platform, original_platform = get_overridden_info( | ||
connection_type=connection_type, | ||
upstream_db=upstream_db, | ||
lineage_overrides=lineage_overrides, | ||
platform_instance_map=platform_instance_map, | ||
) | ||
|
||
table_name = get_fully_qualified_table_name( | ||
original_platform, | ||
upstream_db if upstream_db is not None else "", | ||
schema, | ||
full_name, | ||
) | ||
|
||
return builder.make_dataset_urn_with_platform_instance( | ||
platform, table_name, platform_instance, env | ||
) | ||
|
||
|
||
def make_description_from_params(description, formula): | ||
""" | ||
Generate column description | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I lose the lru_cache capabilities, but I don't think that matters... these are such simple and quick operations