diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index 54c5888ee3312c..4bb71151120b6f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -1,6 +1,6 @@ from datetime import datetime, timezone from functools import lru_cache -from typing import Dict, Iterable, List, Optional, Union +from typing import Dict, Iterable, List, Optional, Tuple, Union import dateutil.parser as dp import pydantic @@ -43,6 +43,8 @@ ) from datahub.utilities import config_clean +DATASOURCE_URN_RECURSION_LIMIT = 5 + class MetabaseConfig(DatasetLineageProviderConfigBase): # See the Metabase /api/session endpoint for details @@ -327,18 +329,43 @@ def emit_card_mces(self) -> Iterable[MetadataWorkUnit]: ) return None - def construct_card_from_api_data(self, card_data: dict) -> Optional[ChartSnapshot]: - card_id = card_data.get("id", "") + def get_card_details_by_id(self, card_id: Union[int, str]) -> dict: + """ + Method will attempt to get detailed information on card + from Metabase API by card ID and return this info as dict. + If information can't be retrieved, an empty dict is returned + to unify return value of failed call with successful call of the method. + :param Union[int, str] card_id: ID of card (question) in Metabase + :param int datasource_id: Numeric datasource ID received from Metabase API + :return: dict with info or empty dict + """ card_url = f"{self.config.connect_uri}/api/card/{card_id}" try: card_response = self.session.get(card_url) card_response.raise_for_status() - card_details = card_response.json() + return card_response.json() except HTTPError as http_error: self.report.report_failure( key=f"metabase-card-{card_id}", reason=f"Unable to retrieve Card info. " f"Reason: {str(http_error)}", ) + return {} + + def construct_card_from_api_data(self, card_data: dict) -> Optional[ChartSnapshot]: + card_id = card_data.get("id") + if card_id is None: + self.report.report_failure( + key="metabase-card", + reason=f"Unable to get Card id from card data {str(card_data)}", + ) + return None + + card_details = self.get_card_details_by_id(card_id) + if not card_details: + self.report.report_failure( + key=f"metabase-card-{card_id}", + reason="Unable to construct Card due to empty card details", + ) return None chart_urn = builder.make_chart_urn(self.platform, card_id) @@ -357,7 +384,7 @@ def construct_card_from_api_data(self, card_data: dict) -> Optional[ChartSnapsho lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) - chart_type = self._get_chart_type(card_id, card_details.get("display")) + chart_type = self._get_chart_type(card_id, card_details.get("display", "")) description = card_details.get("description") or "" title = card_details.get("name") or "" datasource_urn = self.get_datasource_urn(card_details) @@ -448,7 +475,16 @@ def construct_card_custom_properties(self, card_details: dict) -> Dict: return custom_properties - def get_datasource_urn(self, card_details: dict) -> Optional[List]: + def get_datasource_urn( + self, card_details: dict, recursion_depth: int = 0 + ) -> Optional[List]: + if recursion_depth > DATASOURCE_URN_RECURSION_LIMIT: + self.report.report_failure( + key=f"metabase-card-{card_details.get('id')}", + reason="Unable to retrieve Card info. Reason: source table recursion depth exceeded", + ) + return None + ( platform, database_name, @@ -462,9 +498,19 @@ def get_datasource_urn(self, card_details: dict) -> Optional[List]: source_table_id = ( card_details.get("dataset_query", {}) .get("query", {}) - .get("source-table") + .get("source-table", "") ) - if source_table_id is not None: + if str(source_table_id).startswith("card__"): + # question is built not directly from table in DB but from results of other question in Metabase + # trying to get source table from source question. Recursion depth is limited + return self.get_datasource_urn( + card_details=self.get_card_details_by_id( + source_table_id.replace("card__", "") + ), + recursion_depth=recursion_depth + 1, + ) + elif source_table_id != "": + # the question is built directly from table in DB schema_name, table_name = self.get_source_table_from_id(source_table_id) if table_name: source_tables.add( @@ -520,7 +566,7 @@ def get_datasource_urn(self, card_details: dict) -> Optional[List]: return dataset_urn @lru_cache(maxsize=None) - def get_source_table_from_id(self, table_id): + def get_source_table_from_id(self, table_id) -> Tuple[Optional[str], Optional[str]]: try: dataset_response = self.session.get( f"{self.config.connect_uri}/api/table/{table_id}" @@ -542,8 +588,8 @@ def get_source_table_from_id(self, table_id): @lru_cache(maxsize=None) def get_platform_instance( - self, platform: Union[str, None] = None, datasource_id: Union[int, None] = None - ) -> Union[str, None]: + self, platform: Optional[str] = None, datasource_id: Optional[int] = None + ) -> Optional[str]: """ Method will attempt to detect `platform_instance` by checking `database_id_to_instance_map` and `platform_instance_map` mappings. @@ -571,7 +617,9 @@ def get_platform_instance( return platform_instance @lru_cache(maxsize=None) - def get_datasource_from_id(self, datasource_id): + def get_datasource_from_id( + self, datasource_id + ) -> Tuple[Optional[str],Optional[str],Optional[str],Optional[str]]: try: dataset_response = self.session.get( f"{self.config.connect_uri}/api/database/{datasource_id}" @@ -583,7 +631,7 @@ def get_datasource_from_id(self, datasource_id): key=f"metabase-datasource-{datasource_id}", reason=f"Unable to retrieve Datasource. " f"Reason: {str(http_error)}", ) - return None, None + return None, None, None, None # Map engine names to what datahub expects in # https://github.com/datahub-project/datahub/blob/master/metadata-service/war/src/main/resources/boot/data_platforms.json diff --git a/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json b/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json index 6e57dfaae0ce00..0ba6afbd04fc9b 100644 --- a/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json +++ b/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json @@ -115,6 +115,61 @@ "runId": "metabase-test" } }, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(metabase,3)", + "aspects": [ + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "Distinct values of order_number, Sum of nominal_total", + "Filters": "['time-interval', ['field', 'completed_at', {'base-type': 'type/DateTimeWithTZ'}], -8, 'day', {'include-current': False}]", + "Dimensions": "completed_at" + }, + "title": "Question with data from other question", + "description": "", + "lastModified": { + "created": { + "time": 1685628119636, + "actor": "urn:li:corpuser:john.doe@example.com" + }, + "lastModified": { + "time": 1685628119636, + "actor": "urn:li:corpuser:john.doe@example.com" + } + }, + "chartUrl": "http://localhost:3000/card/3", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:bigquery,acryl-data.public.payment,PROD)" + } + ], + "type": "TABLE" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:admin@metabase.com", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1636614000000, + "runId": "metabase-test" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { @@ -195,6 +250,21 @@ "runId": "metabase-test" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(metabase,3)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1636614000000, + "runId": "metabase-test" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(metabase,1)", diff --git a/metadata-ingestion/tests/integration/metabase/setup/card.json b/metadata-ingestion/tests/integration/metabase/setup/card.json index 439edbf60014fa..83bff66e6c9f3e 100644 --- a/metadata-ingestion/tests/integration/metabase/setup/card.json +++ b/metadata-ingestion/tests/integration/metabase/setup/card.json @@ -304,4 +304,196 @@ "favorite": false, "created_at": "2021-12-13T17:48:37.102", "public_uuid": null -}] \ No newline at end of file +}, { + "description": null, + "archived": false, + "collection_position": null, + "table_id": null, + "result_metadata": [ + { + "name": "completed_at", + "display_name": "completed_at", + "base_type": "type/Date", + "special_type": null, + "field_ref": [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ", + "temporal-unit": "day" + } + ], + "unit": "day", + "fingerprint": { + "global": { + "distinct-count": 1916, + "nil%": 0.0385 + } + } + }, + { + "name": "count", + "display_name": "Distinct values of order_number", + "base_type": "type/BigInteger", + "special_type": "type/Quantity", + "field_ref": [ + "aggregation", + 0 + ], + "fingerprint": { + "global": { + "distinct-count": 8, + "nil%": 0.0 + }, + "type": { + "type/Number": { + "min": 44098.0, + "q1": 46911.0, + "q3": 51276.0, + "max": 52228.0, + "sd": 2797.3306887357558, + "avg": 48557.125 + } + } + } + }, + { + "name": "sum", + "display_name": "Sum of nominal_total", + "base_type": "type/Float", + "special_type": null, + "field_ref": [ + "aggregation", + 1 + ], + "fingerprint": { + "global": { + "distinct-count": 8, + "nil%": 0.0 + }, + "type": { + "type/Number": { + "min": 1.256807007034278E8, + "q1": 1.277180884245776E8, + "q3": 1.4257821803491282E8, + "max": 1.4887777502074698E8, + "sd": 8966928.163419789, + "avg": 1.3526486656272435E8 + } + } + } + } + ], + "creator": { + "email": "john.doe@example.com", + "first_name": "John", + "last_login": "2023-08-03T09:33:25.157021Z", + "is_qbnewb": false, + "is_superuser": false, + "id": 1, + "last_name": "Doe", + "date_joined": "2020-07-13T07:29:31.805765Z", + "common_name": "John Doe" + }, + "can_write": true, + "database_id": 2, + "enable_embedding": false, + "collection_id": 1135, + "query_type": "query", + "name": "Question with data from other question", + "last_query_start": null, + "dashboard_count": 1, + "average_query_time": null, + "creator_id": 31337, + "moderation_reviews": [], + "updated_at": "2023-06-01T14:01:59.592811Z", + "made_public_by_id": null, + "embedding_params": null, + "cache_ttl": null, + "dataset_query": { + "database": 2, + "query": { + "source-table": "card__1", + "filter": [ + "time-interval", + [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ" + } + ], + -8, + "day", + { + "include-current": false + } + ], + "aggregation": [ + [ + "distinct", + [ + "field", + "order_number", + { + "base-type": "type/Text" + } + ] + ], + [ + "sum", + [ + "field", + "nominal_total", + { + "base-type": "type/Float" + } + ] + ] + ], + "breakout": [ + [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ", + "temporal-unit": "day" + } + ] + ] + }, + "type": "query" + }, + "id": 3, + "parameter_mappings": null, + "display": "table", + "entity_id": null, + "collection_preview": true, + "last-edit-info": { + "id": 1, + "email": "john.doe@example.com", + "first_name": "John", + "last_name": "Doe", + "timestamp": "2023-06-01T14:01:59.636581Z" + }, + "visualization_settings": {}, + "collection": { + "authority_level": null, + "description": null, + "archived": false, + "slug": "group", + "color": "#509EE3", + "name": "Group", + "personal_owner_id": null, + "id": 1135, + "entity_id": null, + "location": "/3/373/", + "namespace": null, + "created_at": "2020-07-17T19:28:39.513365Z" + }, + "parameters": null, + "dataset": false, + "created_at": "2020-07-17T19:28:39.513365Z", + "parameter_usage_count": 0, + "public_uuid": null +}] diff --git a/metadata-ingestion/tests/integration/metabase/setup/card_3.json b/metadata-ingestion/tests/integration/metabase/setup/card_3.json new file mode 100644 index 00000000000000..3f928cd2e8f696 --- /dev/null +++ b/metadata-ingestion/tests/integration/metabase/setup/card_3.json @@ -0,0 +1,193 @@ +{ + "description": null, + "archived": false, + "collection_position": null, + "table_id": null, + "result_metadata": [ + { + "name": "completed_at", + "display_name": "completed_at", + "base_type": "type/Date", + "special_type": null, + "field_ref": [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ", + "temporal-unit": "day" + } + ], + "unit": "day", + "fingerprint": { + "global": { + "distinct-count": 1916, + "nil%": 0.0385 + } + } + }, + { + "name": "count", + "display_name": "Distinct values of order_number", + "base_type": "type/BigInteger", + "special_type": "type/Quantity", + "field_ref": [ + "aggregation", + 0 + ], + "fingerprint": { + "global": { + "distinct-count": 8, + "nil%": 0.0 + }, + "type": { + "type/Number": { + "min": 44098.0, + "q1": 46911.0, + "q3": 51276.0, + "max": 52228.0, + "sd": 2797.3306887357558, + "avg": 48557.125 + } + } + } + }, + { + "name": "sum", + "display_name": "Sum of nominal_total", + "base_type": "type/Float", + "special_type": null, + "field_ref": [ + "aggregation", + 1 + ], + "fingerprint": { + "global": { + "distinct-count": 8, + "nil%": 0.0 + }, + "type": { + "type/Number": { + "min": 1.256807007034278E8, + "q1": 1.277180884245776E8, + "q3": 1.4257821803491282E8, + "max": 1.4887777502074698E8, + "sd": 8966928.163419789, + "avg": 1.3526486656272435E8 + } + } + } + } + ], + "creator": { + "email": "john.doe@example.com", + "first_name": "John", + "last_login": "2023-08-03T09:33:25.157021Z", + "is_qbnewb": false, + "is_superuser": false, + "id": 1, + "last_name": "Doe", + "date_joined": "2020-07-13T07:29:31.805765Z", + "common_name": "John Doe" + }, + "can_write": true, + "database_id": 2, + "enable_embedding": false, + "collection_id": 1135, + "query_type": "query", + "name": "Question with data from other question", + "last_query_start": null, + "dashboard_count": 1, + "average_query_time": null, + "creator_id": 1, + "moderation_reviews": [], + "updated_at": "2023-06-01T14:01:59.592811Z", + "made_public_by_id": null, + "embedding_params": null, + "cache_ttl": null, + "dataset_query": { + "database": 2, + "query": { + "source-table": "card__1", + "filter": [ + "time-interval", + [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ" + } + ], + -8, + "day", + { + "include-current": false + } + ], + "aggregation": [ + [ + "distinct", + [ + "field", + "order_number", + { + "base-type": "type/Text" + } + ] + ], + [ + "sum", + [ + "field", + "nominal_total", + { + "base-type": "type/Float" + } + ] + ] + ], + "breakout": [ + [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ", + "temporal-unit": "day" + } + ] + ] + }, + "type": "query" + }, + "id": 3, + "parameter_mappings": null, + "display": "table", + "entity_id": null, + "collection_preview": true, + "last-edit-info": { + "id": 1, + "email": "john.doe@example.com", + "first_name": "John", + "last_name": "Doe", + "timestamp": "2023-06-01T14:01:59.636581Z" + }, + "visualization_settings": {}, + "collection": { + "authority_level": null, + "description": null, + "archived": false, + "slug": "group", + "color": "#509EE3", + "name": "Group", + "personal_owner_id": null, + "id": 1135, + "entity_id": null, + "location": "/3/373/", + "namespace": null, + "created_at": "2020-07-17T19:28:39.513365Z" + }, + "parameters": null, + "dataset": false, + "created_at": "2020-07-17T19:28:39.513365Z", + "parameter_usage_count": 0, + "public_uuid": null +} diff --git a/metadata-ingestion/tests/integration/metabase/test_metabase.py b/metadata-ingestion/tests/integration/metabase/test_metabase.py index 5f5c8efedbfebe..24d254fc8469ee 100644 --- a/metadata-ingestion/tests/integration/metabase/test_metabase.py +++ b/metadata-ingestion/tests/integration/metabase/test_metabase.py @@ -23,6 +23,7 @@ "http://localhost:3000/api/card/1": "card_1.json", "http://localhost:3000/api/card/2": "card_2.json", "http://localhost:3000/api/table/21": "table_21.json", + "http://localhost:3000/api/card/3": "card_3.json", } RESPONSE_ERROR_LIST = ["http://localhost:3000/api/dashboard"]