From bf5f6f57f4c40d610a106238103170fce78376cb Mon Sep 17 00:00:00 2001 From: zheyu001 <89370467+zheyu001@users.noreply.github.com> Date: Wed, 2 Aug 2023 12:23:27 +0800 Subject: [PATCH] feat(ingest/presto-on-hive): enable partition key for presto-on-hive (#8380) --- .../dataset/mappers/SchemaFieldMapper.java | 1 + .../src/main/resources/entity.graphql | 5 +++++ datahub-web-react/src/Mocks.tsx | 2 ++ .../schema/utils/schemaTitleRenderer.tsx | 2 ++ .../components/PartitioningKeyLabel.tsx | 21 +++++++++++++++++++ .../src/graphql/fragments.graphql | 1 + .../src/graphql/versionedDataset.graphql | 1 + .../ingestion/source/sql/presto_on_hive.py | 14 +++++++++++++ .../presto_on_hive_mces_golden_1.json | 1 + .../presto_on_hive_mces_golden_2.json | 1 + .../presto_on_hive_mces_golden_3.json | 1 + .../presto_on_hive_mces_golden_4.json | 1 + 12 files changed, 51 insertions(+) create mode 100644 datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/components/PartitioningKeyLabel.tsx diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java index b472688085fa6..f05a1adb6b443 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataset/mappers/SchemaFieldMapper.java @@ -34,6 +34,7 @@ public SchemaField apply(@Nonnull final com.linkedin.schema.SchemaField input, @ result.setGlossaryTerms(GlossaryTermsMapper.map(input.getGlossaryTerms(), entityUrn)); } result.setIsPartOfKey(input.isIsPartOfKey()); + result.setIsPartitioningKey(input.isIsPartitioningKey()); return result; } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index d4745af33e0ce..b1f9d57300177 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -2872,6 +2872,11 @@ type SchemaField { Whether the field is part of a key schema """ isPartOfKey: Boolean + + """ + Whether the field is part of a partitioning key schema + """ + isPartitioningKey: Boolean } """ diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx index be853361574ab..dcefc7f70d785 100644 --- a/datahub-web-react/src/Mocks.tsx +++ b/datahub-web-react/src/Mocks.tsx @@ -549,6 +549,7 @@ export const dataset3WithSchema = { type: SchemaFieldDataType.String, nativeDataType: 'varchar(100)', isPartOfKey: false, + isPartitioningKey: false, jsonPath: null, globalTags: null, glossaryTerms: null, @@ -563,6 +564,7 @@ export const dataset3WithSchema = { type: SchemaFieldDataType.String, nativeDataType: 'boolean', isPartOfKey: false, + isPartitioningKey: false, jsonPath: null, globalTags: null, glossaryTerms: null, diff --git a/datahub-web-react/src/app/entity/dataset/profile/schema/utils/schemaTitleRenderer.tsx b/datahub-web-react/src/app/entity/dataset/profile/schema/utils/schemaTitleRenderer.tsx index bd587e300d057..3d03b6306454d 100644 --- a/datahub-web-react/src/app/entity/dataset/profile/schema/utils/schemaTitleRenderer.tsx +++ b/datahub-web-react/src/app/entity/dataset/profile/schema/utils/schemaTitleRenderer.tsx @@ -7,6 +7,7 @@ import { ExtendedSchemaFields } from './types'; import TypeLabel from '../../../../shared/tabs/Dataset/Schema/components/TypeLabel'; import { ForeignKeyConstraint, SchemaMetadata } from '../../../../../../types.generated'; import PrimaryKeyLabel from '../../../../shared/tabs/Dataset/Schema/components/PrimaryKeyLabel'; +import PartitioningKeyLabel from '../../../../shared/tabs/Dataset/Schema/components/PartitioningKeyLabel'; import NullableLabel from '../../../../shared/tabs/Dataset/Schema/components/NullableLabel'; import ForeignKeyLabel from '../../../../shared/tabs/Dataset/Schema/components/ForeignKeyLabel'; @@ -62,6 +63,7 @@ export default function useSchemaTitleRenderer( {(schemaMetadata?.primaryKeys?.includes(fieldPath) || record.isPartOfKey) && } + {record.isPartitioningKey && } {record.nullable && } {schemaMetadata?.foreignKeys ?.filter( diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/components/PartitioningKeyLabel.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/components/PartitioningKeyLabel.tsx new file mode 100644 index 0000000000000..dbf259aa4cdc5 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/components/PartitioningKeyLabel.tsx @@ -0,0 +1,21 @@ +import React from 'react'; +import { Badge } from 'antd'; +import styled from 'styled-components'; +import { blue } from '@ant-design/colors'; +import { ANTD_GRAY } from '../../../../constants'; + +const PartitioningKeyBadge = styled(Badge)` + margin-left: 4px; + &&& .ant-badge-count { + background-color: ${ANTD_GRAY[1]}; + color: ${blue[5]}; + border: 1px solid ${blue[2]}; + font-size: 12px; + font-weight: 400; + height: 22px; + } +`; + +export default function PartitioningKeyLabel() { + return ; +} diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql index 219722ad1645a..c3ac2139e687b 100644 --- a/datahub-web-react/src/graphql/fragments.graphql +++ b/datahub-web-react/src/graphql/fragments.graphql @@ -678,6 +678,7 @@ fragment schemaFieldFields on SchemaField { nativeDataType recursive isPartOfKey + isPartitioningKey globalTags { ...globalTagsFields } diff --git a/datahub-web-react/src/graphql/versionedDataset.graphql b/datahub-web-react/src/graphql/versionedDataset.graphql index d61139927e14b..bebfb102f8e1b 100644 --- a/datahub-web-react/src/graphql/versionedDataset.graphql +++ b/datahub-web-react/src/graphql/versionedDataset.graphql @@ -10,6 +10,7 @@ query getVersionedDataset($urn: String!, $versionStamp: String) { nativeDataType recursive isPartOfKey + isPartitioningKey } lastObserved } diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py index a54cb9d50e2ae..1f3092888054e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py @@ -524,6 +524,8 @@ def loop_tables( # add table schema fields schema_fields = self.get_schema_fields(dataset_name, columns) + self._set_partition_key(columns, schema_fields) + schema_metadata = get_schema_metadata( self.report, dataset_name, @@ -888,6 +890,18 @@ def get_schema_fields_for_column( default_nullable=True, ) + def _set_partition_key(self, columns, schema_fields): + if len(columns) > 0: + partition_key_names = set() + for column in columns: + if column["is_partition_col"]: + partition_key_names.add(column["col_name"]) + + for schema_field in schema_fields: + name = schema_field.fieldPath.split(".")[-1] + if name in partition_key_names: + schema_field.isPartitioningKey = True + class SQLAlchemyClient: def __init__(self, config: SQLAlchemyConfig): diff --git a/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_1.json b/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_1.json index 0953b6738fdcb..45d13229b2d85 100644 --- a/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_1.json +++ b/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_1.json @@ -1277,6 +1277,7 @@ "nativeDataType": "string", "recursive": false, "isPartOfKey": false, + "isPartitioningKey": true, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { diff --git a/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_2.json b/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_2.json index e5ac05f979368..4ec71eb8c39c6 100644 --- a/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_2.json +++ b/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_2.json @@ -1277,6 +1277,7 @@ "nativeDataType": "string", "recursive": false, "isPartOfKey": false, + "isPartitioningKey": true, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { diff --git a/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_3.json b/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_3.json index a9cf64c0a43b4..824524782a8e3 100644 --- a/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_3.json +++ b/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_3.json @@ -1277,6 +1277,7 @@ "nativeDataType": "string", "recursive": false, "isPartOfKey": false, + "isPartitioningKey": true, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { diff --git a/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_4.json b/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_4.json index a050f418371c6..3f2980457daa4 100644 --- a/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_4.json +++ b/metadata-ingestion/tests/integration/presto-on-hive/presto_on_hive_mces_golden_4.json @@ -1277,6 +1277,7 @@ "nativeDataType": "string", "recursive": false, "isPartOfKey": false, + "isPartitioningKey": true, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, {