diff --git a/build.gradle b/build.gradle index 54802917d05a5..7c5deb4783943 100644 --- a/build.gradle +++ b/build.gradle @@ -19,7 +19,7 @@ buildscript { ext.logbackClassic = '1.2.12' ext.hadoop3Version = '3.3.5' ext.kafkaVersion = '2.3.0' - ext.hazelcastVersion = '5.3.1' + ext.hazelcastVersion = '5.3.6' ext.ebeanVersion = '12.16.1' ext.docker_registry = 'linkedin' @@ -53,7 +53,7 @@ project.ext.spec = [ 'pegasus' : [ 'd2' : 'com.linkedin.pegasus:d2:' + pegasusVersion, 'data' : 'com.linkedin.pegasus:data:' + pegasusVersion, - 'dataAvro1_6' : 'com.linkedin.pegasus:data-avro-1_6:' + pegasusVersion, + 'dataAvro': 'com.linkedin.pegasus:data-avro:' + pegasusVersion, 'generator': 'com.linkedin.pegasus:generator:' + pegasusVersion, 'restliCommon' : 'com.linkedin.pegasus:restli-common:' + pegasusVersion, 'restliClient' : 'com.linkedin.pegasus:restli-client:' + pegasusVersion, @@ -71,22 +71,22 @@ project.ext.externalDependency = [ 'assertJ': 'org.assertj:assertj-core:3.11.1', 'avro': 'org.apache.avro:avro:1.11.3', 'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3', - 'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.10', - 'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:1.1.1', - 'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.8', - 'awsPostgresIamAuth': 'software.amazon.jdbc:aws-advanced-jdbc-wrapper:1.0.0', + 'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.17', + 'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:1.1.9', + 'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.13', + 'awsPostgresIamAuth': 'software.amazon.jdbc:aws-advanced-jdbc-wrapper:1.0.2', 'awsRds':'software.amazon.awssdk:rds:2.18.24', - 'cacheApi' : 'javax.cache:cache-api:1.1.0', + 'cacheApi': 'javax.cache:cache-api:1.1.0', 'commonsCli': 'commons-cli:commons-cli:1.5.0', 'commonsIo': 'commons-io:commons-io:2.4', 'commonsLang': 'commons-lang:commons-lang:2.6', 'commonsText': 'org.apache.commons:commons-text:1.10.0', 'commonsCollections': 'commons-collections:commons-collections:3.2.2', - 'data' : 'com.linkedin.pegasus:data:' + pegasusVersion, + 'caffeine': 'com.github.ben-manes.caffeine:caffeine:3.1.8', 'datastaxOssNativeProtocol': 'com.datastax.oss:native-protocol:1.5.1', 'datastaxOssCore': 'com.datastax.oss:java-driver-core:4.14.1', 'datastaxOssQueryBuilder': 'com.datastax.oss:java-driver-query-builder:4.14.1', - 'dgraph4j' : 'io.dgraph:dgraph4j:21.03.1', + 'dgraph4j' : 'io.dgraph:dgraph4j:21.12.0', 'dropwizardMetricsCore': 'io.dropwizard.metrics:metrics-core:4.2.3', 'dropwizardMetricsJmx': 'io.dropwizard.metrics:metrics-jmx:4.2.3', 'ebean': 'io.ebean:ebean:' + ebeanVersion, @@ -131,7 +131,7 @@ project.ext.externalDependency = [ 'jsonPatch': 'com.github.java-json-tools:json-patch:1.13', 'jsonSimple': 'com.googlecode.json-simple:json-simple:1.1.1', 'jsonSmart': 'net.minidev:json-smart:2.4.9', - 'json': 'org.json:json:20230227', + 'json': 'org.json:json:20231013', 'junit': 'junit:junit:4.13.2', 'junitJupiterApi': "org.junit.jupiter:junit-jupiter-api:$junitJupiterVersion", 'junitJupiterParams': "org.junit.jupiter:junit-jupiter-params:$junitJupiterVersion", @@ -140,7 +140,7 @@ project.ext.externalDependency = [ 'kafkaAvroSerde': 'io.confluent:kafka-streams-avro-serde:5.5.1', 'kafkaAvroSerializer': 'io.confluent:kafka-avro-serializer:5.1.4', 'kafkaClients': "org.apache.kafka:kafka-clients:$kafkaVersion", - 'snappy': 'org.xerial.snappy:snappy-java:1.1.10.3', + 'snappy': 'org.xerial.snappy:snappy-java:1.1.10.4', 'logbackClassic': "ch.qos.logback:logback-classic:$logbackClassic", 'slf4jApi': "org.slf4j:slf4j-api:$slf4jVersion", 'log4jCore': "org.apache.logging.log4j:log4j-core:$log4jVersion", @@ -164,6 +164,7 @@ project.ext.externalDependency = [ 'opentelemetryAnnotations': 'io.opentelemetry:opentelemetry-extension-annotations:' + openTelemetryVersion, 'opentracingJdbc':'io.opentracing.contrib:opentracing-jdbc:0.2.15', 'parquet': 'org.apache.parquet:parquet-avro:1.12.3', + 'parquetHadoop': 'org.apache.parquet:parquet-hadoop:1.13.1', 'picocli': 'info.picocli:picocli:4.5.0', 'playCache': "com.typesafe.play:play-cache_2.12:$playVersion", 'playWs': 'com.typesafe.play:play-ahc-ws-standalone_2.12:2.1.10', @@ -178,6 +179,7 @@ project.ext.externalDependency = [ 'playPac4j': 'org.pac4j:play-pac4j_2.12:9.0.2', 'postgresql': 'org.postgresql:postgresql:42.3.8', 'protobuf': 'com.google.protobuf:protobuf-java:3.19.6', + 'grpcProtobuf': 'io.grpc:grpc-protobuf:1.53.0', 'rangerCommons': 'org.apache.ranger:ranger-plugins-common:2.3.0', 'reflections': 'org.reflections:reflections:0.9.9', 'resilience4j': 'io.github.resilience4j:resilience4j-retry:1.7.1', @@ -201,7 +203,7 @@ project.ext.externalDependency = [ 'springBootStarterJetty': "org.springframework.boot:spring-boot-starter-jetty:$springBootVersion", 'springBootStarterCache': "org.springframework.boot:spring-boot-starter-cache:$springBootVersion", 'springBootStarterValidation': "org.springframework.boot:spring-boot-starter-validation:$springBootVersion", - 'springKafka': 'org.springframework.kafka:spring-kafka:2.8.11', + 'springKafka': 'org.springframework.kafka:spring-kafka:2.9.13', 'springActuator': "org.springframework.boot:spring-boot-starter-actuator:$springBootVersion", 'swaggerAnnotations': 'io.swagger.core.v3:swagger-annotations:2.2.15', 'swaggerCli': 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.46', @@ -263,7 +265,7 @@ subprojects { plugins.withType(JavaPlugin) { dependencies { constraints { - implementation('io.netty:netty-all:4.1.86.Final') + implementation('io.netty:netty-all:4.1.100.Final') implementation('org.apache.commons:commons-compress:1.21') implementation('org.apache.velocity:velocity-engine-core:2.3') implementation('org.hibernate:hibernate-validator:6.0.20.Final') diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/ListDataProductAssetsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/ListDataProductAssetsResolver.java index e727ebe185838..831d449bef9ef 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/ListDataProductAssetsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/ListDataProductAssetsResolver.java @@ -79,11 +79,11 @@ public CompletableFuture get(DataFetchingEnvironment environment) } // 2. Get list of entities that we should query based on filters or assets from aspect. - List entitiesToQuery = assetUrns.stream().map(Urn::getEntityType).collect(Collectors.toList()); + List entitiesToQuery = assetUrns.stream().map(Urn::getEntityType).distinct().collect(Collectors.toList()); final List inputEntityTypes = (input.getTypes() == null || input.getTypes().isEmpty()) ? ImmutableList.of() : input.getTypes(); - final List inputEntityNames = inputEntityTypes.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()); + final List inputEntityNames = inputEntityTypes.stream().map(EntityTypeMapper::getName).distinct().collect(Collectors.toList()); final List finalEntityNames = inputEntityNames.size() > 0 ? inputEntityNames : entitiesToQuery; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java index 5ba32b0c2a77c..2a615b24eaac2 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java @@ -70,7 +70,7 @@ public static List getMatchedFieldEntry(List { window.clearTimeout(timerRef.current); timerRef.current = window.setTimeout(() => { @@ -81,6 +61,26 @@ function DomainSearch() { }, 250); }; + const renderLoadingIndicator = () => ( + + + + ); + + const renderSearchResults = () => ( + + {searchResults?.map((result) => ( + setIsSearchBarFocused(false)} + /> + ))} + + ); + return ( setIsSearchBarFocused(false)}> @@ -102,39 +102,8 @@ function DomainSearch() { entityRegistry={entityRegistry} onFocus={() => setIsSearchBarFocused(true)} /> - {isSearchBarFocused && searchResults && !!searchResults.length && ( - - {searchResults.map((result) => { - return ( - setIsSearchBarFocused(false)} - > - - {result.entity.type === EntityType.Domain ? ( - - ) : ( - entityRegistry.getIcon(result.entity.type, 12, IconStyleType.ACCENT) - )} - -
- - - {entityRegistry.getDisplayName(result.entity.type, result.entity)} - -
-
- ); - })} -
- )} + {loading && renderLoadingIndicator()} + {!loading && isSearchBarFocused && !!searchResults?.length && renderSearchResults()}
); diff --git a/datahub-web-react/src/app/domain/DomainSearchResultItem.tsx b/datahub-web-react/src/app/domain/DomainSearchResultItem.tsx new file mode 100644 index 0000000000000..dc33ea173e0ae --- /dev/null +++ b/datahub-web-react/src/app/domain/DomainSearchResultItem.tsx @@ -0,0 +1,68 @@ +// Create a new component called SearchResultItem.js +import React from 'react'; +import { Link } from 'react-router-dom'; +import Highlight from 'react-highlighter'; +import styled from 'styled-components/macro'; +import { Entity, EntityType } from '../../types.generated'; +import { IconStyleType } from '../entity/Entity'; +import { ANTD_GRAY } from '../entity/shared/constants'; +import DomainIcon from './DomainIcon'; +import ParentEntities from '../search/filters/ParentEntities'; +import { getParentDomains } from './utils'; +import EntityRegistry from '../entity/EntityRegistry'; + +type Props = { + entity: Entity; + entityRegistry: EntityRegistry; + query: string; + onResultClick: () => void; +}; + +const SearchResult = styled(Link)` + color: #262626; + display: flex; + align-items: center; + gap: 8px; + height: 100%; + padding: 6px 8px; + width: 100%; + &:hover { + background-color: ${ANTD_GRAY[3]}; + color: #262626; + } +`; + +const IconWrapper = styled.span``; + +const highlightMatchStyle = { + fontWeight: 'bold', + background: 'none', + padding: 0, +}; + +function DomainSearchResultItem({ entity, entityRegistry, query, onResultClick }: Props) { + return ( + + + {entity.type === EntityType.Domain ? ( + + ) : ( + entityRegistry.getIcon(entity.type, 12, IconStyleType.ACCENT) + )} + +
+ + + {entityRegistry.getDisplayName(entity.type, entity)} + +
+
+ ); +} + +export default DomainSearchResultItem; diff --git a/datahub-web-react/src/app/domain/EmptyDomainDescription.tsx b/datahub-web-react/src/app/domain/EmptyDomainDescription.tsx new file mode 100644 index 0000000000000..6a5f304e565be --- /dev/null +++ b/datahub-web-react/src/app/domain/EmptyDomainDescription.tsx @@ -0,0 +1,39 @@ +import { Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components/macro'; +import { ANTD_GRAY } from '../entity/shared/constants'; + +const StyledParagraph = styled(Typography.Paragraph)` + text-align: justify; + text-justify: inter-word; + margin: 40px 0; + font-size: 15px; +`; + +function EmptyDomainDescription() { + return ( + <> + + Welcome to your Data Domains! It looks like this space + is ready to be transformed into a well-organized data universe. Start by creating your first domain - a + high-level category for your data assets. + + + Create Nested Domains: Want to dive deeper? You can + also create nested domains to add granularity and structure. Just like nesting Russian dolls, its all + about refining your organization. + + + Build Data Products: Once your domains are set, go a + step further! Organize your data assets into data products to realize a data mesh architecture. Data + products empower you to treat data as a product, making it more accessible and manageable. + + + Ready to embark on this data adventure? Click the Create Domain button to begin shaping your data + landscape! + + + ); +} + +export default EmptyDomainDescription; diff --git a/datahub-web-react/src/app/domain/EmptyDomainsSection.tsx b/datahub-web-react/src/app/domain/EmptyDomainsSection.tsx new file mode 100644 index 0000000000000..f232d259c20da --- /dev/null +++ b/datahub-web-react/src/app/domain/EmptyDomainsSection.tsx @@ -0,0 +1,69 @@ +import { PlusOutlined } from '@ant-design/icons'; +import { Button, Empty, Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components/macro'; +import { ANTD_GRAY } from '../entity/shared/constants'; + +const EmptyDomainContainer = styled.div` + display: flex; + justify-content: center; + align-items: center; +`; + +const StyledEmpty = styled(Empty)` + width: 35vw; + @media screen and (max-width: 1300px) { + width: 50vw; + } + @media screen and (max-width: 896px) { + overflow-y: auto; + max-height: 75vh; + &::-webkit-scrollbar { + width: 5px; + background: #d6d6d6; + } + } + padding: 60px 40px; + .ant-empty-image { + display: none; + } +`; + +const StyledButton = styled(Button)` + margin: 18px 8px 0 0; +`; + +const IconContainer = styled.span` + color: ${ANTD_GRAY[7]}; + font-size: 40px; +`; + +interface Props { + title?: string; + setIsCreatingDomain: React.Dispatch>; + description?: React.ReactNode; + icon?: React.ReactNode; +} + +function EmptyDomainsSection(props: Props) { + const { title, description, setIsCreatingDomain, icon } = props; + return ( + + + {icon} + {title} + {description} + + } + > + setIsCreatingDomain(true)}> + Create Domain + + + + ); +} + +export default EmptyDomainsSection; diff --git a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx index b69f0c5458b5d..f5fc0cba2d8ec 100644 --- a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx +++ b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx @@ -51,7 +51,7 @@ export default function ManageDomainsPageV2() { New Domain - + {isCreatingDomain && ( setIsCreatingDomain(false)} diff --git a/datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx b/datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx index 757119919e336..75c38cd4951ef 100644 --- a/datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx +++ b/datahub-web-react/src/app/domain/nestedDomains/RootDomains.tsx @@ -1,17 +1,23 @@ import React from 'react'; import styled from 'styled-components'; +import { ReadOutlined } from '@ant-design/icons'; import { Message } from '../../shared/Message'; import { ResultWrapper } from '../../search/SearchResultList'; import { useEntityRegistry } from '../../useEntityRegistry'; import { EntityType } from '../../../types.generated'; import useListDomains from '../useListDomains'; +import EmptyDomainsSection from '../EmptyDomainsSection'; +import EmptyDomainDescription from '../EmptyDomainDescription'; const DomainsWrapper = styled.div` overflow: auto; padding: 0 28px 16px 28px; `; -export default function RootDomains() { +interface Props { + setIsCreatingDomain: React.Dispatch>; +} +export default function RootDomains({ setIsCreatingDomain }: Props) { const entityRegistry = useEntityRegistry(); const { loading, error, data, sortedDomains } = useListDomains({}); @@ -19,6 +25,14 @@ export default function RootDomains() { <> {!data && loading && } {error && } + {!loading && (!data || !data?.listDomains?.domains?.length) && ( + } + title="Organize your data" + description={} + setIsCreatingDomain={setIsCreatingDomain} + /> + )} {sortedDomains?.map((domain) => ( diff --git a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx index 0fbcffb9a260c..8decc2840a379 100644 --- a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx +++ b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNavigator.tsx @@ -1,9 +1,10 @@ -import { Alert } from 'antd'; +import { Alert, Empty } from 'antd'; import React from 'react'; import styled from 'styled-components'; import useListDomains from '../../useListDomains'; import DomainNode from './DomainNode'; import { Domain } from '../../../../types.generated'; +import { ANTD_GRAY } from '../../../entity/shared/constants'; const NavigatorWrapper = styled.div` font-size: 14px; @@ -19,19 +20,28 @@ interface Props { export default function DomainNavigator({ domainUrnToHide, selectDomainOverride }: Props) { const { sortedDomains, error } = useListDomains({}); + const noDomainsFound: boolean = !sortedDomains || sortedDomains.length === 0; return ( {error && } - {sortedDomains?.map((domain) => ( - - ))} + )} + {!noDomainsFound && + sortedDomains?.map((domain) => ( + + ))} ); } diff --git a/datahub-web-react/src/app/entity/container/ContainerEntity.tsx b/datahub-web-react/src/app/entity/container/ContainerEntity.tsx index 9aecf6900f634..6c683a27295bd 100644 --- a/datahub-web-react/src/app/entity/container/ContainerEntity.tsx +++ b/datahub-web-react/src/app/entity/container/ContainerEntity.tsx @@ -167,6 +167,7 @@ export class ContainerEntity implements Entity { getOverridePropertiesFromEntity = (data: Container) => { return { name: this.displayName(data), + externalUrl: data.properties?.externalUrl, entityCount: data.entities?.total, }; }; diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx index 405442e8d7f50..3d9a7d7f08425 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/Domain/SetDomainModal.tsx @@ -1,6 +1,8 @@ import React, { useRef, useState } from 'react'; -import { Button, Form, message, Modal, Select } from 'antd'; +import { Button, Form, message, Modal, Select, Empty } from 'antd'; +import { LoadingOutlined } from '@ant-design/icons'; +import styled from 'styled-components/macro'; import { useGetSearchResultsLazyQuery } from '../../../../../../../graphql/search.generated'; import { Domain, Entity, EntityType } from '../../../../../../../types.generated'; import { useBatchSetDomainMutation } from '../../../../../../../graphql/mutations.generated'; @@ -12,6 +14,7 @@ import { tagRender } from '../tagRenderer'; import { BrowserWrapper } from '../../../../../../shared/tags/AddTagsTermsModal'; import DomainNavigator from '../../../../../../domain/nestedDomains/domainNavigator/DomainNavigator'; import ClickOutside from '../../../../../../shared/ClickOutside'; +import { ANTD_GRAY } from '../../../../constants'; type Props = { urns: string[]; @@ -28,6 +31,18 @@ type SelectedDomain = { urn: string; }; +const LoadingWrapper = styled.div` + padding: 8px; + display: flex; + justify-content: center; + + svg { + height: 15px; + width: 15px; + color: ${ANTD_GRAY[8]}; + } +`; + export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOkOverride, titleOverride }: Props) => { const entityRegistry = useEntityRegistry(); const [isFocusedOnInput, setIsFocusedOnInput] = useState(false); @@ -41,7 +56,7 @@ export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOk } : undefined, ); - const [domainSearch, { data: domainSearchData }] = useGetSearchResultsLazyQuery(); + const [domainSearch, { data: domainSearchData, loading }] = useGetSearchResultsLazyQuery(); const domainSearchResults = domainSearchData?.search?.searchResults?.map((searchResult) => searchResult.entity) || []; const [batchSetDomainMutation] = useBatchSetDomainMutation(); @@ -206,8 +221,23 @@ export const SetDomainModal = ({ urns, onCloseModal, refetch, defaultValue, onOk onBlur={handleBlur} onFocus={() => setIsFocusedOnInput(true)} dropdownStyle={isShowingDomainNavigator ? { display: 'none' } : {}} + notFoundContent={ + + } > - {domainSearchOptions} + {loading ? ( + + + + + + ) : ( + domainSearchOptions + )} diff --git a/datahub-web-react/src/app/identity/group/GroupList.tsx b/datahub-web-react/src/app/identity/group/GroupList.tsx index db9901a53b26b..5ef77b4dfc8a8 100644 --- a/datahub-web-react/src/app/identity/group/GroupList.tsx +++ b/datahub-web-react/src/app/identity/group/GroupList.tsx @@ -92,7 +92,10 @@ export const GroupList = () => { fontSize: 12, }} onSearch={() => null} - onQueryChange={(q) => setQuery(q)} + onQueryChange={(q) => { + setPage(1); + setQuery(q); + }} entityRegistry={entityRegistry} hideRecommendations /> diff --git a/datahub-web-react/src/app/identity/user/UserList.tsx b/datahub-web-react/src/app/identity/user/UserList.tsx index 55ef27b8458fa..e50005b08377e 100644 --- a/datahub-web-react/src/app/identity/user/UserList.tsx +++ b/datahub-web-react/src/app/identity/user/UserList.tsx @@ -135,7 +135,10 @@ export const UserList = () => { fontSize: 12, }} onSearch={() => null} - onQueryChange={(q) => setQuery(q)} + onQueryChange={(q) => { + setPage(1); + setQuery(q); + }} entityRegistry={entityRegistry} hideRecommendations /> diff --git a/datahub-web-react/src/app/ingest/secret/SecretsList.tsx b/datahub-web-react/src/app/ingest/secret/SecretsList.tsx index 8e5b601e2a809..2728fff0ccba3 100644 --- a/datahub-web-react/src/app/ingest/secret/SecretsList.tsx +++ b/datahub-web-react/src/app/ingest/secret/SecretsList.tsx @@ -54,10 +54,10 @@ export const SecretsList = () => { input: { start, count: pageSize, - query: query && query.length > 0 ? query : undefined, + query: (query?.length && query) || undefined, }, }, - fetchPolicy: query && query.length > 0 ? 'no-cache' : 'cache-first', + fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', }); const totalSecrets = data?.listSecrets?.total || 0; @@ -197,7 +197,10 @@ export const SecretsList = () => { fontSize: 12, }} onSearch={() => null} - onQueryChange={(q) => setQuery(q)} + onQueryChange={(q) => { + setPage(1); + setQuery(q); + }} entityRegistry={entityRegistry} hideRecommendations /> diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index 13af19b0b6ac2..6188845694f9e 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -107,10 +107,10 @@ export const IngestionSourceList = () => { input: { start, count: pageSize, - query, + query: (query?.length && query) || undefined, }, }, - fetchPolicy: 'cache-first', + fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', }); const [createIngestionSource] = useCreateIngestionSourceMutation(); const [updateIngestionSource] = useUpdateIngestionSourceMutation(); @@ -399,7 +399,10 @@ export const IngestionSourceList = () => { fontSize: 12, }} onSearch={() => null} - onQueryChange={(q) => setQuery(q)} + onQueryChange={(q) => { + setPage(1); + setQuery(q); + }} entityRegistry={entityRegistry} hideRecommendations /> diff --git a/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx b/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx index 08327d40a7165..49b0ec922fd57 100644 --- a/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx +++ b/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx @@ -166,7 +166,6 @@ export const ManagePolicies = () => { data: policiesData, refetch: policiesRefetch, } = useListPoliciesQuery({ - fetchPolicy: 'no-cache', variables: { input: { start, @@ -174,6 +173,7 @@ export const ManagePolicies = () => { query, }, }, + fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', }); // Any time a policy is removed, edited, or created, refetch the list. @@ -476,7 +476,10 @@ export const ManagePolicies = () => { fontSize: 12, }} onSearch={() => null} - onQueryChange={(q) => setQuery(q)} + onQueryChange={(q) => { + setPage(1); + setQuery(q); + }} entityRegistry={entityRegistry} hideRecommendations /> diff --git a/datahub-web-react/src/app/permissions/roles/ManageRoles.tsx b/datahub-web-react/src/app/permissions/roles/ManageRoles.tsx index ccdfb7002c67d..011109e2eb915 100644 --- a/datahub-web-react/src/app/permissions/roles/ManageRoles.tsx +++ b/datahub-web-react/src/app/permissions/roles/ManageRoles.tsx @@ -72,7 +72,6 @@ export const ManageRoles = () => { data: rolesData, refetch: rolesRefetch, } = useListRolesQuery({ - fetchPolicy: 'cache-first', variables: { input: { start, @@ -80,6 +79,7 @@ export const ManageRoles = () => { query, }, }, + fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', }); const totalRoles = rolesData?.listRoles?.total || 0; @@ -238,7 +238,10 @@ export const ManageRoles = () => { fontSize: 12, }} onSearch={() => null} - onQueryChange={(q) => setQuery(q)} + onQueryChange={(q) => { + setPage(1); + setQuery(q); + }} entityRegistry={entityRegistry} /> {isBatchAddRolesModalVisible && ( diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx index a23ead83caf54..15457c006c61b 100644 --- a/datahub-web-react/src/app/search/SearchBar.tsx +++ b/datahub-web-react/src/app/search/SearchBar.tsx @@ -45,7 +45,7 @@ const StyledSearchBar = styled(Input)` border: 2px solid transparent; &:focus-within { - border: 1.5px solid ${REDESIGN_COLORS.BLUE}; + border: 2px solid ${REDESIGN_COLORS.BLUE}; } } > .ant-input::placeholder { diff --git a/datahub-web-react/src/setupProxy.js b/datahub-web-react/src/setupProxy.js index 478c015705a13..165e394a507f3 100644 --- a/datahub-web-react/src/setupProxy.js +++ b/datahub-web-react/src/setupProxy.js @@ -2,6 +2,8 @@ const logInFilter = function (pathname, req) { return pathname.match('^/logIn') && req.method === 'POST'; }; +const proxyTarget = process.env.REACT_APP_PROXY_TARGET || 'http://localhost:9002'; + if (process.env.REACT_APP_MOCK === 'true' || process.env.REACT_APP_MOCK === 'cy') { // no proxy needed, MirageJS will intercept all http requests module.exports = function () {}; @@ -13,21 +15,21 @@ if (process.env.REACT_APP_MOCK === 'true' || process.env.REACT_APP_MOCK === 'cy' app.use( '/logIn', createProxyMiddleware(logInFilter, { - target: 'http://localhost:9002', + target: proxyTarget, changeOrigin: true, }), ); app.use( '/authenticate', createProxyMiddleware({ - target: 'http://localhost:9002', + target: proxyTarget, changeOrigin: true, }), ); app.use( '/api/v2/graphql', createProxyMiddleware({ - target: 'http://localhost:9002', + target: proxyTarget, changeOrigin: true, }), ); diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile index 9c13e73078042..aaace5ae38ca3 100644 --- a/docker/datahub-frontend/Dockerfile +++ b/docker/datahub-frontend/Dockerfile @@ -9,7 +9,7 @@ RUN addgroup -S datahub && adduser -S datahub -G datahub # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ && apk --no-cache add curl sqlite libc6-compat java-snappy \ - && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ + && apk --no-cache add openjdk11-jre-headless --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && apk --no-cache add jattach --repository http://dl-cdn.alpinelinux.org/alpine/edge/community/ ENV LD_LIBRARY_PATH="/lib:/lib64" diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index e271188a703cc..c5696bbd2d1d2 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -19,7 +19,7 @@ ENV JMX_VERSION=0.18.0 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ && apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat java-snappy \ - && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ + && apk --no-cache add openjdk11-jre-headless --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && apk --no-cache add jattach --repository http://dl-cdn.alpinelinux.org/alpine/edge/community/ \ && curl -sS https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-runner/9.4.46.v20220331/jetty-runner-9.4.46.v20220331.jar --output jetty-runner.jar \ && curl -sS https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-jmx/9.4.46.v20220331/jetty-jmx-9.4.46.v20220331.jar --output jetty-jmx.jar \ diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index ec3da4de71d15..07af7c66a7783 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -19,7 +19,7 @@ ENV JMX_VERSION=0.18.0 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ && apk --no-cache add curl bash coreutils sqlite libc6-compat java-snappy \ - && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ + && apk --no-cache add openjdk11-jre-headless --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && apk --no-cache add jattach --repository http://dl-cdn.alpinelinux.org/alpine/edge/community/ \ && wget --no-verbose https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \ && wget --no-verbose https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \ diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index f9c47f77a98f5..97861d6be3141 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -19,7 +19,7 @@ ENV JMX_VERSION=0.18.0 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ && apk --no-cache add curl bash sqlite libc6-compat java-snappy \ - && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ + && apk --no-cache add openjdk11-jre-headless --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && apk --no-cache add jattach --repository http://dl-cdn.alpinelinux.org/alpine/edge/community/ \ && wget --no-verbose https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \ && wget --no-verbose https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \ diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile index f08e7268e4018..fa8e65009662b 100644 --- a/docker/datahub-upgrade/Dockerfile +++ b/docker/datahub-upgrade/Dockerfile @@ -19,7 +19,7 @@ ENV JMX_VERSION=0.18.0 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ && apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat java-snappy \ - && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ + && apk --no-cache add openjdk11-jre-headless --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && curl -sS https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-runner/9.4.46.v20220331/jetty-runner-9.4.46.v20220331.jar --output jetty-runner.jar \ && curl -sS https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-jmx/9.4.46.v20220331/jetty-jmx-9.4.46.v20220331.jar --output jetty-jmx.jar \ && curl -sS https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-util/9.4.46.v20220331/jetty-util-9.4.46.v20220331.jar --output jetty-util.jar \ diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index a9c75521fead1..e7f084739a576 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -18,7 +18,7 @@ ENV SCALA_VERSION 2.13 LABEL name="kafka" version=${KAFKA_VERSION} RUN apk add --no-cache bash coreutils -RUN apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community +RUN apk --no-cache add openjdk11-jre-headless --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community RUN apk add --no-cache -t .build-deps git curl ca-certificates jq gcc musl-dev libffi-dev zip RUN mkdir -p /opt \ diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index bb1de002de1f1..21c4cef2e848b 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -6,7 +6,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Breaking Changes -* Updating MySQL version for quickstarts to 8.2, may cause quickstart issues for existing instances. +- Updating MySQL version for quickstarts to 8.2, may cause quickstart issues for existing instances. +- #9244: The `redshift-legacy` and `redshift-legacy-usage` sources, which have been deprecated for >6 months, have been removed. The new `redshift` source is a superset of the functionality provided by those legacy sources. ### Potential Downtime @@ -70,6 +71,9 @@ qualified dataset name, i.e. `.`. We attempt to supp pattern format by prepending `.*\\.` to dataset patterns lacking a period, so in most cases this should not cause any issues. However, if you have a complex dataset pattern, we recommend you manually convert it to the fully qualified format to avoid any potential issues. +- #9110 - The Unity Catalog source will now generate urns based on `env` properly. If you have +been setting `env` in your recipe to something besides `PROD`, we will now generate urns +with that new env variable, invalidating your existing urns. ### Potential Downtime diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 3a13aefa834a4..32da518d6c04c 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -37,12 +37,24 @@ pip install 'acryl-datahub-airflow-plugin[plugin-v2]' ### Configuration -Set up a DataHub connection in Airflow. +Set up a DataHub connection in Airflow, either via command line or the Airflow UI. + +#### Command Line ```shell airflow connections add --conn-type 'datahub-rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' ``` +#### Airflow UI + +On the Airflow UI, go to Admin -> Connections and click the "+" symbol to create a new connection. Select "DataHub REST Server" from the dropdown for "Connection Type" and enter the appropriate values. + +

+ +

+ +#### Optional Configurations + No additional configuration is required to use the plugin. However, there are some optional configuration parameters that can be set in the `airflow.cfg` file. ```ini title="airflow.cfg" diff --git a/metadata-events/mxe-registration/build.gradle b/metadata-events/mxe-registration/build.gradle index 032870d93329f..2842dd935c7ee 100644 --- a/metadata-events/mxe-registration/build.gradle +++ b/metadata-events/mxe-registration/build.gradle @@ -7,7 +7,7 @@ configurations { dependencies { implementation project(':metadata-events:mxe-avro') implementation project(':metadata-models') - implementation spec.product.pegasus.dataAvro1_6 + implementation spec.product.pegasus.dataAvro testImplementation project(':test-models') testImplementation project(path: ':test-models', configuration: 'testDataTemplate') diff --git a/metadata-events/mxe-utils-avro/build.gradle b/metadata-events/mxe-utils-avro/build.gradle index a7bf287ab224d..3493797ab4f97 100644 --- a/metadata-events/mxe-utils-avro/build.gradle +++ b/metadata-events/mxe-utils-avro/build.gradle @@ -3,7 +3,7 @@ apply plugin: 'java-library' dependencies { api project(':metadata-events:mxe-avro') api project(':metadata-models') - api spec.product.pegasus.dataAvro1_6 + api spec.product.pegasus.dataAvro testImplementation externalDependency.testng testImplementation project(':test-models') diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index ebe180703051f..2b002164a49b9 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -213,11 +213,14 @@ # - 0.6.12 adds support for Spark Thrift Server # - 0.6.13 adds a small fix for Databricks # - 0.6.14 uses pure-sasl instead of sasl so it builds on Python 3.11 - "acryl-pyhive[hive_pure_sasl]==0.6.14", + # - 0.6.15 adds support for thrift > 0.14 (cherry-picked from https://github.com/apache/thrift/pull/2491) + "acryl-pyhive[hive_pure_sasl]==0.6.15", # As per https://github.com/datahub-project/datahub/issues/8405 - # and https://github.com/dropbox/PyHive/issues/417, new versions - # of thrift break PyHive's hive+http transport. - "thrift<0.14.0", + # and https://github.com/dropbox/PyHive/issues/417, version 0.14.0 + # of thrift broke PyHive's hive+http transport. + # Fixed by https://github.com/apache/thrift/pull/2491 in version 0.17.0 + # which is unfortunately not on PyPi. + # Instead, we put the fix in our PyHive fork, so no thrift pin is needed. } microsoft_common = {"msal==1.22.0"} @@ -366,8 +369,6 @@ | usage_common | {"redshift-connector"} | sqlglot_lib, - "redshift-legacy": sql_common | redshift_common | sqlglot_lib, - "redshift-usage-legacy": sql_common | redshift_common | sqlglot_lib | usage_common, "s3": {*s3_base, *data_lake_profiling}, "gcs": {*s3_base, *data_lake_profiling}, "sagemaker": aws_common, @@ -510,8 +511,6 @@ "presto", "redash", "redshift", - "redshift-legacy", - "redshift-usage-legacy", "s3", "snowflake", "tableau", @@ -608,8 +607,6 @@ "postgres = datahub.ingestion.source.sql.postgres:PostgresSource", "redash = datahub.ingestion.source.redash:RedashSource", "redshift = datahub.ingestion.source.redshift.redshift:RedshiftSource", - "redshift-legacy = datahub.ingestion.source.sql.redshift:RedshiftSource", - "redshift-usage-legacy = datahub.ingestion.source.usage.redshift_usage:RedshiftUsageSource", "snowflake = datahub.ingestion.source.snowflake.snowflake_v2:SnowflakeV2Source", "superset = datahub.ingestion.source.superset:SupersetSource", "tableau = datahub.ingestion.source.tableau:TableauSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py index d61975694f541..0fb211a5d7b16 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py @@ -1,8 +1,8 @@ -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import boto3 from boto3.session import Session -from botocore.config import Config +from botocore.config import DEFAULT_TIMEOUT, Config from botocore.utils import fix_s3_host from pydantic.fields import Field @@ -104,6 +104,16 @@ class AwsConnectionConfig(ConfigModel): description="A set of proxy configs to use with AWS. See the [botocore.config](https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html) docs for details.", ) + read_timeout: float = Field( + default=DEFAULT_TIMEOUT, + description="The timeout for reading from the connection (in seconds).", + ) + + aws_advanced_config: Dict[str, Any] = Field( + default_factory=dict, + description="Advanced AWS configuration options. These are passed directly to [botocore.config.Config](https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html).", + ) + def _normalized_aws_roles(self) -> List[AwsAssumeRoleConfig]: if not self.aws_role: return [] @@ -167,13 +177,20 @@ def get_credentials(self) -> Dict[str, str]: } return {} + def _aws_config(self) -> Config: + return Config( + proxies=self.aws_proxy, + read_timeout=self.read_timeout, + **self.aws_advanced_config, + ) + def get_s3_client( self, verify_ssl: Optional[Union[bool, str]] = None ) -> "S3Client": return self.get_session().client( "s3", endpoint_url=self.aws_endpoint_url, - config=Config(proxies=self.aws_proxy), + config=self._aws_config(), verify=verify_ssl, ) @@ -183,7 +200,7 @@ def get_s3_resource( resource = self.get_session().resource( "s3", endpoint_url=self.aws_endpoint_url, - config=Config(proxies=self.aws_proxy), + config=self._aws_config(), verify=verify_ssl, ) # according to: https://stackoverflow.com/questions/32618216/override-s3-endpoint-using-boto3-configuration-file @@ -195,10 +212,10 @@ def get_s3_resource( return resource def get_glue_client(self) -> "GlueClient": - return self.get_session().client("glue") + return self.get_session().client("glue", config=self._aws_config()) def get_sagemaker_client(self) -> "SageMakerClient": - return self.get_session().client("sagemaker") + return self.get_session().client("sagemaker", config=self._aws_config()) class AwsSourceConfig(EnvConfigMixin, AwsConnectionConfig): diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index 0b57c41131714..a96171caf9835 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -1,6 +1,6 @@ from datetime import datetime -from datahub.ingestion.source.sql.redshift import redshift_datetime_format +redshift_datetime_format = "%Y-%m-%d %H:%M:%S" class RedshiftQuery: diff --git a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py index 37f088bcd7b50..c3fbab3f9a012 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py @@ -1,6 +1,3 @@ -import warnings - -from datahub.configuration.common import ConfigurationWarning from datahub.ingestion.api.registry import PluginRegistry from datahub.ingestion.api.source import Source @@ -8,15 +5,7 @@ source_registry.register_from_entrypoint("datahub.ingestion.source.plugins") # Deprecations. -source_registry.register_alias( - "redshift-usage", - "redshift-usage-legacy", - lambda: warnings.warn( - "source type redshift-usage is deprecated, use redshift source instead as usage was merged into the main source", - ConfigurationWarning, - stacklevel=3, - ), -) +# source_registry.register_alias(, , ) # The MSSQL source has two possible sets of dependencies. We alias # the second to the first so that we maintain the 1:1 mapping between diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py deleted file mode 100644 index 33d517c8589e9..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py +++ /dev/null @@ -1,1198 +0,0 @@ -import logging -from collections import defaultdict -from dataclasses import dataclass, field -from enum import Enum -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union -from urllib.parse import urlparse - -# These imports verify that the dependencies are available. -import psycopg2 # noqa: F401 -import sqlalchemy -import sqlalchemy_redshift # noqa: F401 -from pydantic.fields import Field -from sqlalchemy import create_engine, inspect -from sqlalchemy.engine import Connection, reflection -from sqlalchemy.engine.reflection import Inspector -from sqlalchemy_redshift.dialect import RedshiftDialect, RelationKey -from sqllineage.runner import LineageRunner - -import datahub.emitter.mce_builder as builder -from datahub.configuration import ConfigModel -from datahub.configuration.source_common import DatasetLineageProviderConfigBase -from datahub.configuration.time_window_config import BaseTimeWindowConfig -from datahub.emitter import mce_builder -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import strip_s3_prefix -from datahub.ingestion.source.data_lake_common.path_spec import PathSpec -from datahub.ingestion.source.sql.postgres import BasePostgresConfig -from datahub.ingestion.source.sql.sql_common import ( - SQLAlchemySource, - SQLSourceReport, - SqlWorkUnit, -) - -# TRICKY: it's necessary to import the Postgres source because -# that module has some side effects that we care about here. -from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage -from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot -from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent -from datahub.metadata.schema_classes import ( - ChangeTypeClass, - DatasetLineageTypeClass, - DatasetPropertiesClass, - DatasetSnapshotClass, - UpstreamClass, -) - -logger: logging.Logger = logging.getLogger(__name__) - - -class LineageMode(Enum): - SQL_BASED = "sql_based" - STL_SCAN_BASED = "stl_scan_based" - MIXED = "mixed" - - -class LineageCollectorType(Enum): - QUERY_SCAN = "query_scan" - QUERY_SQL_PARSER = "query_sql_parser" - VIEW = "view" - NON_BINDING_VIEW = "non-binding-view" - COPY = "copy" - UNLOAD = "unload" - - -class LineageDatasetPlatform(Enum): - S3 = "s3" - REDSHIFT = "redshift" - - -@dataclass(frozen=True, eq=True) -class LineageDataset: - platform: LineageDatasetPlatform - path: str - - -@dataclass -class LineageItem: - dataset: LineageDataset - upstreams: Set[LineageDataset] - collector_type: LineageCollectorType - dataset_lineage_type: str = field(init=False) - query_parser_failed_sqls: List[str] - - def __post_init__(self): - if self.collector_type == LineageCollectorType.COPY: - self.dataset_lineage_type = DatasetLineageTypeClass.COPY - elif self.collector_type in [ - LineageCollectorType.VIEW, - LineageCollectorType.NON_BINDING_VIEW, - ]: - self.dataset_lineage_type = DatasetLineageTypeClass.VIEW - else: - self.dataset_lineage_type = DatasetLineageTypeClass.TRANSFORMED - - -class S3LineageProviderConfig(ConfigModel): - """ - Any source that produces s3 lineage from/to Datasets should inherit this class. - """ - - path_specs: List[PathSpec] = Field( - description="List of PathSpec. See below the details about PathSpec" - ) - - -class DatasetS3LineageProviderConfigBase(ConfigModel): - """ - Any source that produces s3 lineage from/to Datasets should inherit this class. - """ - - s3_lineage_config: Optional[S3LineageProviderConfig] = Field( - default=None, description="Common config for S3 lineage generation" - ) - - -class RedshiftConfig( - BasePostgresConfig, - BaseTimeWindowConfig, - DatasetLineageProviderConfigBase, - DatasetS3LineageProviderConfigBase, -): - def get_identifier(self, schema: str, table: str) -> str: - regular = f"{schema}.{table}" - if self.database_alias: - return f"{self.database_alias}.{regular}" - if self.database: - return f"{self.database}.{regular}" - return regular - - # Although Amazon Redshift is compatible with Postgres's wire format, - # we actually want to use the sqlalchemy-redshift package and dialect - # because it has better caching behavior. In particular, it queries - # the full table, column, and constraint information in a single larger - # query, and then simply pulls out the relevant information as needed. - # Because of this behavior, it uses dramatically fewer round trips for - # large Redshift warehouses. As an example, see this query for the columns: - # https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745. - scheme: str = Field( - default="redshift+psycopg2", - description="", - hidden_from_docs=True, - ) - - default_schema: str = Field( - default="public", - description="The default schema to use if the sql parser fails to parse the schema with `sql_based` lineage collector", - ) - - include_table_lineage: Optional[bool] = Field( - default=True, description="Whether table lineage should be ingested." - ) - include_copy_lineage: Optional[bool] = Field( - default=True, - description="Whether lineage should be collected from copy commands", - ) - include_unload_lineage: Optional[bool] = Field( - default=True, - description="Whether lineage should be collected from unload commands", - ) - capture_lineage_query_parser_failures: Optional[bool] = Field( - default=False, - description="Whether to capture lineage query parser errors with dataset properties for debuggings", - ) - - table_lineage_mode: Optional[LineageMode] = Field( - default=LineageMode.STL_SCAN_BASED, - description="Which table lineage collector mode to use. Available modes are: [stl_scan_based, sql_based, mixed]", - ) - - -# reflection.cache uses eval and other magic to partially rewrite the function. -# mypy can't handle it, so we ignore it for now. -@reflection.cache # type: ignore -def _get_all_table_comments(self, connection, **kw): - COMMENT_SQL = """ - SELECT n.nspname as schema, - c.relname as table_name, - pgd.description as table_comment - FROM pg_catalog.pg_class c - LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace - LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid - WHERE c.relkind in ('r', 'v', 'm', 'f', 'p') - AND pgd.description IS NOT NULL - ORDER BY "schema", "table_name"; - """ - - all_table_comments: Dict[RelationKey, str] = {} - - result = connection.execute(COMMENT_SQL) - for table in result: - key = RelationKey(table.table_name, table.schema, connection) - all_table_comments[key] = table.table_comment - - return all_table_comments - - -@reflection.cache # type: ignore -def get_table_comment(self, connection, table_name, schema=None, **kw): - all_table_comments = self._get_all_table_comments(connection, **kw) - key = RelationKey(table_name, schema, connection) - if key not in all_table_comments.keys(): - key = key.unquoted() - return {"text": all_table_comments.get(key)} - - -# gets all the relations for internal schemas and external schemas -# by UNION of internal schemas (excluding namespaces starting with pg_) -# and external schemas -@reflection.cache # type: ignore -def _get_all_relation_info(self, connection, **kw): - result = connection.execute( - """ - SELECT c.relkind, - n.oid AS "schema_oid", - n.nspname AS "schema", - c.oid AS "rel_oid", - c.relname, - CASE c.reldiststyle - WHEN 0 THEN 'EVEN' - WHEN 1 THEN 'KEY' - WHEN 8 THEN 'ALL' - END AS "diststyle", - c.relowner AS "owner_id", - u.usename AS "owner_name", - TRIM(TRAILING ';' FROM pg_catalog.pg_get_viewdef (c.oid,TRUE)) AS "view_definition", - pg_catalog.array_to_string(c.relacl,'\n') AS "privileges" - FROM pg_catalog.pg_class c - LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace - JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner - WHERE c.relkind IN ('r','v','m','S','f') - AND n.nspname !~ '^pg_' - AND n.nspname != 'information_schema' - UNION - SELECT 'r' AS "relkind", - NULL AS "schema_oid", - schemaname AS "schema", - NULL AS "rel_oid", - tablename AS "relname", - NULL AS "diststyle", - NULL AS "owner_id", - NULL AS "owner_name", - NULL AS "view_definition", - NULL AS "privileges" - FROM pg_catalog.svv_external_tables - ORDER BY "schema", - "relname";""" - ) - relations = {} - for rel in result: - key = RelationKey(rel.relname, rel.schema, connection) - relations[key] = rel - return relations - - -# workaround to get external tables -# Rewriting some external table types to match redshift type based on -# this redshift-sqlalchemy pull request: -# https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/pull/163/files -# The mapping of external types to redshift types: -# (https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_EXTERNAL_TABLE.html): -# External type -> Redshift type -# int -> integer -# decimal -> numeric -# char -> character -# float -> real -# double -> float -@reflection.cache # type: ignore -def _get_schema_column_info(self, connection, schema=None, **kw): - schema_clause = "AND schema = '{schema}'".format(schema=schema) if schema else "" - all_columns = defaultdict(list) - - with connection.connect() as cc: - result = cc.execute( - """ - SELECT - n.nspname as "schema", - c.relname as "table_name", - att.attname as "name", - format_encoding(att.attencodingtype::integer) as "encode", - format_type(att.atttypid, att.atttypmod) as "type", - att.attisdistkey as "distkey", - att.attsortkeyord as "sortkey", - att.attnotnull as "notnull", - pg_catalog.col_description(att.attrelid, att.attnum) - as "comment", - adsrc, - attnum, - pg_catalog.format_type(att.atttypid, att.atttypmod), - pg_catalog.pg_get_expr(ad.adbin, ad.adrelid) AS DEFAULT, - n.oid as "schema_oid", - c.oid as "table_oid" - FROM pg_catalog.pg_class c - LEFT JOIN pg_catalog.pg_namespace n - ON n.oid = c.relnamespace - JOIN pg_catalog.pg_attribute att - ON att.attrelid = c.oid - LEFT JOIN pg_catalog.pg_attrdef ad - ON (att.attrelid, att.attnum) = (ad.adrelid, ad.adnum) - WHERE n.nspname !~ '^pg_' - AND att.attnum > 0 - AND NOT att.attisdropped - {schema_clause} - UNION - SELECT - view_schema as "schema", - view_name as "table_name", - col_name as "name", - null as "encode", - col_type as "type", - null as "distkey", - 0 as "sortkey", - null as "notnull", - null as "comment", - null as "adsrc", - null as "attnum", - col_type as "format_type", - null as "default", - null as "schema_oid", - null as "table_oid" - FROM pg_get_late_binding_view_cols() cols( - view_schema name, - view_name name, - col_name name, - col_type varchar, - col_num int) - WHERE 1 {schema_clause} - UNION - SELECT - schemaname as "schema", - tablename as "table_name", - columnname as "name", - null as "encode", - -- Spectrum represents data types differently. - -- Standardize, so we can infer types. - CASE - WHEN external_type = 'int' THEN 'integer' - ELSE - regexp_replace( - replace( - replace( - replace( - replace( - replace( - replace(external_type, 'decimal', 'numeric'), - 'varchar', 'character varying'), - 'string', 'character varying'), - 'char(', 'character('), - 'float', 'real'), - 'double', 'float'), - '^array<(.*)>$', '$1[]', 1, 'p') - END AS "type", - null as "distkey", - 0 as "sortkey", - null as "notnull", - null as "comment", - null as "adsrc", - null as "attnum", - CASE - WHEN external_type = 'int' THEN 'integer' - ELSE - regexp_replace( - replace( - replace( - replace( - replace( - replace( - replace(external_type, 'decimal', 'numeric'), - 'varchar', 'character varying'), - 'string', 'character varying'), - 'char(', 'character('), - 'float', 'real'), - 'double', 'float'), - '^array<(.*)>$', '$1[]', 1, 'p') - END AS "format_type", - null as "default", - null as "schema_oid", - null as "table_oid" - FROM SVV_EXTERNAL_COLUMNS - WHERE 1 {schema_clause} - ORDER BY "schema", "table_name", "attnum" - """.format( - schema_clause=schema_clause - ) - ) - for col in result: - key = RelationKey(col.table_name, col.schema, connection) - all_columns[key].append(col) - return dict(all_columns) - - -def _get_external_db_mapping(connection): - # SQL query to get mapping of external schemas in redshift to its external database. - return connection.execute( - """ - select * from svv_external_schemas - """ - ) - - -# This monkey-patching enables us to batch fetch the table descriptions, rather than -# fetching them one at a time. -RedshiftDialect._get_all_table_comments = _get_all_table_comments -RedshiftDialect.get_table_comment = get_table_comment -RedshiftDialect._get_all_relation_info = _get_all_relation_info -RedshiftDialect._get_schema_column_info = _get_schema_column_info - -redshift_datetime_format = "%Y-%m-%d %H:%M:%S" - - -@dataclass -class RedshiftReport(SQLSourceReport): - # https://forums.aws.amazon.com/ann.jspa?annID=9105 - saas_version: str = "" - upstream_lineage: Dict[str, List[str]] = field(default_factory=dict) - - -@platform_name("Redshift") -@config_class(RedshiftConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") -@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") -@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") -@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") -@capability( - SourceCapability.USAGE_STATS, - "Not provided by this module, use `redshift-usage` for that.", - supported=False, -) -@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") -class RedshiftSource(SQLAlchemySource): - """ - This plugin extracts the following: - - - Metadata for databases, schemas, views and tables - - Column types associated with each table - - Also supports PostGIS extensions - - Table, row, and column statistics via optional SQL profiling - - Table lineage - - :::tip - - You can also get fine-grained usage statistics for Redshift using the `redshift-usage` source described below. - - ::: - - ### Prerequisites - - This source needs to access system tables that require extra permissions. - To grant these permissions, please alter your datahub Redshift user the following way: - ```sql - ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED; - GRANT SELECT ON pg_catalog.svv_table_info to datahub_user; - GRANT SELECT ON pg_catalog.svl_user_info to datahub_user; - ``` - :::note - - Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements. - - ::: - - ### Lineage - - There are multiple lineage collector implementations as Redshift does not support table lineage out of the box. - - #### stl_scan_based - The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to - discover lineage between tables. - Pros: - - Fast - - Reliable - - Cons: - - Does not work with Spectrum/external tables because those scans do not show up in stl_scan table. - - If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies. - - #### sql_based - The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries - and uses sql parsing to discover the dependecies. - - Pros: - - Works with Spectrum tables - - Views are connected properly if a table depends on it - - Cons: - - Slow. - - Less reliable as the query parser can fail on certain queries - - #### mixed - Using both collector above and first applying the sql based and then the stl_scan based one. - - Pros: - - Works with Spectrum tables - - Views are connected properly if a table depends on it - - A bit more reliable than the sql_based one only - - Cons: - - Slow - - May be incorrect at times as the query parser can fail on certain queries - - :::note - - The redshift stl redshift tables which are used for getting data lineage only retain approximately two to five days of log history. This means you cannot extract lineage from queries issued outside that window. - - ::: - - """ - - eskind_to_platform = {1: "glue", 2: "hive", 3: "postgres", 4: "redshift"} - - def __init__(self, config: RedshiftConfig, ctx: PipelineContext): - super().__init__(config, ctx, "redshift") - self.catalog_metadata: Dict = {} - self.config: RedshiftConfig = config - self._lineage_map: Optional[Dict[str, LineageItem]] = None - self._all_tables_set: Optional[Set[str]] = None - self.report: RedshiftReport = RedshiftReport() - - @classmethod - def create(cls, config_dict, ctx): - config = RedshiftConfig.parse_obj(config_dict) - return cls(config, ctx) - - def get_catalog_metadata(self, conn: Connection) -> None: - try: - catalog_metadata = _get_external_db_mapping(conn) - except Exception as e: - self.error(logger, "external-svv_external_schemas", f"Error was {e}") - return - - db_name = self.get_db_name() - - external_schema_mapping = {} - for rel in catalog_metadata: - if rel.eskind != 1: - logger.debug( - f"Skipping {rel.schemaname} for mapping to external database as currently we only " - f"support glue" - ) - continue - external_schema_mapping[rel.schemaname] = { - "eskind": rel.eskind, - "external_database": rel.databasename, - "esoptions": rel.esoptions, - "esoid": rel.esoid, - "esowner": rel.esowner, - } - self.catalog_metadata[db_name] = external_schema_mapping - - def get_inspectors(self) -> Iterable[Inspector]: - # This method can be overridden in the case that you want to dynamically - # run on multiple databases. - engine = self.get_metadata_engine() - with engine.connect() as conn: - self.get_catalog_metadata(conn) - inspector = inspect(conn) - yield inspector - - def get_metadata_engine(self) -> sqlalchemy.engine.Engine: - url = self.config.get_sql_alchemy_url() - logger.debug(f"sql_alchemy_url={url}") - return create_engine(url, **self.config.options) - - def inspect_version(self) -> Any: - db_engine = self.get_metadata_engine() - logger.info("Checking current version") - for db_row in db_engine.execute("select version()"): - self.report.saas_version = db_row[0] - - def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: - try: - self.inspect_version() - except Exception as e: - self.report.report_failure("version", f"Error: {e}") - return - - for wu in super().get_workunits_internal(): - yield wu - if ( - isinstance(wu, SqlWorkUnit) - and isinstance(wu.metadata, MetadataChangeEvent) - and isinstance(wu.metadata.proposedSnapshot, DatasetSnapshot) - ): - lineage_mcp = None - lineage_properties_aspect: Optional[DatasetPropertiesClass] = None - - dataset_snapshot: DatasetSnapshotClass = wu.metadata.proposedSnapshot - assert dataset_snapshot - - if self.config.include_table_lineage: - lineage_mcp, lineage_properties_aspect = self.get_lineage_mcp( - wu.metadata.proposedSnapshot.urn - ) - - if lineage_mcp is not None: - yield lineage_mcp.as_workunit() - - if lineage_properties_aspect: - aspects = dataset_snapshot.aspects - if aspects is None: - aspects = [] - - dataset_properties_aspect: Optional[DatasetPropertiesClass] = None - - for aspect in aspects: - if isinstance(aspect, DatasetPropertiesClass): - dataset_properties_aspect = aspect - - if dataset_properties_aspect is None: - dataset_properties_aspect = DatasetPropertiesClass() - aspects.append(dataset_properties_aspect) - - custom_properties = ( - { - **dataset_properties_aspect.customProperties, - **lineage_properties_aspect.customProperties, - } - if dataset_properties_aspect.customProperties - else lineage_properties_aspect.customProperties - ) - dataset_properties_aspect.customProperties = custom_properties - dataset_snapshot.aspects = aspects - - dataset_snapshot.aspects.append(dataset_properties_aspect) - - def _get_all_tables(self) -> Set[str]: - all_tables_query: str = """ - select - table_schema as schemaname, - table_name as tablename - from - pg_catalog.svv_tables - where - table_type = 'BASE TABLE' - and table_schema not in ('information_schema', 'pg_catalog', 'pg_internal') - union - select - distinct schemaname, - tablename - from - svv_external_tables - union - SELECT - n.nspname AS schemaname - ,c.relname AS tablename - FROM - pg_catalog.pg_class AS c - INNER JOIN - pg_catalog.pg_namespace AS n - ON c.relnamespace = n.oid - WHERE relkind = 'v' - and - n.nspname not in ('pg_catalog', 'information_schema') - - """ - db_name = self.get_db_name() - all_tables_set = set() - - engine = self.get_metadata_engine() - for db_row in engine.execute(all_tables_query): - all_tables_set.add( - f'{db_name}.{db_row["schemaname"]}.{db_row["tablename"]}' - ) - - return all_tables_set - - def _get_sources_from_query(self, db_name: str, query: str) -> List[LineageDataset]: - sources = list() - - parser = LineageRunner(query) - - for table in parser.source_tables: - source_schema, source_table = str(table).split(".") - if source_schema == "": - source_schema = str(self.config.default_schema) - - source = LineageDataset( - platform=LineageDatasetPlatform.REDSHIFT, - path=f"{db_name}.{source_schema}.{source_table}", - ) - sources.append(source) - - return sources - - def get_db_name(self, inspector: Optional[Inspector] = None) -> str: - db_name = self.config.database - db_alias = self.config.database_alias - if db_alias: - db_name = db_alias - assert db_name - return db_name - - def _get_s3_path(self, path: str) -> str: - if self.config.s3_lineage_config: - for path_spec in self.config.s3_lineage_config.path_specs: - if path_spec.allowed(path): - table_name, table_path = path_spec.extract_table_name_and_path(path) - return table_path - return path - - def _build_s3_path_from_row(self, db_row): - path = db_row["filename"].strip() - if urlparse(path).scheme != "s3": - raise ValueError( - f"Only s3 source supported with copy/unload. The source was: {path}" - ) - return strip_s3_prefix(self._get_s3_path(path)) - - def _populate_lineage_map( - self, query: str, lineage_type: LineageCollectorType - ) -> None: - """ - This method generate table level lineage based with the given query. - The query should return the following columns: target_schema, target_table, source_table, source_schema - source_table and source_schema can be omitted if the sql_field is set because then it assumes the source_table - and source_schema will be extracted from the sql_field by sql parsing. - - :param query: The query to run to extract lineage. - :type query: str - :param lineage_type: The way the lineage should be processed - :type lineage_type: LineageType - return: The method does not return with anything as it directly modify the self._lineage_map property. - :rtype: None - """ - assert self._lineage_map is not None - - if not self._all_tables_set: - self._all_tables_set = self._get_all_tables() - - engine = self.get_metadata_engine() - - db_name = self.get_db_name() - - try: - for db_row in engine.execute(query): - if lineage_type != LineageCollectorType.UNLOAD: - if not self.config.schema_pattern.allowed( - db_row["target_schema"] - ) or not self.config.table_pattern.allowed(db_row["target_table"]): - continue - - # Target - if lineage_type == LineageCollectorType.UNLOAD: - try: - target_platform = LineageDatasetPlatform.S3 - # Following call requires 'filename' key in db_row - target_path = self._build_s3_path_from_row(db_row) - except ValueError as e: - self.warn(logger, "non-s3-lineage", str(e)) - continue - else: - target_platform = LineageDatasetPlatform.REDSHIFT - target_path = ( - f'{db_name}.{db_row["target_schema"]}.{db_row["target_table"]}' - ) - - target = LineageItem( - dataset=LineageDataset(platform=target_platform, path=target_path), - upstreams=set(), - collector_type=lineage_type, - query_parser_failed_sqls=list(), - ) - - # Source - sources: List[LineageDataset] = list() - if lineage_type in { - lineage_type.QUERY_SQL_PARSER, - lineage_type.NON_BINDING_VIEW, - }: - try: - sources = self._get_sources_from_query( - db_name=db_name, query=db_row["ddl"] - ) - except Exception as e: - target.query_parser_failed_sqls.append(db_row["ddl"]) - self.warn( - logger, - "parsing-query", - f'Error parsing query {db_row["ddl"]} for getting lineage .' - f"\nError was {e}.", - ) - else: - if lineage_type == lineage_type.COPY: - try: - platform = LineageDatasetPlatform.S3 - # Following call requires 'filename' key in db_row - path = self._build_s3_path_from_row(db_row) - except ValueError as e: - self.warn(logger, "non-s3-lineage", str(e)) - continue - else: - platform = LineageDatasetPlatform.REDSHIFT - path = f'{db_name}.{db_row["source_schema"]}.{db_row["source_table"]}' - - sources = [ - LineageDataset( - platform=platform, - path=path, - ) - ] - - for source in sources: - # Filtering out tables which does not exist in Redshift - # It was deleted in the meantime or query parser did not capture well the table name - if ( - source.platform == LineageDatasetPlatform.REDSHIFT - and source.path not in self._all_tables_set - ): - self.warn( - logger, "missing-table", f"{source.path} missing table" - ) - continue - - target.upstreams.add(source) - - # Merging downstreams if dataset already exists and has downstreams - if target.dataset.path in self._lineage_map: - self._lineage_map[ - target.dataset.path - ].upstreams = self._lineage_map[ - target.dataset.path - ].upstreams.union( - target.upstreams - ) - - else: - self._lineage_map[target.dataset.path] = target - - logger.info( - f"Lineage[{target}]:{self._lineage_map[target.dataset.path]}" - ) - - except Exception as e: - self.warn(logger, f"extract-{lineage_type.name}", f"Error was {e}") - - def _populate_lineage(self) -> None: - stl_scan_based_lineage_query: str = """ - select - distinct cluster, - target_schema, - target_table, - username as username, - source_schema, - source_table - from - ( - select - distinct tbl as target_table_id, - sti.schema as target_schema, - sti.table as target_table, - sti.database as cluster, - query, - starttime - from - stl_insert - join SVV_TABLE_INFO sti on - sti.table_id = tbl - where starttime >= '{start_time}' - and starttime < '{end_time}' - and cluster = '{db_name}' - ) as target_tables - join ( ( - select - sui.usename as username, - ss.tbl as source_table_id, - sti.schema as source_schema, - sti.table as source_table, - scan_type, - sq.query as query - from - ( - select - distinct userid, - query, - tbl, - type as scan_type - from - stl_scan - ) ss - join SVV_TABLE_INFO sti on - sti.table_id = ss.tbl - left join stl_query sq on - ss.query = sq.query - left join svl_user_info sui on - sq.userid = sui.usesysid - where - sui.usename <> 'rdsdb') - ) as source_tables - using (query) - where - scan_type in (1, 2, 3) - order by cluster, target_schema, target_table, starttime asc - """.format( - # We need the original database name for filtering - db_name=self.config.database, - start_time=self.config.start_time.strftime(redshift_datetime_format), - end_time=self.config.end_time.strftime(redshift_datetime_format), - ) - view_lineage_query = """ - select - distinct - srcnsp.nspname as source_schema - , - srcobj.relname as source_table - , - tgtnsp.nspname as target_schema - , - tgtobj.relname as target_table - from - pg_catalog.pg_class as srcobj - inner join - pg_catalog.pg_depend as srcdep - on - srcobj.oid = srcdep.refobjid - inner join - pg_catalog.pg_depend as tgtdep - on - srcdep.objid = tgtdep.objid - join - pg_catalog.pg_class as tgtobj - on - tgtdep.refobjid = tgtobj.oid - and srcobj.oid <> tgtobj.oid - left outer join - pg_catalog.pg_namespace as srcnsp - on - srcobj.relnamespace = srcnsp.oid - left outer join - pg_catalog.pg_namespace tgtnsp - on - tgtobj.relnamespace = tgtnsp.oid - where - tgtdep.deptype = 'i' - --dependency_internal - and tgtobj.relkind = 'v' - --i=index, v=view, s=sequence - and tgtnsp.nspname not in ('pg_catalog', 'information_schema') - order by target_schema, target_table asc - """ - - list_late_binding_views_query = """ - SELECT - n.nspname AS target_schema - ,c.relname AS target_table - , COALESCE(pg_get_viewdef(c.oid, TRUE), '') AS ddl - FROM - pg_catalog.pg_class AS c - INNER JOIN - pg_catalog.pg_namespace AS n - ON c.relnamespace = n.oid - WHERE relkind = 'v' - and ddl ilike '%%with no schema binding%%' - and - n.nspname not in ('pg_catalog', 'information_schema') - """ - - list_insert_create_queries_sql = """ - select - distinct cluster, - target_schema, - target_table, - username, - querytxt as ddl - from - ( - select - distinct tbl as target_table_id, - sti.schema as target_schema, - sti.table as target_table, - sti.database as cluster, - sui.usename as username, - querytxt, - si.starttime as starttime - from - stl_insert as si - join SVV_TABLE_INFO sti on - sti.table_id = tbl - left join svl_user_info sui on - si.userid = sui.usesysid - left join stl_query sq on - si.query = sq.query - left join stl_load_commits slc on - slc.query = si.query - where - sui.usename <> 'rdsdb' - and sq.aborted = 0 - and slc.query IS NULL - and cluster = '{db_name}' - and si.starttime >= '{start_time}' - and si.starttime < '{end_time}' - ) as target_tables - order by cluster, target_schema, target_table, starttime asc - """.format( - # We need the original database name for filtering - db_name=self.config.database, - start_time=self.config.start_time.strftime(redshift_datetime_format), - end_time=self.config.end_time.strftime(redshift_datetime_format), - ) - - list_copy_commands_sql = """ - select - distinct - "schema" as target_schema, - "table" as target_table, - filename - from - stl_insert as si - join stl_load_commits as c on - si.query = c.query - join SVV_TABLE_INFO sti on - sti.table_id = tbl - where - database = '{db_name}' - and si.starttime >= '{start_time}' - and si.starttime < '{end_time}' - order by target_schema, target_table, starttime asc - """.format( - # We need the original database name for filtering - db_name=self.config.database, - start_time=self.config.start_time.strftime(redshift_datetime_format), - end_time=self.config.end_time.strftime(redshift_datetime_format), - ) - - list_unload_commands_sql = """ - select - distinct - sti.database as cluster, - sti.schema as source_schema, - sti."table" as source_table, - unl.path as filename - from - stl_unload_log unl - join stl_scan sc on - sc.query = unl.query and - sc.starttime >= '{start_time}' and - sc.endtime < '{end_time}' - join SVV_TABLE_INFO sti on - sti.table_id = sc.tbl - where - unl.start_time >= '{start_time}' and - unl.end_time < '{end_time}' and - sti.database = '{db_name}' - and sc.type in (1, 2, 3) - order by cluster, source_schema, source_table, filename, unl.start_time asc - """.format( - # We need the original database name for filtering - db_name=self.config.database, - start_time=self.config.start_time.strftime(redshift_datetime_format), - end_time=self.config.end_time.strftime(redshift_datetime_format), - ) - - if not self._lineage_map: - self._lineage_map = defaultdict() - - if self.config.table_lineage_mode == LineageMode.STL_SCAN_BASED: - # Populate table level lineage by getting upstream tables from stl_scan redshift table - self._populate_lineage_map( - query=stl_scan_based_lineage_query, - lineage_type=LineageCollectorType.QUERY_SCAN, - ) - elif self.config.table_lineage_mode == LineageMode.SQL_BASED: - # Populate table level lineage by parsing table creating sqls - self._populate_lineage_map( - query=list_insert_create_queries_sql, - lineage_type=LineageCollectorType.QUERY_SQL_PARSER, - ) - elif self.config.table_lineage_mode == LineageMode.MIXED: - # Populate table level lineage by parsing table creating sqls - self._populate_lineage_map( - query=list_insert_create_queries_sql, - lineage_type=LineageCollectorType.QUERY_SQL_PARSER, - ) - # Populate table level lineage by getting upstream tables from stl_scan redshift table - self._populate_lineage_map( - query=stl_scan_based_lineage_query, - lineage_type=LineageCollectorType.QUERY_SCAN, - ) - - if self.config.include_views: - # Populate table level lineage for views - self._populate_lineage_map( - query=view_lineage_query, lineage_type=LineageCollectorType.VIEW - ) - - # Populate table level lineage for late binding views - self._populate_lineage_map( - query=list_late_binding_views_query, - lineage_type=LineageCollectorType.NON_BINDING_VIEW, - ) - if self.config.include_copy_lineage: - self._populate_lineage_map( - query=list_copy_commands_sql, lineage_type=LineageCollectorType.COPY - ) - if self.config.include_unload_lineage: - self._populate_lineage_map( - query=list_unload_commands_sql, lineage_type=LineageCollectorType.UNLOAD - ) - - def get_lineage_mcp( - self, dataset_urn: str - ) -> Tuple[ - Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass] - ]: - dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) - if dataset_key is None: - return None, None - - if self._lineage_map is None: - logger.debug("Populating lineage") - self._populate_lineage() - assert self._lineage_map is not None - - upstream_lineage: List[UpstreamClass] = [] - custom_properties: Dict[str, str] = {} - - if dataset_key.name in self._lineage_map: - item = self._lineage_map[dataset_key.name] - if ( - self.config.capture_lineage_query_parser_failures - and item.query_parser_failed_sqls - ): - custom_properties["lineage_sql_parser_failed_queries"] = ",".join( - item.query_parser_failed_sqls - ) - for upstream in item.upstreams: - upstream_table = UpstreamClass( - dataset=builder.make_dataset_urn_with_platform_instance( - upstream.platform.value, - upstream.path, - platform_instance=self.config.platform_instance_map.get( - upstream.platform.value - ) - if self.config.platform_instance_map - else None, - env=self.config.env, - ), - type=item.dataset_lineage_type, - ) - upstream_lineage.append(upstream_table) - - dataset_params = dataset_key.name.split(".") - db_name = dataset_params[0] - schemaname = dataset_params[1] - tablename = dataset_params[2] - if db_name in self.catalog_metadata: - if schemaname in self.catalog_metadata[db_name]: - external_db_params = self.catalog_metadata[db_name][schemaname] - upstream_platform = self.eskind_to_platform[ - external_db_params["eskind"] - ] - catalog_upstream = UpstreamClass( - mce_builder.make_dataset_urn_with_platform_instance( - upstream_platform, - "{database}.{table}".format( - database=external_db_params["external_database"], - table=tablename, - ), - platform_instance=self.config.platform_instance_map.get( - upstream_platform - ) - if self.config.platform_instance_map - else None, - env=self.config.env, - ), - DatasetLineageTypeClass.COPY, - ) - upstream_lineage.append(catalog_upstream) - - properties = None - if custom_properties: - properties = DatasetPropertiesClass(customProperties=custom_properties) - - if upstream_lineage: - self.report.upstream_lineage[dataset_urn] = [ - u.dataset for u in upstream_lineage - ] - else: - return None, properties - - mcp = MetadataChangeProposalWrapper( - entityType="dataset", - changeType=ChangeTypeClass.UPSERT, - entityUrn=dataset_urn, - aspectName="upstreamLineage", - aspect=UpstreamLineage(upstreams=upstream_lineage), - ) - - return mcp, properties diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 7073830318abe..4e3deedddbc43 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -1,7 +1,7 @@ import logging import os from datetime import datetime, timedelta, timezone -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import pydantic from pydantic import Field @@ -132,6 +132,14 @@ class UnityCatalogSourceConfig( _metastore_id_pattern_removed = pydantic_removed_field("metastore_id_pattern") + catalogs: Optional[List[str]] = pydantic.Field( + default=None, + description=( + "Fixed list of catalogs to ingest." + " If not specified, catalogs will be ingested based on `catalog_pattern`." + ), + ) + catalog_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description="Regex patterns for catalogs to filter in ingestion. Specify regex to match the full `metastore.catalog` name.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 3fb77ce512ed2..375c76db8e971 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -112,6 +112,15 @@ def catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]: for catalog in response: yield self._create_catalog(metastore, catalog) + def catalog( + self, catalog_name: str, metastore: Optional[Metastore] + ) -> Optional[Catalog]: + response = self._workspace_client.catalogs.get(catalog_name) + if not response: + logger.info(f"Catalog {catalog_name} not found") + return None + return self._create_catalog(metastore, response) + def schemas(self, catalog: Catalog) -> Iterable[Schema]: response = self._workspace_client.schemas.list(catalog_name=catalog.name) if not response: diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index b63cf65d55dc8..44b5bbbcb0ceb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -188,9 +188,10 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - self.report.report_ingestion_stage_start("Start warehouse") + self.report.report_ingestion_stage_start("Ingestion Setup") wait_on_warehouse = None if self.config.is_profiling_enabled(): + self.report.report_ingestion_stage_start("Start warehouse") # Can take several minutes, so start now and wait later wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse() if wait_on_warehouse is None: @@ -200,8 +201,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) return - self.report.report_ingestion_stage_start("Ingest service principals") - self.build_service_principal_map() + if self.config.include_ownership: + self.report.report_ingestion_stage_start("Ingest service principals") + self.build_service_principal_map() if self.config.include_notebooks: self.report.report_ingestion_stage_start("Ingest notebooks") yield from self.process_notebooks() @@ -317,7 +319,7 @@ def process_metastores(self) -> Iterable[MetadataWorkUnit]: def process_catalogs( self, metastore: Optional[Metastore] ) -> Iterable[MetadataWorkUnit]: - for catalog in self.unity_catalog_api_proxy.catalogs(metastore=metastore): + for catalog in self._get_catalogs(metastore): if not self.config.catalog_pattern.allowed(catalog.id): self.report.catalogs.dropped(catalog.id) continue @@ -327,6 +329,17 @@ def process_catalogs( self.report.catalogs.processed(catalog.id) + def _get_catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]: + if self.config.catalogs: + for catalog_name in self.config.catalogs: + catalog = self.unity_catalog_api_proxy.catalog( + catalog_name, metastore=metastore + ) + if catalog: + yield catalog + else: + yield from self.unity_catalog_api_proxy.catalogs(metastore=metastore) + def process_schemas(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]: for schema in self.unity_catalog_api_proxy.schemas(catalog=catalog): if not self.config.schema_pattern.allowed(schema.id): @@ -509,6 +522,7 @@ def gen_dataset_urn(self, table_ref: TableReference) -> str: platform=self.platform, platform_instance=self.platform_instance_name, name=str(table_ref), + env=self.config.env, ) def gen_notebook_urn(self, notebook: Union[Notebook, NotebookId]) -> str: @@ -576,6 +590,7 @@ def gen_schema_key(self, schema: Schema) -> ContainerKey: instance=self.config.platform_instance, catalog=schema.catalog.name, metastore=schema.catalog.metastore.name, + env=self.config.env, ) else: return UnitySchemaKey( @@ -583,6 +598,7 @@ def gen_schema_key(self, schema: Schema) -> ContainerKey: platform=self.platform, instance=self.config.platform_instance, catalog=schema.catalog.name, + env=self.config.env, ) def gen_metastore_key(self, metastore: Metastore) -> MetastoreKey: @@ -590,6 +606,7 @@ def gen_metastore_key(self, metastore: Metastore) -> MetastoreKey: metastore=metastore.name, platform=self.platform, instance=self.config.platform_instance, + env=self.config.env, ) def gen_catalog_key(self, catalog: Catalog) -> ContainerKey: @@ -600,12 +617,14 @@ def gen_catalog_key(self, catalog: Catalog) -> ContainerKey: metastore=catalog.metastore.name, platform=self.platform, instance=self.config.platform_instance, + env=self.config.env, ) else: return CatalogKey( catalog=catalog.name, platform=self.platform, instance=self.config.platform_instance, + env=self.config.env, ) def _gen_domain_urn(self, dataset_name: str) -> Optional[str]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py deleted file mode 100644 index 691eaa8211054..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py +++ /dev/null @@ -1,397 +0,0 @@ -import collections -import dataclasses -import logging -import time -from datetime import datetime -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set - -from pydantic.fields import Field -from pydantic.main import BaseModel -from sqlalchemy import create_engine -from sqlalchemy.engine import Engine - -import datahub.emitter.mce_builder as builder -from datahub.configuration.source_common import EnvConfigMixin -from datahub.configuration.time_window_config import get_time_bucket -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.api.source import Source, SourceReport -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql.redshift import RedshiftConfig -from datahub.ingestion.source.usage.usage_common import ( - BaseUsageConfig, - GenericAggregatedDataset, -) -from datahub.metadata.schema_classes import OperationClass, OperationTypeClass - -logger = logging.getLogger(__name__) - -if TYPE_CHECKING: - try: - from sqlalchemy.engine import Row # type: ignore - except ImportError: - # See https://github.com/python/mypy/issues/1153. - from sqlalchemy.engine.result import RowProxy as Row # type: ignore - -REDSHIFT_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" - - -# Add this join to the sql query for more metrics on completed queries -# LEFT JOIN svl_query_metrics_summary sqms ON ss.query = sqms.query -# Reference: https://docs.aws.amazon.com/redshift/latest/dg/r_SVL_QUERY_METRICS_SUMMARY.html - -# this sql query joins stl_scan over table info, -# querytext, and user info to get usage stats -# using non-LEFT joins here to limit the results to -# queries run by the user on user-defined tables. -REDSHIFT_USAGE_QUERY_TEMPLATE: str = """ -SELECT DISTINCT ss.userid as userid, - ss.query as query, - sui.usename as username, - ss.tbl as tbl, - sq.querytxt as querytxt, - sti.database as database, - sti.schema as schema, - sti.table as table, - sq.starttime as starttime, - sq.endtime as endtime -FROM stl_scan ss - JOIN svv_table_info sti ON ss.tbl = sti.table_id - JOIN stl_query sq ON ss.query = sq.query - JOIN svl_user_info sui ON sq.userid = sui.usesysid -WHERE ss.starttime >= '{start_time}' -AND ss.starttime < '{end_time}' -AND sti.database = '{database}' -AND sq.aborted = 0 -ORDER BY ss.endtime DESC; -""".strip() - -REDSHIFT_OPERATION_ASPECT_QUERY_TEMPLATE: str = """ - (SELECT - DISTINCT si.userid AS userid, - si.query AS query, - si.rows AS rows, - sui.usename AS username, - si.tbl AS tbl, - sq.querytxt AS querytxt, - sti.database AS database, - sti.schema AS schema, - sti.table AS table, - sq.starttime AS starttime, - sq.endtime AS endtime, - 'insert' AS operation_type - FROM - stl_insert si - JOIN svv_table_info sti ON si.tbl = sti.table_id - JOIN stl_query sq ON si.query = sq.query - JOIN svl_user_info sui ON sq.userid = sui.usesysid - WHERE - si.starttime >= '{start_time}' - AND si.starttime < '{end_time}' - AND si.rows > 0 - AND sq.aborted = 0) -UNION - (SELECT - DISTINCT sd.userid AS userid, - sd.query AS query, - sd.rows AS ROWS, - sui.usename AS username, - sd.tbl AS tbl, - sq.querytxt AS querytxt, - sti.database AS database, - sti.schema AS schema, - sti.table AS table, - sq.starttime AS starttime, - sq.endtime AS endtime, - 'delete' AS operation_type - FROM - stl_delete sd - JOIN svv_table_info sti ON sd.tbl = sti.table_id - JOIN stl_query sq ON sd.query = sq.query - JOIN svl_user_info sui ON sq.userid = sui.usesysid - WHERE - sd.starttime >= '{start_time}' - AND sd.starttime < '{end_time}' - AND sd.rows > 0 - AND sq.aborted = 0) -ORDER BY - endtime DESC -""".strip() - -RedshiftTableRef = str -AggregatedDataset = GenericAggregatedDataset[RedshiftTableRef] -AggregatedAccessEvents = Dict[datetime, Dict[RedshiftTableRef, AggregatedDataset]] - - -class RedshiftAccessEvent(BaseModel): - userid: int - username: str - query: int - tbl: int - text: Optional[str] = Field(None, alias="querytxt") - database: str - schema_: str = Field(alias="schema") - table: str - operation_type: Optional[str] = None - starttime: datetime - endtime: datetime - - -class RedshiftUsageConfig(RedshiftConfig, BaseUsageConfig, EnvConfigMixin): - email_domain: str = Field( - description="Email domain of your organisation so users can be displayed on UI appropriately." - ) - options: Dict = Field( - default={}, - description="Any options specified here will be passed to SQLAlchemy's create_engine as kwargs." - "See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details.", - ) - - def get_sql_alchemy_url(self): - return super().get_sql_alchemy_url() - - -@dataclasses.dataclass -class RedshiftUsageSourceReport(SourceReport): - filtered: Set[str] = dataclasses.field(default_factory=set) - num_usage_workunits_emitted: Optional[int] = None - num_operational_stats_workunits_emitted: Optional[int] = None - - def report_dropped(self, key: str) -> None: - self.filtered.add(key) - - -@platform_name("Redshift") -@config_class(RedshiftUsageConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -class RedshiftUsageSource(Source): - """ - This plugin extracts usage statistics for datasets in Amazon Redshift. - - Note: Usage information is computed by querying the following system tables - - 1. stl_scan - 2. svv_table_info - 3. stl_query - 4. svl_user_info - - To grant access this plugin for all system tables, please alter your datahub Redshift user the following way: - ```sql - ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED; - ``` - This plugin has the below functionalities - - 1. For a specific dataset this plugin ingests the following statistics - - 1. top n queries. - 2. top users. - 2. Aggregation of these statistics into buckets, by day or hour granularity. - - :::note - - This source only does usage statistics. To get the tables, views, and schemas in your Redshift warehouse, ingest using the `redshift` source described above. - - ::: - - :::note - - Redshift system tables have some latency in getting data from queries. In addition, these tables only maintain logs for 2-5 days. You can find more information from the official documentation [here](https://aws.amazon.com/premiumsupport/knowledge-center/logs-redshift-database-cluster/). - - ::: - - """ - - def __init__(self, config: RedshiftUsageConfig, ctx: PipelineContext): - super().__init__(ctx) - self.config: RedshiftUsageConfig = config - self.report: RedshiftUsageSourceReport = RedshiftUsageSourceReport() - - @classmethod - def create(cls, config_dict: Dict, ctx: PipelineContext) -> "RedshiftUsageSource": - config = RedshiftUsageConfig.parse_obj(config_dict) - return cls(config, ctx) - - def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - """Gets Redshift usage stats as work units""" - engine: Engine = self._make_sql_engine() - if self.config.include_operational_stats: - # Generate operation aspect workunits - yield from self._gen_operation_aspect_workunits(engine) - - # Generate aggregate events - query: str = REDSHIFT_USAGE_QUERY_TEMPLATE.format( - start_time=self.config.start_time.strftime(REDSHIFT_DATETIME_FORMAT), - end_time=self.config.end_time.strftime(REDSHIFT_DATETIME_FORMAT), - database=self.config.database, - ) - access_events_iterable: Iterable[ - RedshiftAccessEvent - ] = self._gen_access_events_from_history_query(query, engine) - - aggregated_events: AggregatedAccessEvents = self._aggregate_access_events( - access_events_iterable - ) - # Generate usage workunits from aggregated events. - self.report.num_usage_workunits_emitted = 0 - for time_bucket in aggregated_events.values(): - for aggregate in time_bucket.values(): - yield self._make_usage_stat(aggregate) - self.report.num_usage_workunits_emitted += 1 - - def _gen_operation_aspect_workunits( - self, engine: Engine - ) -> Iterable[MetadataWorkUnit]: - # Generate access events - query: str = REDSHIFT_OPERATION_ASPECT_QUERY_TEMPLATE.format( - start_time=self.config.start_time.strftime(REDSHIFT_DATETIME_FORMAT), - end_time=self.config.end_time.strftime(REDSHIFT_DATETIME_FORMAT), - ) - access_events_iterable: Iterable[ - RedshiftAccessEvent - ] = self._gen_access_events_from_history_query(query, engine) - - # Generate operation aspect work units from the access events - yield from self._gen_operation_aspect_workunits_from_access_events( - access_events_iterable - ) - - def _make_sql_engine(self) -> Engine: - url: str = self.config.get_sql_alchemy_url() - logger.debug(f"sql_alchemy_url = {url}") - return create_engine(url, **self.config.options) - - def _should_process_row(self, row: "Row") -> bool: - # Check for mandatory proerties being present first. - missing_props: List[str] = [ - prop - for prop in ["database", "schema", "table", "username"] - if not row[prop] - ] - if missing_props: - logging.info( - f"Access event parameter(s):[{','.join(missing_props)}] missing. Skipping ...." - ) - return False - # Check schema/table allow/deny patterns - full_table_name: str = f"{row['database']}.{row['schema']}.{row['table']}" - if not self.config.schema_pattern.allowed(row["schema"]): - logger.debug(f"Filtering out {full_table_name} due to schema_pattern.") - self.report.report_dropped(full_table_name) - return False - if not self.config.table_pattern.allowed(full_table_name): - logger.debug(f"Filtering out {full_table_name} due to table_pattern.") - self.report.report_dropped(full_table_name) - return False - # Passed all checks. - return True - - def _gen_access_events_from_history_query( - self, query: str, engine: Engine - ) -> Iterable[RedshiftAccessEvent]: - results = engine.execute(query) - for row in results: - if not self._should_process_row(row): - continue - row = row._asdict() - access_event = RedshiftAccessEvent(**dict(row.items())) - # Replace database name with the alias name if one is provided in the config. - if self.config.database_alias: - access_event.database = self.config.database_alias - yield access_event - - def _gen_operation_aspect_workunits_from_access_events( - self, - events_iterable: Iterable[RedshiftAccessEvent], - ) -> Iterable[MetadataWorkUnit]: - self.report.num_operational_stats_workunits_emitted = 0 - for event in events_iterable: - if not ( - event.database - and event.username - and event.schema_ - and event.table - and event.endtime - and event.operation_type - ): - continue - - assert event.operation_type in ["insert", "delete"] - - resource: str = f"{event.database}.{event.schema_}.{event.table}" - reported_time: int = int(time.time() * 1000) - last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) - user_email: str = event.username - operation_aspect = OperationClass( - timestampMillis=reported_time, - lastUpdatedTimestamp=last_updated_timestamp, - actor=builder.make_user_urn(user_email.split("@")[0]), - operationType=( - OperationTypeClass.INSERT - if event.operation_type == "insert" - else OperationTypeClass.DELETE - ), - ) - yield MetadataChangeProposalWrapper( - entityUrn=builder.make_dataset_urn_with_platform_instance( - "redshift", - resource.lower(), - self.config.platform_instance, - self.config.env, - ), - aspect=operation_aspect, - ).as_workunit() - self.report.num_operational_stats_workunits_emitted += 1 - - def _aggregate_access_events( - self, events_iterable: Iterable[RedshiftAccessEvent] - ) -> AggregatedAccessEvents: - datasets: AggregatedAccessEvents = collections.defaultdict(dict) - for event in events_iterable: - floored_ts: datetime = get_time_bucket( - event.starttime, self.config.bucket_duration - ) - resource: str = f"{event.database}.{event.schema_}.{event.table}" - # Get a reference to the bucket value(or initialize not yet in dict) and update it. - agg_bucket: AggregatedDataset = datasets[floored_ts].setdefault( - resource, - AggregatedDataset( - bucket_start_time=floored_ts, - resource=resource, - ), - ) - # current limitation in user stats UI, we need to provide email to show users - user_email: str = f"{event.username if event.username else 'unknown'}" - if "@" not in user_email: - user_email += f"@{self.config.email_domain}" - logger.info(f"user_email: {user_email}") - agg_bucket.add_read_entry( - user_email, - event.text, - [], # TODO: not currently supported by redshift; find column level changes - user_email_pattern=self.config.user_email_pattern, - ) - return datasets - - def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: - return agg.make_usage_workunit( - self.config.bucket_duration, - lambda resource: builder.make_dataset_urn_with_platform_instance( - "redshift", - resource.lower(), - self.config.platform_instance, - self.config.env, - ), - self.config.top_n_queries, - self.config.format_sql_queries, - self.config.include_top_n_queries, - self.config.queries_character_limit, - ) - - def get_report(self) -> RedshiftUsageSourceReport: - return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 2e9a15063661e..ccc4e115729a2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -59,7 +59,7 @@ class BaseSnowflakeConfig(ConfigModel): ) private_key: Optional[str] = pydantic.Field( default=None, - description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n' if using key pair authentication. Encrypted version of private key will be in a form of '-----BEGIN ENCRYPTED PRIVATE KEY-----\\nencrypted-private-key\\n-----END ECNCRYPTED PRIVATE KEY-----\\n' See: https://docs.snowflake.com/en/user-guide/key-pair-auth.html", + description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n' if using key pair authentication. Encrypted version of private key will be in a form of '-----BEGIN ENCRYPTED PRIVATE KEY-----\\nencrypted-private-key\\n-----END ENCRYPTED PRIVATE KEY-----\\n' See: https://docs.snowflake.com/en/user-guide/key-pair-auth.html", ) private_key_path: Optional[str] = pydantic.Field( diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index d1209f3ec7b75..efe2d26aae3d9 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -360,8 +360,12 @@ def get_urn_for_table(self, table: _TableName, lower: bool = False) -> str: table_name = ".".join( filter(None, [table.database, table.db_schema, table.table]) ) + + platform_instance = self.platform_instance + if lower: table_name = table_name.lower() + platform_instance = platform_instance.lower() if platform_instance else None if self.platform == "bigquery": # Normalize shard numbers and other BigQuery weirdness. @@ -372,7 +376,7 @@ def get_urn_for_table(self, table: _TableName, lower: bool = False) -> str: urn = make_dataset_urn_with_platform_instance( platform=self.platform, - platform_instance=self.platform_instance, + platform_instance=platform_instance, env=self.env, name=table_name, ) diff --git a/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json b/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json index 0c14096345d7e..2e92215d70b99 100644 --- a/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json +++ b/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json @@ -8,6 +8,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data", @@ -16,7 +17,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +33,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +49,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +67,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -87,7 +92,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -102,7 +108,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -114,6 +121,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "main" }, @@ -124,7 +132,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -139,7 +148,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -154,7 +164,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -171,7 +182,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -195,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -210,7 +223,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -230,7 +244,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -242,6 +257,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "main", "unity_schema": "default" @@ -253,7 +269,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -268,7 +285,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -283,7 +301,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -300,7 +319,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -324,7 +344,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -339,7 +360,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -363,7 +385,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -378,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -420,7 +444,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -437,7 +462,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -494,7 +520,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -516,7 +543,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -540,7 +568,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -568,7 +597,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -580,6 +610,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "main", "unity_schema": "information_schema" @@ -591,7 +622,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -606,7 +638,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -621,7 +654,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -638,7 +672,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -662,7 +697,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -677,7 +713,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -701,7 +738,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -716,7 +754,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -758,7 +797,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -775,7 +815,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -832,7 +873,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -854,7 +896,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -878,7 +921,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -906,7 +950,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -918,6 +963,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "main", "unity_schema": "quickstart_schema" @@ -929,7 +975,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -944,7 +991,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -959,7 +1007,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -976,7 +1025,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1000,7 +1050,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1015,7 +1066,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1039,7 +1091,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1054,7 +1107,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1096,7 +1150,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1113,7 +1168,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1170,7 +1226,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1192,7 +1249,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1216,7 +1274,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1244,7 +1303,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1256,6 +1316,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "quickstart_catalog" }, @@ -1266,7 +1327,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1281,7 +1343,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1296,7 +1359,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1313,7 +1377,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1337,7 +1402,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1352,7 +1418,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1372,7 +1439,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1384,6 +1452,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "quickstart_catalog", "unity_schema": "default" @@ -1395,7 +1464,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1410,7 +1480,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1425,7 +1496,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1442,7 +1514,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1466,7 +1539,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1481,7 +1555,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1505,7 +1580,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1520,7 +1596,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1562,7 +1639,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1579,7 +1657,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1636,7 +1715,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1658,7 +1738,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1682,7 +1763,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1710,7 +1792,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1722,6 +1805,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "quickstart_catalog", "unity_schema": "information_schema" @@ -1733,7 +1817,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1748,7 +1833,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1763,7 +1849,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1780,7 +1867,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1804,7 +1892,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1819,7 +1908,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1843,7 +1933,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1858,7 +1949,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1900,7 +1992,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1917,7 +2010,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1974,7 +2068,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1996,7 +2091,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2020,7 +2116,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2048,7 +2145,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2060,6 +2158,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "quickstart_catalog", "unity_schema": "quickstart_schema" @@ -2071,7 +2170,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2086,7 +2186,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2101,7 +2202,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2118,7 +2220,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2142,7 +2245,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2157,7 +2261,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2181,7 +2286,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2196,7 +2302,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2238,7 +2345,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2255,7 +2363,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2312,7 +2421,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2334,7 +2444,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2358,7 +2469,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2386,7 +2498,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2398,6 +2511,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "system" }, @@ -2408,7 +2522,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2423,7 +2538,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2438,7 +2554,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2455,7 +2572,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2479,7 +2597,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2494,7 +2613,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2514,7 +2634,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2526,6 +2647,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "system", "unity_schema": "default" @@ -2537,7 +2659,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2552,7 +2675,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2567,7 +2691,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2584,7 +2709,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2608,7 +2734,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2623,7 +2750,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2647,7 +2775,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2662,7 +2791,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2704,7 +2834,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2721,7 +2852,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2778,7 +2910,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2800,7 +2933,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2824,7 +2958,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2852,7 +2987,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2864,6 +3000,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "system", "unity_schema": "information_schema" @@ -2875,7 +3012,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2890,7 +3028,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2905,7 +3044,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2922,7 +3062,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2946,7 +3087,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2961,7 +3103,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2985,7 +3128,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3000,7 +3144,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3042,7 +3187,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3059,7 +3205,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3116,7 +3263,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3138,7 +3286,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3162,7 +3311,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3190,7 +3340,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3202,6 +3353,7 @@ "json": { "customProperties": { "platform": "databricks", + "env": "PROD", "metastore": "acryl metastore", "catalog": "system", "unity_schema": "quickstart_schema" @@ -3213,7 +3365,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3228,7 +3381,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3243,7 +3397,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3260,7 +3415,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3284,7 +3440,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3299,7 +3456,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3323,7 +3481,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3338,7 +3497,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3380,7 +3540,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3397,7 +3558,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3454,7 +3616,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3476,7 +3639,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3500,7 +3664,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3528,7 +3693,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3543,7 +3709,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3558,7 +3725,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3573,7 +3741,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3588,7 +3757,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3603,7 +3773,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3618,7 +3789,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3633,7 +3805,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3648,7 +3821,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3663,7 +3837,8 @@ }, "systemMetadata": { "lastObserved": 1638860400000, - "runId": "unity-catalog-test" + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_schemaresolver.py b/metadata-ingestion/tests/unit/sql_parsing/test_schemaresolver.py new file mode 100644 index 0000000000000..3fd5d72b4d41a --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/test_schemaresolver.py @@ -0,0 +1,33 @@ +from datahub.utilities.sqlglot_lineage import SchemaResolver, _TableName + + +def test_get_urn_for_table_lowercase(): + schema_resolver = SchemaResolver( + platform="mssql", + platform_instance="Uppercased-Instance", + env="PROD", + graph=None, + ) + + table = _TableName(database="Database", db_schema="DataSet", table="Table") + + assert ( + schema_resolver.get_urn_for_table(table=table, lower=True) + == "urn:li:dataset:(urn:li:dataPlatform:mssql,uppercased-instance.database.dataset.table,PROD)" + ) + + +def test_get_urn_for_table_not_lower_should_keep_capital_letters(): + schema_resolver = SchemaResolver( + platform="mssql", + platform_instance="Uppercased-Instance", + env="PROD", + graph=None, + ) + + table = _TableName(database="Database", db_schema="DataSet", table="Table") + + assert ( + schema_resolver.get_urn_for_table(table=table, lower=False) + == "urn:li:dataset:(urn:li:dataPlatform:mssql,Uppercased-Instance.Database.DataSet.Table,PROD)" + ) diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 4b36f533476f7..48f80f06d07c2 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -22,13 +22,18 @@ dependencies { implementation externalDependency.guava implementation externalDependency.reflections implementation externalDependency.jsonPatch - api externalDependency.dgraph4j exclude group: 'com.google.guava', module: 'guava' + api(externalDependency.dgraph4j) { + exclude group: 'com.google.guava', module: 'guava' + exclude group: 'io.grpc', module: 'grpc-protobuf' + } implementation externalDependency.slf4jApi runtimeOnly externalDependency.logbackClassic compileOnly externalDependency.lombok implementation externalDependency.commonsCollections api externalDependency.datastaxOssNativeProtocol - api externalDependency.datastaxOssCore + api(externalDependency.datastaxOssCore) { + exclude group: 'com.fasterxml.jackson.core' + } api externalDependency.datastaxOssQueryBuilder api externalDependency.elasticSearchRest api externalDependency.elasticSearchJava @@ -101,6 +106,9 @@ dependencies { implementation(externalDependency.snappy) { because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") } + implementation(externalDependency.grpcProtobuf) { + because("CVE-2023-1428, CVE-2023-32731") + } } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index 024cf2b0abec2..9b43642d7621c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -142,11 +142,11 @@ public AutoCompleteResult autoComplete(@Nonnull String entityName, @Nonnull Stri @Nonnull @Override - public Map aggregateByValue(@Nullable String entityName, @Nonnull String field, + public Map aggregateByValue(@Nullable List entityNames, @Nonnull String field, @Nullable Filter requestParams, int limit) { - log.debug("Aggregating by value: {}, field: {}, requestParams: {}, limit: {}", entityName, field, requestParams, - limit); - return esSearchDAO.aggregateByValue(entityName, field, requestParams, limit); + log.debug("Aggregating by value: {}, field: {}, requestParams: {}, limit: {}", entityNames.toString(), field, + requestParams, limit); + return esSearchDAO.aggregateByValue(entityNames, field, requestParams, limit); } @Nonnull diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java index 290e8c60deb00..960a5b38826b1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java @@ -31,6 +31,7 @@ import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; @@ -263,17 +264,16 @@ public AutoCompleteResult autoComplete(@Nonnull String entityName, @Nonnull Stri * @return */ @Nonnull - public Map aggregateByValue(@Nullable String entityName, @Nonnull String field, + public Map aggregateByValue(@Nullable List entityNames, @Nonnull String field, @Nullable Filter requestParams, int limit) { final SearchRequest searchRequest = SearchRequestHandler.getAggregationRequest(field, transformFilterForEntities(requestParams, indexConvention), limit); - String indexName; - if (entityName == null) { - indexName = indexConvention.getAllEntityIndicesPattern(); + if (entityNames == null) { + String indexName = indexConvention.getAllEntityIndicesPattern(); + searchRequest.indices(indexName); } else { - EntitySpec entitySpec = entityRegistry.getEntitySpec(entityName); - indexName = indexConvention.getIndexName(entitySpec); + Stream stream = entityNames.stream().map(entityRegistry::getEntitySpec).map(indexConvention::getIndexName); + searchRequest.indices(stream.toArray(String[]::new)); } - searchRequest.indices(indexName); try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "aggregateByValue_search").time()) { final SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java index d358c03c612d0..a4c359b3595c2 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java @@ -3,6 +3,7 @@ import com.datahub.test.Snapshot; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.TestEntityUrn; import com.linkedin.common.urn.Urn; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; @@ -99,7 +100,7 @@ public void testElasticSearchServiceStructuredQuery() throws Exception { BrowseResult browseResult = _elasticSearchService.browse(ENTITY_NAME, "", null, 0, 10); assertEquals(browseResult.getMetadata().getTotalNumEntities().longValue(), 0); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 0); - assertEquals(_elasticSearchService.aggregateByValue(ENTITY_NAME, "textField", null, 10).size(), 0); + assertEquals(_elasticSearchService.aggregateByValue(ImmutableList.of(ENTITY_NAME), "textField", null, 10).size(), 0); Urn urn = new TestEntityUrn("test", "urn1", "VALUE_1"); ObjectNode document = JsonNodeFactory.instance.objectNode(); @@ -124,7 +125,7 @@ public void testElasticSearchServiceStructuredQuery() throws Exception { assertEquals(browseResult.getMetadata().getTotalNumEntities().longValue(), 1); assertEquals(browseResult.getGroups().get(0).getName(), "b"); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 1); - assertEquals(_elasticSearchService.aggregateByValue(ENTITY_NAME, "textFieldOverride", null, 10), + assertEquals(_elasticSearchService.aggregateByValue(ImmutableList.of(ENTITY_NAME), "textFieldOverride", null, 10), ImmutableMap.of("textFieldOverride", 1L)); Urn urn2 = new TestEntityUrn("test2", "urn2", "VALUE_2"); @@ -147,7 +148,7 @@ public void testElasticSearchServiceStructuredQuery() throws Exception { assertEquals(browseResult.getMetadata().getTotalNumEntities().longValue(), 1); assertEquals(browseResult.getGroups().get(0).getName(), "b"); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 2); - assertEquals(_elasticSearchService.aggregateByValue(ENTITY_NAME, "textFieldOverride", null, 10), + assertEquals(_elasticSearchService.aggregateByValue(ImmutableList.of(ENTITY_NAME), "textFieldOverride", null, 10), ImmutableMap.of("textFieldOverride", 1L, "textFieldOverride2", 1L)); _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); @@ -158,7 +159,7 @@ public void testElasticSearchServiceStructuredQuery() throws Exception { browseResult = _elasticSearchService.browse(ENTITY_NAME, "", null, 0, 10); assertEquals(browseResult.getMetadata().getTotalNumEntities().longValue(), 0); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 0); - assertEquals(_elasticSearchService.aggregateByValue(ENTITY_NAME, "textField", null, 10).size(), 0); + assertEquals(_elasticSearchService.aggregateByValue(ImmutableList.of(ENTITY_NAME), "textField", null, 10).size(), 0); } @Test @@ -181,7 +182,7 @@ public void testElasticSearchServiceFulltext() throws Exception { assertEquals(searchResult.getEntities().get(0).getEntity(), urn); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 1); - assertEquals(_elasticSearchService.aggregateByValue(ENTITY_NAME, "textFieldOverride", null, 10), + assertEquals(_elasticSearchService.aggregateByValue(ImmutableList.of(ENTITY_NAME), "textFieldOverride", null, 10), ImmutableMap.of("textFieldOverride", 1L)); Urn urn2 = new TestEntityUrn("test2", "urn2", "VALUE_2"); @@ -198,7 +199,7 @@ public void testElasticSearchServiceFulltext() throws Exception { assertEquals(searchResult.getEntities().get(0).getEntity(), urn2); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 2); - assertEquals(_elasticSearchService.aggregateByValue(ENTITY_NAME, "textFieldOverride", null, 10), + assertEquals(_elasticSearchService.aggregateByValue(ImmutableList.of(ENTITY_NAME), "textFieldOverride", null, 10), ImmutableMap.of("textFieldOverride", 1L, "textFieldOverride2", 1L)); _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); @@ -208,6 +209,6 @@ public void testElasticSearchServiceFulltext() throws Exception { assertEquals(searchResult.getNumEntities().intValue(), 0); assertEquals(_elasticSearchService.docCount(ENTITY_NAME), 0); - assertEquals(_elasticSearchService.aggregateByValue(ENTITY_NAME, "textField", null, 10).size(), 0); + assertEquals(_elasticSearchService.aggregateByValue(ImmutableList.of(ENTITY_NAME), "textField", null, 10).size(), 0); } } diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 40674e13e647f..571cb66c84aa8 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -332,7 +332,8 @@ entityClient: usageClient: retryInterval: ${USAGE_CLIENT_RETRY_INTERVAL:2} - numRetries: ${USAGE_CLIENT_NUM_RETRIES:3} + numRetries: ${USAGE_CLIENT_NUM_RETRIES:0} + timeoutMs: ${USAGE_CLIENT_TIMEOUT_MS:3000} cache: primary: diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index 2e99def17c3c5..86644e3b034da 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -63,4 +63,5 @@ dependencies { configurations.all{ exclude group: "commons-io", module:"commons-io" exclude group: "jline", module:"jline" + exclude group: 'software.amazon.awssdk', module: 'third-party-jackson-core' } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/usage/UsageClientFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/usage/UsageClientFactory.java index e83cbc82d8067..d2bd89de8767a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/usage/UsageClientFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/usage/UsageClientFactory.java @@ -5,6 +5,7 @@ import com.linkedin.metadata.spring.YamlPropertySourceFactory; import com.linkedin.metadata.restli.DefaultRestliClientFactory; import com.linkedin.parseq.retry.backoff.ExponentialBackoff; +import com.linkedin.r2.transport.http.client.HttpClientFactory; import com.linkedin.restli.client.Client; import com.linkedin.usage.UsageClient; import org.springframework.beans.factory.annotation.Autowired; @@ -14,6 +15,9 @@ import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.PropertySource; +import java.util.HashMap; +import java.util.Map; + @Configuration @PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) @@ -34,16 +38,22 @@ public class UsageClientFactory { @Value("${usageClient.retryInterval:2}") private int retryInterval; - @Value("${usageClient.numRetries:3}") + @Value("${usageClient.numRetries:0}") private int numRetries; + @Value("${usageClient.timeoutMs:3000}") + private long timeoutMs; + @Autowired @Qualifier("configurationProvider") private ConfigurationProvider configurationProvider; @Bean("usageClient") public UsageClient getUsageClient(@Qualifier("systemAuthentication") final Authentication systemAuthentication) { - Client restClient = DefaultRestliClientFactory.getRestLiClient(gmsHost, gmsPort, gmsUseSSL, gmsSslProtocol); + Map params = new HashMap<>(); + params.put(HttpClientFactory.HTTP_REQUEST_TIMEOUT, String.valueOf(timeoutMs)); + + Client restClient = DefaultRestliClientFactory.getRestLiClient(gmsHost, gmsPort, gmsUseSSL, gmsSslProtocol, params); return new UsageClient(restClient, new ExponentialBackoff(retryInterval), numRetries, systemAuthentication, configurationProvider.getCache().getClient().getUsageClient()); } diff --git a/metadata-service/restli-api/build.gradle b/metadata-service/restli-api/build.gradle index f182d11b6baeb..352738d01f8da 100644 --- a/metadata-service/restli-api/build.gradle +++ b/metadata-service/restli-api/build.gradle @@ -13,5 +13,8 @@ dependencies { restClientCompile(externalDependency.zookeeper) { because("CVE-2023-44981") } + restClientCompile(externalDependency.grpcProtobuf) { + because("CVE-2023-1428, CVE-2023-32731") + } } } \ No newline at end of file diff --git a/metadata-service/restli-client/build.gradle b/metadata-service/restli-client/build.gradle index b1b778b45c0b5..7cad1981ad911 100644 --- a/metadata-service/restli-client/build.gradle +++ b/metadata-service/restli-client/build.gradle @@ -9,6 +9,7 @@ dependencies { api project(':metadata-utils') implementation project(':metadata-service:configuration') + implementation externalDependency.caffeine implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/common/client/ClientCache.java b/metadata-service/restli-client/src/main/java/com/linkedin/common/client/ClientCache.java index 8aa0984be57b9..79d473d1b0090 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/common/client/ClientCache.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/common/client/ClientCache.java @@ -14,8 +14,8 @@ import lombok.extern.slf4j.Slf4j; import org.checkerframework.checker.nullness.qual.Nullable; -import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.function.BiFunction; @@ -63,15 +63,15 @@ private ClientCache build() { public ClientCache build(Class metricClazz) { // loads data from entity client - CacheLoader loader = new CacheLoader<>() { + CacheLoader loader = new CacheLoader() { @Override public V load(@NonNull K key) { - return loadAll(List.of(key)).get(key); + return loadAll(Set.of(key)).get(key); } @Override @NonNull - public Map loadAll(@NonNull Iterable keys) { + public Map loadAll(@NonNull Set keys) { return loadFunction.apply(keys); } }; diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClientCache.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClientCache.java index 6006f3a9a87f6..8e103cff283ea 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClientCache.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClientCache.java @@ -81,16 +81,14 @@ private EntityClientCacheBuilder cache(LoadingCache cache) public EntityClientCache build(Class metricClazz) { // estimate size Weigher weighByEstimatedSize = (key, value) -> - value.getValue().data().values().parallelStream() - .mapToInt(o -> o.toString().getBytes().length) - .sum(); + value.getValue().data().toString().getBytes().length; // batch loads data from entity client (restli or java) Function, Map> loader = (Iterable keys) -> { Map> keysByEntity = StreamSupport.stream(keys.spliterator(), true) .collect(Collectors.groupingBy(Key::getEntityName, Collectors.toSet())); - Map results = keysByEntity.entrySet().parallelStream() + Map results = keysByEntity.entrySet().stream() .flatMap(entry -> { Set urns = entry.getValue().stream() .map(Key::getUrn) diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClient.java index d2b8499615e8d..850847bfd262a 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClient.java @@ -9,6 +9,7 @@ import com.linkedin.parseq.retry.backoff.BackoffPolicy; import com.linkedin.r2.RemoteInvocationException; import com.linkedin.restli.client.Client; + import java.net.URISyntaxException; import javax.annotation.Nonnull; @@ -51,10 +52,12 @@ public UsageQueryResult getUsageStats(@Nonnull String resource, @Nonnull UsageTi private UsageQueryResult getUsageStats(@Nonnull String resource, @Nonnull UsageTimeRange range, @Nonnull Authentication authentication) throws RemoteInvocationException, URISyntaxException { - final UsageStatsDoQueryRangeRequestBuilder requestBuilder = USAGE_STATS_REQUEST_BUILDERS.actionQueryRange() - .resourceParam(resource) - .durationParam(WindowDuration.DAY) - .rangeFromEndParam(range); + + final UsageStatsDoQueryRangeRequestBuilder requestBuilder = USAGE_STATS_REQUEST_BUILDERS + .actionQueryRange() + .resourceParam(resource) + .durationParam(WindowDuration.DAY) + .rangeFromEndParam(range); return sendClientRequest(requestBuilder, authentication).getEntity(); } } diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClientCache.java b/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClientCache.java index a04c1e90fb4a3..10a1ebb6dcccb 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClientCache.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/usage/UsageClientCache.java @@ -42,9 +42,7 @@ private UsageClientCacheBuilder cache(LoadingCache cache) public UsageClientCache build() { // estimate size Weigher weighByEstimatedSize = (key, value) -> - value.data().values().parallelStream() - .mapToInt(o -> o.toString().getBytes().length) - .sum(); + value.data().toString().getBytes().length; // batch loads data from usage client Function, Map> loader = (Iterable keys) -> diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java.latest b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java.latest deleted file mode 100644 index 91f74c12e6aad..0000000000000 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java.latest +++ /dev/null @@ -1,316 +0,0 @@ -package com.linkedin.metadata.resources.usage; - -import com.linkedin.common.WindowDuration; -import com.linkedin.common.urn.Urn; -import com.linkedin.data.template.SetMode; -import com.linkedin.data.template.StringArray; -import com.linkedin.data.template.StringArrayArray; -import com.linkedin.metadata.query.Condition; -import com.linkedin.metadata.query.Criterion; -import com.linkedin.metadata.query.CriterionArray; -import com.linkedin.metadata.query.Filter; -import com.linkedin.metadata.timeseries.elastic.ElasticSearchTimeseriesAspectService; -import com.linkedin.metadata.usage.UsageService; -import com.linkedin.timeseries.AggregationSpec; -import com.linkedin.timeseries.AggregationType; -import com.linkedin.timeseries.CalendarInterval; -import com.linkedin.timeseries.DateGroupingBucket; -import com.linkedin.timeseries.GenericTable; -import com.linkedin.metadata.restli.RestliUtils; -import com.linkedin.parseq.Task; -import com.linkedin.restli.server.annotations.Action; -import com.linkedin.restli.server.annotations.ActionParam; -import com.linkedin.restli.server.annotations.RestLiSimpleResource; -import com.linkedin.restli.server.resources.SimpleResourceTemplate; -import com.linkedin.timeseries.GroupingBucket; -import com.linkedin.timeseries.MetricAggregation; -import com.linkedin.timeseries.StringGroupingBucket; -import com.linkedin.usage.FieldUsageCounts; -import com.linkedin.usage.FieldUsageCountsArray; -import com.linkedin.usage.UsageAggregation; -import com.linkedin.usage.UsageAggregationArray; -import com.linkedin.usage.UsageAggregationMetrics; -import com.linkedin.usage.UsageQueryResult; -import com.linkedin.usage.UsageQueryResultAggregations; -import com.linkedin.usage.UsageTimeRange; -import com.linkedin.usage.UserUsageCounts; -import com.linkedin.usage.UserUsageCountsArray; -import com.linkedin.util.Pair; -import java.net.URISyntaxException; -import java.util.ArrayList; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nonnull; -import javax.inject.Inject; -import javax.inject.Named; -import java.time.Instant; -import java.util.HashMap; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; - - -/** - * Rest.li entry point: /usageStats - */ -@RestLiSimpleResource(name = "usageStats", namespace = "com.linkedin.usage") -public class UsageStats extends SimpleResourceTemplate { - private static final String ACTION_BATCH_INGEST = "batchIngest"; - private static final String PARAM_BUCKETS = "buckets"; - - private static final String ACTION_QUERY = "query"; - private static final String PARAM_RESOURCE = "resource"; - private static final String PARAM_DURATION = "duration"; - private static final String PARAM_START_TIME = "startTime"; - private static final String PARAM_END_TIME = "endTime"; - private static final String PARAM_MAX_BUCKETS = "maxBuckets"; - - private static final String ACTION_QUERY_RANGE = "queryRange"; - private static final String PARAM_RANGE = "rangeFromEnd"; - private static final String USAGE_STATS_ENTITY_NAME = "dataset"; - private static final String USAGE_STATS_ASPECT_NAME = "datasetUsageStatistics"; - private static final String ES_FIELD_TIMESTAMP = "timestampMillis"; - private final Logger _logger = LoggerFactory.getLogger(UsageStats.class.getName()); - @Inject - @Named("usageService") - private UsageService _usageService; - @Inject - @Named("elasticSearchTimeseriesAspectService") - private ElasticSearchTimeseriesAspectService _elasticSearchTimeseriesAspectService; - - @Action(name = ACTION_BATCH_INGEST) - @Nonnull - public Task batchIngest(@ActionParam(PARAM_BUCKETS) @Nonnull UsageAggregation[] buckets) { - _logger.info("Ingesting {} usage stats aggregations", buckets.length); - return RestliUtils.toTask(() -> { - for (UsageAggregation agg : buckets) { - this.ingest(agg); - } - return null; - }); - } - - private CalendarInterval windowToInterval(@Nonnull WindowDuration duration) { - switch (duration) { - case HOUR: - return CalendarInterval.HOUR; - case DAY: - return CalendarInterval.DAY; - case WEEK: - return CalendarInterval.WEEK; - case MONTH: - return CalendarInterval.MONTH; - case YEAR: - return CalendarInterval.YEAR; - default: - throw new IllegalArgumentException("Unsupported duration value" + duration); - } - } - - private UsageAggregationArray getBuckets(@Nonnull String resource, @Nonnull WindowDuration duration, Long startTime, Long endTime) { - // Populate the filter - Filter filter = new Filter(); - ArrayList criteria = new ArrayList<>(); - Criterion hasUrnCriterion = new Criterion().setField("urn").setCondition(Condition.EQUAL).setValue(resource); - criteria.add(hasUrnCriterion); - if (startTime != null) { - Criterion startTimeCriterion = new Criterion().setField(ES_FIELD_TIMESTAMP) - .setCondition(Condition.GREATER_THAN_OR_EQUAL_TO) - .setValue(startTime.toString()); - criteria.add(startTimeCriterion); - } - if (endTime != null) { - Criterion endTimeCriterion = new Criterion().setField(ES_FIELD_TIMESTAMP) - .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) - .setValue(endTime.toString()); - criteria.add(endTimeCriterion); - } - filter.setCriteria(new CriterionArray(criteria)); - // Populate the aggregation specs - ArrayList aggregationSpecs = new ArrayList<>(); - aggregationSpecs.add(new AggregationSpec().setAggregationType(AggregationType.LATEST).setMemberName("uniqueUserCount")); - aggregationSpecs.add(new AggregationSpec().setAggregationType(AggregationType.LATEST).setMemberName("totalSqlQueries")); - aggregationSpecs.add(new AggregationSpec().setAggregationType(AggregationType.LATEST).setMemberName("topSqlQueries")); - /* - aggregationSpecs.add(new AggregationSpec().setAggregationType(AggregationType.SUM).setMemberName("totalSqlQueries")); - aggregationSpecs.add(new AggregationSpec().setAggregationType(AggregationType.SUM).setMemberName("userCounts.count")); - aggregationSpecs.add(new AggregationSpec().setAggregationType(AggregationType.SUM).setMemberName("fieldCounts.count")); - */ - - // Populate the Grouping buckets - ArrayList groupingBuckets = new ArrayList<>(); - // ts bucket - GroupingBucket timestampBucket = new GroupingBucket(); - timestampBucket.setDateGroupingBucket( - new DateGroupingBucket().setKey(ES_FIELD_TIMESTAMP).setGranularity(windowToInterval(duration))); - groupingBuckets.add(timestampBucket); - /* - // user counts bucket - GroupingBucket userGroupsBucket = new GroupingBucket(); - userGroupsBucket.setStringGroupingBucket( new StringGroupingBucket().setKey("userCounts.user") ); - groupingBuckets.add(userGroupsBucket); - // field counts bucket - GroupingBucket fieldCountGroupBucket = new GroupingBucket(); - fieldCountGroupBucket.setStringGroupingBucket(new StringGroupingBucket().setKey("fieldCounts.fieldName")); - groupingBuckets.add(fieldCountGroupBucket); - */ - - GenericTable result = - _elasticSearchTimeseriesAspectService.getAggregatedStats(USAGE_STATS_ENTITY_NAME, USAGE_STATS_ASPECT_NAME, - (AggregationSpec[]) aggregationSpecs.toArray(), filter, (GroupingBucket[]) groupingBuckets.toArray()); - UsageAggregationArray buckets = new UsageAggregationArray(); - for(StringArray row: result.getRows()) { - UsageAggregation usageAggregation = new UsageAggregation(); - usageAggregation.setBucket(Long.valueOf(row.get(0))); - usageAggregation.setDuration(duration); - try { - usageAggregation.setResource(new Urn(resource)); - } catch (URISyntaxException e) { - throw new IllegalArgumentException("Invalid resource" + e); - } - UsageAggregationMetrics usageAggregationMetrics = new UsageAggregationMetrics(); - usageAggregationMetrics.setUniqueUserCount(Integer.valueOf(row.get(1))); - usageAggregationMetrics.setTotalSqlQueries(Integer.valueOf(row.get(2))); - //usageAggregationMetrics.setTopSqlQueries(row.get(3)); - usageAggregation.setMetrics(usageAggregationMetrics); - } - return buckets; - } - - private UsageQueryResultAggregations getAggregations(String resource, WindowDuration duration, Long startTime, Long endTime) { - // TODO: make the aggregation computation logic reusable - UsageQueryResultAggregations aggregations = new UsageQueryResultAggregations(); - - /* - // Compute aggregations for users and unique user count. - { - Map, Integer> userAgg = new HashMap<>(); - buckets.forEach((bucket) -> { - Optional.ofNullable(bucket.getMetrics().getUsers()).ifPresent(usersUsageCounts -> { - usersUsageCounts.forEach((userCount -> { - Pair key = new Pair<>(userCount.getUser(), userCount.getUserEmail()); - int count = userAgg.getOrDefault(key, 0); - count += userCount.getCount(); - userAgg.put(key, count); - })); - }); - }); - - if (!userAgg.isEmpty()) { - UserUsageCountsArray users = new UserUsageCountsArray(); - users.addAll(userAgg.entrySet() - .stream() - .map((mapping) -> new UserUsageCounts().setUser(mapping.getKey().getFirst(), SetMode.REMOVE_IF_NULL) - .setUserEmail(mapping.getKey().getSecond(), SetMode.REMOVE_IF_NULL) - .setCount(mapping.getValue())) - .collect(Collectors.toList())); - aggregations.setUsers(users); - aggregations.setUniqueUserCount(userAgg.size()); - } - } - - // Compute aggregation for total query count. - { - Integer totalQueryCount = null; - - for (UsageAggregation bucket : buckets) { - if (bucket.getMetrics().getTotalSqlQueries() != null) { - if (totalQueryCount == null) { - totalQueryCount = 0; - } - totalQueryCount += bucket.getMetrics().getTotalSqlQueries(); - } - } - - if (totalQueryCount != null) { - aggregations.setTotalSqlQueries(totalQueryCount); - } - } - - // Compute aggregations for field usage counts. - { - Map fieldAgg = new HashMap<>(); - buckets.forEach((bucket) -> { - Optional.ofNullable(bucket.getMetrics().getFields()).ifPresent(fieldUsageCounts -> { - fieldUsageCounts.forEach((fieldCount -> { - String key = fieldCount.getFieldName(); - int count = fieldAgg.getOrDefault(key, 0); - count += fieldCount.getCount(); - fieldAgg.put(key, count); - })); - }); - }); - - if (!fieldAgg.isEmpty()) { - FieldUsageCountsArray fields = new FieldUsageCountsArray(); - fields.addAll(fieldAgg.entrySet() - .stream() - .map((mapping) -> new FieldUsageCounts().setFieldName(mapping.getKey()).setCount(mapping.getValue())) - .collect(Collectors.toList())); - aggregations.setFields(fields); - } - } - */ - return aggregations; - } - - @Action(name = ACTION_QUERY) - @Nonnull - public Task query(@ActionParam(PARAM_RESOURCE) @Nonnull String resource, - @ActionParam(PARAM_DURATION) @Nonnull WindowDuration duration, - @ActionParam(PARAM_START_TIME) @com.linkedin.restli.server.annotations.Optional Long startTime, - @ActionParam(PARAM_END_TIME) @com.linkedin.restli.server.annotations.Optional Long endTime, - @ActionParam(PARAM_MAX_BUCKETS) @com.linkedin.restli.server.annotations.Optional Integer maxBuckets) { - _logger.info("Attempting to query usage stats"); - return RestliUtils.toTask(() -> { - UsageAggregationArray buckets = getBuckets(resource, duration, startTime, endTime); - UsageQueryResultAggregations aggregations = getAggregations(resource, duration, startTime, endTime); - return new UsageQueryResult().setBuckets(buckets).setAggregations(aggregations); - }); - } - - - @Action(name = ACTION_QUERY_RANGE) - @Nonnull - public Task queryRange(@ActionParam(PARAM_RESOURCE) @Nonnull String resource, - @ActionParam(PARAM_DURATION) @Nonnull WindowDuration duration, @ActionParam(PARAM_RANGE) UsageTimeRange range) { - final long now = Instant.now().toEpochMilli(); - return this.query(resource, duration, convertRangeToStartTime(range, now), now, null); - } - - private void ingest(@Nonnull UsageAggregation bucket) { - // TODO attempt to resolve users into emails - _usageService.upsertDocument(bucket); - } - - @Nonnull - Long convertRangeToStartTime(@Nonnull UsageTimeRange range, long currentEpochMillis) { - // TRICKY: since start_time must be before the bucket's start, we actually - // need to subtract extra from the current time to ensure that we get precisely - // what we're looking for. Note that start_time and end_time are both inclusive, - // so we must also do an off-by-one adjustment. - final long oneHourMillis = 60 * 60 * 1000; - final long oneDayMillis = 24 * oneHourMillis; - - if (range == UsageTimeRange.HOUR) { - return currentEpochMillis - (2 * oneHourMillis + 1); - } else if (range == UsageTimeRange.DAY) { - return currentEpochMillis - (2 * oneDayMillis + 1); - } else if (range == UsageTimeRange.WEEK) { - return currentEpochMillis - (8 * oneDayMillis + 1); - } else if (range == UsageTimeRange.MONTH) { - // Assuming month is last 30 days. - return currentEpochMillis - (31 * oneDayMillis + 1); - } else if (range == UsageTimeRange.QUARTER) { - // Assuming a quarter is 91 days. - return currentEpochMillis - (92 * oneDayMillis + 1); - } else if (range == UsageTimeRange.YEAR) { - return currentEpochMillis - (366 * oneDayMillis + 1); - } else if (range == UsageTimeRange.ALL) { - return 0L; - } else { - throw new IllegalArgumentException("invalid UsageTimeRange enum state: " + range.name()); - } - } -} diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java index 9fb0c18f1b621..e1ebc6d5e97be 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java @@ -82,7 +82,7 @@ protected boolean isValidCandidate(T candidate) { public List getRecommendations(@Nonnull Urn userUrn, @Nullable RecommendationRequestContext requestContext) { Map aggregationResult = - _entitySearchService.aggregateByValue(null, getSearchFieldName(), null, getMaxContent()); + _entitySearchService.aggregateByValue(getEntityNames(), getSearchFieldName(), null, getMaxContent()); if (aggregationResult.isEmpty()) { return Collections.emptyList(); @@ -116,6 +116,11 @@ public List getRecommendations(@Nonnull Urn userUrn, .collect(Collectors.toList()); } + protected List getEntityNames() { + // By default, no list is applied which means searching across entities. + return null; + } + // Get top K entries with the most count private List> getTopKValues(Map countMap) { final PriorityQueue> queue = diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/TopPlatformsSource.java b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/TopPlatformsSource.java index f81a91be0660a..9562440889f63 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/TopPlatformsSource.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/recommendation/candidatesource/TopPlatformsSource.java @@ -1,15 +1,16 @@ package com.linkedin.metadata.recommendation.candidatesource; -import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.RecordTemplate; import com.linkedin.dataplatform.DataPlatformInfo; +import com.linkedin.metadata.Constants; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.recommendation.RecommendationRenderType; import com.linkedin.metadata.recommendation.RecommendationRequestContext; import com.linkedin.metadata.recommendation.ScenarioType; import com.linkedin.metadata.search.EntitySearchService; -import java.util.Set; +import java.util.List; import javax.annotation.Nonnull; import lombok.extern.slf4j.Slf4j; @@ -18,12 +19,24 @@ public class TopPlatformsSource extends EntitySearchAggregationSource { /** - * TODO: Remove this once we permit specifying set of entities in aggregation API (filter out assertions) + * Set of entities that we want to consider for defining the top platform sources. + * This must match SearchUtils.SEARCHABLE_ENTITY_TYPES */ - private static final Set FILTERED_DATA_PLATFORM_URNS = ImmutableSet.of( - "urn:li:dataPlatform:great-expectations" + private static final List SEARCHABLE_ENTITY_TYPES = ImmutableList.of( + Constants.DATASET_ENTITY_NAME, + Constants.DASHBOARD_ENTITY_NAME, + Constants.CHART_ENTITY_NAME, + Constants.ML_MODEL_ENTITY_NAME, + Constants.ML_MODEL_GROUP_ENTITY_NAME, + Constants.ML_FEATURE_TABLE_ENTITY_NAME, + Constants.ML_FEATURE_ENTITY_NAME, + Constants.ML_PRIMARY_KEY_ENTITY_NAME, + Constants.DATA_FLOW_ENTITY_NAME, + Constants.DATA_JOB_ENTITY_NAME, + Constants.TAG_ENTITY_NAME, + Constants.CONTAINER_ENTITY_NAME, + Constants.NOTEBOOK_ENTITY_NAME ); - private final EntityService _entityService; private static final String PLATFORM = "platform"; @@ -52,6 +65,10 @@ public boolean isEligible(@Nonnull Urn userUrn, @Nonnull RecommendationRequestCo return requestContext.getScenario() == ScenarioType.HOME; } + protected List getEntityNames() { + return SEARCHABLE_ENTITY_TYPES; + } + @Override protected String getSearchFieldName() { return PLATFORM; @@ -69,9 +86,6 @@ protected boolean isValueUrn() { @Override protected boolean isValidCandidateUrn(Urn urn) { - if (FILTERED_DATA_PLATFORM_URNS.contains(urn.toString())) { - return false; - } RecordTemplate dataPlatformInfo = _entityService.getLatestAspect(urn, "dataPlatformInfo"); if (dataPlatformInfo == null) { return false; diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java index cbfeeaef860d3..9cd865bd888e2 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java @@ -131,15 +131,15 @@ AutoCompleteResult autoComplete(@Nonnull String entityName, @Nonnull String quer /** * Returns number of documents per field value given the field and filters * - * @param entityName name of the entity, if empty aggregate over all entities + * @param entityNames list of name of entities to aggregate across, if empty aggregate over all entities * @param field the field name for aggregate * @param requestParams filters to apply before aggregating * @param limit the number of aggregations to return * @return */ @Nonnull - Map aggregateByValue(@Nullable String entityName, @Nonnull String field, @Nullable Filter requestParams, - int limit); + Map aggregateByValue(@Nullable List entityNames, @Nonnull String field, + @Nullable Filter requestParams, int limit); /** * Gets a list of groups/entities that match given browse request. diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/restli/DefaultRestliClientFactory.java b/metadata-utils/src/main/java/com/linkedin/metadata/restli/DefaultRestliClientFactory.java index 436c7ae5d77b5..2d4e355a93e53 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/restli/DefaultRestliClientFactory.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/restli/DefaultRestliClientFactory.java @@ -44,18 +44,34 @@ public static RestClient getRestLiD2Client(@Nonnull String restLiClientD2ZkHost, @Nonnull public static RestClient getRestLiClient(@Nonnull String restLiServerHost, int restLiServerPort, boolean useSSL, @Nullable String sslProtocol) { + return getRestLiClient(restLiServerHost, restLiServerPort, useSSL, sslProtocol, null); + } + + @Nonnull + public static RestClient getRestLiClient(@Nonnull String restLiServerHost, int restLiServerPort, boolean useSSL, + @Nullable String sslProtocol, @Nullable Map params) { return getRestLiClient( URI.create(String.format("%s://%s:%s", useSSL ? "https" : "http", restLiServerHost, restLiServerPort)), - sslProtocol); + sslProtocol, + params); } @Nonnull public static RestClient getRestLiClient(@Nonnull URI gmsUri, @Nullable String sslProtocol) { + return getRestLiClient(gmsUri, sslProtocol, null); + } + + @Nonnull + public static RestClient getRestLiClient(@Nonnull URI gmsUri, @Nullable String sslProtocol, + @Nullable Map inputParams) { if (StringUtils.isBlank(gmsUri.getHost()) || gmsUri.getPort() <= 0) { throw new InvalidParameterException("Invalid restli server host name or port!"); } Map params = new HashMap<>(); + if (inputParams != null) { + params.putAll(inputParams); + } if ("https".equals(gmsUri.getScheme())) { try {