diff --git a/.gitignore b/.gitignore index 858f560f0b842..b6edbccf71125 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,7 @@ metadata-ingestion/generated/** # docs docs/generated/ +docs-website/versioned_docs/ tmp* temp/** diff --git a/README.md b/README.md index d2208cf6ced49..951dcebad6498 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,11 @@ Please follow the [DataHub Quickstart Guide](https://datahubproject.io/docs/quic If you're looking to build & modify datahub please take a look at our [Development Guide](https://datahubproject.io/docs/developers). -[![DataHub Demo GIF](docs/imgs/entity.png)](https://demo.datahubproject.io/) +

+ + + +

## Source Code and Repositories diff --git a/build.gradle b/build.gradle index ae54de07cb81c..3958f502b3b32 100644 --- a/build.gradle +++ b/build.gradle @@ -9,8 +9,7 @@ buildscript { ext.neo4jVersion = '4.4.9' ext.testContainersVersion = '1.17.4' ext.elasticsearchVersion = '7.10.2' - // TODO: Change to final release version once it's out ETA Mid-April - ext.jacksonVersion = '2.15.0-rc2' + ext.jacksonVersion = '2.15.2' ext.jettyVersion = '9.4.46.v20220331' ext.playVersion = '2.8.18' ext.log4jVersion = '2.19.0' diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index d6dd2de6d31e3..682710ad5d539 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -68,6 +68,7 @@ import com.linkedin.datahub.graphql.generated.ListQueriesResult; import com.linkedin.datahub.graphql.generated.ListTestsResult; import com.linkedin.datahub.graphql.generated.ListViewsResult; +import com.linkedin.datahub.graphql.generated.MatchedField; import com.linkedin.datahub.graphql.generated.MLFeature; import com.linkedin.datahub.graphql.generated.MLFeatureProperties; import com.linkedin.datahub.graphql.generated.MLFeatureTable; @@ -1008,6 +1009,10 @@ private void configureGenericEntityResolvers(final RuntimeWiring.Builder builder .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((SearchResult) env.getSource()).getEntity())) ) + .type("MatchedField", typeWiring -> typeWiring + .dataFetcher("entity", new EntityTypeResolver(entityTypes, + (env) -> ((MatchedField) env.getSource()).getEntity())) + ) .type("SearchAcrossLineageResult", typeWiring -> typeWiring .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((SearchAcrossLineageResult) env.getSource()).getEntity())) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java index 2c55bc79fe501..90017f7b87997 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java @@ -18,6 +18,7 @@ import com.linkedin.datahub.graphql.generated.Privilege; import com.linkedin.datahub.graphql.generated.QueriesTabConfig; import com.linkedin.datahub.graphql.generated.ResourcePrivileges; +import com.linkedin.datahub.graphql.generated.SearchResultsVisualConfig; import com.linkedin.datahub.graphql.generated.TelemetryConfig; import com.linkedin.datahub.graphql.generated.TestsConfig; import com.linkedin.datahub.graphql.generated.ViewsConfig; @@ -144,6 +145,13 @@ public CompletableFuture get(final DataFetchingEnvironment environmen } visualConfig.setEntityProfiles(entityProfilesConfig); } + if (_visualConfiguration != null && _visualConfiguration.getSearchResult() != null) { + SearchResultsVisualConfig searchResultsVisualConfig = new SearchResultsVisualConfig(); + if (_visualConfiguration.getSearchResult().getEnableNameHighlight() != null) { + searchResultsVisualConfig.setEnableNameHighlight(_visualConfiguration.getSearchResult().getEnableNameHighlight()); + } + visualConfig.setSearchResult(searchResultsVisualConfig); + } appConfig.setVisualConfig(visualConfig); final TelemetryConfig telemetryConfig = new TelemetryConfig(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java index 6435d6ee4c8e5..f3ac008734339 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java @@ -39,6 +39,9 @@ public com.linkedin.metadata.query.SearchFlags apply(@Nonnull final SearchFlags if (searchFlags.getSkipAggregates() != null) { result.setSkipAggregates(searchFlags.getSkipAggregates()); } + if (searchFlags.getGetSuggestions() != null) { + result.setGetSuggestions(searchFlags.getGetSuggestions()); + } return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java index 0b292a373ea40..5ba32b0c2a77c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java @@ -1,12 +1,18 @@ package com.linkedin.datahub.graphql.types.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.generated.AggregationMetadata; import com.linkedin.datahub.graphql.generated.FacetMetadata; import com.linkedin.datahub.graphql.generated.MatchedField; import com.linkedin.datahub.graphql.generated.SearchResult; +import com.linkedin.datahub.graphql.generated.SearchSuggestion; import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; import com.linkedin.metadata.search.SearchEntity; +import com.linkedin.metadata.search.utils.SearchUtils; +import lombok.extern.slf4j.Slf4j; + +import java.net.URISyntaxException; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -16,6 +22,7 @@ import static com.linkedin.metadata.utils.SearchUtil.*; +@Slf4j public class MapperUtils { private MapperUtils() { @@ -54,7 +61,24 @@ public static String convertFilterValue(String filterValue, List isEnti public static List getMatchedFieldEntry(List highlightMetadata) { return highlightMetadata.stream() - .map(field -> new MatchedField(field.getName(), field.getValue())) + .map(field -> { + MatchedField matchedField = new MatchedField(); + matchedField.setName(field.getName()); + matchedField.setValue(field.getValue()); + if (SearchUtils.isUrn(field.getValue())) { + try { + Urn urn = Urn.createFromString(field.getValue()); + matchedField.setEntity(UrnToEntityMapper.map(urn)); + } catch (URISyntaxException e) { + log.warn("Failed to create urn from MatchedField value: {}", field.getValue(), e); + } + } + return matchedField; + }) .collect(Collectors.toList()); } + + public static SearchSuggestion mapSearchSuggestion(com.linkedin.metadata.search.SearchSuggestion suggestion) { + return new SearchSuggestion(suggestion.getText(), suggestion.getScore(), Math.toIntExact(suggestion.getFrequency())); + } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java index 9f750820e3093..b16e2f10d1df7 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java @@ -27,6 +27,7 @@ public SearchResults apply(com.linkedin.metadata.search.SearchResult input) { final SearchResultMetadata searchResultMetadata = input.getMetadata(); result.setSearchResults(input.getEntities().stream().map(MapperUtils::mapResult).collect(Collectors.toList())); result.setFacets(searchResultMetadata.getAggregations().stream().map(MapperUtils::mapFacet).collect(Collectors.toList())); + result.setSuggestions(searchResultMetadata.getSuggestions().stream().map(MapperUtils::mapSearchSuggestion).collect(Collectors.toList())); return result; } diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index 761242a6711c1..dbee24b4bf6f7 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -221,6 +221,11 @@ type VisualConfig { Configuration for the queries tab """ entityProfiles: EntityProfilesConfig + + """ + Configuration for search results + """ + searchResult: SearchResultsVisualConfig } """ @@ -255,6 +260,16 @@ type EntityProfileConfig { defaultTab: String } +""" +Configuration for a search result +""" +type SearchResultsVisualConfig { + """ + Whether a search result should highlight the name/description if it was matched on those fields. + """ + enableNameHighlight: Boolean +} + """ Configurations related to tracking users in the app """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index fbea66f738955..4cabdb04afe77 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -138,6 +138,11 @@ input SearchFlags { Whether to skip aggregates/facets """ skipAggregates: Boolean + + """ + Whether to request for search suggestions on the _entityName virtualized field + """ + getSuggestions: Boolean } """ @@ -483,6 +488,11 @@ type SearchResults { Candidate facet aggregations used for search filtering """ facets: [FacetMetadata!] + + """ + Search suggestions based on the query provided for alternate query texts + """ + suggestions: [SearchSuggestion!] } """ @@ -665,6 +675,11 @@ type MatchedField { Value of the field that matched """ value: String! + + """ + Entity if the value is an urn + """ + entity: Entity } """ @@ -722,6 +737,31 @@ type AggregationMetadata { entity: Entity } +""" +A suggestion for an alternate search query given an original query compared to all +of the entity names in our search index. +""" +type SearchSuggestion { + """ + The suggested text based on the provided query text compared to + the entity name field in the search index. + """ + text: String! + + """ + The "edit distance" for this suggestion. The closer this number is to 1, the + closer the suggested text is to the original text. The closer it is to 0, the + further from the original text it is. + """ + score: Float + + """ + The number of entities that would match on the name field given the suggested text + """ + frequency: Int +} + + """ Input for performing an auto completion query against a single Metadata Entity """ diff --git a/datahub-web-react/README.md b/datahub-web-react/README.md index 6c91b169af858..8bf592b11a0ae 100644 --- a/datahub-web-react/README.md +++ b/datahub-web-react/README.md @@ -126,7 +126,9 @@ for functional configurability should reside. to render a view associated with a particular entity type (user, dataset, etc.). -![entity-registry](./entity-registry.png) +

+ +

**graphql** - The React App talks to the `dathub-frontend` server using GraphQL. This module is where the *queries* issued against the server are defined. Once defined, running `yarn run generate` will code-gen TypeScript objects to make invoking diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx index b772341370050..a2e14308e8cee 100644 --- a/datahub-web-react/src/Mocks.tsx +++ b/datahub-web-react/src/Mocks.tsx @@ -1973,6 +1973,7 @@ export const mocks = [ count: 10, filters: [], orFilters: [], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2033,6 +2034,7 @@ export const mocks = [ ], }, ], + suggestions: [], }, } as GetSearchResultsQuery, }, @@ -2059,6 +2061,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2112,6 +2115,7 @@ export const mocks = [ ], }, ], + suggestions: [], }, } as GetSearchResultsQuery, }, @@ -2230,6 +2234,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2251,6 +2256,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -2772,6 +2778,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2794,6 +2801,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { __typename: 'FacetMetadata', @@ -2886,6 +2894,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2908,6 +2917,7 @@ export const mocks = [ }, ], facets: [], + suggestions: [], }, } as GetSearchResultsForMultipleQuery, }, @@ -2934,6 +2944,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2955,6 +2966,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3007,6 +3019,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3028,6 +3041,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3084,6 +3098,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3113,6 +3128,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3175,6 +3191,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3196,6 +3213,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3258,6 +3276,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3279,6 +3298,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3451,6 +3471,7 @@ export const mocks = [ count: 10, filters: [], orFilters: [], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3462,6 +3483,7 @@ export const mocks = [ total: 0, searchResults: [], facets: [], + suggestions: [], }, }, }, diff --git a/datahub-web-react/src/app/entity/EntityRegistry.tsx b/datahub-web-react/src/app/entity/EntityRegistry.tsx index a07fd02841197..56b085cf69f4a 100644 --- a/datahub-web-react/src/app/entity/EntityRegistry.tsx +++ b/datahub-web-react/src/app/entity/EntityRegistry.tsx @@ -1,5 +1,7 @@ +import React from 'react'; import { Entity as EntityInterface, EntityType, SearchResult } from '../../types.generated'; import { FetchedEntity } from '../lineage/types'; +import { SearchResultProvider } from '../search/context/SearchResultContext'; import { Entity, EntityCapabilityType, IconStyleType, PreviewType } from './Entity'; import { GLOSSARY_ENTITY_TYPES } from './shared/constants'; import { GenericEntityProperties } from './shared/types'; @@ -119,7 +121,9 @@ export default class EntityRegistry { renderSearchResult(type: EntityType, searchResult: SearchResult): JSX.Element { const entity = validatedGet(type, this.entityTypeToEntity); - return entity.renderSearch(searchResult); + return ( + {entity.renderSearch(searchResult)} + ); } renderBrowse(type: EntityType, data: T): JSX.Element { diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx index b5ebcbef80379..0f1b6dbf3d660 100644 --- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx @@ -19,13 +19,14 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { ChartStatsSummarySubHeader } from './profile/stats/ChartStatsSummarySubHeader'; import { InputFieldsTab } from '../shared/tabs/Entity/InputFieldsTab'; -import { ChartSnippet } from './ChartSnippet'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import { LOOKER_URN } from '../../ingest/source/builder/constants'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; +import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; /** * Definition of the DataHub Chart entity. @@ -203,7 +204,11 @@ export class ChartEntity implements Entity { lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} externalUrl={data.properties?.externalUrl} - snippet={} + snippet={ + matchedInputFieldRenderer(matchedField, data)} + /> + } degree={(result as any).degree} paths={(result as any).paths} /> diff --git a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx b/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx deleted file mode 100644 index 27982d3037207..0000000000000 --- a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx +++ /dev/null @@ -1,53 +0,0 @@ -import React from 'react'; - -import { Typography } from 'antd'; -import { InputFields, MatchedField, Maybe } from '../../../types.generated'; -import TagTermGroup from '../../shared/tags/TagTermGroup'; -import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights'; -import { getMatchPrioritizingPrimary } from '../shared/utils'; - -type Props = { - matchedFields: MatchedField[]; - inputFields: Maybe | undefined; - isMatchingDashboard?: boolean; -}; - -const LABEL_INDEX_NAME = 'fieldLabels'; -const TYPE_PROPERTY_KEY_NAME = 'type'; - -export const ChartSnippet = ({ matchedFields, inputFields, isMatchingDashboard = false }: Props) => { - const matchedField = getMatchPrioritizingPrimary(matchedFields, 'fieldLabels'); - - if (matchedField?.name === LABEL_INDEX_NAME) { - const matchedSchemaField = inputFields?.fields?.find( - (field) => field?.schemaField?.label === matchedField.value, - ); - const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find( - (term) => term?.term?.name === matchedField.value, - ); - - if (matchedGlossaryTerm) { - let termType = 'term'; - const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find( - (property) => property.key === TYPE_PROPERTY_KEY_NAME, - ); - if (typeProperty) { - termType = typeProperty.value || termType; - } - - return ( - - Matches {termType} {' '} - {isMatchingDashboard && 'on a contained Chart'} - - ); - } - } - - return matchedField ? ( - - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {matchedField.value}{' '} - {isMatchingDashboard && 'on a contained Chart'} - - ) : null; -}; diff --git a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx index a64e437265262..0a36d0e5f1bfa 100644 --- a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx +++ b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx @@ -24,12 +24,13 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import { DashboardStatsSummarySubHeader } from './profile/DashboardStatsSummarySubHeader'; -import { ChartSnippet } from '../chart/ChartSnippet'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; import { LOOKER_URN } from '../../ingest/source/builder/constants'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; +import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; /** * Definition of the DataHub Dashboard entity. @@ -227,10 +228,9 @@ export class DashboardEntity implements Entity { lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} snippet={ - matchedInputFieldRenderer(matchedField, data)} + matchSuffix="on a contained chart" /> } subtype={data.subTypes?.typeNames?.[0]} diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx index cb4239872045f..ed3904bcf4e2d 100644 --- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx +++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx @@ -25,11 +25,12 @@ import { OperationsTab } from './profile/OperationsTab'; import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { SidebarSiblingsSection } from '../shared/containers/profile/sidebar/SidebarSiblingsSection'; import { DatasetStatsSummarySubHeader } from './profile/stats/stats/DatasetStatsSummarySubHeader'; -import { DatasetSearchSnippet } from './DatasetSearchSnippet'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; +import { matchedFieldPathsRenderer } from '../../search/matches/matchedFieldPathsRenderer'; const SUBTYPES = { VIEW: 'view', @@ -290,7 +291,7 @@ export class DatasetEntity implements Entity { subtype={data.subTypes?.typeNames?.[0]} container={data.container} parentContainers={data.parentContainers} - snippet={} + snippet={} insights={result.insights} externalUrl={data.properties?.externalUrl} statsSummary={data.statsSummary} diff --git a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx b/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx deleted file mode 100644 index e4f88eb0fbbfa..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx +++ /dev/null @@ -1,39 +0,0 @@ -import React from 'react'; - -import { Typography } from 'antd'; -import { MatchedField } from '../../../types.generated'; -import { TagSummary } from './shared/TagSummary'; -import { TermSummary } from './shared/TermSummary'; -import { FIELDS_TO_HIGHLIGHT } from './search/highlights'; -import { getMatchPrioritizingPrimary } from '../shared/utils'; -import { downgradeV2FieldPath } from './profile/schema/utils/utils'; - -type Props = { - matchedFields: MatchedField[]; -}; - -const LABEL_INDEX_NAME = 'fieldLabels'; - -export const DatasetSearchSnippet = ({ matchedFields }: Props) => { - const matchedField = getMatchPrioritizingPrimary(matchedFields, LABEL_INDEX_NAME); - - let snippet: React.ReactNode; - - if (matchedField) { - if (matchedField.value.includes('urn:li:tag')) { - snippet = ; - } else if (matchedField.value.includes('urn:li:glossaryTerm')) { - snippet = ; - } else if (matchedField.name === 'fieldPaths') { - snippet = {downgradeV2FieldPath(matchedField.value)}; - } else { - snippet = {matchedField.value}; - } - } - - return matchedField ? ( - - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {snippet}{' '} - - ) : null; -}; diff --git a/datahub-web-react/src/app/entity/dataset/search/highlights.ts b/datahub-web-react/src/app/entity/dataset/search/highlights.ts deleted file mode 100644 index 64505e0709c7b..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/search/highlights.ts +++ /dev/null @@ -1,7 +0,0 @@ -export const FIELDS_TO_HIGHLIGHT = new Map(); -FIELDS_TO_HIGHLIGHT.set('fieldPaths', 'column'); -FIELDS_TO_HIGHLIGHT.set('fieldDescriptions', 'column description'); -FIELDS_TO_HIGHLIGHT.set('fieldTags', 'column tag'); -FIELDS_TO_HIGHLIGHT.set('editedFieldDescriptions', 'column description'); -FIELDS_TO_HIGHLIGHT.set('editedFieldTags', 'column tag'); -FIELDS_TO_HIGHLIGHT.set('fieldLabels', 'label'); diff --git a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx deleted file mode 100644 index 106cc298fb58c..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx +++ /dev/null @@ -1,38 +0,0 @@ -import React from 'react'; -import styled from 'styled-components'; -import { useGetTagQuery } from '../../../../graphql/tag.generated'; -import { EntityType, Tag } from '../../../../types.generated'; -import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; -import { useEntityRegistry } from '../../../useEntityRegistry'; -import { StyledTag } from '../../shared/components/styled/StyledTag'; - -const TagLink = styled.span` - display: inline-block; -`; - -type Props = { - urn: string; -}; - -export const TagSummary = ({ urn }: Props) => { - const entityRegistry = useEntityRegistry(); - const { data } = useGetTagQuery({ variables: { urn } }); - return ( - <> - {data && ( - - - - {entityRegistry.getDisplayName(EntityType.Tag, data?.tag)} - - - - )} - - ); -}; diff --git a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx deleted file mode 100644 index cc1274693a342..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx +++ /dev/null @@ -1,36 +0,0 @@ -import React from 'react'; -import { Tag } from 'antd'; -import { BookOutlined } from '@ant-design/icons'; -import styled from 'styled-components'; -import { useGetGlossaryTermQuery } from '../../../../graphql/glossaryTerm.generated'; -import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; -import { EntityType, GlossaryTerm } from '../../../../types.generated'; -import { useEntityRegistry } from '../../../useEntityRegistry'; - -const TermLink = styled.span` - display: inline-block; -`; - -type Props = { - urn: string; -}; - -export const TermSummary = ({ urn }: Props) => { - const entityRegistry = useEntityRegistry(); - const { data } = useGetGlossaryTermQuery({ variables: { urn } }); - - return ( - <> - {data && ( - - - - - {entityRegistry.getDisplayName(EntityType.GlossaryTerm, data?.glossaryTerm)} - - - - )} - - ); -}; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx index 26d3cf456ab7a..b6802e37652cb 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx @@ -4,6 +4,8 @@ import { Deprecation, Domain, EntityType, Owner, ParentNodesResult } from '../.. import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType, PreviewType } from '../../Entity'; +import UrlButton from '../../shared/UrlButton'; +import { getRelatedEntitiesUrl } from '../utils'; export const Preview = ({ urn, @@ -39,6 +41,9 @@ export const Preview = ({ deprecation={deprecation} parentNodes={parentNodes} domain={domain} + entityTitleSuffix={ + View Related Entities + } /> ); }; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx index d0e8de0928b48..098e97e526fd8 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx @@ -5,7 +5,7 @@ import { EmbeddedListSearchSection } from '../../shared/components/styled/search import { useEntityData } from '../../shared/EntityContext'; export default function GlossaryRelatedEntity() { - const { entityData }: any = useEntityData(); + const { entityData } = useEntityData(); const entityUrn = entityData?.urn; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts index 3a2a3d35a8126..cbfa76fa34866 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts +++ b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts @@ -6,3 +6,7 @@ export function sortGlossaryTerms(entityRegistry: EntityRegistry, nodeA?: Entity const nodeBName = entityRegistry.getDisplayName(EntityType.GlossaryTerm, nodeB) || ''; return nodeAName.localeCompare(nodeBName); } + +export function getRelatedEntitiesUrl(entityRegistry: EntityRegistry, urn: string) { + return `${entityRegistry.getEntityUrl(EntityType.GlossaryTerm, urn)}/${encodeURIComponent('Related Entities')}`; +} diff --git a/datahub-web-react/src/app/entity/group/preview/Preview.tsx b/datahub-web-react/src/app/entity/group/preview/Preview.tsx index dc83f6fe4f840..67449b9a481f0 100644 --- a/datahub-web-react/src/app/entity/group/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/group/preview/Preview.tsx @@ -8,6 +8,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry'; import { ANTD_GRAY } from '../../shared/constants'; import { IconStyleType } from '../../Entity'; import NoMarkdownViewer from '../../shared/components/styled/StripMarkdownText'; +import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter'; const PreviewContainer = styled.div` margin-bottom: 4px; @@ -87,7 +88,9 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpGroup)} - {name || urn} + + {name ? : urn} + {membersCount} members @@ -96,7 +99,12 @@ export const Preview = ({ {description && description.length > 0 && ( - {description} + } + > + {description} + )} diff --git a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx index 9677af0776604..dce74c02cdb34 100644 --- a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx +++ b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx @@ -1,28 +1,11 @@ -import { ArrowRightOutlined } from '@ant-design/icons'; -import { Button } from 'antd'; import React from 'react'; -import styled from 'styled-components/macro'; import { EntityType } from '../../../types.generated'; import analytics, { EventType, EntityActionType } from '../../analytics'; +import UrlButton from './UrlButton'; const GITHUB_LINK = 'github.com'; const GITHUB = 'GitHub'; -const ExternalUrlWrapper = styled.span` - font-size: 12px; -`; - -const StyledButton = styled(Button)` - > :hover { - text-decoration: underline; - } - &&& { - padding-bottom: 0px; - } - padding-left: 12px; - padding-right: 12px; -`; - interface Props { externalUrl: string; platformName?: string; @@ -46,17 +29,8 @@ export default function ExternalUrlButton({ externalUrl, platformName, entityTyp } return ( - - - {displayedName ? `View in ${displayedName}` : 'View link'}{' '} - - - + + {displayedName ? `View in ${displayedName}` : 'View link'} + ); } diff --git a/datahub-web-react/src/app/entity/shared/UrlButton.tsx b/datahub-web-react/src/app/entity/shared/UrlButton.tsx new file mode 100644 index 0000000000000..a6f6da4a60ad5 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/UrlButton.tsx @@ -0,0 +1,37 @@ +import React, { ReactNode } from 'react'; +import { ArrowRightOutlined } from '@ant-design/icons'; +import { Button } from 'antd'; +import styled from 'styled-components/macro'; + +const UrlButtonContainer = styled.span` + font-size: 12px; +`; + +const StyledButton = styled(Button)` + > :hover { + text-decoration: underline; + } + &&& { + padding-bottom: 0px; + } + padding-left: 12px; + padding-right: 12px; +`; + +interface Props { + href: string; + children: ReactNode; + onClick?: () => void; +} + +const NOOP = () => {}; + +export default function UrlButton({ href, children, onClick = NOOP }: Props) { + return ( + + + {children} + + + ); +} diff --git a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts b/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts deleted file mode 100644 index 86dec46528b49..0000000000000 --- a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { getMatchPrioritizingPrimary } from '../utils'; - -const MOCK_MATCHED_FIELDS = [ - { - name: 'fieldPaths', - value: 'rain', - }, - { - name: 'description', - value: 'rainbow', - }, - { - name: 'fieldPaths', - value: 'rainbow', - }, - { - name: 'fieldPaths', - value: 'rainbows', - }, -]; - -describe('utils', () => { - describe('getMatchPrioritizingPrimary', () => { - it('prioritizes exact match', () => { - global.window.location.search = 'query=rainbow'; - const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths'); - expect(match?.value).toEqual('rainbow'); - expect(match?.name).toEqual('fieldPaths'); - }); - it('will accept first contains match', () => { - global.window.location.search = 'query=bow'; - const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths'); - expect(match?.value).toEqual('rainbow'); - expect(match?.name).toEqual('fieldPaths'); - }); - }); -}); diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx index 59293c2b0eee5..212813ffcb643 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx @@ -17,6 +17,7 @@ export type Props = { suffix?: JSX.Element; limit?: number; shouldWrap?: boolean; + customRender?: (text: string) => JSX.Element; }; export const removeMarkdown = (text: string) => { @@ -29,7 +30,7 @@ export const removeMarkdown = (text: string) => { .replace(/^•/, ''); // remove first • }; -export default function NoMarkdownViewer({ children, readMore, suffix, limit, shouldWrap }: Props) { +export default function NoMarkdownViewer({ children, customRender, readMore, suffix, limit, shouldWrap }: Props) { let plainText = removeMarkdown(children || ''); if (limit) { @@ -44,7 +45,8 @@ export default function NoMarkdownViewer({ children, readMore, suffix, limit, sh return ( - {plainText} {showReadMore && <>{readMore}} {suffix} + {customRender ? customRender(plainText) : plainText} + {showReadMore && <>{readMore}} {suffix} ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx index c1a23811fdd7e..08087bfd79b8e 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx @@ -6,7 +6,15 @@ export const generateColor = new ColorHash({ saturation: 0.9, }); -export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number }>` +export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number; highlightTag?: boolean }>` + &&& { + ${(props) => + props.highlightTag && + ` + background: ${props.theme.styles['highlight-color']}; + border: 1px solid ${props.theme.styles['highlight-border-color']}; + `} + } ${(props) => props.fontSize && `font-size: ${props.fontSize}px;`} ${(props) => props.$colorHash && diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx index 1aef497ced57b..bcce994c3f0f8 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx @@ -33,7 +33,7 @@ type LinkListProps = { }; export const LinkList = ({ refetch }: LinkListProps) => { - const { entityData } = useEntityData(); + const { urn: entityUrn, entityData } = useEntityData(); const entityRegistry = useEntityRegistry(); const [removeLinkMutation] = useRemoveLinkMutation(); const links = entityData?.institutionalMemory?.elements || []; @@ -41,7 +41,7 @@ export const LinkList = ({ refetch }: LinkListProps) => { const handleDeleteLink = async (metadata: InstitutionalMemoryMetadata) => { try { await removeLinkMutation({ - variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn } }, + variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn || entityUrn } }, }); message.success({ content: 'Link Removed', duration: 2 }); } catch (e: unknown) { diff --git a/datahub-web-react/src/app/entity/shared/utils.ts b/datahub-web-react/src/app/entity/shared/utils.ts index 712b5f61f002a..3f0c1ddae24e0 100644 --- a/datahub-web-react/src/app/entity/shared/utils.ts +++ b/datahub-web-react/src/app/entity/shared/utils.ts @@ -1,9 +1,7 @@ -import * as QueryString from 'query-string'; import { Maybe } from 'graphql/jsutils/Maybe'; -import { Entity, EntityType, MatchedField, EntityRelationshipsResult, DataProduct } from '../../../types.generated'; +import { Entity, EntityType, EntityRelationshipsResult, DataProduct } from '../../../types.generated'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; -import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights'; import { GenericEntityProperties } from './types'; export function dictToQueryStringParams(params: Record) { @@ -87,46 +85,6 @@ export const isListSubset = (l1, l2): boolean => { return l1.every((result) => l2.indexOf(result) >= 0); }; -function normalize(value: string) { - return value.trim().toLowerCase(); -} - -function fromQueryGetBestMatch(selectedMatchedFields: MatchedField[], rawQuery: string) { - const query = normalize(rawQuery); - // first lets see if there's an exact match between a field value and the query - const exactMatch = selectedMatchedFields.find((field) => normalize(field.value) === query); - if (exactMatch) { - return exactMatch; - } - - // if no exact match exists, we'll see if the entire query is contained in any of the values - const containedMatch = selectedMatchedFields.find((field) => normalize(field.value).includes(query)); - if (containedMatch) { - return containedMatch; - } - - // otherwise, just return whichever is first - return selectedMatchedFields[0]; -} - -export const getMatchPrioritizingPrimary = ( - matchedFields: MatchedField[], - primaryField: string, -): MatchedField | undefined => { - const { location } = window; - const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); - const query: string = decodeURIComponent(params.query ? (params.query as string) : ''); - - const primaryMatches = matchedFields.filter((field) => field.name === primaryField); - if (primaryMatches.length > 0) { - return fromQueryGetBestMatch(primaryMatches, query); - } - - const matchesThatShouldBeShownOnFE = matchedFields.filter((field) => FIELDS_TO_HIGHLIGHT.has(field.name)); - - return fromQueryGetBestMatch(matchesThatShouldBeShownOnFE, query); -}; - function getGraphqlErrorCode(e) { if (e.graphQLErrors && e.graphQLErrors.length) { const firstError = e.graphQLErrors[0]; diff --git a/datahub-web-react/src/app/entity/user/preview/Preview.tsx b/datahub-web-react/src/app/entity/user/preview/Preview.tsx index 01f68d9065523..8893d4ab86786 100644 --- a/datahub-web-react/src/app/entity/user/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/user/preview/Preview.tsx @@ -7,6 +7,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry'; import { ANTD_GRAY } from '../../shared/constants'; import { IconStyleType } from '../../Entity'; import { CustomAvatar } from '../../../shared/avatar'; +import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter'; const PreviewContainer = styled.div` display: flex; @@ -80,11 +81,17 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpUser)} - {name || urn} + + {name ? : urn} + - {title && {title}} + {title && ( + + + + )} diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 36713cfb7ffcf..0d0a32f7750a8 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -34,6 +34,7 @@ import ExternalUrlButton from '../entity/shared/ExternalUrlButton'; import EntityPaths from './EntityPaths/EntityPaths'; import { DataProductLink } from '../shared/tags/DataProductLink'; import { EntityHealth } from '../entity/shared/containers/profile/header/EntityHealth'; +import SearchTextHighlighter from '../search/matches/SearchTextHighlighter'; import { getUniqueOwners } from './utils'; const PreviewContainer = styled.div` @@ -173,6 +174,7 @@ interface Props { deprecation?: Deprecation | null; topUsers?: Array | null; externalUrl?: string | null; + entityTitleSuffix?: React.ReactNode; subHeader?: React.ReactNode; snippet?: React.ReactNode; insights?: Array | null; @@ -225,6 +227,7 @@ export default function DefaultPreviewCard({ titleSizePx, dataTestID, externalUrl, + entityTitleSuffix, onClick, degree, parentContainers, @@ -289,7 +292,7 @@ export default function DefaultPreviewCard({ ) : ( - {name || ' '} + )} @@ -305,6 +308,7 @@ export default function DefaultPreviewCard({ entityType={type} /> )} + {entityTitleSuffix} {degree !== undefined && degree !== null && ( ) : undefined } + customRender={(text) => } > {description} diff --git a/datahub-web-react/src/app/search/EmptySearchResults.tsx b/datahub-web-react/src/app/search/EmptySearchResults.tsx new file mode 100644 index 0000000000000..cde61f746d35b --- /dev/null +++ b/datahub-web-react/src/app/search/EmptySearchResults.tsx @@ -0,0 +1,90 @@ +import { RocketOutlined } from '@ant-design/icons'; +import { useHistory } from 'react-router'; +import { Button } from 'antd'; +import React, { useCallback } from 'react'; +import styled from 'styled-components'; +import { ANTD_GRAY_V2 } from '../entity/shared/constants'; +import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; +import analytics, { EventType } from '../analytics'; +import { SuggestedText } from './suggestions/SearchQuerySugggester'; +import useGetSearchQueryInputs from './useGetSearchQueryInputs'; +import { FacetFilterInput, SearchSuggestion } from '../../types.generated'; +import { useUserContext } from '../context/useUserContext'; + +const NoDataContainer = styled.div` + margin: 40px auto; + font-size: 16px; + color: ${ANTD_GRAY_V2[8]}; +`; + +const Section = styled.div` + margin-bottom: 16px; +`; + +function getRefineSearchText(filters: FacetFilterInput[], viewUrn?: string | null) { + let text = ''; + if (filters.length && viewUrn) { + text = 'clearing all filters and selected view'; + } else if (filters.length) { + text = 'clearing all filters'; + } else if (viewUrn) { + text = 'clearing the selected view'; + } + + return text; +} + +interface Props { + suggestions: SearchSuggestion[]; +} + +export default function EmptySearchResults({ suggestions }: Props) { + const { query, filters, viewUrn } = useGetSearchQueryInputs(); + const history = useHistory(); + const userContext = useUserContext(); + const suggestText = suggestions.length > 0 ? suggestions[0].text : ''; + const refineSearchText = getRefineSearchText(filters, viewUrn); + + const onClickExploreAll = useCallback(() => { + analytics.event({ type: EventType.SearchResultsExploreAllClickEvent }); + navigateToSearchUrl({ query: '*', history }); + }, [history]); + + const searchForSuggestion = () => { + navigateToSearchUrl({ query: suggestText, history }); + }; + + const clearFiltersAndView = () => { + navigateToSearchUrl({ query, history }); + userContext.updateLocalState({ + ...userContext.localState, + selectedViewUrn: undefined, + }); + }; + + return ( + +
No results found for "{query}"
+ {refineSearchText && ( + <> + Try {refineSearchText}{' '} + {suggestText && ( + <> + or searching for {suggestText} + + )} + + )} + {!refineSearchText && suggestText && ( + <> + Did you mean {suggestText} + + )} + {!refineSearchText && !suggestText && ( + + )} +
+ ); +} diff --git a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx b/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx deleted file mode 100644 index 9b577048145c5..0000000000000 --- a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx +++ /dev/null @@ -1,98 +0,0 @@ -import { ArrowRightOutlined } from '@ant-design/icons'; -import { Button, Card, Divider, List, Space, Typography } from 'antd'; -import { ListProps } from 'antd/lib/list'; -import * as React from 'react'; -import { useHistory } from 'react-router-dom'; -import styled from 'styled-components'; -import { EntityType, SearchResult } from '../../types.generated'; -import { IconStyleType } from '../entity/Entity'; -import { useEntityRegistry } from '../useEntityRegistry'; -import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; -import analytics, { EventType } from '../analytics'; - -const styles = { - header: { marginBottom: 20 }, - resultHeaderCardBody: { padding: '16px 24px' }, - resultHeaderCard: { right: '52px', top: '-40px', position: 'absolute' }, - seeAllButton: { fontSize: 18 }, - resultsContainer: { width: '100%', padding: '40px 132px' }, -}; - -const ResultList = styled(List)` - &&& { - width: 100%; - border-color: ${(props) => props.theme.styles['border-color-base']}; - margin-top: 8px; - padding: 16px 48px; - box-shadow: ${(props) => props.theme.styles['box-shadow']}; - } -`; - -interface Props { - type: EntityType; - query: string; - searchResults: Array; -} - -export const EntityGroupSearchResults = ({ type, query, searchResults }: Props) => { - const history = useHistory(); - const entityRegistry = useEntityRegistry(); - - const onResultClick = (result: SearchResult, index: number) => { - analytics.event({ - type: EventType.SearchResultClickEvent, - query, - entityUrn: result.entity.urn, - entityType: result.entity.type, - index, - total: searchResults.length, - }); - }; - - return ( - - >> - header={ - - {entityRegistry.getCollectionName(type)} - - {entityRegistry.getIcon(type, 36, IconStyleType.ACCENT)} - - - } - footer={ - searchResults.length > 0 && ( - - ) - } - dataSource={searchResults as SearchResult[]} - split={false} - renderItem={(searchResult, index) => ( - <> - onResultClick(searchResult, index)}> - {entityRegistry.renderSearchResult(type, searchResult)} - - {index < searchResults.length - 1 && } - - )} - bordered - /> - - ); -}; diff --git a/datahub-web-react/src/app/search/SearchPage.tsx b/datahub-web-react/src/app/search/SearchPage.tsx index ce353640d8179..6387f0ef8c05e 100644 --- a/datahub-web-react/src/app/search/SearchPage.tsx +++ b/datahub-web-react/src/app/search/SearchPage.tsx @@ -59,6 +59,7 @@ export const SearchPage = () => { orFilters, viewUrn, sortInput, + searchFlags: { getSuggestions: true }, }, }, }); @@ -235,6 +236,7 @@ export const SearchPage = () => { error={error} searchResponse={data?.searchAcrossEntities} facets={data?.searchAcrossEntities?.facets} + suggestions={data?.searchAcrossEntities?.suggestions || []} selectedFilters={filters} loading={loading} onChangeFilters={onChangeFilters} diff --git a/datahub-web-react/src/app/search/SearchResultList.tsx b/datahub-web-react/src/app/search/SearchResultList.tsx index 6e2d5c923c6e2..386b22f34602b 100644 --- a/datahub-web-react/src/app/search/SearchResultList.tsx +++ b/datahub-web-react/src/app/search/SearchResultList.tsx @@ -1,18 +1,16 @@ -import React, { useCallback } from 'react'; -import { Button, Checkbox, Divider, Empty, List, ListProps } from 'antd'; +import React from 'react'; +import { Checkbox, Divider, List, ListProps } from 'antd'; import styled from 'styled-components'; -import { useHistory } from 'react-router'; -import { RocketOutlined } from '@ant-design/icons'; -import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; import { ANTD_GRAY } from '../entity/shared/constants'; import { SEPARATE_SIBLINGS_URL_PARAM } from '../entity/shared/siblingUtils'; import { CompactEntityNameList } from '../recommendations/renderer/component/CompactEntityNameList'; import { useEntityRegistry } from '../useEntityRegistry'; -import { SearchResult } from '../../types.generated'; +import { SearchResult, SearchSuggestion } from '../../types.generated'; import analytics, { EventType } from '../analytics'; import { EntityAndType } from '../entity/shared/types'; import { useIsSearchV2 } from './useSearchAndBrowseVersion'; import { CombinedSearchResult } from './utils/combineSiblingsInSearchResults'; +import EmptySearchResults from './EmptySearchResults'; const ResultList = styled(List)` &&& { @@ -28,13 +26,6 @@ const StyledCheckbox = styled(Checkbox)` margin-right: 12px; `; -const NoDataContainer = styled.div` - > div { - margin-top: 28px; - margin-bottom: 28px; - } -`; - const ThinDivider = styled(Divider)` margin-top: 16px; margin-bottom: 16px; @@ -70,6 +61,7 @@ type Props = { isSelectMode: boolean; selectedEntities: EntityAndType[]; setSelectedEntities: (entities: EntityAndType[]) => any; + suggestions: SearchSuggestion[]; }; export const SearchResultList = ({ @@ -79,17 +71,12 @@ export const SearchResultList = ({ isSelectMode, selectedEntities, setSelectedEntities, + suggestions, }: Props) => { - const history = useHistory(); const entityRegistry = useEntityRegistry(); const selectedEntityUrns = selectedEntities.map((entity) => entity.urn); const showSearchFiltersV2 = useIsSearchV2(); - const onClickExploreAll = useCallback(() => { - analytics.event({ type: EventType.SearchResultsExploreAllClickEvent }); - navigateToSearchUrl({ query: '*', history }); - }, [history]); - const onClickResult = (result: SearchResult, index: number) => { analytics.event({ type: EventType.SearchResultClickEvent, @@ -118,19 +105,7 @@ export const SearchResultList = ({ id="search-result-list" dataSource={searchResults} split={false} - locale={{ - emptyText: ( - - - - - ), - }} + locale={{ emptyText: }} renderItem={(item, index) => ( ` display: flex; @@ -131,6 +132,7 @@ interface Props { setNumResultsPerPage: (numResults: number) => void; isSelectMode: boolean; selectedEntities: EntityAndType[]; + suggestions: SearchSuggestion[]; setSelectedEntities: (entities: EntityAndType[]) => void; setIsSelectMode: (showSelectMode: boolean) => any; onChangeSelectAll: (selected: boolean) => void; @@ -155,6 +157,7 @@ export const SearchResults = ({ setNumResultsPerPage, isSelectMode, selectedEntities, + suggestions, setIsSelectMode, setSelectedEntities, onChangeSelectAll, @@ -238,6 +241,7 @@ export const SearchResults = ({ {(error && ) || (!loading && ( + {totalResults > 0 && } - - SearchCfg.RESULTS_PER_PAGE} - onShowSizeChange={(_currNum, newNum) => setNumResultsPerPage(newNum)} - pageSizeOptions={['10', '20', '50', '100']} - /> - + {totalResults > 0 && ( + + SearchCfg.RESULTS_PER_PAGE} + onShowSizeChange={(_currNum, newNum) => setNumResultsPerPage(newNum)} + pageSizeOptions={['10', '20', '50', '100']} + /> + + )} {authenticatedUserUrn && ( void; }; export const DEFAULT_CONTEXT = { + query: undefined, selectedSortOption: undefined, setSelectedSortOption: (_: string) => null, }; @@ -21,3 +23,7 @@ export function useSearchContext() { export function useSelectedSortOption() { return useSearchContext().selectedSortOption; } + +export function useSearchQuery() { + return useSearchContext().query; +} diff --git a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx index bfb65c1d74d3e..5ad9667ab1fc0 100644 --- a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx +++ b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx @@ -8,6 +8,7 @@ export default function SearchContextProvider({ children }: { children: React.Re const history = useHistory(); const location = useLocation(); const params = useMemo(() => QueryString.parse(location.search, { arrayFormat: 'comma' }), [location.search]); + const query = (params.query ? decodeURIComponent(params.query as string) : undefined) as string | undefined; const selectedSortOption = params.sortOption as string | undefined; function setSelectedSortOption(selectedOption: string) { @@ -15,7 +16,7 @@ export default function SearchContextProvider({ children }: { children: React.Re } return ( - + {children} ); diff --git a/datahub-web-react/src/app/search/context/SearchResultContext.tsx b/datahub-web-react/src/app/search/context/SearchResultContext.tsx new file mode 100644 index 0000000000000..68adead005149 --- /dev/null +++ b/datahub-web-react/src/app/search/context/SearchResultContext.tsx @@ -0,0 +1,72 @@ +import React, { ReactNode, createContext, useContext, useMemo } from 'react'; +import { SearchResult } from '../../../types.generated'; +import { + getMatchedFieldsByUrn, + getMatchedFieldNames, + getMatchedFieldsByNames, + shouldShowInMatchedFieldList, + getMatchedFieldLabel, + getMatchesPrioritized, +} from '../matches/utils'; +import { MatchedFieldName } from '../matches/constants'; + +type SearchResultContextValue = { + searchResult: SearchResult; +} | null; + +const SearchResultContext = createContext(null); + +type Props = { + children: ReactNode; + searchResult: SearchResult; +}; + +export const SearchResultProvider = ({ children, searchResult }: Props) => { + const value = useMemo( + () => ({ + searchResult, + }), + [searchResult], + ); + return {children}; +}; + +const useSearchResultContext = () => { + return useContext(SearchResultContext); +}; + +export const useSearchResult = () => { + return useSearchResultContext()?.searchResult; +}; + +export const useEntityType = () => { + return useSearchResultContext()?.searchResult.entity.type; +}; + +export const useMatchedFields = () => { + return useSearchResult()?.matchedFields ?? []; +}; + +export const useMatchedFieldsForList = (primaryField: MatchedFieldName) => { + const entityType = useEntityType(); + const matchedFields = useMatchedFields(); + const showableFields = matchedFields.filter((field) => shouldShowInMatchedFieldList(entityType, field)); + return entityType ? getMatchesPrioritized(entityType, showableFields, primaryField) : []; +}; + +export const useMatchedFieldsByGroup = (fieldName: MatchedFieldName) => { + const entityType = useEntityType(); + const matchedFields = useMatchedFields(); + const matchedFieldNames = getMatchedFieldNames(entityType, fieldName); + return getMatchedFieldsByNames(matchedFields, matchedFieldNames); +}; + +export const useHasMatchedFieldByUrn = (urn: string, fieldName: MatchedFieldName) => { + const matchedFields = useMatchedFieldsByGroup(fieldName); + return getMatchedFieldsByUrn(matchedFields, urn).length > 0; +}; + +export const useMatchedFieldLabel = (fieldName: string) => { + const entityType = useEntityType(); + return getMatchedFieldLabel(entityType, fieldName); +}; diff --git a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx new file mode 100644 index 0000000000000..0bfe000dea366 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx @@ -0,0 +1,133 @@ +import React from 'react'; + +import { Tooltip, Typography } from 'antd'; +import styled from 'styled-components'; +import { useMatchedFieldLabel, useMatchedFieldsForList } from '../context/SearchResultContext'; +import { MatchedField } from '../../../types.generated'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; +import { useSearchQuery } from '../context/SearchContext'; +import { MatchesGroupedByFieldName } from './constants'; +import { useEntityRegistry } from '../../useEntityRegistry'; +import { getDescriptionSlice, isDescriptionField, isHighlightableEntityField } from './utils'; + +const MatchesContainer = styled.div` + display: flex; + flex-wrap: wrap; + gap: 8px; +`; + +const MatchText = styled(Typography.Text)` + color: ${ANTD_GRAY_V2[8]}; + background: ${(props) => props.theme.styles['highlight-color']}; + border-radius: 4px; + padding: 2px 4px 2px 4px; + padding-right: 4px; +`; + +const MATCH_GROUP_LIMIT = 3; +const TOOLTIP_MATCH_GROUP_LIMIT = 10; + +type CustomFieldRenderer = (field: MatchedField) => JSX.Element | null; + +type Props = { + customFieldRenderer?: CustomFieldRenderer; + matchSuffix?: string; +}; + +const RenderedField = ({ + customFieldRenderer, + field, +}: { + customFieldRenderer?: CustomFieldRenderer; + field: MatchedField; +}) => { + const entityRegistry = useEntityRegistry(); + const query = useSearchQuery()?.trim().toLowerCase(); + const customRenderedField = customFieldRenderer?.(field); + if (customRenderedField) return {customRenderedField}; + if (isHighlightableEntityField(field)) { + return field.entity ? <>{entityRegistry.getDisplayName(field.entity.type, field.entity)} : <>; + } + if (isDescriptionField(field) && query) return {getDescriptionSlice(field.value, query)}; + return {field.value}; +}; + +const MatchedFieldsList = ({ + groupedMatch, + limit, + tooltip, + matchSuffix = '', + customFieldRenderer, +}: { + groupedMatch: MatchesGroupedByFieldName; + limit: number; + tooltip?: JSX.Element; + matchSuffix?: string; + customFieldRenderer?: CustomFieldRenderer; +}) => { + const label = useMatchedFieldLabel(groupedMatch.fieldName); + const count = groupedMatch.matchedFields.length; + const moreCount = Math.max(count - limit, 0); + const andMore = ( + <> + {' '} + & more + + ); + return ( + <> + Matches {count > 1 && `${count} `} + {label} + {count > 1 && 's'}{' '} + {groupedMatch.matchedFields.slice(0, limit).map((field, index) => ( + <> + {index > 0 && ', '} + <> + + + + ))} + {moreCount > 0 && + (tooltip ? ( + + {andMore} + + ) : ( + <>{andMore} + ))}{' '} + {matchSuffix} + + ); +}; + +export const MatchedFieldList = ({ customFieldRenderer, matchSuffix = '' }: Props) => { + const groupedMatches = useMatchedFieldsForList('fieldLabels'); + + return ( + <> + {groupedMatches.length > 0 ? ( + + {groupedMatches.map((groupedMatch) => { + return ( + + + } + /> + + ); + })} + + ) : null} + + ); +}; diff --git a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx new file mode 100644 index 0000000000000..d8da1088ea89d --- /dev/null +++ b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx @@ -0,0 +1,42 @@ +import React from 'react'; +import Highlight from 'react-highlighter'; +import styled from 'styled-components'; +import { useMatchedFieldsByGroup } from '../context/SearchResultContext'; +import { useSearchQuery } from '../context/SearchContext'; +import { MatchedFieldName } from './constants'; +import { useAppConfig } from '../../useAppConfig'; + +type Props = { + field: MatchedFieldName; + text: string; + enableFullHighlight?: boolean; +}; + +const HIGHLIGHT_ALL_PATTERN = /.*/; + +const StyledHighlight = styled(Highlight).attrs((props) => ({ + matchStyle: { background: props.theme.styles['highlight-color'] }, +}))``; + +const SearchTextHighlighter = ({ field, text, enableFullHighlight = false }: Props) => { + const appConfig = useAppConfig(); + const enableNameHighlight = appConfig.config.visualConfig.searchResult?.enableNameHighlight; + const matchedFields = useMatchedFieldsByGroup(field); + const hasMatchedField = !!matchedFields?.length; + const normalizedSearchQuery = useSearchQuery()?.trim().toLowerCase(); + const normalizedText = text.trim().toLowerCase(); + const hasSubstring = hasMatchedField && !!normalizedSearchQuery && normalizedText.includes(normalizedSearchQuery); + const pattern = enableFullHighlight ? HIGHLIGHT_ALL_PATTERN : undefined; + + return ( + <> + {enableNameHighlight && hasMatchedField ? ( + {text} + ) : ( + text + )} + + ); +}; + +export default SearchTextHighlighter; diff --git a/datahub-web-react/src/app/search/matches/constants.ts b/datahub-web-react/src/app/search/matches/constants.ts new file mode 100644 index 0000000000000..25ca82eef9597 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/constants.ts @@ -0,0 +1,129 @@ +import { EntityType, MatchedField } from '../../../types.generated'; + +export type MatchedFieldName = + | 'urn' + | 'name' + | 'displayName' + | 'title' + | 'description' + | 'editedDescription' + | 'editedFieldDescriptions' + | 'fieldDescriptions' + | 'tags' + | 'fieldTags' + | 'editedFieldTags' + | 'glossaryTerms' + | 'fieldGlossaryTerms' + | 'editedFieldGlossaryTerms' + | 'fieldLabels' + | 'fieldPaths'; + +export type MatchedFieldConfig = { + name: MatchedFieldName; + groupInto?: MatchedFieldName; + label: string; + showInMatchedFieldList?: boolean; +}; + +const DEFAULT_MATCHED_FIELD_CONFIG: Array = [ + { + name: 'urn', + label: 'urn', + }, + { + name: 'title', + label: 'title', + }, + { + name: 'displayName', + groupInto: 'name', + label: 'display name', + }, + { + name: 'name', + groupInto: 'name', + label: 'name', + }, + { + name: 'editedDescription', + groupInto: 'description', + label: 'description', + }, + { + name: 'description', + groupInto: 'description', + label: 'description', + }, + { + name: 'editedFieldDescriptions', + groupInto: 'fieldDescriptions', + label: 'column description', + showInMatchedFieldList: true, + }, + { + name: 'fieldDescriptions', + groupInto: 'fieldDescriptions', + label: 'column description', + showInMatchedFieldList: true, + }, + { + name: 'tags', + label: 'tag', + }, + { + name: 'editedFieldTags', + groupInto: 'fieldTags', + label: 'column tag', + showInMatchedFieldList: true, + }, + { + name: 'fieldTags', + groupInto: 'fieldTags', + label: 'column tag', + showInMatchedFieldList: true, + }, + { + name: 'glossaryTerms', + label: 'term', + }, + { + name: 'editedFieldGlossaryTerms', + groupInto: 'fieldGlossaryTerms', + label: 'column term', + showInMatchedFieldList: true, + }, + { + name: 'fieldGlossaryTerms', + groupInto: 'fieldGlossaryTerms', + label: 'column term', + showInMatchedFieldList: true, + }, + { + name: 'fieldLabels', + label: 'label', + showInMatchedFieldList: true, + }, + { + name: 'fieldPaths', + label: 'column', + showInMatchedFieldList: true, + }, +]; + +export const CHART_DASHBOARD_FIELD_CONFIG: Array = DEFAULT_MATCHED_FIELD_CONFIG.map((config) => { + if (config.name === 'title') return { ...config, groupInto: 'name' }; + return config; +}); + +export const MATCHED_FIELD_CONFIG = { + [EntityType.Chart]: CHART_DASHBOARD_FIELD_CONFIG, + [EntityType.Dashboard]: CHART_DASHBOARD_FIELD_CONFIG, + DEFAULT: DEFAULT_MATCHED_FIELD_CONFIG, +} as const; + +export type MatchesGroupedByFieldName = { + fieldName: string; + matchedFields: Array; +}; + +export const HIGHLIGHTABLE_ENTITY_TYPES = [EntityType.Tag, EntityType.GlossaryTerm]; diff --git a/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx new file mode 100644 index 0000000000000..0a33530552864 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx @@ -0,0 +1,8 @@ +import React from 'react'; + +import { MatchedField } from '../../../types.generated'; +import { downgradeV2FieldPath } from '../../entity/dataset/profile/schema/utils/utils'; + +export const matchedFieldPathsRenderer = (matchedField: MatchedField) => { + return matchedField?.name === 'fieldPaths' ? {downgradeV2FieldPath(matchedField.value)} : null; +}; diff --git a/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx new file mode 100644 index 0000000000000..25634c9e8b80e --- /dev/null +++ b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx @@ -0,0 +1,40 @@ +import React from 'react'; + +import { Chart, Dashboard, EntityType, GlossaryTerm, MatchedField } from '../../../types.generated'; +import { useEntityRegistry } from '../../useEntityRegistry'; + +const LABEL_INDEX_NAME = 'fieldLabels'; +const TYPE_PROPERTY_KEY_NAME = 'type'; + +const TermName = ({ term }: { term: GlossaryTerm }) => { + const entityRegistry = useEntityRegistry(); + return <>{entityRegistry.getDisplayName(EntityType.GlossaryTerm, term)}; +}; + +export const matchedInputFieldRenderer = (matchedField: MatchedField, entity: Chart | Dashboard) => { + if (matchedField?.name === LABEL_INDEX_NAME) { + const matchedSchemaField = entity.inputFields?.fields?.find( + (field) => field?.schemaField?.label === matchedField.value, + ); + const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find( + (term) => term?.term?.name === matchedField.value, + ); + + if (matchedGlossaryTerm) { + let termType = 'term'; + const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find( + (property) => property.key === TYPE_PROPERTY_KEY_NAME, + ); + if (typeProperty) { + termType = typeProperty.value || termType; + } + + return ( + <> + {termType} + + ); + } + } + return null; +}; diff --git a/datahub-web-react/src/app/search/matches/utils.test.ts b/datahub-web-react/src/app/search/matches/utils.test.ts new file mode 100644 index 0000000000000..8b5ed27f5c2ad --- /dev/null +++ b/datahub-web-react/src/app/search/matches/utils.test.ts @@ -0,0 +1,110 @@ +import { EntityType } from '../../../types.generated'; +import { getMatchesPrioritized } from './utils'; + +const mapping = new Map(); +mapping.set('fieldPaths', 'column'); +mapping.set('fieldDescriptions', 'column description'); +mapping.set('fieldTags', 'column tag'); + +const MOCK_MATCHED_FIELDS = [ + { + name: 'fieldPaths', + value: 'rain', + }, + { + name: 'fieldDescriptions', + value: 'rainbow', + }, + { + name: 'fieldPaths', + value: 'rainbow', + }, + { + name: 'fieldPaths', + value: 'rainbows', + }, +]; + +const MOCK_MATCHED_DESCRIPTION_FIELDS = [ + { + name: 'editedDescription', + value: 'edited description value', + }, + { + name: 'description', + value: 'description value', + }, + { + name: 'fieldDescriptions', + value: 'field descriptions value', + }, + { + name: 'editedFieldDescriptions', + value: 'edited field descriptions value', + }, +]; + +describe('utils', () => { + describe('getMatchPrioritizingPrimary', () => { + it('prioritizes exact match', () => { + global.window.location.search = 'query=rainbow'; + const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths'); + expect(groupedMatches).toEqual([ + { + fieldName: 'fieldPaths', + matchedFields: [ + { name: 'fieldPaths', value: 'rainbow' }, + { name: 'fieldPaths', value: 'rainbows' }, + { name: 'fieldPaths', value: 'rain' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }], + }, + ]); + }); + it('will accept first contains match', () => { + global.window.location.search = 'query=bow'; + const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths'); + expect(groupedMatches).toEqual([ + { + fieldName: 'fieldPaths', + matchedFields: [ + { name: 'fieldPaths', value: 'rainbow' }, + { name: 'fieldPaths', value: 'rainbows' }, + { name: 'fieldPaths', value: 'rain' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }], + }, + ]); + }); + it('will group by field name', () => { + global.window.location.search = ''; + const groupedMatches = getMatchesPrioritized( + EntityType.Dataset, + MOCK_MATCHED_DESCRIPTION_FIELDS, + 'fieldPaths', + ); + expect(groupedMatches).toEqual([ + { + fieldName: 'description', + matchedFields: [ + { name: 'editedDescription', value: 'edited description value' }, + { name: 'description', value: 'description value' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [ + { name: 'fieldDescriptions', value: 'field descriptions value' }, + { name: 'editedFieldDescriptions', value: 'edited field descriptions value' }, + ], + }, + ]); + }); + }); +}); diff --git a/datahub-web-react/src/app/search/matches/utils.ts b/datahub-web-react/src/app/search/matches/utils.ts new file mode 100644 index 0000000000000..78c62f7eef458 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/utils.ts @@ -0,0 +1,136 @@ +import * as QueryString from 'query-string'; +import { EntityType, MatchedField } from '../../../types.generated'; +import { + HIGHLIGHTABLE_ENTITY_TYPES, + MATCHED_FIELD_CONFIG, + MatchedFieldConfig, + MatchedFieldName, + MatchesGroupedByFieldName, +} from './constants'; + +const getFieldConfigsByEntityType = (entityType: EntityType | undefined): Array => { + return entityType && entityType in MATCHED_FIELD_CONFIG + ? MATCHED_FIELD_CONFIG[entityType] + : MATCHED_FIELD_CONFIG.DEFAULT; +}; + +export const shouldShowInMatchedFieldList = (entityType: EntityType | undefined, field: MatchedField): boolean => { + const configs = getFieldConfigsByEntityType(entityType); + return configs.some((config) => config.name === field.name && config.showInMatchedFieldList); +}; + +export const getMatchedFieldLabel = (entityType: EntityType | undefined, fieldName: string): string => { + const configs = getFieldConfigsByEntityType(entityType); + return configs.find((config) => config.name === fieldName)?.label ?? ''; +}; + +export const getGroupedFieldName = ( + entityType: EntityType | undefined, + fieldName: string, +): MatchedFieldName | undefined => { + const configs = getFieldConfigsByEntityType(entityType); + const fieldConfig = configs.find((config) => config.name === fieldName); + return fieldConfig?.groupInto; +}; + +export const getMatchedFieldNames = ( + entityType: EntityType | undefined, + fieldName: MatchedFieldName, +): Array => { + return getFieldConfigsByEntityType(entityType) + .filter((config) => fieldName === config.groupInto || fieldName === config.name) + .map((field) => field.name); +}; + +export const getMatchedFieldsByNames = (fields: Array, names: Array): Array => { + return fields.filter((field) => names.includes(field.name)); +}; + +export const getMatchedFieldsByUrn = (fields: Array, urn: string): Array => { + return fields.filter((field) => field.value === urn); +}; + +function normalize(value: string) { + return value.trim().toLowerCase(); +} + +function fromQueryGetBestMatch( + selectedMatchedFields: MatchedField[], + rawQuery: string, + prioritizedField: string, +): Array { + const query = normalize(rawQuery); + const priorityMatches: Array = selectedMatchedFields.filter( + (field) => field.name === prioritizedField, + ); + const nonPriorityMatches: Array = selectedMatchedFields.filter( + (field) => field.name !== prioritizedField, + ); + const exactMatches: Array = []; + const containedMatches: Array = []; + const rest: Array = []; + + [...priorityMatches, ...nonPriorityMatches].forEach((field) => { + const normalizedValue = normalize(field.value); + if (normalizedValue === query) exactMatches.push(field); + else if (normalizedValue.includes(query)) containedMatches.push(field); + else rest.push(field); + }); + + return [...exactMatches, ...containedMatches, ...rest]; +} + +const getMatchesGroupedByFieldName = ( + entityType: EntityType, + matchedFields: Array, +): Array => { + const fieldNameToMatches = new Map>(); + const fieldNames: Array = []; + matchedFields.forEach((field) => { + const groupedFieldName = getGroupedFieldName(entityType, field.name) || field.name; + const matchesInMap = fieldNameToMatches.get(groupedFieldName); + if (matchesInMap) { + matchesInMap.push(field); + } else { + fieldNameToMatches.set(groupedFieldName, [field]); + fieldNames.push(groupedFieldName); + } + }); + return fieldNames.map((fieldName) => ({ + fieldName, + matchedFields: fieldNameToMatches.get(fieldName) ?? [], + })); +}; + +export const getMatchesPrioritized = ( + entityType: EntityType, + matchedFields: MatchedField[], + prioritizedField: string, +): Array => { + const { location } = window; + const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); + const query: string = decodeURIComponent(params.query ? (params.query as string) : ''); + const matches = fromQueryGetBestMatch(matchedFields, query, prioritizedField); + return getMatchesGroupedByFieldName(entityType, matches); +}; + +export const isHighlightableEntityField = (field: MatchedField) => + !!field.entity && HIGHLIGHTABLE_ENTITY_TYPES.includes(field.entity.type); + +export const isDescriptionField = (field: MatchedField) => field.name.toLowerCase().includes('description'); + +const SURROUNDING_DESCRIPTION_CHARS = 10; +const MAX_DESCRIPTION_CHARS = 50; + +export const getDescriptionSlice = (text: string, target: string) => { + const queryIndex = text.indexOf(target); + const start = Math.max(0, queryIndex - SURROUNDING_DESCRIPTION_CHARS); + const end = Math.min( + start + MAX_DESCRIPTION_CHARS, + text.length, + queryIndex + target.length + SURROUNDING_DESCRIPTION_CHARS, + ); + const startEllipsis = start > 0 ? '...' : ''; + const endEllipsis = end < text.length ? '...' : ''; + return `${startEllipsis}${text.slice(start, end)}${endEllipsis}`; +}; diff --git a/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx b/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx new file mode 100644 index 0000000000000..9dbd67883bf64 --- /dev/null +++ b/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx @@ -0,0 +1,39 @@ +import styled from 'styled-components'; +import React from 'react'; +import { useHistory } from 'react-router'; +import { SearchSuggestion } from '../../../types.generated'; +import { navigateToSearchUrl } from '../utils/navigateToSearchUrl'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; + +const TextWrapper = styled.div` + font-size: 14px; + color: ${ANTD_GRAY_V2[8]}; + margin: 16px 0 -8px 32px; +`; + +export const SuggestedText = styled.span` + color: ${(props) => props.theme.styles['primary-color']}; + text-decoration: underline ${(props) => props.theme.styles['primary-color']}; + cursor: pointer; +`; + +interface Props { + suggestions: SearchSuggestion[]; +} + +export default function SearchQuerySuggester({ suggestions }: Props) { + const history = useHistory(); + + if (suggestions.length === 0) return null; + const suggestText = suggestions[0].text; + + function searchForSuggestion() { + navigateToSearchUrl({ query: suggestText, history }); + } + + return ( + + Did you mean {suggestText} + + ); +} diff --git a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx index 2288238091776..ed2460b6eea3c 100644 --- a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx +++ b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx @@ -8,6 +8,7 @@ import { StyledTag } from '../../../entity/shared/components/styled/StyledTag'; import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { TagProfileDrawer } from '../TagProfileDrawer'; +import { useHasMatchedFieldByUrn } from '../../../search/context/SearchResultContext'; const TagLink = styled.span` display: inline-block; @@ -41,6 +42,7 @@ export default function Tag({ }: Props) { const entityRegistry = useEntityRegistry(); const [removeTagMutation] = useRemoveTagMutation(); + const highlightTag = useHasMatchedFieldByUrn(tag.tag.urn, 'tags'); const [tagProfileDrawerVisible, setTagProfileDrawerVisible] = useState(false); const [addTagUrn, setAddTagUrn] = useState(''); @@ -110,6 +112,7 @@ export default function Tag({ removeTag(tag); }} fontSize={fontSize} + highlightTag={highlightTag} > ` +const StyledTag = styled(Tag)<{ fontSize?: number; highlightTerm?: boolean }>` + &&& { + ${(props) => + props.highlightTerm && + ` + background: ${props.theme.styles['highlight-color']}; + border: 1px solid ${props.theme.styles['highlight-border-color']}; + `} + } ${(props) => props.fontSize && `font-size: ${props.fontSize}px;`} `; @@ -38,6 +47,7 @@ export default function TermContent({ }: Props) { const entityRegistry = useEntityRegistry(); const [removeTermMutation] = useRemoveTermMutation(); + const highlightTerm = useHasMatchedFieldByUrn(term.term.urn, 'glossaryTerms'); const removeTerm = (termToRemove: GlossaryTermAssociation) => { onOpenModal?.(); @@ -85,6 +95,7 @@ export default function TermContent({ removeTerm(term); }} fontSize={fontSize} + highlightTerm={highlightTerm} > diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx index 3b34b108ecc93..807a17c4fd6a4 100644 --- a/datahub-web-react/src/appConfigContext.tsx +++ b/datahub-web-react/src/appConfigContext.tsx @@ -27,6 +27,9 @@ export const DEFAULT_APP_CONFIG = { entityProfile: { domainDefaultTab: null, }, + searchResult: { + enableNameHighlight: false, + }, }, authConfig: { tokenAuthEnabled: false, diff --git a/datahub-web-react/src/conf/theme/theme_dark.config.json b/datahub-web-react/src/conf/theme/theme_dark.config.json index b648f3d997f21..9746c3ddde5f3 100644 --- a/datahub-web-react/src/conf/theme/theme_dark.config.json +++ b/datahub-web-react/src/conf/theme/theme_dark.config.json @@ -17,7 +17,9 @@ "disabled-color": "fade(white, 25%)", "steps-nav-arrow-color": "fade(white, 25%)", "homepage-background-upper-fade": "#FFFFFF", - "homepage-background-lower-fade": "#333E4C" + "homepage-background-lower-fade": "#333E4C", + "highlight-color": "#E6F4FF", + "highlight-border-color": "#BAE0FF" }, "assets": { "logoUrl": "/assets/logo.png" diff --git a/datahub-web-react/src/conf/theme/theme_light.config.json b/datahub-web-react/src/conf/theme/theme_light.config.json index e842fdb1bb8aa..906c04e38a1ba 100644 --- a/datahub-web-react/src/conf/theme/theme_light.config.json +++ b/datahub-web-react/src/conf/theme/theme_light.config.json @@ -20,7 +20,9 @@ "homepage-background-lower-fade": "#FFFFFF", "homepage-text-color": "#434343", "box-shadow": "0px 0px 30px 0px rgb(239 239 239)", - "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)" + "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)", + "highlight-color": "#E6F4FF", + "highlight-border-color": "#BAE0FF" }, "assets": { "logoUrl": "/assets/logo.png" diff --git a/datahub-web-react/src/conf/theme/types.ts b/datahub-web-react/src/conf/theme/types.ts index 98140cbbd553d..7d78230092700 100644 --- a/datahub-web-react/src/conf/theme/types.ts +++ b/datahub-web-react/src/conf/theme/types.ts @@ -18,6 +18,8 @@ export type Theme = { 'homepage-background-lower-fade': string; 'box-shadow': string; 'box-shadow-hover': string; + 'highlight-color': string; + 'highlight-border-color': string; }; assets: { logoUrl: string; diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql index 4b1295f1024a2..bf15e5f757f8f 100644 --- a/datahub-web-react/src/graphql/app.graphql +++ b/datahub-web-react/src/graphql/app.graphql @@ -45,6 +45,9 @@ query appConfig { defaultTab } } + searchResult { + enableNameHighlight + } } telemetryConfig { enableThirdPartyLogging diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 172a6d957e287..7cd868d7cd2b2 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -832,6 +832,11 @@ fragment searchResults on SearchResults { matchedFields { name value + entity { + urn + type + ...entityDisplayNameFields + } } insights { text @@ -841,6 +846,11 @@ fragment searchResults on SearchResults { facets { ...facetFields } + suggestions { + text + frequency + score + } } fragment schemaFieldEntityFields on SchemaFieldEntity { diff --git a/docker/airflow/local_airflow.md b/docker/airflow/local_airflow.md index d0a2b18cff2d2..55a64f5c122c5 100644 --- a/docker/airflow/local_airflow.md +++ b/docker/airflow/local_airflow.md @@ -138,25 +138,57 @@ Successfully added `conn_id`=datahub_rest_default : datahub_rest://:@http://data Navigate the Airflow UI to find the sample Airflow dag we just brought in -![Find the DAG](../../docs/imgs/airflow/find_the_dag.png) + +

+ +

+ By default, Airflow loads all DAG-s in paused status. Unpause the sample DAG to use it. -![Paused DAG](../../docs/imgs/airflow/paused_dag.png) -![Unpaused DAG](../../docs/imgs/airflow/unpaused_dag.png) + +

+ +

+ + +

+ +

+ Then trigger the DAG to run. -![Trigger the DAG](../../docs/imgs/airflow/trigger_dag.png) + +

+ +

+ After the DAG runs successfully, go over to your DataHub instance to see the Pipeline and navigate its lineage. -![DataHub Pipeline View](../../docs/imgs/airflow/datahub_pipeline_view.png) -![DataHub Pipeline Entity](../../docs/imgs/airflow/datahub_pipeline_entity.png) +

+ +

+ + + +

+ +

-![DataHub Task View](../../docs/imgs/airflow/datahub_task_view.png) -![DataHub Lineage View](../../docs/imgs/airflow/datahub_lineage_view.png) + +

+ +

+ + + +

+ +

+ ## TroubleShooting @@ -164,9 +196,17 @@ Most issues are related to connectivity between Airflow and DataHub. Here is how you can debug them. -![Find the Task Log](../../docs/imgs/airflow/finding_failed_log.png) -![Inspect the Log](../../docs/imgs/airflow/connection_error.png) +

+ +

+ + + +

+ +

+ In this case, clearly the connection `datahub-rest` has not been registered. Looks like we forgot to register the connection with Airflow! Let's execute Step 4 to register the datahub connection with Airflow. @@ -175,4 +215,8 @@ In case the connection was registered successfully but you are still seeing `Fai After re-running the DAG, we see success! -![Pipeline Success](../../docs/imgs/airflow/successful_run.png) + +

+ +

+ diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index 5707234b85f57..a9c75521fead1 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -15,9 +15,6 @@ FROM python:3-alpine ENV KAFKA_VERSION 3.4.1 ENV SCALA_VERSION 2.13 -# Set the classpath for JARs required by `cub` -ENV CUB_CLASSPATH='"/usr/share/java/cp-base-new/*"' - LABEL name="kafka" version=${KAFKA_VERSION} RUN apk add --no-cache bash coreutils @@ -31,10 +28,6 @@ RUN mkdir -p /opt \ && mv /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION} /opt/kafka \ && adduser -DH -s /sbin/nologin kafka \ && chown -R kafka: /opt/kafka \ - && echo "===> Installing python packages ..." \ - && pip install --no-cache-dir --upgrade pip wheel setuptools \ - && pip install jinja2 requests \ - && pip install "Cython<3.0" "PyYAML<6" --no-build-isolation \ && rm -rf /tmp/* \ && apk del --purge .build-deps diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 12f37033efc2f..851c10d9ea97f 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -77,7 +77,12 @@ task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, args = ['run', 'generate'] } -task yarnStart(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { +task downloadHistoricalVersions(type: Exec) { + workingDir '.' + commandLine 'python3', 'download_historical_versions.py' +} + +task yarnStart(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate, downloadHistoricalVersions]) { args = ['run', 'start'] } task fastReload(type: YarnTask) { @@ -105,7 +110,7 @@ task serve(type: YarnTask, dependsOn: [yarnInstall] ) { } -task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate]) { +task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate, downloadHistoricalVersions]) { inputs.files(projectMdFiles) inputs.file("package.json").withPathSensitivity(PathSensitivity.RELATIVE) inputs.dir("src").withPathSensitivity(PathSensitivity.RELATIVE) diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index c10c178424b53..df69e8513fbfc 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -69,6 +69,11 @@ module.exports = { label: "Roadmap", position: "right", }, + { + type: 'docsVersionDropdown', + position: 'right', + dropdownActiveClassDisabled: true, + }, { href: "https://slack.datahubproject.io", "aria-label": "Slack", diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py new file mode 100644 index 0000000000000..a005445cb1497 --- /dev/null +++ b/docs-website/download_historical_versions.py @@ -0,0 +1,60 @@ +import os +import tarfile +import urllib.request +import json + +repo_url = "https://api.github.com/repos/datahub-project/static-assets" + + +def download_file(url, destination): + with urllib.request.urlopen(url) as response: + with open(destination, "wb") as f: + while True: + chunk = response.read(8192) + if not chunk: + break + f.write(chunk) + + +def fetch_tar_urls(repo_url, folder_path): + api_url = f"{repo_url}/contents/{folder_path}" + response = urllib.request.urlopen(api_url) + data = response.read().decode('utf-8') + tar_urls = [ + file["download_url"] for file in json.loads(data) if file["name"].endswith(".tar.gz") + ] + print(tar_urls) + return tar_urls + + +def main(): + folder_path = "versioned_docs" + destination_dir = "versioned_docs" + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + + tar_urls = fetch_tar_urls(repo_url, folder_path) + + for url in tar_urls: + filename = os.path.basename(url) + destination_path = os.path.join(destination_dir, filename) + + version = '.'.join(filename.split('.')[:3]) + extracted_path = os.path.join(destination_dir, version) + print("extracted_path", extracted_path) + if os.path.exists(extracted_path): + print(f"{extracted_path} already exists, skipping downloads") + continue + try: + download_file(url, destination_path) + print(f"Downloaded {filename} to {destination_dir}") + with tarfile.open(destination_path, "r:gz") as tar: + tar.extractall() + os.remove(destination_path) + except urllib.error.URLError as e: + print(f"Error while downloading {filename}: {e}") + continue + + +if __name__ == "__main__": + main() diff --git a/docs-website/src/pages/docs/_components/SearchBar/index.jsx b/docs-website/src/pages/docs/_components/SearchBar/index.jsx index 37f8a5c252aee..054c041d8a9e5 100644 --- a/docs-website/src/pages/docs/_components/SearchBar/index.jsx +++ b/docs-website/src/pages/docs/_components/SearchBar/index.jsx @@ -303,11 +303,16 @@ function SearchBar() { strokeLinejoin="round" > - - {docsSearchVersionsHelpers.versioningEnabled && } - -
{!!searchResultState.totalResults && documentsFoundPlural(searchResultState.totalResults)}
+ {docsSearchVersionsHelpers.versioningEnabled && ( + + )} +
+ {!!searchResultState.totalResults && + documentsFoundPlural(searchResultState.totalResults)} +
{searchResultState.items.length > 0 ? (
@@ -369,4 +374,4 @@ function SearchBar() { ); } -export default SearchBar; +export default SearchBar; \ No newline at end of file diff --git a/docs-website/src/pages/docs/_components/SearchBar/search.module.scss b/docs-website/src/pages/docs/_components/SearchBar/search.module.scss index 17e5f22490664..30a2973384ba6 100644 --- a/docs-website/src/pages/docs/_components/SearchBar/search.module.scss +++ b/docs-website/src/pages/docs/_components/SearchBar/search.module.scss @@ -21,13 +21,21 @@ height: 1.5rem; } +.searchQueryInput { + padding: 0.8rem 0.8rem 0.8rem 3rem; +} + +.searchVersionInput { + padding: 0.8rem 2rem 0.8rem 2rem; + text-align: center; +} + .searchQueryInput, .searchVersionInput { border-radius: 1000em; border-style: solid; border-color: transparent; font: var(--ifm-font-size-base) var(--ifm-font-family-base); - padding: 0.8rem 0.8rem 0.8rem 3rem; width: 100%; background: var(--docsearch-searchbox-background); color: var(--docsearch-text-color); @@ -93,6 +101,7 @@ @media only screen and (max-width: 996px) { .searchVersionColumn { max-width: 40% !important; + margin: auto; } .searchResultsColumn { @@ -113,9 +122,15 @@ .searchVersionColumn { max-width: 100% !important; padding-left: var(--ifm-spacing-horizontal) !important; + margin: auto; } } +.searchVersionColumn { + margin: auto; +} + + .loadingSpinner { width: 3rem; height: 3rem; diff --git a/docs-website/versioned_sidebars/version-0.10.5-sidebars.json b/docs-website/versioned_sidebars/version-0.10.5-sidebars.json new file mode 100644 index 0000000000000..67179075fc994 --- /dev/null +++ b/docs-website/versioned_sidebars/version-0.10.5-sidebars.json @@ -0,0 +1,594 @@ +{ + "overviewSidebar": [ + { + "label": "Getting Started", + "type": "category", + "collapsed": true, + "items": [ + { + "type": "doc", + "label": "Introduction", + "id": "docs/features" + }, + { + "type": "doc", + "label": "Quickstart", + "id": "docs/quickstart" + }, + { + "type": "link", + "label": "Demo", + "href": "https://demo.datahubproject.io/" + }, + "docs/what-is-datahub/datahub-concepts", + "docs/saas" + ] + }, + { + "Integrations": [ + { + "type": "doc", + "label": "Introduction", + "id": "metadata-ingestion/README" + }, + { + "Quickstart Guides": [ + { + "BigQuery": [ + "docs/quick-ingestion-guides/bigquery/overview", + "docs/quick-ingestion-guides/bigquery/setup", + "docs/quick-ingestion-guides/bigquery/configuration" + ] + }, + { + "Redshift": [ + "docs/quick-ingestion-guides/redshift/overview", + "docs/quick-ingestion-guides/redshift/setup", + "docs/quick-ingestion-guides/redshift/configuration" + ] + }, + { + "Snowflake": [ + "docs/quick-ingestion-guides/snowflake/overview", + "docs/quick-ingestion-guides/snowflake/setup", + "docs/quick-ingestion-guides/snowflake/configuration" + ] + }, + { + "Tableau": [ + "docs/quick-ingestion-guides/tableau/overview", + "docs/quick-ingestion-guides/tableau/setup", + "docs/quick-ingestion-guides/tableau/configuration" + ] + }, + { + "PowerBI": [ + "docs/quick-ingestion-guides/powerbi/overview", + "docs/quick-ingestion-guides/powerbi/setup", + "docs/quick-ingestion-guides/powerbi/configuration" + ] + } + ] + }, + { + "Sources": [ + { + "type": "doc", + "id": "docs/lineage/airflow", + "label": "Airflow" + }, + "metadata-integration/java/spark-lineage/README", + "metadata-ingestion/integration_docs/great-expectations", + "metadata-integration/java/datahub-protobuf/README", + { + "type": "autogenerated", + "dirName": "docs/generated/ingestion/sources" + } + ] + }, + { + "Sinks": [ + { + "type": "autogenerated", + "dirName": "metadata-ingestion/sink_docs" + } + ] + }, + { + "Transformers": [ + "metadata-ingestion/docs/transformer/intro", + "metadata-ingestion/docs/transformer/dataset_transformer" + ] + }, + { + "Advanced Guides": [ + { + "Scheduling Ingestion": [ + "metadata-ingestion/schedule_docs/intro", + "metadata-ingestion/schedule_docs/cron", + "metadata-ingestion/schedule_docs/airflow", + "metadata-ingestion/schedule_docs/kubernetes" + ] + }, + "docs/platform-instances", + "metadata-ingestion/docs/dev_guides/stateful", + "metadata-ingestion/docs/dev_guides/classification", + "metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source", + "metadata-ingestion/docs/dev_guides/sql_profiles" + ] + } + ] + }, + { + "Deployment": [ + "docs/deploy/aws", + "docs/deploy/gcp", + "docker/README", + "docs/deploy/kubernetes", + "docs/deploy/environment-vars", + { + "Authentication": [ + "docs/authentication/README", + "docs/authentication/concepts", + "docs/authentication/changing-default-credentials", + "docs/authentication/guides/add-users", + { + "Frontend Authentication": [ + "docs/authentication/guides/jaas", + { + "OIDC Authentication": [ + "docs/authentication/guides/sso/configure-oidc-react", + "docs/authentication/guides/sso/configure-oidc-react-google", + "docs/authentication/guides/sso/configure-oidc-react-okta", + "docs/authentication/guides/sso/configure-oidc-react-azure" + ] + } + ] + }, + "docs/authentication/introducing-metadata-service-authentication", + "docs/authentication/personal-access-tokens" + ] + }, + { + "Authorization": [ + "docs/authorization/README", + "docs/authorization/roles", + "docs/authorization/policies", + "docs/authorization/groups" + ] + }, + { + "Advanced Guides": [ + "docs/how/delete-metadata", + "docs/how/configuring-authorization-with-apache-ranger", + "docs/how/backup-datahub", + "docs/how/restore-indices", + "docs/advanced/db-retention", + "docs/advanced/monitoring", + "docs/how/extract-container-logs", + "docs/deploy/telemetry", + "docs/how/kafka-config", + "docs/deploy/confluent-cloud", + "docs/advanced/no-code-upgrade", + "docs/how/jattach-guide" + ] + }, + "docs/how/updating-datahub" + ] + }, + { + "API": [ + "docs/api/datahub-apis", + { + "GraphQL API": [ + { + "label": "Overview", + "type": "doc", + "id": "docs/api/graphql/overview" + }, + { + "Reference": [ + { + "type": "doc", + "label": "Queries", + "id": "graphql/queries" + }, + { + "type": "doc", + "label": "Mutations", + "id": "graphql/mutations" + }, + { + "type": "doc", + "label": "Objects", + "id": "graphql/objects" + }, + { + "type": "doc", + "label": "Inputs", + "id": "graphql/inputObjects" + }, + { + "type": "doc", + "label": "Interfaces", + "id": "graphql/interfaces" + }, + { + "type": "doc", + "label": "Unions", + "id": "graphql/unions" + }, + { + "type": "doc", + "label": "Enums", + "id": "graphql/enums" + }, + { + "type": "doc", + "label": "Scalars", + "id": "graphql/scalars" + } + ] + }, + { + "Guides": [ + { + "type": "doc", + "label": "How To Set Up GraphQL", + "id": "docs/api/graphql/how-to-set-up-graphql" + }, + { + "type": "doc", + "label": "Getting Started With GraphQL", + "id": "docs/api/graphql/getting-started" + }, + { + "type": "doc", + "label": "Access Token Management", + "id": "docs/api/graphql/token-management" + } + ] + } + ] + }, + { + "type": "doc", + "label": "OpenAPI", + "id": "docs/api/openapi/openapi-usage-guide" + }, + "docs/dev-guides/timeline", + { + "Rest.li API": [ + { + "type": "doc", + "label": "Rest.li API Guide", + "id": "docs/api/restli/restli-overview" + }, + { + "type": "doc", + "label": "Restore Indices", + "id": "docs/api/restli/restore-indices" + }, + { + "type": "doc", + "label": "Get Index Sizes", + "id": "docs/api/restli/get-index-sizes" + }, + { + "type": "doc", + "label": "Truncate Timeseries Aspect", + "id": "docs/api/restli/truncate-time-series-aspect" + }, + { + "type": "doc", + "label": "Get ElasticSearch Task Status Endpoint", + "id": "docs/api/restli/get-elastic-task-status" + }, + { + "type": "doc", + "label": "Evaluate Tests", + "id": "docs/api/restli/evaluate-tests" + }, + { + "type": "doc", + "label": "Aspect Versioning and Rest.li Modeling", + "id": "docs/advanced/aspect-versioning" + } + ] + }, + { + "Python SDK": [ + "metadata-ingestion/as-a-library", + { + "Python SDK Reference": [ + { + "type": "autogenerated", + "dirName": "python-sdk" + } + ] + } + ] + }, + "metadata-integration/java/as-a-library", + { + "API and SDK Guides": [ + "docs/advanced/patch", + "docs/api/tutorials/datasets", + "docs/api/tutorials/lineage", + "docs/api/tutorials/tags", + "docs/api/tutorials/terms", + "docs/api/tutorials/owners", + "docs/api/tutorials/domains", + "docs/api/tutorials/deprecation", + "docs/api/tutorials/descriptions", + "docs/api/tutorials/custom-properties", + "docs/api/tutorials/ml" + ] + }, + { + "type": "category", + "label": "DataHub CLI", + "link": { + "type": "doc", + "id": "docs/cli" + }, + "items": [ + "docs/datahub_lite" + ] + }, + { + "type": "category", + "label": "Datahub Actions", + "link": { + "type": "doc", + "id": "docs/act-on-metadata" + }, + "items": [ + "docs/actions/README", + "docs/actions/quickstart", + "docs/actions/concepts", + { + "Sources": [ + { + "type": "autogenerated", + "dirName": "docs/actions/sources" + } + ] + }, + { + "Events": [ + { + "type": "autogenerated", + "dirName": "docs/actions/events" + } + ] + }, + { + "Actions": [ + { + "type": "autogenerated", + "dirName": "docs/actions/actions" + } + ] + }, + { + "Guides": [ + { + "type": "autogenerated", + "dirName": "docs/actions/guides" + } + ] + } + ] + } + ] + }, + { + "Features": [ + "docs/ui-ingestion", + "docs/how/search", + "docs/schema-history", + "docs/domains", + "docs/dataproducts", + "docs/glossary/business-glossary", + "docs/tags", + "docs/ownership/ownership-types", + "docs/browse", + "docs/authorization/access-policies-guide", + "docs/features/dataset-usage-and-query-history", + "docs/posts", + "docs/sync-status", + "docs/lineage/lineage-feature-guide", + { + "type": "doc", + "id": "docs/tests/metadata-tests", + "className": "saasOnly" + }, + "docs/act-on-metadata/impact-analysis", + { + "Observability": [ + "docs/managed-datahub/observe/freshness-assertions" + ] + } + ] + }, + { + "Develop": [ + { + "DataHub Metadata Model": [ + "docs/modeling/metadata-model", + "docs/modeling/extending-the-metadata-model", + "docs/what/mxe", + { + "Entities": [ + { + "type": "autogenerated", + "dirName": "docs/generated/metamodel/entities" + } + ] + } + ] + }, + { + "Architecture": [ + "docs/architecture/architecture", + "docs/components", + "docs/architecture/metadata-ingestion", + "docs/architecture/metadata-serving", + "docs/architecture/docker-containers" + ] + }, + { + "Developing on DataHub": [ + "docs/developers", + "docs/docker/development", + "metadata-ingestion/developing", + "docs/api/graphql/graphql-endpoint-development", + { + "Modules": [ + "datahub-web-react/README", + "datahub-frontend/README", + "datahub-graphql-core/README", + "metadata-service/README", + "metadata-jobs/mae-consumer-job/README", + "metadata-jobs/mce-consumer-job/README" + ] + } + ] + }, + "docs/plugins", + { + "Troubleshooting": [ + "docs/troubleshooting/quickstart", + "docs/troubleshooting/build", + "docs/troubleshooting/general" + ] + }, + { + "Advanced": [ + "metadata-ingestion/docs/dev_guides/reporting_telemetry", + "docs/advanced/mcp-mcl", + "docker/datahub-upgrade/README", + "docs/advanced/no-code-modeling", + "datahub-web-react/src/app/analytics/README", + "docs/how/migrating-graph-service-implementation", + "docs/advanced/field-path-spec-v2", + "metadata-ingestion/adding-source", + "docs/how/add-custom-ingestion-source", + "docs/how/add-custom-data-platform", + "docs/advanced/browse-paths-upgrade", + "docs/browseV2/browse-paths-v2" + ] + } + ] + }, + { + "Community": [ + "docs/slack", + "docs/townhalls", + "docs/townhall-history", + "docs/CODE_OF_CONDUCT", + "docs/CONTRIBUTING", + "docs/links", + "docs/rfc" + ] + }, + { + "Managed DataHub": [ + "docs/managed-datahub/managed-datahub-overview", + "docs/managed-datahub/welcome-acryl", + { + "type": "doc", + "id": "docs/managed-datahub/saas-slack-setup", + "className": "saasOnly" + }, + { + "type": "doc", + "id": "docs/managed-datahub/approval-workflows", + "className": "saasOnly" + }, + { + "Metadata Ingestion With Acryl": [ + "docs/managed-datahub/metadata-ingestion-with-acryl/ingestion" + ] + }, + { + "DataHub API": [ + { + "type": "doc", + "id": "docs/managed-datahub/datahub-api/entity-events-api", + "className": "saasOnly" + }, + { + "GraphQL API": [ + "docs/managed-datahub/datahub-api/graphql-api/getting-started", + { + "type": "doc", + "id": "docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta", + "className": "saasOnly" + } + ] + } + ] + }, + { + "Integrations": [ + { + "type": "doc", + "id": "docs/managed-datahub/integrations/aws-privatelink", + "className": "saasOnly" + }, + { + "type": "doc", + "id": "docs/managed-datahub/integrations/oidc-sso-integration", + "className": "saasOnly" + } + ] + }, + { + "Operator Guide": [ + { + "type": "doc", + "id": "docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws", + "className": "saasOnly" + }, + { + "type": "doc", + "id": "docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge", + "className": "saasOnly" + } + ] + }, + { + "type": "doc", + "id": "docs/managed-datahub/chrome-extension", + "className": "saasOnly" + }, + { + "Managed DataHub Release History": [ + "docs/managed-datahub/release-notes/v_0_2_10", + "docs/managed-datahub/release-notes/v_0_2_9", + "docs/managed-datahub/release-notes/v_0_2_8", + "docs/managed-datahub/release-notes/v_0_2_7", + "docs/managed-datahub/release-notes/v_0_2_6", + "docs/managed-datahub/release-notes/v_0_2_5", + "docs/managed-datahub/release-notes/v_0_2_4", + "docs/managed-datahub/release-notes/v_0_2_3", + "docs/managed-datahub/release-notes/v_0_2_2", + "docs/managed-datahub/release-notes/v_0_2_1", + "docs/managed-datahub/release-notes/v_0_2_0", + "docs/managed-datahub/release-notes/v_0_1_73", + "docs/managed-datahub/release-notes/v_0_1_72", + "docs/managed-datahub/release-notes/v_0_1_70", + "docs/managed-datahub/release-notes/v_0_1_69" + ] + } + ] + }, + { + "Release History": [ + "releases" + ] + } + ] +} diff --git a/docs-website/versions.json b/docs-website/versions.json new file mode 100644 index 0000000000000..0b79ac9498e06 --- /dev/null +++ b/docs-website/versions.json @@ -0,0 +1,3 @@ +[ + "0.10.5" +] diff --git a/docs/actions/concepts.md b/docs/actions/concepts.md index 381f2551d2237..5b05a0c586a5d 100644 --- a/docs/actions/concepts.md +++ b/docs/actions/concepts.md @@ -40,7 +40,11 @@ The Actions Framework consists of a few core concepts-- Each of these will be described in detail below. -![](imgs/actions.png) + +

+ +

+ **In the Actions Framework, Events flow continuously from left-to-right.** ### Pipelines diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md index 9c8f6761a62bc..d76b776d3dddb 100644 --- a/docs/advanced/no-code-modeling.md +++ b/docs/advanced/no-code-modeling.md @@ -159,11 +159,19 @@ along with simplifying the number of raw data models that need defined, includin From an architectural PoV, we will move from a before that looks something like this: -![no-code-before](../imgs/no-code-before.png) + +

+ +

+ to an after that looks like this -![no-code-after](../imgs/no-code-after.png) + +

+ +

+ That is, a move away from patterns of strong-typing-everywhere to a more generic + flexible world. diff --git a/docs/api/graphql/how-to-set-up-graphql.md b/docs/api/graphql/how-to-set-up-graphql.md index 562e8edb9f5d9..584bf34ad3f92 100644 --- a/docs/api/graphql/how-to-set-up-graphql.md +++ b/docs/api/graphql/how-to-set-up-graphql.md @@ -62,7 +62,11 @@ Postman is a popular API client that provides a graphical user interface for sen Within Postman, you can create a `POST` request and set the request URL to the `/api/graphql` endpoint. In the request body, select the `GraphQL` option and enter your GraphQL query in the request body. -![postman-graphql](../../imgs/apis/postman-graphql.png) + +

+ +

+ Please refer to [Querying with GraphQL](https://learning.postman.com/docs/sending-requests/graphql/graphql/) in the Postman documentation for more information. diff --git a/docs/api/tutorials/custom-properties.md b/docs/api/tutorials/custom-properties.md index dbc07bfaa712e..fe0d7e62dcde8 100644 --- a/docs/api/tutorials/custom-properties.md +++ b/docs/api/tutorials/custom-properties.md @@ -34,7 +34,11 @@ In this example, we will add some custom properties `cluster_name` and `retentio After you have ingested sample data, the dataset `fct_users_deleted` should have a custom properties section with `encoding` set to `utf-8`. -![dataset-properties-before](../../imgs/apis/tutorials/dataset-properties-before.png) + +

+ +

+ ```shell datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)" --aspect datasetProperties @@ -80,7 +84,11 @@ The following code adds custom properties `cluster_name` and `retention_time` to You can now see the two new properties are added to `fct_users_deleted` and the previous property `encoding` is unchanged. -![dataset-properties-added](../../imgs/apis/tutorials/dataset-properties-added.png) + +

+ +

+ We can also verify this operation by programmatically checking the `datasetProperties` aspect after running this code using the `datahub` cli. @@ -130,7 +138,11 @@ The following code shows you how can add and remove custom properties in the sam You can now see the `cluster_name` property is added to `fct_users_deleted` and the `retention_time` property is removed. -![dataset-properties-added-removed](../../imgs/apis/tutorials/dataset-properties-added-removed.png) + +

+ +

+ We can also verify this operation programmatically by checking the `datasetProperties` aspect using the `datahub` cli. @@ -179,7 +191,11 @@ The following code replaces the current custom properties with a new properties You can now see the `cluster_name` and `retention_time` properties are added to `fct_users_deleted` but the previous `encoding` property is no longer present. -![dataset-properties-replaced](../../imgs/apis/tutorials/dataset-properties-replaced.png) + +

+ +

+ We can also verify this operation programmatically by checking the `datasetProperties` aspect using the `datahub` cli. diff --git a/docs/api/tutorials/datasets.md b/docs/api/tutorials/datasets.md index 62b30e97c8020..7c6d4a88d4190 100644 --- a/docs/api/tutorials/datasets.md +++ b/docs/api/tutorials/datasets.md @@ -42,7 +42,11 @@ For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart. You can now see `realestate_db.sales` dataset has been created. -![dataset-created](../../imgs/apis/tutorials/dataset-created.png) + +

+ +

+ ## Delete Dataset @@ -110,4 +114,8 @@ Expected Response: The dataset `fct_users_deleted` has now been deleted, so if you search for a hive dataset named `fct_users_delete`, you will no longer be able to see it. -![dataset-deleted](../../imgs/apis/tutorials/dataset-deleted.png) + +

+ +

+ diff --git a/docs/api/tutorials/deprecation.md b/docs/api/tutorials/deprecation.md index 6a8f7c8a1d2be..73e73f5224cbc 100644 --- a/docs/api/tutorials/deprecation.md +++ b/docs/api/tutorials/deprecation.md @@ -155,4 +155,8 @@ Expected Response: You can now see the dataset `fct_users_created` has been marked as `Deprecated.` -![tag-removed](../../imgs/apis/tutorials/deprecation-updated.png) + +

+ +

+ diff --git a/docs/api/tutorials/descriptions.md b/docs/api/tutorials/descriptions.md index 46f42b7a05be6..27c57309ba76a 100644 --- a/docs/api/tutorials/descriptions.md +++ b/docs/api/tutorials/descriptions.md @@ -275,7 +275,11 @@ Expected Response: You can now see the description is added to `fct_users_deleted`. -![dataset-description-added](../../imgs/apis/tutorials/dataset-description-added.png) + +

+ +

+ ## Add Description on Column @@ -357,4 +361,8 @@ Expected Response: You can now see column description is added to `user_name` column of `fct_users_deleted`. -![column-description-added](../../imgs/apis/tutorials/column-description-added.png) + +

+ +

+ diff --git a/docs/api/tutorials/domains.md b/docs/api/tutorials/domains.md index c8c47f85c570f..617864d233b7a 100644 --- a/docs/api/tutorials/domains.md +++ b/docs/api/tutorials/domains.md @@ -74,7 +74,11 @@ Expected Response: You can now see `Marketing` domain has been created under `Govern > Domains`. -![domain-created](../../imgs/apis/tutorials/domain-created.png) + +

+ +

+ ## Read Domains @@ -209,7 +213,11 @@ Expected Response: You can now see `Marketing` domain has been added to the dataset. -![domain-added](../../imgs/apis/tutorials/domain-added.png) + +

+ +

+ ## Remove Domains @@ -259,4 +267,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see a domain `Marketing` has been removed from the `fct_users_created` dataset. -![domain-removed](../../imgs/apis/tutorials/domain-removed.png) + +

+ +

+ diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index e37986af7bbbd..ce23a4d274e8e 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -112,7 +112,11 @@ Expected Response: You can now see the lineage between `fct_users_deleted` and `logging_events`. -![lineage-added](../../imgs/apis/tutorials/lineage-added.png) + +

+ +

+ ## Add Column-level Lineage @@ -130,7 +134,11 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`. You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage. -![column-level-lineage-added](../../imgs/apis/tutorials/column-level-lineage-added.png) + +

+ +

+ ## Read Lineage diff --git a/docs/api/tutorials/ml.md b/docs/api/tutorials/ml.md index b16f2669b30c7..cb77556d48ebf 100644 --- a/docs/api/tutorials/ml.md +++ b/docs/api/tutorials/ml.md @@ -94,9 +94,17 @@ Please note that an MlModelGroup serves as a container for all the runs of a sin You can search the entities in DataHub UI. -![feature-table-created](../../imgs/apis/tutorials/feature-table-created.png) -![model-group-created](../../imgs/apis/tutorials/model-group-created.png) +

+ +

+ + + +

+ +

+ ## Read ML Entities @@ -499,6 +507,14 @@ Expected Response: (Note that this entity does not exist in the sample ingestion You can access to `Features` or `Group` Tab of each entity to view the added entities. -![feature-added-to-model](../../imgs/apis/tutorials/feature-added-to-model.png) -![model-group-added-to-model](../../imgs/apis/tutorials/model-group-added-to-model.png) +

+ +

+ + + +

+ +

+ diff --git a/docs/api/tutorials/owners.md b/docs/api/tutorials/owners.md index 3c7a46b136d76..5bc3b95cb5631 100644 --- a/docs/api/tutorials/owners.md +++ b/docs/api/tutorials/owners.md @@ -77,7 +77,11 @@ Update succeeded for urn urn:li:corpuser:datahub. ### Expected Outcomes of Upserting User You can see the user `The bar` has been created and the user `Datahub` has been updated under `Settings > Access > Users & Groups` -![user-upserted](../../imgs/apis/tutorials/user-upserted.png) + +

+ +

+ ## Upsert Group @@ -125,7 +129,11 @@ Update succeeded for group urn:li:corpGroup:foogroup@acryl.io. ### Expected Outcomes of Upserting Group You can see the group `Foo Group` has been created under `Settings > Access > Users & Groups` -![group-upserted](../../imgs/apis/tutorials/group-upserted.png) + +

+ +

+ ## Read Owners @@ -272,7 +280,11 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see `bfoo` has been added as an owner to the `fct_users_created` dataset. -![ownership-added](../../imgs/apis/tutorials/owner-added.png) + +

+ +

+ ## Remove Owners @@ -340,4 +352,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see `John Doe` has been removed as an owner from the `fct_users_created` dataset. -![ownership-removed](../../imgs/apis/tutorials/owner-removed.png) + +

+ +

+ diff --git a/docs/api/tutorials/tags.md b/docs/api/tutorials/tags.md index 2f80a833136c1..b2234bf00bcb9 100644 --- a/docs/api/tutorials/tags.md +++ b/docs/api/tutorials/tags.md @@ -91,7 +91,11 @@ Expected Response: You can now see the new tag `Deprecated` has been created. -![tag-created](../../imgs/apis/tutorials/tag-created.png) + +

+ +

+ We can also verify this operation by programmatically searching `Deprecated` tag after running this code using the `datahub` cli. @@ -307,7 +311,11 @@ Expected Response: You can now see `Deprecated` tag has been added to `user_name` column. -![tag-added](../../imgs/apis/tutorials/tag-added.png) + +

+ +

+ We can also verify this operation programmatically by checking the `globalTags` aspect using the `datahub` cli. @@ -359,7 +367,11 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see `Deprecated` tag has been removed to `user_name` column. -![tag-removed](../../imgs/apis/tutorials/tag-removed.png) + +

+ +

+ We can also verify this operation programmatically by checking the `gloablTags` aspect using the `datahub` cli. diff --git a/docs/api/tutorials/terms.md b/docs/api/tutorials/terms.md index 207e14ea4afe8..99acf77d26ab0 100644 --- a/docs/api/tutorials/terms.md +++ b/docs/api/tutorials/terms.md @@ -95,7 +95,11 @@ Expected Response: You can now see the new term `Rate of Return` has been created. -![term-created](../../imgs/apis/tutorials/term-created.png) + +

+ +

+ We can also verify this operation by programmatically searching `Rate of Return` term after running this code using the `datahub` cli. @@ -289,7 +293,11 @@ Expected Response: You can now see `Rate of Return` term has been added to `user_name` column. -![term-added](../../imgs/apis/tutorials/term-added.png) + +

+ +

+ ## Remove Terms @@ -361,4 +369,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see `Rate of Return` term has been removed to `user_name` column. -![term-removed](../../imgs/apis/tutorials/term-removed.png) + +

+ +

+ diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md index 6b76b995cc427..6a9c1860d71b0 100644 --- a/docs/architecture/architecture.md +++ b/docs/architecture/architecture.md @@ -10,8 +10,16 @@ disparate tools & systems. The figures below describe the high-level architecture of DataHub. -![datahub-architecture](../imgs/datahub-architecture.png) -![Acryl DataHub System Architecture ](../managed-datahub/imgs/saas/DataHub-Architecture.png) + +

+ +

+ + +

+ +

+ For a more detailed look at the components that make up the Architecture, check out [Components](../components.md). diff --git a/docs/architecture/metadata-ingestion.md b/docs/architecture/metadata-ingestion.md index 2b60383319c68..abf8fc24d1385 100644 --- a/docs/architecture/metadata-ingestion.md +++ b/docs/architecture/metadata-ingestion.md @@ -6,7 +6,11 @@ title: "Ingestion Framework" DataHub supports an extremely flexible ingestion architecture that can support push, pull, asynchronous and synchronous models. The figure below describes all the options possible for connecting your favorite system to DataHub. -![Ingestion Architecture](../imgs/ingestion-architecture.png) + +

+ +

+ ## Metadata Change Proposal: The Center Piece diff --git a/docs/architecture/metadata-serving.md b/docs/architecture/metadata-serving.md index ada41179af4e0..57194f49d5ea4 100644 --- a/docs/architecture/metadata-serving.md +++ b/docs/architecture/metadata-serving.md @@ -6,7 +6,11 @@ title: "Serving Tier" The figure below shows the high-level system diagram for DataHub's Serving Tier. -![datahub-serving](../imgs/datahub-serving.png) + +

+ +

+ The primary component is called [the Metadata Service](../../metadata-service) and exposes a REST API and a GraphQL API for performing CRUD operations on metadata. The service also exposes search and graph query API-s to support secondary-index style queries, full-text search queries as well as relationship queries like lineage. In addition, the [datahub-frontend](../../datahub-frontend) service expose a GraphQL API on top of the metadata graph. diff --git a/docs/authentication/concepts.md b/docs/authentication/concepts.md index 715e94c7e0380..0940f86a805f1 100644 --- a/docs/authentication/concepts.md +++ b/docs/authentication/concepts.md @@ -11,7 +11,11 @@ We introduced a few important concepts to the Metadata Service to make authentic In following sections, we'll take a closer look at each individually. -![](../imgs/metadata-service-auth.png) + +

+ +

+ *High level overview of Metadata Service Authentication* ## What is an Actor? diff --git a/docs/authentication/guides/sso/configure-oidc-react-azure.md b/docs/authentication/guides/sso/configure-oidc-react-azure.md index d185957967882..177387327c0e8 100644 --- a/docs/authentication/guides/sso/configure-oidc-react-azure.md +++ b/docs/authentication/guides/sso/configure-oidc-react-azure.md @@ -32,7 +32,11 @@ Azure supports more than one redirect URI, so both can be configured at the same At this point, your app registration should look like the following: -![azure-setup-app-registration](img/azure-setup-app-registration.png) + +

+ +

+ e. Click **Register**. @@ -40,7 +44,11 @@ e. Click **Register**. Once registration is done, you will land on the app registration **Overview** tab. On the left-side navigation bar, click on **Authentication** under **Manage** and add extra redirect URIs if need be (if you want to support both local testing and Azure deployments). -![azure-setup-authentication](img/azure-setup-authentication.png) + +

+ +

+ Click **Save**. @@ -51,7 +59,11 @@ Select **Client secrets**, then **New client secret**. Type in a meaningful des **IMPORTANT:** Copy the `value` of your newly create secret since Azure will never display its value afterwards. -![azure-setup-certificates-secrets](img/azure-setup-certificates-secrets.png) + +

+ +

+ ### 4. Configure API permissions @@ -66,7 +78,11 @@ Click on **Add a permission**, then from the **Microsoft APIs** tab select **Mic At this point, you should be looking at a screen like the following: -![azure-setup-api-permissions](img/azure-setup-api-permissions.png) + +

+ +

+ ### 5. Obtain Application (Client) ID diff --git a/docs/authentication/guides/sso/configure-oidc-react-google.md b/docs/authentication/guides/sso/configure-oidc-react-google.md index 474538097aae2..af62185e6e787 100644 --- a/docs/authentication/guides/sso/configure-oidc-react-google.md +++ b/docs/authentication/guides/sso/configure-oidc-react-google.md @@ -31,7 +31,11 @@ Note that in order to complete this step you should be logged into a Google acco c. Fill out the details in the App Information & Domain sections. Make sure the 'Application Home Page' provided matches where DataHub is deployed at your organization. -![google-setup-1](img/google-setup-1.png) + +

+ +

+ Once you've completed this, **Save & Continue**. @@ -70,7 +74,11 @@ f. You will now receive a pair of values, a client id and a client secret. Bookm At this point, you should be looking at a screen like the following: -![google-setup-2](img/google-setup-2.png) + +

+ +

+ Success! diff --git a/docs/authentication/guides/sso/configure-oidc-react-okta.md b/docs/authentication/guides/sso/configure-oidc-react-okta.md index cfede999f1e70..320b887a28f16 100644 --- a/docs/authentication/guides/sso/configure-oidc-react-okta.md +++ b/docs/authentication/guides/sso/configure-oidc-react-okta.md @@ -69,8 +69,16 @@ for example, `https://dev-33231928.okta.com/.well-known/openid-configuration`. At this point, you should be looking at a screen like the following: -![okta-setup-1](img/okta-setup-1.png) -![okta-setup-2](img/okta-setup-2.png) + +

+ +

+ + +

+ +

+ Success! @@ -96,7 +104,11 @@ Replacing the placeholders above with the client id & client secret received fro > > By default, we assume that the groups will appear in a claim named "groups". This can be customized using the `AUTH_OIDC_GROUPS_CLAIM` container configuration. > -> ![okta-setup-2](img/okta-setup-groups-claim.png) +> +

+ +

+ ### 5. Restart `datahub-frontend-react` docker container diff --git a/docs/authentication/guides/sso/img/azure-setup-api-permissions.png b/docs/authentication/guides/sso/img/azure-setup-api-permissions.png deleted file mode 100755 index 4964b7d48ffec..0000000000000 Binary files a/docs/authentication/guides/sso/img/azure-setup-api-permissions.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/azure-setup-app-registration.png b/docs/authentication/guides/sso/img/azure-setup-app-registration.png deleted file mode 100755 index ffb23a7e3ddec..0000000000000 Binary files a/docs/authentication/guides/sso/img/azure-setup-app-registration.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/azure-setup-authentication.png b/docs/authentication/guides/sso/img/azure-setup-authentication.png deleted file mode 100755 index 2d27ec88fb40b..0000000000000 Binary files a/docs/authentication/guides/sso/img/azure-setup-authentication.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png b/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png deleted file mode 100755 index db6585d84d8ee..0000000000000 Binary files a/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/google-setup-1.png b/docs/authentication/guides/sso/img/google-setup-1.png deleted file mode 100644 index 88c674146f1e4..0000000000000 Binary files a/docs/authentication/guides/sso/img/google-setup-1.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/google-setup-2.png b/docs/authentication/guides/sso/img/google-setup-2.png deleted file mode 100644 index 850512b891d5f..0000000000000 Binary files a/docs/authentication/guides/sso/img/google-setup-2.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/okta-setup-1.png b/docs/authentication/guides/sso/img/okta-setup-1.png deleted file mode 100644 index 3949f18657c5e..0000000000000 Binary files a/docs/authentication/guides/sso/img/okta-setup-1.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/okta-setup-2.png b/docs/authentication/guides/sso/img/okta-setup-2.png deleted file mode 100644 index fa6ea4d991894..0000000000000 Binary files a/docs/authentication/guides/sso/img/okta-setup-2.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/okta-setup-groups-claim.png b/docs/authentication/guides/sso/img/okta-setup-groups-claim.png deleted file mode 100644 index ed35426685e46..0000000000000 Binary files a/docs/authentication/guides/sso/img/okta-setup-groups-claim.png and /dev/null differ diff --git a/docs/authentication/personal-access-tokens.md b/docs/authentication/personal-access-tokens.md index 0188aab49444e..dc57a989a4e0c 100644 --- a/docs/authentication/personal-access-tokens.md +++ b/docs/authentication/personal-access-tokens.md @@ -71,7 +71,11 @@ curl 'http://localhost:8080/entities/urn:li:corpuser:datahub' -H 'Authorization: Since authorization happens at the GMS level, this means that ingestion is also protected behind access tokens, to use them simply add a `token` to the sink config property as seen below: -![](../imgs/ingestion-with-token.png) + +

+ +

+ :::note diff --git a/docs/authorization/access-policies-guide.md b/docs/authorization/access-policies-guide.md index 5820e513a83e3..1eabb64d2878f 100644 --- a/docs/authorization/access-policies-guide.md +++ b/docs/authorization/access-policies-guide.md @@ -110,10 +110,13 @@ In the second step, we can simply select the Privileges that this Platform Polic | Manage Tags | Allow the actor to create and remove any Tags | | Manage Public Views | Allow the actor to create, edit, and remove any public (shared) Views. | | Manage Ownership Types | Allow the actor to create, edit, and remove any Ownership Types. | +| Manage Platform Settings | (Acryl DataHub only) Allow the actor to manage global integrations and notification settings | +| Manage Monitors | (Acryl DataHub only) Allow the actor to create, remove, start, or stop any entity assertion monitors | | Restore Indices API[^1] | Allow the actor to restore indices for a set of entities via API | | Enable/Disable Writeability API[^1] | Allow the actor to enable or disable GMS writeability for use in data migrations | | Apply Retention API[^1] | Allow the actor to apply aspect retention via API | + [^1]: Only active if REST_API_AUTHORIZATION_ENABLED environment flag is enabled #### Step 3: Choose Policy Actors @@ -204,8 +207,15 @@ The common Metadata Privileges, which span across entity types, include: | Edit Status | Allow actor to edit the status of an entity (soft deleted or not). | | Edit Domain | Allow actor to edit the Domain of an entity. | | Edit Deprecation | Allow actor to edit the Deprecation status of an entity. | -| Edit Assertions | Allow actor to add and remove assertions from an entity. | -| Edit All | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. | +| Edit Lineage | Allow actor to edit custom lineage edges for the entity. | +| Edit Data Product | Allow actor to edit the data product that an entity is part of | +| Propose Tags | (Acryl DataHub only) Allow actor to propose new Tags for the entity. | +| Propose Glossary Terms | (Acryl DataHub only) Allow actor to propose new Glossary Terms for the entity. | +| Propose Documentation | (Acryl DataHub only) Allow actor to propose new Documentation for the entity. | +| Manage Tag Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Tags for the entity. | +| Manage Glossary Terms Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Glossary Terms for the entity. | +| Manage Documentation Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Documentation for the entity | +| Edit Entity | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. | | Get Timeline API[^1] | Allow actor to get the timeline of an entity via API. | | Get Entity API[^1] | Allow actor to get an entity via API. | | Get Timeseries Aspect API[^1] | Allow actor to get a timeseries aspect via API. | @@ -225,10 +235,19 @@ The common Metadata Privileges, which span across entity types, include: | Dataset | Edit Dataset Queries | Allow actor to edit the Highlighted Queries on the Queries tab of the dataset. | | Dataset | View Dataset Usage | Allow actor to access usage metadata about a dataset both in the UI and in the GraphQL API. This includes example queries, number of queries, etc. Also applies to REST APIs when REST API Authorization is enabled. | | Dataset | View Dataset Profile | Allow actor to access a dataset's profile both in the UI and in the GraphQL API. This includes snapshot statistics like #rows, #columns, null percentage per field, etc. | +| Dataset | Edit Assertions | Allow actor to change the assertions associated with a dataset. | +| Dataset | Edit Incidents | (Acryl DataHub only) Allow actor to change the incidents associated with a dataset. | +| Dataset | Edit Monitors | (Acryl DataHub only) Allow actor to change the assertion monitors associated with a dataset. | | Tag | Edit Tag Color | Allow actor to change the color of a Tag. | | Group | Edit Group Members | Allow actor to add and remove members to a group. | +| Group | Edit Contact Information | Allow actor to change email, slack handle associated with the group. | +| Group | Manage Group Subscriptions | (Acryl DataHub only) Allow actor to subscribe the group to entities. | +| Group | Manage Group Notifications | (Acryl DataHub only) Allow actor to change notification settings for the group. | | User | Edit User Profile | Allow actor to change the user's profile including display name, bio, title, profile image, etc. | | User + Group | Edit Contact Information | Allow actor to change the contact information such as email & chat handles. | +| Term Group | Manage Direct Glossary Children | Allow actor to change the direct child Term Groups or Terms of the group. | +| Term Group | Manage All Glossary Children | Allow actor to change any direct or indirect child Term Groups or Terms of the group. | + > **Still have questions about Privileges?** Let us know in [Slack](https://slack.datahubproject.io)! diff --git a/docs/components.md b/docs/components.md index ef76729bb37fb..b59dabcf999cc 100644 --- a/docs/components.md +++ b/docs/components.md @@ -6,7 +6,11 @@ title: "Components" The DataHub platform consists of the components shown in the following diagram. -![DataHub Component Overview](./imgs/datahub-components.png) + +

+ +

+ ## Metadata Store diff --git a/docs/demo/DataHub-UIOverview.pdf b/docs/demo/DataHub-UIOverview.pdf deleted file mode 100644 index cd6106e84ac23..0000000000000 Binary files a/docs/demo/DataHub-UIOverview.pdf and /dev/null differ diff --git a/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf b/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf deleted file mode 100644 index 71498045f9b5b..0000000000000 Binary files a/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf and /dev/null differ diff --git a/docs/demo/Data_Discoverability_at_SpotHero.pdf b/docs/demo/Data_Discoverability_at_SpotHero.pdf deleted file mode 100644 index 83e37d8606428..0000000000000 Binary files a/docs/demo/Data_Discoverability_at_SpotHero.pdf and /dev/null differ diff --git a/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf b/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf deleted file mode 100644 index 2d6a33a464650..0000000000000 Binary files a/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf and /dev/null differ diff --git a/docs/demo/Datahub_at_Grofers.pdf b/docs/demo/Datahub_at_Grofers.pdf deleted file mode 100644 index c29cece9e250a..0000000000000 Binary files a/docs/demo/Datahub_at_Grofers.pdf and /dev/null differ diff --git a/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf b/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf deleted file mode 100644 index 0d067eef28d03..0000000000000 Binary files a/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf and /dev/null differ diff --git a/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf b/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf deleted file mode 100644 index 382754f863c8a..0000000000000 Binary files a/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf and /dev/null differ diff --git a/docs/demo/Saxo Bank Data Workbench.pdf b/docs/demo/Saxo Bank Data Workbench.pdf deleted file mode 100644 index c43480d32b8f2..0000000000000 Binary files a/docs/demo/Saxo Bank Data Workbench.pdf and /dev/null differ diff --git a/docs/demo/Taming the Data Beast Using DataHub.pdf b/docs/demo/Taming the Data Beast Using DataHub.pdf deleted file mode 100644 index d0062465d9220..0000000000000 Binary files a/docs/demo/Taming the Data Beast Using DataHub.pdf and /dev/null differ diff --git a/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf b/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf deleted file mode 100644 index fb7bd2b693e87..0000000000000 Binary files a/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf and /dev/null differ diff --git a/docs/demo/ViasatMetadataJourney.pdf b/docs/demo/ViasatMetadataJourney.pdf deleted file mode 100644 index ccffd18a06d18..0000000000000 Binary files a/docs/demo/ViasatMetadataJourney.pdf and /dev/null differ diff --git a/docs/deploy/aws.md b/docs/deploy/aws.md index 7b01ffa02a744..228fcb51d1a28 100644 --- a/docs/deploy/aws.md +++ b/docs/deploy/aws.md @@ -201,7 +201,11 @@ Provision a MySQL database in AWS RDS that shares the VPC with the kubernetes cl the VPC of the kubernetes cluster. Once the database is provisioned, you should be able to see the following page. Take a note of the endpoint marked by the red box. -![AWS RDS](../imgs/aws/aws-rds.png) + +

+ +

+ First, add the DB password to kubernetes by running the following. @@ -234,7 +238,11 @@ Provision an elasticsearch domain running elasticsearch version 7.10 or above th cluster or has VPC peering set up between the VPC of the kubernetes cluster. Once the domain is provisioned, you should be able to see the following page. Take a note of the endpoint marked by the red box. -![AWS Elasticsearch Service](../imgs/aws/aws-elasticsearch.png) + +

+ +

+ Update the elasticsearch settings under global in the values.yaml as follows. @@ -330,7 +338,11 @@ Provision an MSK cluster that shares the VPC with the kubernetes cluster or has the kubernetes cluster. Once the domain is provisioned, click on the “View client information” button in the ‘Cluster Summary” section. You should see a page like below. Take a note of the endpoints marked by the red boxes. -![AWS MSK](../imgs/aws/aws-msk.png) + +

+ +

+ Update the kafka settings under global in the values.yaml as follows. diff --git a/docs/deploy/confluent-cloud.md b/docs/deploy/confluent-cloud.md index d93ffcceaecee..794b55d4686bf 100644 --- a/docs/deploy/confluent-cloud.md +++ b/docs/deploy/confluent-cloud.md @@ -24,7 +24,11 @@ decommissioned. To create the topics, navigate to your **Cluster** and click "Create Topic". Feel free to tweak the default topic configurations to match your preferences. -![CreateTopic](../imgs/confluent-create-topic.png) + +

+ +

+ ## Step 2: Configure DataHub Container to use Confluent Cloud Topics @@ -140,12 +144,20 @@ and another for the user info used for connecting to the schema registry. You'll select "Clients" -> "Configure new Java Client". You should see a page like the following: -![Config](../imgs/confluent-cloud-config.png) + +

+ +

+ You'll want to generate both a Kafka Cluster API Key & a Schema Registry key. Once you do so,you should see the config automatically populate with your new secrets: -![Config](../imgs/confluent-cloud-config-2.png) + +

+ +

+ You'll need to copy the values of `sasl.jaas.config` and `basic.auth.user.info` for the next step. diff --git a/docs/deploy/gcp.md b/docs/deploy/gcp.md index 3713d69f90636..0cd3d92a8f3cd 100644 --- a/docs/deploy/gcp.md +++ b/docs/deploy/gcp.md @@ -65,16 +65,28 @@ the GKE page on [GCP website](https://console.cloud.google.com/kubernetes/discov Once all deploy is successful, you should see a page like below in the "Services & Ingress" tab on the left. -![Services and Ingress](../imgs/gcp/services_ingress.png) + +

+ +

+ Tick the checkbox for datahub-datahub-frontend and click "CREATE INGRESS" button. You should land on the following page. -![Ingress1](../imgs/gcp/ingress1.png) + +

+ +

+ Type in an arbitrary name for the ingress and click on the second step "Host and path rules". You should land on the following page. -![Ingress2](../imgs/gcp/ingress2.png) + +

+ +

+ Select "datahub-datahub-frontend" in the dropdown menu for backends, and then click on "ADD HOST AND PATH RULE" button. In the second row that got created, add in the host name of choice (here gcp.datahubproject.io) and select @@ -83,14 +95,22 @@ In the second row that got created, add in the host name of choice (here gcp.dat This step adds the rule allowing requests from the host name of choice to get routed to datahub-frontend service. Click on step 3 "Frontend configuration". You should land on the following page. -![Ingress3](../imgs/gcp/ingress3.png) + +

+ +

+ Choose HTTPS in the dropdown menu for protocol. To enable SSL, you need to add a certificate. If you do not have one, you can click "CREATE A NEW CERTIFICATE" and input the host name of choice. GCP will create a certificate for you. Now press "CREATE" button on the left to create ingress! After around 5 minutes, you should see the following. -![Ingress Ready](../imgs/gcp/ingress_ready.png) + +

+ +

+ In your domain provider, add an A record for the host name set above using the IP address on the ingress page (noted with the red box). Once DNS updates, you should be able to access DataHub through the host name!! @@ -98,5 +118,9 @@ with the red box). Once DNS updates, you should be able to access DataHub throug Note, ignore the warning icon next to ingress. It takes about ten minutes for ingress to check that the backend service is ready and show a check mark as follows. However, ingress is fully functional once you see the above page. -![Ingress Final](../imgs/gcp/ingress_final.png) + +

+ +

+ diff --git a/docs/dev-guides/timeline.md b/docs/dev-guides/timeline.md index 966e659b90991..829aef1d3eefa 100644 --- a/docs/dev-guides/timeline.md +++ b/docs/dev-guides/timeline.md @@ -14,7 +14,11 @@ The Timeline API is available in server versions `0.8.28` and higher. The `cli` ## Entity Timeline Conceptually For the visually inclined, here is a conceptual diagram that illustrates how to think about the entity timeline with categorical changes overlaid on it. -![../imgs/timeline/timeline-conceptually.png](../imgs/timeline/timeline-conceptually.png) + +

+ +

+ ## Change Event Each modification is modeled as a @@ -228,8 +232,16 @@ http://localhost:8080/openapi/timeline/v1/urn%3Ali%3Adataset%3A%28urn%3Ali%3Adat The API is browse-able via the UI through through the dropdown. Here are a few screenshots showing how to navigate to it. You can try out the API and send example requests. -![../imgs/timeline/dropdown-apis.png](../imgs/timeline/dropdown-apis.png) -![../imgs/timeline/swagger-ui.png](../imgs/timeline/swagger-ui.png) + +

+ +

+ + +

+ +

+ # Future Work diff --git a/docs/docker/development.md b/docs/docker/development.md index 2153aa9dc613f..91a303744a03b 100644 --- a/docs/docker/development.md +++ b/docs/docker/development.md @@ -92,7 +92,11 @@ Environment variables control the debugging ports for GMS and the frontend. The screenshot shows an example configuration for IntelliJ using the default GMS debugging port of 5001. -![](../imgs/development/intellij-remote-debug.png) + +

+ +

+ ## Tips for People New To Docker diff --git a/docs/glossary/business-glossary.md b/docs/glossary/business-glossary.md index faab6f12fc55e..e10cbed30b913 100644 --- a/docs/glossary/business-glossary.md +++ b/docs/glossary/business-glossary.md @@ -31,59 +31,103 @@ In order to view a Business Glossary, users must have the Platform Privilege cal Once granted this privilege, you can access your Glossary by clicking the dropdown at the top of the page called **Govern** and then click **Glossary**: -![](../imgs/glossary/glossary-button.png) + +

+ +

+ You are now at the root of your Glossary and should see all Terms and Term Groups with no parents assigned to them. You should also notice a hierarchy navigator on the left where you can easily check out the structure of your Glossary! -![](../imgs/glossary/root-glossary.png) + +

+ +

+ ## Creating a Term or Term Group There are two ways to create Terms and Term Groups through the UI. First, you can create directly from the Glossary home page by clicking the menu dots on the top right and selecting your desired option: -![](../imgs/glossary/root-glossary-create.png) + +

+ +

+ You can also create Terms or Term Groups directly from a Term Group's page. In order to do that you need to click the menu dots on the top right and select what you want: -![](../imgs/glossary/create-from-node.png) + +

+ +

+ Note that the modal that pops up will automatically set the current Term Group you are in as the **Parent**. You can easily change this by selecting the input and navigating through your Glossary to find your desired Term Group. In addition, you could start typing the name of a Term Group to see it appear by searching. You can also leave this input blank in order to create a Term or Term Group with no parent. -![](../imgs/glossary/create-modal.png) + +

+ +

+ ## Editing a Term or Term Group In order to edit a Term or Term Group, you first need to go the page of the Term or Term group you want to edit. Then simply click the edit icon right next to the name to open up an inline editor. Change the text and it will save when you click outside or hit Enter. -![](../imgs/glossary/edit-term.png) + +

+ +

+ ## Moving a Term or Term Group Once a Term or Term Group has been created, you can always move it to be under a different Term Group parent. In order to do this, click the menu dots on the top right of either entity and select **Move**. -![](../imgs/glossary/move-term-button.png) + +

+ +

+ This will open a modal where you can navigate through your Glossary to find your desired Term Group. -![](../imgs/glossary/move-term-modal.png) + +

+ +

+ ## Deleting a Term or Term Group In order to delete a Term or Term Group, you need to go to the entity page of what you want to delete then click the menu dots on the top right. From here you can select **Delete** followed by confirming through a separate modal. **Note**: at the moment we only support deleting Term Groups that do not have any children. Until cascade deleting is supported, you will have to delete all children first, then delete the Term Group. -![](../imgs/glossary/delete-button.png) + +

+ +

+ ## Adding a Term to an Entity Once you've defined your Glossary, you can begin attaching terms to data assets. To add a Glossary Term to an asset, go to the entity page of your asset and find the **Add Terms** button on the right sidebar. -![](../imgs/glossary/add-term-to-entity.png) + +

+ +

+ In the modal that pops up you can select the Term you care about in one of two ways: - Search for the Term by name in the input - Navigate through the Glossary dropdown that appears after clicking into the input -![](../imgs/glossary/add-term-modal.png) + +

+ +

+ ## Privileges diff --git a/docs/how/configuring-authorization-with-apache-ranger.md b/docs/how/configuring-authorization-with-apache-ranger.md index 26d3be6d358b2..46f9432e6c18a 100644 --- a/docs/how/configuring-authorization-with-apache-ranger.md +++ b/docs/how/configuring-authorization-with-apache-ranger.md @@ -67,7 +67,11 @@ Now, you should have the DataHub plugin registered with Apache Ranger. Next, we' **DATAHUB** plugin and **ranger_datahub** service is shown in below screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-plugin.png) + +

+ +

+ 4. Create a new policy under service **ranger_datahub** - this will be used to control DataHub authorization. 5. Create a test user & assign them to a policy. We'll use the `datahub` user, which is the default root user inside DataHub. @@ -80,7 +84,11 @@ Now, you should have the DataHub plugin registered with Apache Ranger. Next, we' DataHub platform access policy screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-platform-access-policy.png) + +

+ +

+ Once we've created our first policy, we can set up DataHub to start authorizing requests using Ranger policies. @@ -178,7 +186,11 @@ then follow the below sections to undo the configuration steps you have performe **ranger_datahub** service is shown in below screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-plugin.png) + +

+ +

+ 2. Delete **datahub** plugin: Execute below curl command to delete **datahub** plugin Replace variables with corresponding values in curl command diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 2b6fd5571cc9e..7ba516c82cf1b 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -15,6 +15,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #8300: Clickhouse source now inherited from TwoTierSQLAlchemy. In old way we have platform_instance -> container -> co container db (None) -> container schema and now we have platform_instance -> container database. - #8300: Added `uri_opts` argument; now we can add any options for clickhouse client. +- #8659: BigQuery ingestion no longer creates DataPlatformInstance aspects by default. + This will only affect users that were depending on this aspect for custom functionality, + and can be enabled via the `include_data_platform_instance` config option. ## 0.10.5 diff --git a/docs/imgs/add-schema-tag.png b/docs/imgs/add-schema-tag.png deleted file mode 100644 index b6fd273389c90..0000000000000 Binary files a/docs/imgs/add-schema-tag.png and /dev/null differ diff --git a/docs/imgs/add-tag-search.png b/docs/imgs/add-tag-search.png deleted file mode 100644 index a129f5eba4271..0000000000000 Binary files a/docs/imgs/add-tag-search.png and /dev/null differ diff --git a/docs/imgs/add-tag.png b/docs/imgs/add-tag.png deleted file mode 100644 index 386b4cdcd9911..0000000000000 Binary files a/docs/imgs/add-tag.png and /dev/null differ diff --git a/docs/imgs/added-tag.png b/docs/imgs/added-tag.png deleted file mode 100644 index 96ae48318a35a..0000000000000 Binary files a/docs/imgs/added-tag.png and /dev/null differ diff --git a/docs/imgs/airflow/connection_error.png b/docs/imgs/airflow/connection_error.png deleted file mode 100644 index c2f3344b8cc45..0000000000000 Binary files a/docs/imgs/airflow/connection_error.png and /dev/null differ diff --git a/docs/imgs/airflow/datahub_lineage_view.png b/docs/imgs/airflow/datahub_lineage_view.png deleted file mode 100644 index c7c774c203d2f..0000000000000 Binary files a/docs/imgs/airflow/datahub_lineage_view.png and /dev/null differ diff --git a/docs/imgs/airflow/datahub_pipeline_entity.png b/docs/imgs/airflow/datahub_pipeline_entity.png deleted file mode 100644 index 715baefd784ca..0000000000000 Binary files a/docs/imgs/airflow/datahub_pipeline_entity.png and /dev/null differ diff --git a/docs/imgs/airflow/datahub_pipeline_view.png b/docs/imgs/airflow/datahub_pipeline_view.png deleted file mode 100644 index 5b3afd13c4ce6..0000000000000 Binary files a/docs/imgs/airflow/datahub_pipeline_view.png and /dev/null differ diff --git a/docs/imgs/airflow/datahub_task_view.png b/docs/imgs/airflow/datahub_task_view.png deleted file mode 100644 index 66b3487d87319..0000000000000 Binary files a/docs/imgs/airflow/datahub_task_view.png and /dev/null differ diff --git a/docs/imgs/airflow/entity_page_screenshot.png b/docs/imgs/airflow/entity_page_screenshot.png deleted file mode 100644 index a782969a1f17b..0000000000000 Binary files a/docs/imgs/airflow/entity_page_screenshot.png and /dev/null differ diff --git a/docs/imgs/airflow/find_the_dag.png b/docs/imgs/airflow/find_the_dag.png deleted file mode 100644 index 37cda041e4b75..0000000000000 Binary files a/docs/imgs/airflow/find_the_dag.png and /dev/null differ diff --git a/docs/imgs/airflow/finding_failed_log.png b/docs/imgs/airflow/finding_failed_log.png deleted file mode 100644 index 96552ba1e1983..0000000000000 Binary files a/docs/imgs/airflow/finding_failed_log.png and /dev/null differ diff --git a/docs/imgs/airflow/paused_dag.png b/docs/imgs/airflow/paused_dag.png deleted file mode 100644 index c314de5d38d75..0000000000000 Binary files a/docs/imgs/airflow/paused_dag.png and /dev/null differ diff --git a/docs/imgs/airflow/successful_run.png b/docs/imgs/airflow/successful_run.png deleted file mode 100644 index b997cc7210ff6..0000000000000 Binary files a/docs/imgs/airflow/successful_run.png and /dev/null differ diff --git a/docs/imgs/airflow/trigger_dag.png b/docs/imgs/airflow/trigger_dag.png deleted file mode 100644 index a44999c929d4e..0000000000000 Binary files a/docs/imgs/airflow/trigger_dag.png and /dev/null differ diff --git a/docs/imgs/airflow/unpaused_dag.png b/docs/imgs/airflow/unpaused_dag.png deleted file mode 100644 index 8462562f31d97..0000000000000 Binary files a/docs/imgs/airflow/unpaused_dag.png and /dev/null differ diff --git a/docs/imgs/apache-ranger/datahub-platform-access-policy.png b/docs/imgs/apache-ranger/datahub-platform-access-policy.png deleted file mode 100644 index 7e3ff6fd372a9..0000000000000 Binary files a/docs/imgs/apache-ranger/datahub-platform-access-policy.png and /dev/null differ diff --git a/docs/imgs/apache-ranger/datahub-plugin.png b/docs/imgs/apache-ranger/datahub-plugin.png deleted file mode 100644 index 5dd044c014657..0000000000000 Binary files a/docs/imgs/apache-ranger/datahub-plugin.png and /dev/null differ diff --git a/docs/imgs/apis/postman-graphql.png b/docs/imgs/apis/postman-graphql.png deleted file mode 100644 index 1cffd226fdf77..0000000000000 Binary files a/docs/imgs/apis/postman-graphql.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/column-description-added.png b/docs/imgs/apis/tutorials/column-description-added.png deleted file mode 100644 index ed8cbd3bf5622..0000000000000 Binary files a/docs/imgs/apis/tutorials/column-description-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/column-level-lineage-added.png b/docs/imgs/apis/tutorials/column-level-lineage-added.png deleted file mode 100644 index 6092436e0a6a8..0000000000000 Binary files a/docs/imgs/apis/tutorials/column-level-lineage-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/custom-properties-added.png b/docs/imgs/apis/tutorials/custom-properties-added.png deleted file mode 100644 index a7e85d875045c..0000000000000 Binary files a/docs/imgs/apis/tutorials/custom-properties-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/datahub-main-ui.png b/docs/imgs/apis/tutorials/datahub-main-ui.png deleted file mode 100644 index b058e2683a851..0000000000000 Binary files a/docs/imgs/apis/tutorials/datahub-main-ui.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-created.png b/docs/imgs/apis/tutorials/dataset-created.png deleted file mode 100644 index 086dd8b7c9b16..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-deleted.png b/docs/imgs/apis/tutorials/dataset-deleted.png deleted file mode 100644 index d94ad7e85195f..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-deleted.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-description-added.png b/docs/imgs/apis/tutorials/dataset-description-added.png deleted file mode 100644 index 41aa9f109115b..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-description-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-properties-added-removed.png b/docs/imgs/apis/tutorials/dataset-properties-added-removed.png deleted file mode 100644 index 9eb0284776f13..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-properties-added-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-properties-added.png b/docs/imgs/apis/tutorials/dataset-properties-added.png deleted file mode 100644 index e0d2acbb66eb5..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-properties-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-properties-before.png b/docs/imgs/apis/tutorials/dataset-properties-before.png deleted file mode 100644 index b4915121a8c65..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-properties-before.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-properties-replaced.png b/docs/imgs/apis/tutorials/dataset-properties-replaced.png deleted file mode 100644 index 8624689c20ada..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-properties-replaced.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/deprecation-updated.png b/docs/imgs/apis/tutorials/deprecation-updated.png deleted file mode 100644 index 06fedf746f694..0000000000000 Binary files a/docs/imgs/apis/tutorials/deprecation-updated.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/domain-added.png b/docs/imgs/apis/tutorials/domain-added.png deleted file mode 100644 index cb2002ec9ab4d..0000000000000 Binary files a/docs/imgs/apis/tutorials/domain-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/domain-created.png b/docs/imgs/apis/tutorials/domain-created.png deleted file mode 100644 index cafab2a5e8d5c..0000000000000 Binary files a/docs/imgs/apis/tutorials/domain-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/domain-removed.png b/docs/imgs/apis/tutorials/domain-removed.png deleted file mode 100644 index 1b21172be11d2..0000000000000 Binary files a/docs/imgs/apis/tutorials/domain-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/feature-added-to-model.png b/docs/imgs/apis/tutorials/feature-added-to-model.png deleted file mode 100644 index 311506e4b2783..0000000000000 Binary files a/docs/imgs/apis/tutorials/feature-added-to-model.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/feature-table-created.png b/docs/imgs/apis/tutorials/feature-table-created.png deleted file mode 100644 index 0541cbe572435..0000000000000 Binary files a/docs/imgs/apis/tutorials/feature-table-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/group-upserted.png b/docs/imgs/apis/tutorials/group-upserted.png deleted file mode 100644 index 5283f6273f02a..0000000000000 Binary files a/docs/imgs/apis/tutorials/group-upserted.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/lineage-added.png b/docs/imgs/apis/tutorials/lineage-added.png deleted file mode 100644 index b381498bad5ac..0000000000000 Binary files a/docs/imgs/apis/tutorials/lineage-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/model-group-added-to-model.png b/docs/imgs/apis/tutorials/model-group-added-to-model.png deleted file mode 100644 index 360b7fbb2d922..0000000000000 Binary files a/docs/imgs/apis/tutorials/model-group-added-to-model.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/model-group-created.png b/docs/imgs/apis/tutorials/model-group-created.png deleted file mode 100644 index 2e0fdcea4803f..0000000000000 Binary files a/docs/imgs/apis/tutorials/model-group-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/owner-added.png b/docs/imgs/apis/tutorials/owner-added.png deleted file mode 100644 index 6508c231cfb4b..0000000000000 Binary files a/docs/imgs/apis/tutorials/owner-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/owner-removed.png b/docs/imgs/apis/tutorials/owner-removed.png deleted file mode 100644 index a7b6567888caf..0000000000000 Binary files a/docs/imgs/apis/tutorials/owner-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/sample-ingestion.png b/docs/imgs/apis/tutorials/sample-ingestion.png deleted file mode 100644 index 40aa046904841..0000000000000 Binary files a/docs/imgs/apis/tutorials/sample-ingestion.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/tag-added.png b/docs/imgs/apis/tutorials/tag-added.png deleted file mode 100644 index fd99a04f6cceb..0000000000000 Binary files a/docs/imgs/apis/tutorials/tag-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/tag-created.png b/docs/imgs/apis/tutorials/tag-created.png deleted file mode 100644 index 99e3fea8a14e1..0000000000000 Binary files a/docs/imgs/apis/tutorials/tag-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/tag-removed.png b/docs/imgs/apis/tutorials/tag-removed.png deleted file mode 100644 index 31a267549843e..0000000000000 Binary files a/docs/imgs/apis/tutorials/tag-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/term-added.png b/docs/imgs/apis/tutorials/term-added.png deleted file mode 100644 index 62e285a92e7af..0000000000000 Binary files a/docs/imgs/apis/tutorials/term-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/term-created.png b/docs/imgs/apis/tutorials/term-created.png deleted file mode 100644 index deff0179b155e..0000000000000 Binary files a/docs/imgs/apis/tutorials/term-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/term-removed.png b/docs/imgs/apis/tutorials/term-removed.png deleted file mode 100644 index dbf9f35f09339..0000000000000 Binary files a/docs/imgs/apis/tutorials/term-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/user-upserted.png b/docs/imgs/apis/tutorials/user-upserted.png deleted file mode 100644 index 38c5bbb9ad828..0000000000000 Binary files a/docs/imgs/apis/tutorials/user-upserted.png and /dev/null differ diff --git a/docs/imgs/aws/aws-elasticsearch.png b/docs/imgs/aws/aws-elasticsearch.png deleted file mode 100644 index e16d5eee26fd8..0000000000000 Binary files a/docs/imgs/aws/aws-elasticsearch.png and /dev/null differ diff --git a/docs/imgs/aws/aws-msk.png b/docs/imgs/aws/aws-msk.png deleted file mode 100644 index 96a3173747007..0000000000000 Binary files a/docs/imgs/aws/aws-msk.png and /dev/null differ diff --git a/docs/imgs/aws/aws-rds.png b/docs/imgs/aws/aws-rds.png deleted file mode 100644 index ab329952c7756..0000000000000 Binary files a/docs/imgs/aws/aws-rds.png and /dev/null differ diff --git a/docs/imgs/browse-domains.png b/docs/imgs/browse-domains.png deleted file mode 100644 index 41444470517d2..0000000000000 Binary files a/docs/imgs/browse-domains.png and /dev/null differ diff --git a/docs/imgs/cancelled-ingestion.png b/docs/imgs/cancelled-ingestion.png deleted file mode 100644 index 0c4af7b66a8ff..0000000000000 Binary files a/docs/imgs/cancelled-ingestion.png and /dev/null differ diff --git a/docs/imgs/confluent-cloud-config-2.png b/docs/imgs/confluent-cloud-config-2.png deleted file mode 100644 index 543101154f42c..0000000000000 Binary files a/docs/imgs/confluent-cloud-config-2.png and /dev/null differ diff --git a/docs/imgs/confluent-cloud-config.png b/docs/imgs/confluent-cloud-config.png deleted file mode 100644 index a2490eab5c6a7..0000000000000 Binary files a/docs/imgs/confluent-cloud-config.png and /dev/null differ diff --git a/docs/imgs/confluent-create-topic.png b/docs/imgs/confluent-create-topic.png deleted file mode 100644 index 1972bb3770388..0000000000000 Binary files a/docs/imgs/confluent-create-topic.png and /dev/null differ diff --git a/docs/imgs/create-domain.png b/docs/imgs/create-domain.png deleted file mode 100644 index 1db2090fca6b8..0000000000000 Binary files a/docs/imgs/create-domain.png and /dev/null differ diff --git a/docs/imgs/create-new-ingestion-source-button.png b/docs/imgs/create-new-ingestion-source-button.png deleted file mode 100644 index c425f0837c51d..0000000000000 Binary files a/docs/imgs/create-new-ingestion-source-button.png and /dev/null differ diff --git a/docs/imgs/create-secret.png b/docs/imgs/create-secret.png deleted file mode 100644 index a0cc63e3b4892..0000000000000 Binary files a/docs/imgs/create-secret.png and /dev/null differ diff --git a/docs/imgs/custom-ingestion-cli-version.png b/docs/imgs/custom-ingestion-cli-version.png deleted file mode 100644 index 43d4736684abb..0000000000000 Binary files a/docs/imgs/custom-ingestion-cli-version.png and /dev/null differ diff --git a/docs/imgs/datahub-architecture.png b/docs/imgs/datahub-architecture.png deleted file mode 100644 index 236f939f74198..0000000000000 Binary files a/docs/imgs/datahub-architecture.png and /dev/null differ diff --git a/docs/imgs/datahub-architecture.svg b/docs/imgs/datahub-architecture.svg deleted file mode 100644 index 842194a5e377c..0000000000000 --- a/docs/imgs/datahub-architecture.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/imgs/datahub-components.png b/docs/imgs/datahub-components.png deleted file mode 100644 index 8b7d0e5330275..0000000000000 Binary files a/docs/imgs/datahub-components.png and /dev/null differ diff --git a/docs/imgs/datahub-logo-color-mark.svg b/docs/imgs/datahub-logo-color-mark.svg deleted file mode 100644 index a984092952bae..0000000000000 --- a/docs/imgs/datahub-logo-color-mark.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/imgs/datahub-metadata-ingestion-framework.png b/docs/imgs/datahub-metadata-ingestion-framework.png deleted file mode 100644 index 1319329710906..0000000000000 Binary files a/docs/imgs/datahub-metadata-ingestion-framework.png and /dev/null differ diff --git a/docs/imgs/datahub-metadata-model.png b/docs/imgs/datahub-metadata-model.png deleted file mode 100644 index 59449cd0d4ef5..0000000000000 Binary files a/docs/imgs/datahub-metadata-model.png and /dev/null differ diff --git a/docs/imgs/datahub-sequence-diagram.png b/docs/imgs/datahub-sequence-diagram.png deleted file mode 100644 index b5a8f8a9c25ce..0000000000000 Binary files a/docs/imgs/datahub-sequence-diagram.png and /dev/null differ diff --git a/docs/imgs/datahub-serving.png b/docs/imgs/datahub-serving.png deleted file mode 100644 index 67a2f8eb3f085..0000000000000 Binary files a/docs/imgs/datahub-serving.png and /dev/null differ diff --git a/docs/imgs/development/intellij-remote-debug.png b/docs/imgs/development/intellij-remote-debug.png deleted file mode 100644 index 32a41a75d1dc3..0000000000000 Binary files a/docs/imgs/development/intellij-remote-debug.png and /dev/null differ diff --git a/docs/imgs/domain-entities.png b/docs/imgs/domain-entities.png deleted file mode 100644 index 5766d051fa209..0000000000000 Binary files a/docs/imgs/domain-entities.png and /dev/null differ diff --git a/docs/imgs/domains-tab.png b/docs/imgs/domains-tab.png deleted file mode 100644 index 20be5b103fdca..0000000000000 Binary files a/docs/imgs/domains-tab.png and /dev/null differ diff --git a/docs/imgs/entity-registry-diagram.png b/docs/imgs/entity-registry-diagram.png deleted file mode 100644 index 08cb5edd8e13f..0000000000000 Binary files a/docs/imgs/entity-registry-diagram.png and /dev/null differ diff --git a/docs/imgs/entity.png b/docs/imgs/entity.png deleted file mode 100644 index cfe9eb38b2921..0000000000000 Binary files a/docs/imgs/entity.png and /dev/null differ diff --git a/docs/imgs/example-mysql-recipe.png b/docs/imgs/example-mysql-recipe.png deleted file mode 100644 index 9cb2cbb169a56..0000000000000 Binary files a/docs/imgs/example-mysql-recipe.png and /dev/null differ diff --git a/docs/imgs/failed-ingestion.png b/docs/imgs/failed-ingestion.png deleted file mode 100644 index 4f9de8eb002d2..0000000000000 Binary files a/docs/imgs/failed-ingestion.png and /dev/null differ diff --git a/docs/imgs/feature-create-new-tag.gif b/docs/imgs/feature-create-new-tag.gif deleted file mode 100644 index 57b8ad852dd5b..0000000000000 Binary files a/docs/imgs/feature-create-new-tag.gif and /dev/null differ diff --git a/docs/imgs/feature-datahub-analytics.png b/docs/imgs/feature-datahub-analytics.png deleted file mode 100644 index 7fe66b84682f9..0000000000000 Binary files a/docs/imgs/feature-datahub-analytics.png and /dev/null differ diff --git a/docs/imgs/feature-rich-documentation.gif b/docs/imgs/feature-rich-documentation.gif deleted file mode 100644 index 48ad795670022..0000000000000 Binary files a/docs/imgs/feature-rich-documentation.gif and /dev/null differ diff --git a/docs/imgs/feature-tag-browse.gif b/docs/imgs/feature-tag-browse.gif deleted file mode 100644 index e70a30db7d3ba..0000000000000 Binary files a/docs/imgs/feature-tag-browse.gif and /dev/null differ diff --git a/docs/imgs/feature-validation-timeseries.png b/docs/imgs/feature-validation-timeseries.png deleted file mode 100644 index 28ce1daec5f32..0000000000000 Binary files a/docs/imgs/feature-validation-timeseries.png and /dev/null differ diff --git a/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif b/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif deleted file mode 100644 index aad77df373574..0000000000000 Binary files a/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif and /dev/null differ diff --git a/docs/imgs/gcp/ingress1.png b/docs/imgs/gcp/ingress1.png deleted file mode 100644 index 4cb49834af5b6..0000000000000 Binary files a/docs/imgs/gcp/ingress1.png and /dev/null differ diff --git a/docs/imgs/gcp/ingress2.png b/docs/imgs/gcp/ingress2.png deleted file mode 100644 index cdf2446b0e923..0000000000000 Binary files a/docs/imgs/gcp/ingress2.png and /dev/null differ diff --git a/docs/imgs/gcp/ingress3.png b/docs/imgs/gcp/ingress3.png deleted file mode 100644 index cc3745ad97f5b..0000000000000 Binary files a/docs/imgs/gcp/ingress3.png and /dev/null differ diff --git a/docs/imgs/gcp/ingress_final.png b/docs/imgs/gcp/ingress_final.png deleted file mode 100644 index a30ca744c49f7..0000000000000 Binary files a/docs/imgs/gcp/ingress_final.png and /dev/null differ diff --git a/docs/imgs/gcp/ingress_ready.png b/docs/imgs/gcp/ingress_ready.png deleted file mode 100644 index d14016e420fd3..0000000000000 Binary files a/docs/imgs/gcp/ingress_ready.png and /dev/null differ diff --git a/docs/imgs/gcp/services_ingress.png b/docs/imgs/gcp/services_ingress.png deleted file mode 100644 index 1d9ff2b313715..0000000000000 Binary files a/docs/imgs/gcp/services_ingress.png and /dev/null differ diff --git a/docs/imgs/glossary/add-term-modal.png b/docs/imgs/glossary/add-term-modal.png deleted file mode 100644 index e32a9cb8d648c..0000000000000 Binary files a/docs/imgs/glossary/add-term-modal.png and /dev/null differ diff --git a/docs/imgs/glossary/add-term-to-entity.png b/docs/imgs/glossary/add-term-to-entity.png deleted file mode 100644 index 7487a68c0d755..0000000000000 Binary files a/docs/imgs/glossary/add-term-to-entity.png and /dev/null differ diff --git a/docs/imgs/glossary/create-from-node.png b/docs/imgs/glossary/create-from-node.png deleted file mode 100644 index 70638d083343c..0000000000000 Binary files a/docs/imgs/glossary/create-from-node.png and /dev/null differ diff --git a/docs/imgs/glossary/create-modal.png b/docs/imgs/glossary/create-modal.png deleted file mode 100644 index e84fb5a36e2d4..0000000000000 Binary files a/docs/imgs/glossary/create-modal.png and /dev/null differ diff --git a/docs/imgs/glossary/delete-button.png b/docs/imgs/glossary/delete-button.png deleted file mode 100644 index 3e0cc2a5b0a54..0000000000000 Binary files a/docs/imgs/glossary/delete-button.png and /dev/null differ diff --git a/docs/imgs/glossary/edit-term.png b/docs/imgs/glossary/edit-term.png deleted file mode 100644 index 62b0e425c8c4f..0000000000000 Binary files a/docs/imgs/glossary/edit-term.png and /dev/null differ diff --git a/docs/imgs/glossary/glossary-button.png b/docs/imgs/glossary/glossary-button.png deleted file mode 100644 index e4b8fd2393587..0000000000000 Binary files a/docs/imgs/glossary/glossary-button.png and /dev/null differ diff --git a/docs/imgs/glossary/move-term-button.png b/docs/imgs/glossary/move-term-button.png deleted file mode 100644 index df03c820340ef..0000000000000 Binary files a/docs/imgs/glossary/move-term-button.png and /dev/null differ diff --git a/docs/imgs/glossary/move-term-modal.png b/docs/imgs/glossary/move-term-modal.png deleted file mode 100644 index 0fda501911b2b..0000000000000 Binary files a/docs/imgs/glossary/move-term-modal.png and /dev/null differ diff --git a/docs/imgs/glossary/root-glossary-create.png b/docs/imgs/glossary/root-glossary-create.png deleted file mode 100644 index c91f397eb6213..0000000000000 Binary files a/docs/imgs/glossary/root-glossary-create.png and /dev/null differ diff --git a/docs/imgs/glossary/root-glossary.png b/docs/imgs/glossary/root-glossary.png deleted file mode 100644 index 1296c16b0dc3d..0000000000000 Binary files a/docs/imgs/glossary/root-glossary.png and /dev/null differ diff --git a/docs/imgs/ingestion-architecture.png b/docs/imgs/ingestion-architecture.png deleted file mode 100644 index fc7bc74acacfa..0000000000000 Binary files a/docs/imgs/ingestion-architecture.png and /dev/null differ diff --git a/docs/imgs/ingestion-logs.png b/docs/imgs/ingestion-logs.png deleted file mode 100644 index 42211be7379d6..0000000000000 Binary files a/docs/imgs/ingestion-logs.png and /dev/null differ diff --git a/docs/imgs/ingestion-privileges.png b/docs/imgs/ingestion-privileges.png deleted file mode 100644 index 8e23868309676..0000000000000 Binary files a/docs/imgs/ingestion-privileges.png and /dev/null differ diff --git a/docs/imgs/ingestion-tab.png b/docs/imgs/ingestion-tab.png deleted file mode 100644 index 046068c63bdb7..0000000000000 Binary files a/docs/imgs/ingestion-tab.png and /dev/null differ diff --git a/docs/imgs/ingestion-with-token.png b/docs/imgs/ingestion-with-token.png deleted file mode 100644 index 5e1a2cce036f7..0000000000000 Binary files a/docs/imgs/ingestion-with-token.png and /dev/null differ diff --git a/docs/imgs/invite-users-button.png b/docs/imgs/invite-users-button.png deleted file mode 100644 index a5d07a1c1e7e7..0000000000000 Binary files a/docs/imgs/invite-users-button.png and /dev/null differ diff --git a/docs/imgs/invite-users-popup.png b/docs/imgs/invite-users-popup.png deleted file mode 100644 index 621b1521eae75..0000000000000 Binary files a/docs/imgs/invite-users-popup.png and /dev/null differ diff --git a/docs/imgs/lineage.png b/docs/imgs/lineage.png deleted file mode 100644 index 7488c1e04c31b..0000000000000 Binary files a/docs/imgs/lineage.png and /dev/null differ diff --git a/docs/imgs/list-domains.png b/docs/imgs/list-domains.png deleted file mode 100644 index 98a28130f8c99..0000000000000 Binary files a/docs/imgs/list-domains.png and /dev/null differ diff --git a/docs/imgs/locust-example.png b/docs/imgs/locust-example.png deleted file mode 100644 index bbae3e0ca19d0..0000000000000 Binary files a/docs/imgs/locust-example.png and /dev/null differ diff --git a/docs/imgs/metadata-model-chart.png b/docs/imgs/metadata-model-chart.png deleted file mode 100644 index 2fb7483654906..0000000000000 Binary files a/docs/imgs/metadata-model-chart.png and /dev/null differ diff --git a/docs/imgs/metadata-model-to-fork-or-not-to.png b/docs/imgs/metadata-model-to-fork-or-not-to.png deleted file mode 100644 index f9d89d555196d..0000000000000 Binary files a/docs/imgs/metadata-model-to-fork-or-not-to.png and /dev/null differ diff --git a/docs/imgs/metadata-modeling.png b/docs/imgs/metadata-modeling.png deleted file mode 100644 index cbad7613e04e4..0000000000000 Binary files a/docs/imgs/metadata-modeling.png and /dev/null differ diff --git a/docs/imgs/metadata-service-auth.png b/docs/imgs/metadata-service-auth.png deleted file mode 100644 index 15a3ac51876c2..0000000000000 Binary files a/docs/imgs/metadata-service-auth.png and /dev/null differ diff --git a/docs/imgs/metadata-serving.png b/docs/imgs/metadata-serving.png deleted file mode 100644 index 54b928a0cff52..0000000000000 Binary files a/docs/imgs/metadata-serving.png and /dev/null differ diff --git a/docs/imgs/metadata.png b/docs/imgs/metadata.png deleted file mode 100644 index 45bb0cdce12e9..0000000000000 Binary files a/docs/imgs/metadata.png and /dev/null differ diff --git a/docs/imgs/name-ingestion-source.png b/docs/imgs/name-ingestion-source.png deleted file mode 100644 index bde1208248473..0000000000000 Binary files a/docs/imgs/name-ingestion-source.png and /dev/null differ diff --git a/docs/imgs/no-code-after.png b/docs/imgs/no-code-after.png deleted file mode 100644 index c0eee88625ace..0000000000000 Binary files a/docs/imgs/no-code-after.png and /dev/null differ diff --git a/docs/imgs/no-code-before.png b/docs/imgs/no-code-before.png deleted file mode 100644 index 50315578b1804..0000000000000 Binary files a/docs/imgs/no-code-before.png and /dev/null differ diff --git a/docs/imgs/platform-instances-for-ingestion.png b/docs/imgs/platform-instances-for-ingestion.png deleted file mode 100644 index 740249a805fb8..0000000000000 Binary files a/docs/imgs/platform-instances-for-ingestion.png and /dev/null differ diff --git a/docs/imgs/quickstart-ingestion-config.png b/docs/imgs/quickstart-ingestion-config.png deleted file mode 100644 index de51777ccddc3..0000000000000 Binary files a/docs/imgs/quickstart-ingestion-config.png and /dev/null differ diff --git a/docs/imgs/reset-credentials-screen.png b/docs/imgs/reset-credentials-screen.png deleted file mode 100644 index 4b680837b77ab..0000000000000 Binary files a/docs/imgs/reset-credentials-screen.png and /dev/null differ diff --git a/docs/imgs/reset-user-password-button.png b/docs/imgs/reset-user-password-button.png deleted file mode 100644 index 5b1f3ee153d07..0000000000000 Binary files a/docs/imgs/reset-user-password-button.png and /dev/null differ diff --git a/docs/imgs/reset-user-password-popup.png b/docs/imgs/reset-user-password-popup.png deleted file mode 100644 index ac2456dde4d4d..0000000000000 Binary files a/docs/imgs/reset-user-password-popup.png and /dev/null differ diff --git a/docs/imgs/running-ingestion.png b/docs/imgs/running-ingestion.png deleted file mode 100644 index a03fb444a029e..0000000000000 Binary files a/docs/imgs/running-ingestion.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/10_outputs.png b/docs/imgs/s3-ingestion/10_outputs.png deleted file mode 100644 index e0d1ed3376ade..0000000000000 Binary files a/docs/imgs/s3-ingestion/10_outputs.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/1_crawler-info.png b/docs/imgs/s3-ingestion/1_crawler-info.png deleted file mode 100644 index 1288247392047..0000000000000 Binary files a/docs/imgs/s3-ingestion/1_crawler-info.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/2_crawler-type.png b/docs/imgs/s3-ingestion/2_crawler-type.png deleted file mode 100644 index 4898438417913..0000000000000 Binary files a/docs/imgs/s3-ingestion/2_crawler-type.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/3_data-store.png b/docs/imgs/s3-ingestion/3_data-store.png deleted file mode 100644 index d29e4b1be05d6..0000000000000 Binary files a/docs/imgs/s3-ingestion/3_data-store.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/4_data-store-2.png b/docs/imgs/s3-ingestion/4_data-store-2.png deleted file mode 100644 index c0a6f140bedb2..0000000000000 Binary files a/docs/imgs/s3-ingestion/4_data-store-2.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/5_iam.png b/docs/imgs/s3-ingestion/5_iam.png deleted file mode 100644 index 73a631cb74f56..0000000000000 Binary files a/docs/imgs/s3-ingestion/5_iam.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/6_schedule.png b/docs/imgs/s3-ingestion/6_schedule.png deleted file mode 100644 index c5df59348fbc6..0000000000000 Binary files a/docs/imgs/s3-ingestion/6_schedule.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/7_output.png b/docs/imgs/s3-ingestion/7_output.png deleted file mode 100644 index 6201fa40bcfb3..0000000000000 Binary files a/docs/imgs/s3-ingestion/7_output.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/8_review.png b/docs/imgs/s3-ingestion/8_review.png deleted file mode 100644 index 2d27e79c2128b..0000000000000 Binary files a/docs/imgs/s3-ingestion/8_review.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/9_run.png b/docs/imgs/s3-ingestion/9_run.png deleted file mode 100644 index 2b0644f6ad038..0000000000000 Binary files a/docs/imgs/s3-ingestion/9_run.png and /dev/null differ diff --git a/docs/imgs/schedule-ingestion.png b/docs/imgs/schedule-ingestion.png deleted file mode 100644 index 0e6ec8e268c32..0000000000000 Binary files a/docs/imgs/schedule-ingestion.png and /dev/null differ diff --git a/docs/imgs/schema-blame-blame-activated.png b/docs/imgs/schema-blame-blame-activated.png deleted file mode 100644 index 363466c39aedf..0000000000000 Binary files a/docs/imgs/schema-blame-blame-activated.png and /dev/null differ diff --git a/docs/imgs/schema-history-audit-activated.png b/docs/imgs/schema-history-audit-activated.png deleted file mode 100644 index f59676b9b8a8f..0000000000000 Binary files a/docs/imgs/schema-history-audit-activated.png and /dev/null differ diff --git a/docs/imgs/schema-history-latest-version.png b/docs/imgs/schema-history-latest-version.png deleted file mode 100644 index 0a54df4d520d5..0000000000000 Binary files a/docs/imgs/schema-history-latest-version.png and /dev/null differ diff --git a/docs/imgs/schema-history-older-version.png b/docs/imgs/schema-history-older-version.png deleted file mode 100644 index 8d295f176104f..0000000000000 Binary files a/docs/imgs/schema-history-older-version.png and /dev/null differ diff --git a/docs/imgs/search-by-domain.png b/docs/imgs/search-by-domain.png deleted file mode 100644 index 4b92e58959187..0000000000000 Binary files a/docs/imgs/search-by-domain.png and /dev/null differ diff --git a/docs/imgs/search-domain.png b/docs/imgs/search-domain.png deleted file mode 100644 index b1359e07d5fc2..0000000000000 Binary files a/docs/imgs/search-domain.png and /dev/null differ diff --git a/docs/imgs/search-tag.png b/docs/imgs/search-tag.png deleted file mode 100644 index cf4b6b629d1e2..0000000000000 Binary files a/docs/imgs/search-tag.png and /dev/null differ diff --git a/docs/imgs/select-platform-template.png b/docs/imgs/select-platform-template.png deleted file mode 100644 index 4f78e2b7309ed..0000000000000 Binary files a/docs/imgs/select-platform-template.png and /dev/null differ diff --git a/docs/imgs/set-domain-id.png b/docs/imgs/set-domain-id.png deleted file mode 100644 index 3e1dde4ae51ee..0000000000000 Binary files a/docs/imgs/set-domain-id.png and /dev/null differ diff --git a/docs/imgs/set-domain.png b/docs/imgs/set-domain.png deleted file mode 100644 index 1c4460e747835..0000000000000 Binary files a/docs/imgs/set-domain.png and /dev/null differ diff --git a/docs/imgs/successful-ingestion.png b/docs/imgs/successful-ingestion.png deleted file mode 100644 index fa8dbdff7501e..0000000000000 Binary files a/docs/imgs/successful-ingestion.png and /dev/null differ diff --git a/docs/imgs/timeline/dropdown-apis.png b/docs/imgs/timeline/dropdown-apis.png deleted file mode 100644 index f7aba08bbc061..0000000000000 Binary files a/docs/imgs/timeline/dropdown-apis.png and /dev/null differ diff --git a/docs/imgs/timeline/swagger-ui.png b/docs/imgs/timeline/swagger-ui.png deleted file mode 100644 index e52a57e8ca670..0000000000000 Binary files a/docs/imgs/timeline/swagger-ui.png and /dev/null differ diff --git a/docs/imgs/timeline/timeline-conceptually.png b/docs/imgs/timeline/timeline-conceptually.png deleted file mode 100644 index 70bd843bf8aed..0000000000000 Binary files a/docs/imgs/timeline/timeline-conceptually.png and /dev/null differ diff --git a/docs/imgs/user-sign-up-screen.png b/docs/imgs/user-sign-up-screen.png deleted file mode 100644 index 88c2589203bd1..0000000000000 Binary files a/docs/imgs/user-sign-up-screen.png and /dev/null differ diff --git a/docs/links.md b/docs/links.md index f175262b9b5d9..45ba391e557cd 100644 --- a/docs/links.md +++ b/docs/links.md @@ -39,7 +39,7 @@ * [Creating Notebook-based Dynamic Dashboards](https://towardsdatascience.com/creating-notebook-based-dynamic-dashboards-91f936adc6f3) ## Talks & Presentations -* [DataHub: Powering LinkedIn's Metadata](demo/DataHub_-_Powering_LinkedIn_Metadata.pdf) @ [Budapest Data Forum 2020](https://budapestdata.hu/2020/en/) +* [DataHub: Powering LinkedIn's Metadata](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf) @ [Budapest Data Forum 2020](https://budapestdata.hu/2020/en/) * [Taming the Data Beast Using DataHub](https://www.youtube.com/watch?v=bo4OhiPro7Y) @ [Data Engineering Melbourne Meetup November 2020](https://www.meetup.com/Data-Engineering-Melbourne/events/kgnvlrybcpbjc/) * [Metadata Management And Integration At LinkedIn With DataHub](https://www.dataengineeringpodcast.com/datahub-metadata-management-episode-147/) @ [Data Engineering Podcast](https://www.dataengineeringpodcast.com) * [The evolution of metadata: LinkedIn’s story](https://speakerdeck.com/shirshanka/the-evolution-of-metadata-linkedins-journey-strata-nyc-2019) @ [Strata Data Conference 2019](https://conferences.oreilly.com/strata/strata-ny-2019.html) diff --git a/docs/managed-datahub/chrome-extension.md b/docs/managed-datahub/chrome-extension.md index a614327c7fd29..c6840f4e8e221 100644 --- a/docs/managed-datahub/chrome-extension.md +++ b/docs/managed-datahub/chrome-extension.md @@ -10,7 +10,11 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; In order to use the Acryl DataHub Chrome extension, you need to download it onto your browser from the Chrome web store [here](https://chrome.google.com/webstore/detail/datahub-chrome-extension/aoenebhmfokhglijmoacfjcnebdpchfj). -![](imgs/saas/chrome-store-extension-screenshot.png) + +

+ +

+ Simply click "Add to Chrome" then "Add extension" on the ensuing popup. @@ -20,11 +24,19 @@ Once you have your extension installed, you'll need to configure it to work with 1. Click the extension button on the right of your browser's address bar to view all of your installed extensions. Click on the newly installed DataHub extension. -![](imgs/saas/extension_open_popup.png) + +

+ +

+ 2. Fill in your DataHub domain and click "Continue" in the extension popup that appears. -![](imgs/saas/extension_enter_domain.png) + +

+ +

+ If your organization uses standard SaaS domains for Looker, you should be ready to go! @@ -34,11 +46,19 @@ Some organizations have custom SaaS domains for Looker and some Acryl DataHub de 1. Click on the extension button and select your DataHub extension to open the popup again. Now click the settings icon in order to open the configurations page. -![](imgs/saas/extension_open_options_page.png) + +

+ +

+ 2. Fill out any and save custom configurations you have in the **TOOL CONFIGURATIONS** section. Here you can configure a custom domain, a Platform Instance associated with that domain, and the Environment set on your DataHub assets. If you don't have a custom domain but do have a custom Platform Instance or Environment, feel free to leave the field domain empty. -![](imgs/saas/extension_custom_configs.png) + +

+ +

+ ## Using the Extension @@ -52,7 +72,11 @@ Once you have everything configured on your extension, it's time to use it! 4. Click the Acryl DataHub extension button on the bottom right of your page to open a drawer where you can now see additional information about this asset right from your DataHub instance. -![](imgs/saas/extension_view_in_looker.png) + +

+ +

+ ## Advanced: Self-Hosted DataHub diff --git a/docs/managed-datahub/datahub-api/graphql-api/getting-started.md b/docs/managed-datahub/datahub-api/graphql-api/getting-started.md index 3c57b0a21d96e..57d46f05c4e0c 100644 --- a/docs/managed-datahub/datahub-api/graphql-api/getting-started.md +++ b/docs/managed-datahub/datahub-api/graphql-api/getting-started.md @@ -10,7 +10,11 @@ For a full reference to the Queries & Mutations available for consumption, check ### Connecting to the API -![](../../imgs/saas/image-(3).png) + +

+ +

+ When you generate the token you will see an example of `curl` command which you can use to connect to the GraphQL API. diff --git a/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md b/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md index 89bacb2009e49..bfd8e8f2dae1b 100644 --- a/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md +++ b/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md @@ -404,7 +404,11 @@ You can configure Acryl to send slack notifications to a specific channel when i These notifications are also able to tag the immediate asset's owners, along with the owners of downstream assets consuming it. -![](../../imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png) + +

+ +

+ To do so, simply follow the [Slack Integration Guide](docs/managed-datahub/saas-slack-setup.md) and contact your Acryl customer success team to enable the feature! diff --git a/docs/managed-datahub/imgs/saas/DataHub-Architecture.png b/docs/managed-datahub/imgs/saas/DataHub-Architecture.png deleted file mode 100644 index 95b3ab0b06ad6..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/DataHub-Architecture.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png deleted file mode 100644 index 721989a6c37e1..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png deleted file mode 100644 index dffac92f257c7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png deleted file mode 100644 index ff0c29de1fbad..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png deleted file mode 100644 index 070bfd9f6b897..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png deleted file mode 100644 index b4bb4e2ba60ed..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png deleted file mode 100644 index b0397afd1b3a4..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png deleted file mode 100644 index 9258badb6f088..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png deleted file mode 100644 index 386b4cdcd9911..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png deleted file mode 100644 index a129f5eba4271..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png deleted file mode 100644 index 96ae48318a35a..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png deleted file mode 100644 index b6fd273389c90..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png deleted file mode 100644 index 0acd4e75bc6d2..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png deleted file mode 100644 index 364b9292cfaab..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png deleted file mode 100644 index 6a12dc545ec62..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png deleted file mode 100644 index 6a12dc545ec62..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png deleted file mode 100644 index 83645e00d724a..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png deleted file mode 100644 index a2f239ce847e0..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png deleted file mode 100644 index e31d4b089d929..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png deleted file mode 100644 index c003581c9d1b6..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png deleted file mode 100644 index 660dd121dd0a4..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png deleted file mode 100644 index 07e3c71dba262..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png deleted file mode 100644 index 579e7f62af708..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png deleted file mode 100644 index f85f4d5c79bfb..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png deleted file mode 100644 index f85f4d5c79bfb..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png deleted file mode 100644 index cb8b7470cd957..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png deleted file mode 100644 index 1de51e33d87c2..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png deleted file mode 100644 index df687dabe345c..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png deleted file mode 100644 index a8d9ee37c7a55..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png deleted file mode 100644 index a8d9ee37c7a55..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled(1).png b/docs/managed-datahub/imgs/saas/Untitled(1).png deleted file mode 100644 index 87846e7897f6e..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png b/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png deleted file mode 100644 index 7715bf4a51fbe..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled-(2).png b/docs/managed-datahub/imgs/saas/Untitled-(2).png deleted file mode 100644 index a01a1af370442..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled-(2).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled-(3).png b/docs/managed-datahub/imgs/saas/Untitled-(3).png deleted file mode 100644 index 02d84b326896c..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled-(3).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled-(4).png b/docs/managed-datahub/imgs/saas/Untitled-(4).png deleted file mode 100644 index a01a1af370442..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled-(4).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled.png b/docs/managed-datahub/imgs/saas/Untitled.png deleted file mode 100644 index a01a1af370442..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png b/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png deleted file mode 100644 index e00a4d57f32dd..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_custom_configs.png b/docs/managed-datahub/imgs/saas/extension_custom_configs.png deleted file mode 100644 index b3d70dfac00ff..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_custom_configs.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_developer_mode.png b/docs/managed-datahub/imgs/saas/extension_developer_mode.png deleted file mode 100644 index e740d15912e17..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_developer_mode.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_enter_domain.png b/docs/managed-datahub/imgs/saas/extension_enter_domain.png deleted file mode 100644 index 3304fa168beaf..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_enter_domain.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_load_unpacked.png b/docs/managed-datahub/imgs/saas/extension_load_unpacked.png deleted file mode 100644 index 8f56705cd9176..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_load_unpacked.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_open_options_page.png b/docs/managed-datahub/imgs/saas/extension_open_options_page.png deleted file mode 100644 index c1366d5673b59..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_open_options_page.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_open_popup.png b/docs/managed-datahub/imgs/saas/extension_open_popup.png deleted file mode 100644 index 216056b847fb5..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_open_popup.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_view_in_looker.png b/docs/managed-datahub/imgs/saas/extension_view_in_looker.png deleted file mode 100644 index bf854b3e840f7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_view_in_looker.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/home-(1).png b/docs/managed-datahub/imgs/saas/home-(1).png deleted file mode 100644 index 88cf2017dd7e7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/home-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/home.png b/docs/managed-datahub/imgs/saas/home.png deleted file mode 100644 index 8ad63deec75c9..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/home.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(1).png b/docs/managed-datahub/imgs/saas/image-(1).png deleted file mode 100644 index c1a249125fcf7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(10).png b/docs/managed-datahub/imgs/saas/image-(10).png deleted file mode 100644 index a580fdc3d6730..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(10).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(11).png b/docs/managed-datahub/imgs/saas/image-(11).png deleted file mode 100644 index ee95eb4384272..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(11).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(12).png b/docs/managed-datahub/imgs/saas/image-(12).png deleted file mode 100644 index bbd8e6a66cf85..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(12).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(13).png b/docs/managed-datahub/imgs/saas/image-(13).png deleted file mode 100644 index bbd8e6a66cf85..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(13).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(14).png b/docs/managed-datahub/imgs/saas/image-(14).png deleted file mode 100644 index a580fdc3d6730..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(14).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(15).png b/docs/managed-datahub/imgs/saas/image-(15).png deleted file mode 100644 index f282e2d92c1a1..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(15).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(16).png b/docs/managed-datahub/imgs/saas/image-(16).png deleted file mode 100644 index 1340c77bd648c..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(16).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(17).png b/docs/managed-datahub/imgs/saas/image-(17).png deleted file mode 100644 index 6eee2fb2d821f..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(17).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(2).png b/docs/managed-datahub/imgs/saas/image-(2).png deleted file mode 100644 index cf475edd7b95d..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(2).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(3).png b/docs/managed-datahub/imgs/saas/image-(3).png deleted file mode 100644 index b08818ff3e97c..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(3).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(4).png b/docs/managed-datahub/imgs/saas/image-(4).png deleted file mode 100644 index a580fdc3d6730..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(4).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(5).png b/docs/managed-datahub/imgs/saas/image-(5).png deleted file mode 100644 index 48438c6001e4f..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(5).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(6).png b/docs/managed-datahub/imgs/saas/image-(6).png deleted file mode 100644 index 54e569e853f24..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(6).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(7).png b/docs/managed-datahub/imgs/saas/image-(7).png deleted file mode 100644 index 6e89e5881cfa7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(7).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(8).png b/docs/managed-datahub/imgs/saas/image-(8).png deleted file mode 100644 index ee0a3c89d58fa..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(8).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(9).png b/docs/managed-datahub/imgs/saas/image-(9).png deleted file mode 100644 index 301ca98593ef9..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(9).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image.png b/docs/managed-datahub/imgs/saas/image.png deleted file mode 100644 index a1cfc3e74c5dd..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/settings.png b/docs/managed-datahub/imgs/saas/settings.png deleted file mode 100644 index ca99984abbbc9..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/settings.png and /dev/null differ diff --git a/docs/managed-datahub/integrations/oidc-sso-integration.md b/docs/managed-datahub/integrations/oidc-sso-integration.md index 6a9e085186b44..c0f5069d849fa 100644 --- a/docs/managed-datahub/integrations/oidc-sso-integration.md +++ b/docs/managed-datahub/integrations/oidc-sso-integration.md @@ -42,4 +42,8 @@ To enable the OIDC integration, start by navigating to **Settings > Platform > S 4. If there are any advanced settings you would like to configure, click on the **Advanced** button. These come with defaults, so only input settings here if there is something you need changed from the default configuration. 5. Click **Update** to save your settings. -![](../imgs/saas/image-(10).png) + +

+ +

+ diff --git a/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md b/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md index 95ca6e5e33e16..e225fd8b014c8 100644 --- a/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md +++ b/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md @@ -56,9 +56,17 @@ In Acryl DataHub deployments, you _must_ use a sink of type `datahub-rest`, whic 2. **token**: a unique API key used to authenticate requests to your instance's REST API The token can be retrieved by logging in as admin. You can go to Settings page and generate a Personal Access Token with your desired expiration date. -![](../imgs/saas/home-(1).png) -![](../imgs/saas/settings.png) +

+ +

+ + + +

+ +

+ To configure your instance of DataHub as the destination for ingestion, set the "server" field of your recipe to point to your Acryl instance's domain suffixed by the path `/gms`, as shown below. A complete example of a DataHub recipe file, which reads from MySQL and writes into a DataHub instance: diff --git a/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md b/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md index d389ec97d0550..6c6cce51ea098 100644 --- a/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md +++ b/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md @@ -17,11 +17,19 @@ Acryl DataHub comes packaged with an Acryl-managed ingestion executor, which is For example, if an ingestion source is not publicly accessible via the internet, e.g. hosted privately within a specific AWS account, then the Acryl executor will be unable to extract metadata from it. -![Option 1: Acryl-hosted ingestion runner](../imgs/saas/image-(12).png) + +

+ +

+ To accommodate these cases, Acryl supports configuring a remote ingestion executor which can be deployed inside of your AWS account. This setup allows you to continue leveraging the Acryl DataHub console to create, schedule, and run metadata ingestion, all while retaining network and credential isolation. -![Option 2: Customer-hosted ingestion runner](../imgs/saas/image-(6).png) + +

+ +

+ ## Deploying a Remote Ingestion Executor 1. **Provide AWS Account Id**: Provide Acryl Team with the id of the AWS in which the remote executor will be hosted. This will be used to grant access to private Acryl containers and create a unique SQS queue which your remote agent will subscribe to. The account id can be provided to your Acryl representative via Email or [One Time Secret](https://onetimesecret.com/). @@ -40,23 +48,39 @@ To accommodate these cases, Acryl supports configuring a remote ingestion execut Note that the only external secret provider that is currently supported is AWS Secrets Manager. -![](../imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png) -![](../imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png) +

+ +

+ + + +

+ +

+ 3. **Test the Executor:** To test your remote executor: 1. Create a new Ingestion Source by clicking '**Create new Source**' the '**Ingestion**' tab of the DataHub console. Configure your Ingestion Recipe as though you were running it from inside of your environment. 2. When working with "secret" fields (passwords, keys, etc), you can refer to any "self-managed" secrets by name: `${SECRET_NAME}:` - ![Using a secret called BQ_DEPLOY_KEY which is managed in AWS secrets manager](../imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png) + +

+ +

+ 3. In the 'Finish Up' step, click '**Advanced'**. 4. Update the '**Executor Id**' form field to be '**remote**'. This indicates that you'd like to use the remote executor. 5. Click '**Done**'. Now, simple click '**Execute**' to test out the remote executor. If your remote executor is configured properly, you should promptly see the ingestion task state change to 'Running'. -![](../imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png) + +

+ +

+ ## Updating a Remote Ingestion Executor In order to update the executor, ie. to deploy a new container version, you'll need to update the CloudFormation Stack to re-deploy the CloudFormation template with a new set of parameters. ### Steps - AWS Console @@ -66,7 +90,11 @@ In order to update the executor, ie. to deploy a new container version, you'll n 4. Select **Replace Current Template** 5. Select **Upload a template file** 6. Upload a copy of the Acryl Remote Executor [CloudFormation Template](https://raw.githubusercontent.com/acryldata/datahub-cloudformation/master/Ingestion/templates/python.ecs.template.yaml) -![](../imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png) + +

+ +

+ 7. Click **Next** 8. Change parameters based on your modifications (e.g. ImageTag, etc) 9. Click **Next** diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md index f47630f44e772..98f70f6d933e4 100644 --- a/docs/modeling/extending-the-metadata-model.md +++ b/docs/modeling/extending-the-metadata-model.md @@ -11,7 +11,11 @@ these two concepts prior to making changes. ## To fork or not to fork? An important question that will arise once you've decided to extend the metadata model is whether you need to fork the main repo or not. Use the diagram below to understand how to make this decision. -![Metadata Model To Fork or Not](../imgs/metadata-model-to-fork-or-not-to.png) + +

+ +

+ The green lines represent pathways that will lead to lesser friction for you to maintain your code long term. The red lines represent higher risk of conflicts in the future. We are working hard to move the majority of model extension use-cases to no-code / low-code pathways to ensure that you can extend the core metadata model without having to maintain a custom fork of DataHub. diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md index 704fce1412329..037c9c7108a6e 100644 --- a/docs/modeling/metadata-model.md +++ b/docs/modeling/metadata-model.md @@ -30,7 +30,11 @@ Conceptually, metadata is modeled using the following abstractions Here is an example graph consisting of 3 types of entity (CorpUser, Chart, Dashboard), 2 types of relationship (OwnedBy, Contains), and 3 types of metadata aspect (Ownership, ChartInfo, and DashboardInfo). -![metadata-modeling](../imgs/metadata-model-chart.png) + +

+ +

+ ## The Core Entities @@ -73,7 +77,11 @@ to the YAML configuration, instead of creating new Snapshot / Aspect files. ## Exploring DataHub's Metadata Model To explore the current DataHub metadata model, you can inspect this high-level picture that shows the different entities and edges between them showing the relationships between them. -![Metadata Model Graph](../imgs/datahub-metadata-model.png) + +

+ +

+ To navigate the aspect model for specific entities and explore relationships using the `foreign-key` concept, you can view them in our demo environment or navigate the auto-generated docs in the **Metadata Modeling/Entities** section on the left. diff --git a/docs/platform-instances.md b/docs/platform-instances.md index c6bfe3315de98..0f4515aedae54 100644 --- a/docs/platform-instances.md +++ b/docs/platform-instances.md @@ -1,44 +1,48 @@ -# Working With Platform Instances - -DataHub's metadata model for Datasets supports a three-part key currently: -- Data Platform (e.g. urn:li:dataPlatform:mysql) -- Name (e.g. db.schema.name) -- Env or Fabric (e.g. DEV, PROD, etc.) - -This naming scheme unfortunately does not allow for easy representation of the multiplicity of platforms (or technologies) that might be deployed at an organization within the same environment or fabric. For example, an organization might have multiple Redshift instances in Production and would want to see all the data assets located in those instances inside the DataHub metadata repository. - -As part of the `v0.8.24+` releases, we are unlocking the first phase of supporting Platform Instances in the metadata model. This is done via two main additions: -- The `dataPlatformInstance` aspect that has been added to Datasets which allows datasets to be associated to an instance of a platform -- Enhancements to all ingestion sources that allow them to attach a platform instance to the recipe that changes the generated urns to go from `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format to `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format. Sources that produce lineage to datasets in other platforms (e.g. Looker, Superset etc) also have specific configuration additions that allow the recipe author to specify the mapping between a platform and the instance name that it should be mapped to. - -![./imgs/platform-instances-for-ingestion.png](./imgs/platform-instances-for-ingestion.png) - -## Naming Platform Instances - -When configuring a platform instance, choose an instance name that is understandable and will be stable for the foreseeable future. e.g. `core_warehouse` or `finance_redshift` are allowed names, as are pure guids like `a37dc708-c512-4fe4-9829-401cd60ed789`. Remember that whatever instance name you choose, you will need to specify it in more than one recipe to ensure that the identifiers produced by different sources will line up. - -## Enabling Platform Instances - -Read the Ingestion source specific guides for how to enable platform instances in each of them. -The general pattern is to add an additional optional configuration parameter called `platform_instance`. - -e.g. here is how you would configure a recipe to ingest a mysql instance that you want to call `core_finance` -```yaml -source: - type: mysql - config: - # Coordinates - host_port: localhost:3306 - platform_instance: core_finance - database: dbname - - # Credentials - username: root - password: example - -sink: - # sink configs -``` - - -## +# Working With Platform Instances + +DataHub's metadata model for Datasets supports a three-part key currently: +- Data Platform (e.g. urn:li:dataPlatform:mysql) +- Name (e.g. db.schema.name) +- Env or Fabric (e.g. DEV, PROD, etc.) + +This naming scheme unfortunately does not allow for easy representation of the multiplicity of platforms (or technologies) that might be deployed at an organization within the same environment or fabric. For example, an organization might have multiple Redshift instances in Production and would want to see all the data assets located in those instances inside the DataHub metadata repository. + +As part of the `v0.8.24+` releases, we are unlocking the first phase of supporting Platform Instances in the metadata model. This is done via two main additions: +- The `dataPlatformInstance` aspect that has been added to Datasets which allows datasets to be associated to an instance of a platform +- Enhancements to all ingestion sources that allow them to attach a platform instance to the recipe that changes the generated urns to go from `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format to `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format. Sources that produce lineage to datasets in other platforms (e.g. Looker, Superset etc) also have specific configuration additions that allow the recipe author to specify the mapping between a platform and the instance name that it should be mapped to. + + +

+ +

+ + +## Naming Platform Instances + +When configuring a platform instance, choose an instance name that is understandable and will be stable for the foreseeable future. e.g. `core_warehouse` or `finance_redshift` are allowed names, as are pure guids like `a37dc708-c512-4fe4-9829-401cd60ed789`. Remember that whatever instance name you choose, you will need to specify it in more than one recipe to ensure that the identifiers produced by different sources will line up. + +## Enabling Platform Instances + +Read the Ingestion source specific guides for how to enable platform instances in each of them. +The general pattern is to add an additional optional configuration parameter called `platform_instance`. + +e.g. here is how you would configure a recipe to ingest a mysql instance that you want to call `core_finance` +```yaml +source: + type: mysql + config: + # Coordinates + host_port: localhost:3306 + platform_instance: core_finance + database: dbname + + # Credentials + username: root + password: example + +sink: + # sink configs +``` + + +## diff --git a/docs/schema-history.md b/docs/schema-history.md index 9fc9ec1af52bb..120d041960186 100644 --- a/docs/schema-history.md +++ b/docs/schema-history.md @@ -23,20 +23,32 @@ must have the **View Entity Page** privilege, or be assigned to **any** DataHub You can view the Schema History for a Dataset by navigating to that Dataset's Schema Tab. As long as that Dataset has more than one version, you can view what a Dataset looked like at any given version by using the version selector. Here's an example from DataHub's official Demo environment with the -[Snowflake pets dataset](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.pets,PROD)/Schema?is_lineage_mode=false). +Snowflake pets dataset. + + +

+ +

-![](./imgs/schema-history-latest-version.png) If you click on an older version in the selector, you'll be able to see what the schema looked like back then. Notice the changes here to the glossary terms for the `status` field, and to the descriptions for the `created_at` and `updated_at` fields. -![](./imgs/schema-history-older-version.png) + +

+ +

+ In addition to this, you can also toggle the Audit view that shows you when the most recent changes were made to each field. You can active this by clicking on the Audit icon you see above the top right of the table. -![](./imgs/schema-history-audit-activated.png) + +

+ +

+ You can see here that some of these fields were added at the oldest dataset version, while some were added only at this latest version. Some fields were even modified and had a type change at the latest version! diff --git a/docs/townhall-history.md b/docs/townhall-history.md index 1da490ca6fa69..e235a70c5d7b9 100644 --- a/docs/townhall-history.md +++ b/docs/townhall-history.md @@ -343,8 +343,7 @@ Agenda - Announcements - 2 mins - Community Updates ([video](https://youtu.be/r862MZTLAJ0?t=99)) - 10 mins -- Use-Case: DataHub at Viasat ([slides](demo/ViasatMetadataJourney.pdf),[video](https://youtu.be/2SrDAJnzkjE)) by [Anna Kepler](https://www.linkedin.com/in/akepler) - 15 mins -- Tech Deep Dive: GraphQL + React RFCs readout and discussion ([slides](https://docs.google.com/presentation/d/e/2PACX-1vRtnINnpi6PvFw7-5iW8PSQoT9Kdf1O_0YW7QAr1_mSdJMNftYFTVCjKL-e3fpe8t6IGkha8UpdmoOI/pub?start=false&loop=false&delayms=3000) ,[video](https://www.youtube.com/watch?v=PrBaFrb7pqA)) by [John Joyce](https://www.linkedin.com/in/john-joyce-759883aa) and [Arun Vasudevan](https://www.linkedin.com/in/arun-vasudevan-55117368/) - 15 mins +- Use-Case: DataHub at Viasat ([slides](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/ViasatMetadataJourney.pdf),[video](https://youtu.be/2SrDAJnzkjE)) by [Anna Kepler](https://www.linkedin.com/in/akepler) - 15 mins- Tech Deep Dive: GraphQL + React RFCs readout and discussion ([slides](https://docs.google.com/presentation/d/e/2PACX-1vRtnINnpi6PvFw7-5iW8PSQoT9Kdf1O_0YW7QAr1_mSdJMNftYFTVCjKL-e3fpe8t6IGkha8UpdmoOI/pub?start=false&loop=false&delayms=3000) ,[video](https://www.youtube.com/watch?v=PrBaFrb7pqA)) by [John Joyce](https://www.linkedin.com/in/john-joyce-759883aa) and [Arun Vasudevan](https://www.linkedin.com/in/arun-vasudevan-55117368/) - 15 mins - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 3 mins - General Q&A from sign up sheet, slack, and participants - 15 mins @@ -356,8 +355,8 @@ Agenda Agenda - Quick intro - 5 mins -- [Why did Grofers choose DataHub for their data catalog?](demo/Datahub_at_Grofers.pdf) by [Shubham Gupta](https://www.linkedin.com/in/shubhamg931/) - 15 minutes -- [DataHub UI development - Part 2](demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf) by [Charlie Tran](https://www.linkedin.com/in/charlie-tran/) (LinkedIn) - 20 minutes +- [Why did Grofers choose DataHub for their data catalog?](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Datahub_at_Grofers.pdf) by [Shubham Gupta](https://www.linkedin.com/in/shubhamg931/) - 15 minutes +- [DataHub UI development - Part 2](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf) by [Charlie Tran](https://www.linkedin.com/in/charlie-tran/) (LinkedIn) - 20 minutes - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 5 minutes @@ -368,9 +367,9 @@ Agenda Agenda - Quick intro - 5 mins -- [Lightning talk on Metadata use-cases at LinkedIn](demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf) by [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) (LinkedIn) - 5 mins -- [Strongly Consistent Secondary Index (SCSI) in GMA](demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf), an upcoming feature by [Jyoti Wadhwani](https://www.linkedin.com/in/jyotiwadhwani/) (LinkedIn) - 15 minutes -- [DataHub UI overview](demo/DataHub-UIOverview.pdf) by [Ignacio Bona](https://www.linkedin.com/in/ignaciobona) (LinkedIn) - 20 minutes +- [Lightning talk on Metadata use-cases at LinkedIn](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf) by [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) (LinkedIn) - 5 mins +- [Strongly Consistent Secondary Index (SCSI) in GMA](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf), an upcoming feature by [Jyoti Wadhwani](https://www.linkedin.com/in/jyotiwadhwani/) (LinkedIn) - 15 minutes +- [DataHub UI overview](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/DataHub-UIOverview.pdf) by [Ignacio Bona](https://www.linkedin.com/in/ignaciobona) (LinkedIn) - 20 minutes - General Q&A from sign up sheet, slack, and participants - 10 mins - Closing remarks - 5 minutes @@ -382,8 +381,8 @@ Agenda Agenda - Quick intro - 5 mins -- [Data Discoverability at SpotHero](demo/Data_Discoverability_at_SpotHero.pdf) by [Maggie Hays](https://www.linkedin.com/in/maggie-hays/) (SpotHero) - 20 mins -- [Designing the next generation of metadata events for scale](demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf) by [Chris Lee](https://www.linkedin.com/in/chrisleecmu/) (LinkedIn) - 15 mins +- [Data Discoverability at SpotHero](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Data_Discoverability_at_SpotHero.pdf) by [Maggie Hays](https://www.linkedin.com/in/maggie-hays/) (SpotHero) - 20 mins +- [Designing the next generation of metadata events for scale](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf) by [Chris Lee](https://www.linkedin.com/in/chrisleecmu/) (LinkedIn) - 15 mins - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 5 mins diff --git a/docs/ui-ingestion.md b/docs/ui-ingestion.md index 4435f66e514f3..2ecb1e634c79f 100644 --- a/docs/ui-ingestion.md +++ b/docs/ui-ingestion.md @@ -14,11 +14,19 @@ This document will describe the steps required to configure, schedule, and execu To view & manage UI-based metadata ingestion, you must have the `Manage Metadata Ingestion` & `Manage Secrets` privileges assigned to your account. These can be granted by a [Platform Policy](authorization/policies.md). -![](./imgs/ingestion-privileges.png) + +

+ +

+ Once you have these privileges, you can begin to manage ingestion by navigating to the 'Ingestion' tab in DataHub. -![](./imgs/ingestion-tab.png) + +

+ +

+ On this page, you'll see a list of active **Ingestion Sources**. An Ingestion Sources is a unique source of metadata ingested into DataHub from an external source like Snowflake, Redshift, or BigQuery. @@ -33,7 +41,11 @@ your first **Ingestion Source**. Before ingesting any metadata, you need to create a new Ingestion Source. Start by clicking **+ Create new source**. -![](./imgs/create-new-ingestion-source-button.png) + +

+ +

+ #### Step 1: Select a Platform Template @@ -41,7 +53,11 @@ In the first step, select a **Recipe Template** corresponding to the source type a variety of natively supported integrations, from Snowflake to Postgres to Kafka. Select `Custom` to construct an ingestion recipe from scratch. -![](./imgs/select-platform-template.png) + +

+ +

+ Next, you'll configure an ingestion **Recipe**, which defines _how_ and _what_ to extract from the source system. @@ -68,7 +84,11 @@ used by DataHub to extract metadata from a 3rd party system. It most often consi A sample of a full recipe configured to ingest metadata from MySQL can be found in the image below. -![](./imgs/example-mysql-recipe.png) + +

+ +

+ Detailed configuration examples & documentation for each source type can be found on the [DataHub Docs](https://datahubproject.io/docs/metadata-ingestion/) website. @@ -80,7 +100,11 @@ that are encrypted and stored within DataHub's storage layer. To create a secret, first navigate to the 'Secrets' tab. Then click `+ Create new secret`. -![](./imgs/create-secret.png) + +

+ +

+ _Creating a Secret to store the username for a MySQL database_ @@ -123,7 +147,11 @@ Secret values are not persisted to disk beyond execution time, and are never tra Next, you can optionally configure a schedule on which to execute your new Ingestion Source. This enables to schedule metadata extraction on a monthly, weekly, daily, or hourly cadence depending on the needs of your organization. Schedules are defined using CRON format. -![](./imgs/schedule-ingestion.png) + +

+ +

+ _An Ingestion Source that is executed at 9:15am every day, Los Angeles time_ @@ -136,7 +164,11 @@ you can always come back and change this. Finally, give your Ingestion Source a name. -![](./imgs/name-ingestion-source.png) + +

+ +

+ Once you're happy with your configurations, click 'Done' to save your changes. @@ -149,7 +181,11 @@ with the server. However, you can override the default package version using the To do so, simply click 'Advanced', then change the 'CLI Version' text box to contain the exact version of the DataHub CLI you'd like to use. -![](./imgs/custom-ingestion-cli-version.png) + +

+ +

+ _Pinning the CLI version to version `0.8.23.2`_ Once you're happy with your changes, simply click 'Done' to save. @@ -200,11 +236,19 @@ Once you've created your Ingestion Source, you can run it by clicking 'Execute'. you should see the 'Last Status' column of the ingestion source change from `N/A` to `Running`. This means that the request to execute ingestion has been successfully picked up by the DataHub ingestion executor. -![](./imgs/running-ingestion.png) + +

+ +

+ If ingestion has executed successfully, you should see it's state shown in green as `Succeeded`. -![](./imgs/successful-ingestion.png) + +

+ +

+ ### Cancelling an Ingestion Run @@ -212,14 +256,22 @@ If ingestion has executed successfully, you should see it's state shown in green If your ingestion run is hanging, there may a bug in the ingestion source, or another persistent issue like exponential timeouts. If these situations, you can cancel ingestion by clicking **Cancel** on the problematic run. -![](./imgs/cancelled-ingestion.png) + +

+ +

+ Once cancelled, you can view the output of the ingestion run by clicking **Details**. ### Debugging a Failed Ingestion Run -![](./imgs/failed-ingestion.png) + +

+ +

+ A variety of things can cause an ingestion run to fail. Common reasons for failure include: @@ -235,12 +287,20 @@ A variety of things can cause an ingestion run to fail. Common reasons for failu 4. **Authentication**: If you've enabled [Metadata Service Authentication](authentication/introducing-metadata-service-authentication.md), you'll need to provide a Personal Access Token in your Recipe Configuration. To so this, set the 'token' field of the sink configuration to contain a Personal Access Token: - ![](./imgs/ingestion-with-token.png) + +

+ +

+ The output of each run is captured and available to view in the UI for easier debugging. To view output logs, click **DETAILS** on the corresponding ingestion run. -![](./imgs/ingestion-logs.png) + +

+ +

+ ## FAQ @@ -250,7 +310,11 @@ If not due to one of the reasons outlined above, this may be because the executo to reach DataHub's backend using the default configurations. Try changing your ingestion recipe to make the `sink.config.server` variable point to the Docker DNS name for the `datahub-gms` pod: -![](./imgs/quickstart-ingestion-config.png) + +

+ +

+ ### I see 'N/A' when I try to run ingestion. What do I do? diff --git a/docs/what/relationship.md b/docs/what/relationship.md index 1908bbd6ce75f..dcfe093a1b124 100644 --- a/docs/what/relationship.md +++ b/docs/what/relationship.md @@ -2,7 +2,11 @@ A relationship is a named associate between exactly two [entities](entity.md), a source and a destination. -![metadata-modeling](../imgs/metadata-modeling.png) + +

+ +

+ From the above graph, a `Group` entity can be linked to a `User` entity via a `HasMember` relationship. Note that the name of the relationship reflects the direction, i.e. pointing from `Group` to `User`. diff --git a/metadata-ingestion/adding-source.md b/metadata-ingestion/adding-source.md index 50e6a1cd5fcc6..e4fc950a7cdbd 100644 --- a/metadata-ingestion/adding-source.md +++ b/metadata-ingestion/adding-source.md @@ -44,7 +44,11 @@ class LookerAPIConfig(ConfigModel): ``` generates the following documentation: -![Generated Config Documentation](./docs/images/generated_config_docs.png) + +

+ +

+ :::note Inline markdown or code snippets are not yet supported for field level documentation. diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index f636cf25c67f7..199ccc59c21e0 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -21,11 +21,13 @@ task checkPythonVersion(type: Exec) { } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { + def sentinel_file = "${venv_name}/.venv_environment_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") + outputs.file(sentinel_file) commandLine 'bash', '-c', "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0'" + "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " + + "touch ${sentinel_file}" } task runPreFlightScript(type: Exec, dependsOn: environmentSetup) { @@ -39,7 +41,6 @@ task runPreFlightScript(type: Exec, dependsOn: environmentSetup) { task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) { def sentinel_file = "${venv_name}/.build_install_package_only_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . &&" + @@ -47,9 +48,12 @@ task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) { } task installPackage(type: Exec, dependsOn: installPackageOnly) { + def sentinel_file = "${venv_name}/.build_install_package_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") - commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . ${extra_pip_requirements}" + outputs.file(sentinel_file) + commandLine 'bash', '-x', '-c', + "${venv_name}/bin/pip install -e . ${extra_pip_requirements} && " + + "touch ${sentinel_file}" } task codegen(type: Exec, dependsOn: [environmentSetup, installPackage, ':metadata-events:mxe-schemas:build']) { @@ -63,7 +67,6 @@ task install(dependsOn: [installPackage, codegen]) task installDev(type: Exec, dependsOn: [install]) { def sentinel_file = "${venv_name}/.build_install_dev_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + @@ -75,7 +78,6 @@ task installDev(type: Exec, dependsOn: [install]) { task installAll(type: Exec, dependsOn: [install]) { def sentinel_file = "${venv_name}/.build_install_all_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md index 67041d23a21b1..5d49b9a866a3d 100644 --- a/metadata-ingestion/developing.md +++ b/metadata-ingestion/developing.md @@ -74,7 +74,9 @@ The syntax for installing plugins is slightly different in development. For exam ## Architecture -![metadata ingestion framework layout](../docs/imgs/datahub-metadata-ingestion-framework.png) +

+ +

The architecture of this metadata ingestion framework is heavily inspired by [Apache Gobblin](https://gobblin.apache.org/) (also originally a LinkedIn project!). We have a standardized format - the MetadataChangeEvent - and sources and sinks which respectively produce and consume these objects. The sources pull metadata from a variety of data systems, while the sinks are primarily for moving this metadata into DataHub. diff --git a/metadata-ingestion/docs/dev_guides/stateful.md b/metadata-ingestion/docs/dev_guides/stateful.md index eccacbb416714..b3a409e965c62 100644 --- a/metadata-ingestion/docs/dev_guides/stateful.md +++ b/metadata-ingestion/docs/dev_guides/stateful.md @@ -38,7 +38,9 @@ Following is the list of current use-cases powered by stateful ingestion in data Stateful ingestion can be used to automatically soft-delete the tables and views that are seen in a previous run but absent in the current run (they are either deleted or no longer desired). -![Stale Metadata Deletion](./stale_metadata_deletion.png) +

+ +

#### Supported sources * All sql based sources. diff --git a/metadata-ingestion/docs/sources/azure-ad/azure-ad.md b/metadata-ingestion/docs/sources/azure-ad/azure-ad.md index 8b375fbee4f33..d2677d7e4fc7a 100644 --- a/metadata-ingestion/docs/sources/azure-ad/azure-ad.md +++ b/metadata-ingestion/docs/sources/azure-ad/azure-ad.md @@ -5,6 +5,15 @@ to read your organization's Users and Groups. The following permissions are requ - `GroupMember.Read.All` - `User.Read.All` -You can add a permission by navigating to the permissions tab in your DataHub application on the Azure AD portal. ![Azure AD API Permissions](./azure_ad_api_permissions.png) +You can add a permission by navigating to the permissions tab in your DataHub application on the Azure AD portal. +

+ +

-You can view the necessary endpoints to configure by clicking on the Endpoints button in the Overview tab. ![Azure AD Endpoints](./azure_ad_endpoints.png) + +You can view the necessary endpoints to configure by clicking on the Endpoints button in the Overview tab. + + +

+ +

diff --git a/metadata-ingestion/docs/sources/databricks/README.md b/metadata-ingestion/docs/sources/databricks/README.md index 01aee3236e01c..b380a892c22b9 100644 --- a/metadata-ingestion/docs/sources/databricks/README.md +++ b/metadata-ingestion/docs/sources/databricks/README.md @@ -15,8 +15,11 @@ To complete the picture, we recommend adding push-based ingestion from your Spar ## Watch the DataHub Talk at the Data and AI Summit 2022 For a deeper look at how to think about DataHub within and across your Databricks ecosystem, watch the recording of our talk at the Data and AI Summit 2022. - -[![IMAGE_ALT](../../images/databricks/data_and_ai_summit_2022.png)](https://www.youtube.com/watch?v=SCP0PR3t7dc) +

+ + + +

diff --git a/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png b/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png deleted file mode 100644 index 7227dc04fb8a0..0000000000000 Binary files a/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png and /dev/null differ diff --git a/metadata-ingestion/docs/sources/looker/looker_pre.md b/metadata-ingestion/docs/sources/looker/looker_pre.md index ad7fff9c0daaf..6798103d66e99 100644 --- a/metadata-ingestion/docs/sources/looker/looker_pre.md +++ b/metadata-ingestion/docs/sources/looker/looker_pre.md @@ -19,7 +19,10 @@ see_user_dashboards see_users ``` Here is an example permission set after configuration. -![Looker DataHub Permission Set](./looker_datahub_permission_set.png) + +

+ +

#### Get an API key diff --git a/metadata-ingestion/docs/sources/mssql/mssql_pre.md b/metadata-ingestion/docs/sources/mssql/mssql_pre.md new file mode 100644 index 0000000000000..396581966e691 --- /dev/null +++ b/metadata-ingestion/docs/sources/mssql/mssql_pre.md @@ -0,0 +1,14 @@ +### Prerequisites + +If you want to ingest MSSQL Jobs and stored procedures (with code) the user credentials needs the proper privileges. + +Script for granting the privileges: +``` +USE MSDB +GRANT SELECT ON OBJECT::msdb.dbo.sysjobsteps TO 'USERNAME' +GRANT SELECT ON OBJECT::msdb.dbo.sysjobs TO 'USERNAME' + +USE 'DATA_DB_NAME' +GRANT VIEW DEFINITION TO 'USERNAME' +GRANT SELECT ON OBJECT::sys.sql_expression_dependencies TO 'USERNAME' +``` \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md index 9a381fb351aec..75bd579417a48 100644 --- a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md @@ -99,6 +99,24 @@ The steps slightly differ based on which you decide to use. including `client_id` and `client_secret`, plus your Okta user's `Username` and `Password` * Note: the `username` and `password` config options are not nested under `oauth_config` +### Snowflake Shares +If you are using [Snowflake Shares](https://docs.snowflake.com/en/user-guide/data-sharing-provider) to share data across different snowflake accounts, and you have set up DataHub recipes for ingesting metadata from all these accounts, you may end up having multiple similar dataset entities corresponding to virtual versions of same table in different snowflake accounts. DataHub Snowflake connector can automatically link such tables together through Siblings and Lineage relationship if user provides information necessary to establish the relationship using configuration `shares` in recipe. + +#### Example +- Snowflake account `account1` (ingested as platform_instance `instance1`) owns a database `db1`. A share `X` is created in `account1` that includes database `db1` along with schemas and tables inside it. +- Now, `X` is shared with snowflake account `account2` (ingested as platform_instance `instance2`). A database `db1_from_X` is created from inbound share `X` in `account2`. In this case, all tables and views included in share `X` will also be present in `instance2`.`db1_from_X`. +- This can be represented in `shares` configuration section as + ```yaml + shares: + X: # name of the share + database_name: db1 + platform_instance: instance1 + consumers: # list of all databases created from share X + - database_name: db1_from_X + platform_instance: instance2 + + ``` +- If share `X` is shared with more snowflake accounts and database is created from share `X` in those account then additional entries need to be added in `consumers` list for share `X`, one per snowflake account. The same `shares` config can then be copied across recipes of all accounts. ### Caveats - Some of the features are only available in the Snowflake Enterprise Edition. This doc has notes mentioning where this applies. diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 59cdcee79f052..ded9186e08a22 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -376,6 +376,7 @@ def get_long_description(): "salesforce": {"simple-salesforce"}, "snowflake": snowflake_common | usage_common | sqlglot_lib, "sqlalchemy": sql_common, + "sql-queries": usage_common | sqlglot_lib, "superset": { "requests", "sqlalchemy", @@ -608,6 +609,7 @@ def get_long_description(): "demo-data = datahub.ingestion.source.demo_data.DemoDataSource", "unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource", "gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource", + "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource", ], "datahub.ingestion.transformer.plugins": [ "simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership", diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index cf4d46cf18ba8..acb5763280905 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -31,13 +31,14 @@ ) _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on 429, + 500, 502, 503, 504, ] _DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"] _DEFAULT_RETRY_MAX_TIMES = int( - os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "3") + os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4") ) diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py new file mode 100644 index 0000000000000..071d590f270f8 --- /dev/null +++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py @@ -0,0 +1,289 @@ +import logging +import time +from collections import defaultdict +from dataclasses import dataclass, field +from datetime import datetime +from typing import Collection, Dict, Iterable, List, Optional, Set + +from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator +from datahub.metadata.schema_classes import ( + AuditStampClass, + DatasetLineageTypeClass, + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, + OperationClass, + OperationTypeClass, + UpstreamClass, + UpstreamLineageClass, +) +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult + +logger = logging.getLogger(__name__) + +# TODO: Use this over other sources' equivalent code, if possible + +DatasetUrn = str +FieldUrn = str +UserUrn = str + + +@dataclass +class LineageEdge: + """Stores information about a single lineage edge, from an upstream table to a downstream table.""" + + downstream_urn: DatasetUrn + upstream_urn: DatasetUrn + audit_stamp: Optional[datetime] + actor: Optional[UserUrn] + type: str = DatasetLineageTypeClass.TRANSFORMED + + # Maps downstream_col -> {upstream_col} + column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set)) + + def gen_upstream_aspect(self) -> UpstreamClass: + return UpstreamClass( + auditStamp=AuditStampClass( + time=int(self.audit_stamp.timestamp() * 1000), actor=self.actor or "" + ) + if self.audit_stamp + else None, + dataset=self.upstream_urn, + type=self.type, + ) + + def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]: + for downstream_col, upstream_cols in self.column_map.items(): + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + # Sort to avoid creating multiple aspects in backend with same lineage but different order + upstreams=sorted( + make_schema_field_urn(self.upstream_urn, col) + for col in upstream_cols + ), + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn(self.downstream_urn, downstream_col) + ], + ) + + +@dataclass +class SqlParsingBuilder: + # Open question: does it make sense to iterate over out_tables? When will we have multiple? + + generate_lineage: bool = True + generate_usage_statistics: bool = True + generate_operations: bool = True + usage_config: Optional[BaseUsageConfig] = None + + # TODO: Make inner dict a FileBackedDict and make LineageEdge frozen + # Builds up a single LineageEdge for each upstream -> downstream pair + _lineage_map: Dict[DatasetUrn, Dict[DatasetUrn, LineageEdge]] = field( + default_factory=lambda: defaultdict(dict), init=False + ) + + # TODO: Replace with FileBackedDict approach like in BigQuery usage + _usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False) + + def __post_init__(self) -> None: + if self.usage_config: + self._usage_aggregator = UsageAggregator(self.usage_config) + else: + logger.info("No usage config provided, not generating usage statistics") + self.generate_usage_statistics = False + + def process_sql_parsing_result( + self, + result: SqlParsingResult, + *, + query: str, + query_timestamp: Optional[datetime] = None, + is_view_ddl: bool = False, + user: Optional[UserUrn] = None, + custom_operation_type: Optional[str] = None, + include_urns: Optional[Set[DatasetUrn]] = None, + ) -> Iterable[MetadataWorkUnit]: + """Process a single query and yield any generated workunits. + + Args: + result: The result of parsing the query, or a mock result if parsing failed. + query: The SQL query to parse and process. + query_timestamp: When the query was run. + is_view_ddl: Whether the query is a DDL statement that creates a view. + user: The urn of the user who ran the query. + custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed. + include_urns: If provided, only generate workunits for these urns. + """ + downstreams_to_ingest = result.out_tables + upstreams_to_ingest = result.in_tables + if include_urns: + logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}") + downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns) + upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns) + + if self.generate_lineage: + for downstream_urn in downstreams_to_ingest: + _merge_lineage_data( + downstream_urn=downstream_urn, + upstream_urns=result.in_tables, + column_lineage=result.column_lineage, + upstream_edges=self._lineage_map[downstream_urn], + query_timestamp=query_timestamp, + is_view_ddl=is_view_ddl, + user=user, + ) + + if self.generate_usage_statistics and query_timestamp is not None: + upstream_fields = _compute_upstream_fields(result) + for upstream_urn in upstreams_to_ingest: + self._usage_aggregator.aggregate_event( + resource=upstream_urn, + start_time=query_timestamp, + query=query, + user=user, + fields=sorted(upstream_fields.get(upstream_urn, [])), + ) + + if self.generate_operations and query_timestamp is not None: + for downstream_urn in downstreams_to_ingest: + yield from _gen_operation_workunit( + result, + downstream_urn=downstream_urn, + query_timestamp=query_timestamp, + user=user, + custom_operation_type=custom_operation_type, + ) + + def add_lineage( + self, + downstream_urn: DatasetUrn, + upstream_urns: Collection[DatasetUrn], + timestamp: Optional[datetime] = None, + is_view_ddl: bool = False, + user: Optional[UserUrn] = None, + ) -> None: + """Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails.""" + _merge_lineage_data( + downstream_urn=downstream_urn, + upstream_urns=upstream_urns, + column_lineage=None, + upstream_edges=self._lineage_map[downstream_urn], + query_timestamp=timestamp, + is_view_ddl=is_view_ddl, + user=user, + ) + + def gen_workunits(self) -> Iterable[MetadataWorkUnit]: + if self.generate_lineage: + yield from self._gen_lineage_workunits() + if self.generate_usage_statistics: + yield from self._gen_usage_statistics_workunits() + + def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]: + for downstream_urn in self._lineage_map: + upstreams: List[UpstreamClass] = [] + fine_upstreams: List[FineGrainedLineageClass] = [] + for upstream_urn, edge in self._lineage_map[downstream_urn].items(): + upstreams.append(edge.gen_upstream_aspect()) + fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects()) + + upstream_lineage = UpstreamLineageClass( + upstreams=sorted(upstreams, key=lambda x: x.dataset), + fineGrainedLineages=sorted( + fine_upstreams, + key=lambda x: (x.downstreams, x.upstreams), + ) + or None, + ) + yield MetadataChangeProposalWrapper( + entityUrn=downstream_urn, aspect=upstream_lineage + ).as_workunit() + + def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]: + yield from self._usage_aggregator.generate_workunits( + resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn + ) + + +def _merge_lineage_data( + downstream_urn: DatasetUrn, + *, + upstream_urns: Collection[DatasetUrn], + column_lineage: Optional[List[ColumnLineageInfo]], + upstream_edges: Dict[DatasetUrn, LineageEdge], + query_timestamp: Optional[datetime], + is_view_ddl: bool, + user: Optional[UserUrn], +) -> None: + for upstream_urn in upstream_urns: + edge = upstream_edges.setdefault( + upstream_urn, + LineageEdge( + downstream_urn=downstream_urn, + upstream_urn=upstream_urn, + audit_stamp=query_timestamp, + actor=user, + type=DatasetLineageTypeClass.VIEW + if is_view_ddl + else DatasetLineageTypeClass.TRANSFORMED, + ), + ) + if query_timestamp and ( # Use the most recent query + edge.audit_stamp is None or query_timestamp > edge.audit_stamp + ): + edge.audit_stamp = query_timestamp + if user: + edge.actor = user + + # Note: Inefficient as we loop through all column_lineage entries for each downstream table + for cl in column_lineage or []: + if cl.downstream.table == downstream_urn: + for upstream_column_info in cl.upstreams: + if upstream_column_info.table not in upstream_urns: + continue + column_map = upstream_edges[upstream_column_info.table].column_map + column_map[cl.downstream.column].add(upstream_column_info.column) + + +def _compute_upstream_fields( + result: SqlParsingResult, +) -> Dict[DatasetUrn, Set[DatasetUrn]]: + upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set) + for cl in result.column_lineage or []: + for upstream in cl.upstreams: + upstream_fields[upstream.table].add(upstream.column) + return upstream_fields + + +def _gen_operation_workunit( + result: SqlParsingResult, + *, + downstream_urn: DatasetUrn, + query_timestamp: datetime, + user: Optional[UserUrn], + custom_operation_type: Optional[str], +) -> Iterable[MetadataWorkUnit]: + operation_type = result.query_type.to_operation_type() + # Filter out SELECT and other undesired statements + if operation_type is None: + return + elif operation_type == OperationTypeClass.UNKNOWN: + if custom_operation_type is None: + return + else: + operation_type = OperationTypeClass.CUSTOM + + aspect = OperationClass( + timestampMillis=int(time.time() * 1000), + operationType=operation_type, + lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000), + actor=user, + customOperationType=custom_operation_type, + ) + yield MetadataChangeProposalWrapper( + entityUrn=downstream_urn, aspect=aspect + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 243c1848279c7..50ea69b6c13a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type from avro.schema import RecordSchema from deprecated import deprecated @@ -38,6 +38,8 @@ SystemMetadataClass, TelemetryClientIdClass, ) +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn, guess_entity_type if TYPE_CHECKING: @@ -957,7 +959,11 @@ def delete_references_to_urn( @functools.lru_cache() def _make_schema_resolver( - self, platform: str, platform_instance: Optional[str], env: str + self, + platform: str, + platform_instance: Optional[str], + env: str, + include_graph: bool = True, ) -> "SchemaResolver": from datahub.utilities.sqlglot_lineage import SchemaResolver @@ -965,8 +971,50 @@ def _make_schema_resolver( platform=platform, platform_instance=platform_instance, env=env, - graph=self, + graph=self if include_graph else None, + ) + + def initialize_schema_resolver_from_datahub( + self, platform: str, platform_instance: Optional[str], env: str + ) -> Tuple["SchemaResolver", Set[str]]: + logger.info("Initializing schema resolver") + + # TODO: Filter on platform instance? + logger.info(f"Fetching urns for platform {platform}, env {env}") + with PerfTimer() as timer: + urns = set( + self.get_urns_by_filter( + entity_types=[DatasetUrn.ENTITY_TYPE], + platform=platform, + env=env, + batch_size=3000, + ) + ) + logger.info( + f"Fetched {len(urns)} urns in {timer.elapsed_seconds()} seconds" + ) + + schema_resolver = self._make_schema_resolver( + platform, platform_instance, env, include_graph=False ) + with PerfTimer() as timer: + count = 0 + for i, urn in enumerate(urns): + if i % 1000 == 0: + logger.debug(f"Loaded {i} schema metadata") + try: + schema_metadata = self.get_aspect(urn, SchemaMetadataClass) + if schema_metadata: + schema_resolver.add_schema_metadata(urn, schema_metadata) + count += 1 + except Exception: + logger.warning("Failed to load schema metadata", exc_info=True) + logger.info( + f"Loaded {count} schema metadata in {timer.elapsed_seconds()} seconds" + ) + + logger.info("Finished initializing schema resolver") + return schema_resolver, urns def parse_sql_lineage( self, @@ -982,9 +1030,7 @@ def parse_sql_lineage( # Cache the schema resolver to make bulk parsing faster. schema_resolver = self._make_schema_resolver( - platform=platform, - platform_instance=platform_instance, - env=env, + platform=platform, platform_instance=platform_instance, env=env ) return sqlglot_lineage( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 7725d63ce0e1e..1446812c29216 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -429,7 +429,9 @@ def get_dataplatform_instance_aspect( ) -> MetadataWorkUnit: aspect = DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), - instance=make_dataplatform_instance_urn(self.platform, project_id), + instance=make_dataplatform_instance_urn(self.platform, project_id) + if self.config.include_data_platform_instance + else None, ) return MetadataChangeProposalWrapper( entityUrn=dataset_urn, aspect=aspect diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index e5730ee87daf4..0f2082c5e53bf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -81,6 +81,13 @@ class BigQueryV2Config( description="Whether to populate BigQuery Console url to Datasets/Tables", ) + include_data_platform_instance: bool = Field( + default=False, + description="Whether to create a DataPlatformInstance aspect, equal to the BigQuery project id." + " If enabled, will cause redundancy in the browse path for BigQuery entities in the UI," + " because the project id is represented as the top-level container.", + ) + debug_include_full_payloads: bool = Field( default=False, description="Include full payload into events. It is only for debugging and internal use.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index c8a4c7a6ab8fa..b3fa5e3401c07 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -626,12 +626,17 @@ def _extract_lineages(self): @dataclass class DebeziumSourceConnector: connector_manifest: ConnectorManifest + report: KafkaConnectSourceReport def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig + self, + connector_manifest: ConnectorManifest, + config: KafkaConnectSourceConfig, + report: KafkaConnectSourceReport, ) -> None: self.connector_manifest = connector_manifest self.config = config + self.report = report self._extract_lineages() @dataclass @@ -683,10 +688,19 @@ def get_parser( database_name=connector_manifest.config.get("database.dbname"), ) elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + parser = self.DebeziumParser( source_platform="mssql", server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), + database_name=database_name, ) elif connector_class == "io.debezium.connector.db2.Db2Connector": parser = self.DebeziumParser( @@ -707,29 +721,37 @@ def get_parser( def _extract_lineages(self): lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name) - if not self.connector_manifest.topic_names: - return lineages + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name) - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) + if not self.connector_manifest.topic_names: + return lineages - if found: - table_name = get_dataset_name(database_name, found.group(2)) + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.connector_manifest.lineages = lineages + except Exception as e: + self.report.report_warning( + self.connector_manifest.name, f"Error resolving lineage: {e}" + ) + + return @dataclass @@ -1061,7 +1083,9 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: "io.debezium.connector" ): connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, config=self.config + connector_manifest=connector_manifest, + config=self.config, + report=self.report, ).connector_manifest elif ( connector_manifest.config.get(CONNECTOR_CLASS, "") diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index d568ddcb02afa..40b90d216348c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -34,6 +34,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, ) +from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageTypeClass, FineGrainedLineageDownstreamType, @@ -76,6 +77,8 @@ from datahub.utilities.lossy_collections import LossyList, LossySet from datahub.utilities.url_util import remove_port_from_url +CORPUSER_DATAHUB = "urn:li:corpuser:datahub" + if TYPE_CHECKING: from datahub.ingestion.source.looker.lookml_source import ( LookerViewFileLoader, @@ -786,6 +789,7 @@ def _to_metadata_events( # noqa: C901 if self.upstream_views is not None: assert self.project_name is not None upstreams = [] + observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc) for view_ref in sorted(self.upstream_views): view_urn = LookerViewId( project_name=view_ref.project @@ -799,6 +803,10 @@ def _to_metadata_events( # noqa: C901 UpstreamClass( dataset=view_urn, type=DatasetLineageTypeClass.VIEW, + auditStamp=AuditStamp( + time=int(observed_lineage_ts.timestamp() * 1000), + actor=CORPUSER_DATAHUB, + ), ) ) view_name_to_urn_map[view_ref.include] = view_urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 362b4e5530638..1a32afa2b7fdd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -6,7 +6,7 @@ import re import tempfile from dataclasses import dataclass, field as dataclass_field, replace -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import ( Any, ClassVar, @@ -50,6 +50,7 @@ from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.git.git_import import GitClone from datahub.ingestion.source.looker.looker_common import ( + CORPUSER_DATAHUB, LookerCommonConfig, LookerExplore, LookerUtil, @@ -83,6 +84,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ( + AuditStampClass, DatasetPropertiesClass, FineGrainedLineageClass, FineGrainedLineageUpstreamTypeClass, @@ -1615,11 +1617,16 @@ def _get_upstream_lineage( # Generate the upstream + fine grained lineage objects. upstreams = [] + observed_lineage_ts = datetime.now(tz=timezone.utc) fine_grained_lineages: List[FineGrainedLineageClass] = [] for upstream_dataset_urn in upstream_dataset_urns: upstream = UpstreamClass( dataset=upstream_dataset_urn, type=DatasetLineageTypeClass.VIEW, + auditStamp=AuditStampClass( + time=int(observed_lineage_ts.timestamp() * 1000), + actor=CORPUSER_DATAHUB, + ), ) upstreams.append(upstream) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 7699d89ce9ac2..a7d946e99d806 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -1,10 +1,12 @@ import logging +from collections import defaultdict +from dataclasses import dataclass from enum import Enum -from typing import Dict, List, Optional, cast +from typing import Dict, List, Optional, Set, cast from pydantic import Field, SecretStr, root_validator, validator -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pattern_utils import UUID_REGEX from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -42,6 +44,31 @@ class TagOption(str, Enum): skip = "skip" +@dataclass(frozen=True) +class DatabaseId: + database: str = Field( + description="Database created from share in consumer account." + ) + platform_instance: str = Field( + description="Platform instance of consumer snowflake account." + ) + + +class SnowflakeShareConfig(ConfigModel): + database: str = Field(description="Database from which share is created.") + platform_instance: str = Field( + description="Platform instance for snowflake account in which share is created." + ) + + consumers: Set[DatabaseId] = Field( + description="List of databases created in consumer accounts." + ) + + @property + def source_database(self) -> DatabaseId: + return DatabaseId(self.database, self.platform_instance) + + class SnowflakeV2Config( SnowflakeConfig, SnowflakeUsageConfig, @@ -115,6 +142,13 @@ class SnowflakeV2Config( "upstreams_deny_pattern", "temporary_tables_pattern" ) + shares: Optional[Dict[str, SnowflakeShareConfig]] = Field( + default=None, + description="Required if current account owns or consumes snowflake share." + " If specified, connector creates lineage and siblings relationship between current account's database tables and consumer/producer account's database tables." + " Map of share name -> details of share.", + ) + email_as_user_identifier: bool = Field( default=True, description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is provided, generates email addresses for snowflake users with unset emails, based on their username.", @@ -192,3 +226,77 @@ def get_sql_alchemy_url( @property def parse_view_ddl(self) -> bool: return self.include_view_column_lineage + + @validator("shares") + def validate_shares( + cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict + ) -> Optional[Dict[str, SnowflakeShareConfig]]: + current_platform_instance = values.get("platform_instance") + + if shares: + # Check: platform_instance should be present + assert current_platform_instance is not None, ( + "Did you forget to set `platform_instance` for current ingestion ? " + "It is required to use `platform_instance` when ingesting from multiple snowflake accounts." + ) + + databases_included_in_share: List[DatabaseId] = [] + databases_created_from_share: List[DatabaseId] = [] + + for share_details in shares.values(): + shared_db = DatabaseId( + share_details.database, share_details.platform_instance + ) + assert all( + consumer.platform_instance != share_details.platform_instance + for consumer in share_details.consumers + ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake." + + databases_included_in_share.append(shared_db) + databases_created_from_share.extend(share_details.consumers) + + for db_from_share in databases_created_from_share: + assert ( + db_from_share not in databases_included_in_share + ), "Database included in a share can not be present as consumer in any share." + assert ( + databases_created_from_share.count(db_from_share) == 1 + ), "Same database can not be present as consumer in more than one share." + + return shares + + def outbounds(self) -> Dict[str, Set[DatabaseId]]: + """ + Returns mapping of + database included in current account's outbound share -> all databases created from this share in other accounts + """ + outbounds: Dict[str, Set[DatabaseId]] = defaultdict(set) + if self.shares: + for share_name, share_details in self.shares.items(): + if share_details.platform_instance == self.platform_instance: + logger.debug( + f"database {share_details.database} is included in outbound share(s) {share_name}." + ) + outbounds[share_details.database].update(share_details.consumers) + return outbounds + + def inbounds(self) -> Dict[str, DatabaseId]: + """ + Returns mapping of + database created from an current account's inbound share -> other-account database from which this share was created + """ + inbounds: Dict[str, DatabaseId] = {} + if self.shares: + for share_name, share_details in self.shares.items(): + for consumer in share_details.consumers: + if consumer.platform_instance == self.platform_instance: + logger.debug( + f"database {consumer.database} is created from inbound share {share_name}." + ) + inbounds[consumer.database] = share_details.source_database + break + else: + logger.info( + f"Skipping Share {share_name}, as it does not include current platform instance {self.platform_instance}", + ) + return inbounds diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index dab46645bffcc..e5b214ba35e4b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -261,6 +261,7 @@ def get_tables_for_database( for table in cur: if table["TABLE_SCHEMA"] not in tables: tables[table["TABLE_SCHEMA"]] = [] + tables[table["TABLE_SCHEMA"]].append( SnowflakeTable( name=table["TABLE_NAME"], diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py new file mode 100644 index 0000000000000..6f7520bbf1988 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -0,0 +1,158 @@ +import logging +from typing import Callable, Iterable, List + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_config import ( + DatabaseId, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + DatasetLineageType, + Upstream, + UpstreamLineage, +) + +logger: logging.Logger = logging.getLogger(__name__) + + +class SnowflakeSharesHandler(SnowflakeCommonMixin): + def __init__( + self, + config: SnowflakeV2Config, + report: SnowflakeV2Report, + dataset_urn_builder: Callable[[str], str], + ) -> None: + self.config = config + self.report = report + self.logger = logger + self.dataset_urn_builder = dataset_urn_builder + + def get_shares_workunits( + self, databases: List[SnowflakeDatabase] + ) -> Iterable[MetadataWorkUnit]: + inbounds = self.config.inbounds() + outbounds = self.config.outbounds() + # None of the databases are shared + if not (inbounds or outbounds): + return + + logger.debug("Checking databases for inbound or outbound shares.") + for db in databases: + is_inbound = db.name in inbounds + is_outbound = db.name in outbounds + + if not (is_inbound or is_outbound): + logger.debug(f"database {db.name} is not shared.") + continue + + sibling_dbs = ( + list(outbounds[db.name]) if is_outbound else [inbounds[db.name]] + ) + + for schema in db.schemas: + for table_name in schema.tables + schema.views: + # TODO: If this is outbound database, + # 1. attempt listing shares using `show shares` to identify name of share associated with this database (cache query result). + # 2. if corresponding share is listed, then run `show grants to share ` to identify exact tables, views included in share. + # 3. emit siblings only for the objects listed above. + # This will work only if the configured role has accountadmin role access OR is owner of share. + # Otherwise ghost nodes may be shown in "Composed Of" section for tables/views in original database which are not granted to share. + yield from self.gen_siblings( + db.name, + schema.name, + table_name, + is_outbound, + sibling_dbs, + ) + + if is_inbound: + assert len(sibling_dbs) == 1 + # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy + # hence this lineage code is not written in SnowflakeLineageExtractor + # also this is not governed by configs include_table_lineage and include_view_lineage + yield self.get_upstream_lineage_with_primary_sibling( + db.name, schema.name, table_name, sibling_dbs[0] + ) + + self.report_missing_databases( + databases, list(inbounds.keys()), list(outbounds.keys()) + ) + + def report_missing_databases( + self, + databases: List[SnowflakeDatabase], + inbounds: List[str], + outbounds: List[str], + ) -> None: + db_names = [db.name for db in databases] + missing_dbs = [db for db in inbounds + outbounds if db not in db_names] + + if missing_dbs: + self.report_warning( + "snowflake-shares", + f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.", + ) + + def gen_siblings( + self, + database_name: str, + schema_name: str, + table_name: str, + primary: bool, + sibling_databases: List[DatabaseId], + ) -> Iterable[MetadataWorkUnit]: + if not sibling_databases: + return + dataset_identifier = self.get_dataset_identifier( + table_name, schema_name, database_name + ) + urn = self.dataset_urn_builder(dataset_identifier) + + sibling_urns = [ + make_dataset_urn_with_platform_instance( + self.platform, + self.get_dataset_identifier( + table_name, schema_name, sibling_db.database + ), + sibling_db.platform_instance, + ) + for sibling_db in sibling_databases + ] + + yield MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=Siblings(primary=primary, siblings=sorted(sibling_urns)), + ).as_workunit() + + def get_upstream_lineage_with_primary_sibling( + self, + database_name: str, + schema_name: str, + table_name: str, + primary_sibling_db: DatabaseId, + ) -> MetadataWorkUnit: + dataset_identifier = self.get_dataset_identifier( + table_name, schema_name, database_name + ) + urn = self.dataset_urn_builder(dataset_identifier) + + upstream_urn = make_dataset_urn_with_platform_instance( + self.platform, + self.get_dataset_identifier( + table_name, schema_name, primary_sibling_db.database + ), + primary_sibling_db.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=UpstreamLineage( + upstreams=[Upstream(dataset=upstream_urn, type=DatasetLineageType.COPY)] + ), + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 40c4d32525a51..2cb4b37fdd696 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -68,6 +68,7 @@ SnowflakeTag, SnowflakeView, ) +from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler from datahub.ingestion.source.snowflake.snowflake_tag import SnowflakeTagExtractor from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeUsageExtractor, @@ -491,9 +492,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: return self.data_dictionary.set_connection(self.connection) - databases = self.get_databases() + databases: List[SnowflakeDatabase] = [] - if databases is None or len(databases) == 0: + for database in self.get_databases() or []: + self.report.report_entity_scanned(database.name, "database") + if not self.config.database_pattern.allowed(database.name): + self.report.report_dropped(f"{database.name}.*") + else: + databases.append(database) + + if len(databases) == 0: return for snowflake_db in databases: @@ -520,25 +528,22 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # TODO: The checkpoint state for stale entity detection can be committed here. + if self.config.shares: + yield from SnowflakeSharesHandler( + self.config, self.report, self.gen_dataset_urn + ).get_shares_workunits(databases) + discovered_tables: List[str] = [ self.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.tables - if self._is_dataset_pattern_allowed( - self.get_dataset_identifier(table_name, schema.name, db.name), - SnowflakeObjectDomain.TABLE, - ) ] discovered_views: List[str] = [ self.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.views - if self._is_dataset_pattern_allowed( - self.get_dataset_identifier(table_name, schema.name, db.name), - SnowflakeObjectDomain.VIEW, - ) ] if len(discovered_tables) == 0 and len(discovered_views) == 0: @@ -642,11 +647,6 @@ def get_databases_from_ischema(self, databases): def _process_database( self, snowflake_db: SnowflakeDatabase ) -> Iterable[MetadataWorkUnit]: - self.report.report_entity_scanned(snowflake_db.name, "database") - if not self.config.database_pattern.allowed(snowflake_db.name): - self.report.report_dropped(f"{snowflake_db.name}.*") - return - db_name = snowflake_db.name try: @@ -692,11 +692,22 @@ def _process_database( if self.config.is_profiling_enabled() and self.db_tables: yield from self.profiler.get_workunits(snowflake_db, self.db_tables) - def fetch_schemas_for_database(self, snowflake_db, db_name): + def fetch_schemas_for_database( + self, snowflake_db: SnowflakeDatabase, db_name: str + ) -> None: + schemas: List[SnowflakeSchema] = [] try: - snowflake_db.schemas = self.data_dictionary.get_schemas_for_database( - db_name - ) + for schema in self.data_dictionary.get_schemas_for_database(db_name): + self.report.report_entity_scanned(schema.name, "schema") + if not is_schema_allowed( + self.config.schema_pattern, + schema.name, + db_name, + self.config.match_fully_qualified_names, + ): + self.report.report_dropped(f"{db_name}.{schema.name}.*") + else: + schemas.append(schema) except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = f"Failed to get schemas for database {db_name}. Please check permissions." @@ -712,25 +723,17 @@ def fetch_schemas_for_database(self, snowflake_db, db_name): db_name, ) - if not snowflake_db.schemas: + if not schemas: self.report_warning( "No schemas found in database. If schemas exist, please grant USAGE permissions on them.", db_name, ) + else: + snowflake_db.schemas = schemas def _process_schema( self, snowflake_schema: SnowflakeSchema, db_name: str ) -> Iterable[MetadataWorkUnit]: - self.report.report_entity_scanned(snowflake_schema.name, "schema") - if not is_schema_allowed( - self.config.schema_pattern, - snowflake_schema.name, - db_name, - self.config.match_fully_qualified_names, - ): - self.report.report_dropped(f"{db_name}.{snowflake_schema.name}.*") - return - schema_name = snowflake_schema.name if self.config.extract_tags != TagOption.skip: @@ -772,9 +775,20 @@ def _process_schema( f"{db_name}.{schema_name}", ) - def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name): + def fetch_views_for_schema( + self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str + ) -> List[SnowflakeView]: try: - views = self.get_views_for_schema(schema_name, db_name) + views: List[SnowflakeView] = [] + for view in self.get_views_for_schema(schema_name, db_name): + view_name = self.get_dataset_identifier(view.name, schema_name, db_name) + + self.report.report_entity_scanned(view_name, "view") + + if not self.config.view_pattern.allowed(view_name): + self.report.report_dropped(view_name) + else: + views.append(view) snowflake_schema.views = [view.name for view in views] return views except Exception as e: @@ -792,10 +806,22 @@ def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name): "Failed to get views for schema", f"{db_name}.{schema_name}", ) + return [] - def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name): + def fetch_tables_for_schema( + self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str + ) -> List[SnowflakeTable]: try: - tables = self.get_tables_for_schema(schema_name, db_name) + tables: List[SnowflakeTable] = [] + for table in self.get_tables_for_schema(schema_name, db_name): + table_identifier = self.get_dataset_identifier( + table.name, schema_name, db_name + ) + self.report.report_entity_scanned(table_identifier) + if not self.config.table_pattern.allowed(table_identifier): + self.report.report_dropped(table_identifier) + else: + tables.append(table) snowflake_schema.tables = [table.name for table in tables] return tables except Exception as e: @@ -812,6 +838,7 @@ def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name): "Failed to get tables for schema", f"{db_name}.{schema_name}", ) + return [] def _process_table( self, @@ -821,12 +848,6 @@ def _process_table( ) -> Iterable[MetadataWorkUnit]: table_identifier = self.get_dataset_identifier(table.name, schema_name, db_name) - self.report.report_entity_scanned(table_identifier) - - if not self.config.table_pattern.allowed(table_identifier): - self.report.report_dropped(table_identifier) - return - self.fetch_columns_for_table(table, schema_name, db_name, table_identifier) self.fetch_pk_for_table(table, schema_name, db_name, table_identifier) @@ -938,12 +959,6 @@ def _process_view( ) -> Iterable[MetadataWorkUnit]: view_name = self.get_dataset_identifier(view.name, schema_name, db_name) - self.report.report_entity_scanned(view_name, "view") - - if not self.config.view_pattern.allowed(view_name): - self.report.report_dropped(view_name) - return - try: view.columns = self.get_columns_for_table(view.name, schema_name, db_name) if self.config.extract_tags != TagOption.skip: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py deleted file mode 100644 index a9afd40fd45b6..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py +++ /dev/null @@ -1,278 +0,0 @@ -import logging -import urllib.parse -from typing import Any, Dict, Iterable, List, Optional, Tuple - -import pydantic -import sqlalchemy.dialects.mssql - -# This import verifies that the dependencies are available. -import sqlalchemy_pytds # noqa: F401 -from pydantic.fields import Field -from sqlalchemy import create_engine, inspect -from sqlalchemy.engine.base import Connection -from sqlalchemy.engine.reflection import Inspector - -from datahub.configuration.common import AllowDenyPattern -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.source.sql.sql_common import ( - SQLAlchemySource, - register_custom_type, -) -from datahub.ingestion.source.sql.sql_config import ( - BasicSQLAlchemyConfig, - make_sqlalchemy_uri, -) -from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass - -logger: logging.Logger = logging.getLogger(__name__) - -register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) -register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) - - -class SQLServerConfig(BasicSQLAlchemyConfig): - # defaults - host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") - scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) - use_odbc: bool = Field( - default=False, - description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", - ) - uri_args: Dict[str, str] = Field( - default={}, - description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", - ) - database_pattern: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for databases to filter in ingestion.", - ) - database: Optional[str] = Field( - default=None, - description="database (catalog). If set to Null, all databases will be considered for ingestion.", - ) - convert_urns_to_lowercase: bool = Field( - default=False, - description="Enable to convert the SQL Server assets urns to lowercase", - ) - - @pydantic.validator("uri_args") - def passwords_match(cls, v, values, **kwargs): - if values["use_odbc"] and "driver" not in v: - raise ValueError("uri_args must contain a 'driver' option") - elif not values["use_odbc"] and v: - raise ValueError("uri_args is not supported when ODBC is disabled") - return v - - def get_sql_alchemy_url( - self, - uri_opts: Optional[Dict[str, Any]] = None, - current_db: Optional[str] = None, - ) -> str: - if self.use_odbc: - # Ensure that the import is available. - import pyodbc # noqa: F401 - - self.scheme = "mssql+pyodbc" - - uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri( - self.scheme, # type: ignore - self.username, - self.password.get_secret_value() if self.password else None, - self.host_port, # type: ignore - current_db if current_db else self.database, - uri_opts=uri_opts, - ) - if self.use_odbc: - uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}" - return uri - - -@platform_name("Microsoft SQL Server", id="mssql") -@config_class(SQLServerConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") -@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") -@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") -@capability( - SourceCapability.USAGE_STATS, - "Not provided by this module, use `bigquery-usage` for that.", - supported=False, -) -@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") -class SQLServerSource(SQLAlchemySource): - """ - This plugin extracts the following: - - - Metadata for databases, schemas, views and tables - - Column types associated with each table/view - - Table, row, and column statistics via optional SQL profiling - - We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. - """ - - def __init__(self, config: SQLServerConfig, ctx: PipelineContext): - super().__init__(config, ctx, "mssql") - # Cache the table and column descriptions - self.config: SQLServerConfig = config - self.current_database = None - self.table_descriptions: Dict[str, str] = {} - self.column_descriptions: Dict[str, str] = {} - for inspector in self.get_inspectors(): - db_name: str = self.get_db_name(inspector) - with inspector.engine.connect() as conn: - if self.config.use_odbc: - self._add_output_converters(conn) - self._populate_table_descriptions(conn, db_name) - self._populate_column_descriptions(conn, db_name) - - @staticmethod - def _add_output_converters(conn: Connection) -> None: - def handle_sql_variant_as_string(value): - return value.decode("utf-16le") - - # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported - # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy - try: - conn.connection.add_output_converter(-150, handle_sql_variant_as_string) - except AttributeError as e: - logger.debug( - f"Failed to mount output converter for MSSQL data type -150 due to {e}" - ) - - def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: - # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name - # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/ - table_metadata = conn.execute( - """ - SELECT - SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, - T.NAME AS table_name, - EP.VALUE AS table_description - FROM sys.tables AS T - INNER JOIN sys.extended_properties AS EP - ON EP.MAJOR_ID = T.[OBJECT_ID] - AND EP.MINOR_ID = 0 - AND EP.NAME = 'MS_Description' - AND EP.CLASS = 1 - """ - ) - for row in table_metadata: - self.table_descriptions[ - f"{db_name}.{row['schema_name']}.{row['table_name']}" - ] = row["table_description"] - - def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None: - column_metadata = conn.execute( - """ - SELECT - SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, - T.NAME AS table_name, - C.NAME AS column_name , - EP.VALUE AS column_description - FROM sys.tables AS T - INNER JOIN sys.all_columns AS C - ON C.OBJECT_ID = T.[OBJECT_ID] - INNER JOIN sys.extended_properties AS EP - ON EP.MAJOR_ID = T.[OBJECT_ID] - AND EP.MINOR_ID = C.COLUMN_ID - AND EP.NAME = 'MS_Description' - AND EP.CLASS = 1 - """ - ) - for row in column_metadata: - self.column_descriptions[ - f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}" - ] = row["column_description"] - - @classmethod - def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource": - config = SQLServerConfig.parse_obj(config_dict) - return cls(config, ctx) - - # override to get table descriptions - def get_table_properties( - self, inspector: Inspector, schema: str, table: str - ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: - description, properties, location_urn = super().get_table_properties( - inspector, schema, table - ) - # Update description if available. - db_name: str = self.get_db_name(inspector) - description = self.table_descriptions.get( - f"{db_name}.{schema}.{table}", description - ) - return description, properties, location_urn - - # override to get column descriptions - def _get_columns( - self, dataset_name: str, inspector: Inspector, schema: str, table: str - ) -> List[Dict]: - columns: List[Dict] = super()._get_columns( - dataset_name, inspector, schema, table - ) - # Update column description if available. - db_name: str = self.get_db_name(inspector) - for column in columns: - description: Optional[str] = self.column_descriptions.get( - f"{db_name}.{schema}.{table}.{column['name']}", - ) - if description: - column["comment"] = description - return columns - - def get_inspectors(self) -> Iterable[Inspector]: - # This method can be overridden in the case that you want to dynamically - # run on multiple databases. - url = self.config.get_sql_alchemy_url() - logger.debug(f"sql_alchemy_url={url}") - engine = create_engine(url, **self.config.options) - with engine.connect() as conn: - if self.config.database and self.config.database != "": - inspector = inspect(conn) - yield inspector - else: - databases = conn.execute( - "SELECT name FROM master.sys.databases WHERE name NOT IN \ - ('master', 'model', 'msdb', 'tempdb', 'Resource', \ - 'distribution' , 'reportserver', 'reportservertempdb'); " - ) - for db in databases: - if self.config.database_pattern.allowed(db["name"]): - url = self.config.get_sql_alchemy_url(current_db=db["name"]) - with create_engine( - url, **self.config.options - ).connect() as conn: - inspector = inspect(conn) - self.current_database = db["name"] - yield inspector - - def get_identifier( - self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any - ) -> str: - regular = f"{schema}.{entity}" - - qualified_table_name = regular - - if self.config.database: - if self.config.database_alias: - qualified_table_name = f"{self.config.database_alias}.{regular}" - else: - qualified_table_name = f"{self.config.database}.{regular}" - - if self.current_database: - qualified_table_name = f"{self.current_database}.{regular}" - - return ( - qualified_table_name.lower() - if self.config.convert_urns_to_lowercase - else qualified_table_name - ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py new file mode 100644 index 0000000000000..8db89505a9cf6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py @@ -0,0 +1 @@ +from datahub.ingestion.source.sql.mssql.source import SQLServerConfig, SQLServerSource diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py new file mode 100644 index 0000000000000..8aeb5421891aa --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -0,0 +1,239 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union + +from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn +from datahub.metadata.schema_classes import ( + DataFlowInfoClass, + DataJobInfoClass, + DataJobInputOutputClass, +) + + +@dataclass +class ProcedureDependency: + db: str + schema: str + name: str + type: str + env: str + server: str + source: str = "mssql" + + +@dataclass +class ProcedureLineageStream: + dependencies: List[ProcedureDependency] + + @property + def as_property(self) -> Dict[str, str]: + return { + f"{dep.db}.{dep.schema}.{dep.name}": dep.type for dep in self.dependencies + } + + +@dataclass +class MSSQLJob: + db: str + platform_instance: str + name: str + env: str + source: str = "mssql" + type: str = "JOB" + + @property + def formatted_name(self) -> str: + return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}" + + @property + def full_type(self) -> str: + return f"({self.source},{self.formatted_name},{self.env})" + + @property + def orchestrator(self) -> str: + return self.source + + @property + def formatted_platform_instance(self) -> str: + return self.platform_instance.replace(".", "/") + + @property + def cluster(self) -> str: + return f"{self.env}" + + +@dataclass +class MSSQLProceduresContainer: + db: str + platform_instance: str + name: str + env: str + source: str = "mssql" + type: str = "JOB" + + @property + def formatted_name(self) -> str: + return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}" + + @property + def orchestrator(self) -> str: + return self.source + + @property + def formatted_platform_instance(self) -> str: + return self.platform_instance.replace(".", "/") + + @property + def cluster(self) -> str: + return f"{self.env}" + + @property + def full_type(self) -> str: + return f"({self.source},{self.name},{self.env})" + + +@dataclass +class ProcedureParameter: + name: str + type: str + + @property + def properties(self) -> Dict[str, str]: + return {"type": self.type} + + +@dataclass +class StoredProcedure: + db: str + schema: str + name: str + flow: Union[MSSQLJob, MSSQLProceduresContainer] + type: str = "STORED_PROCEDURE" + source: str = "mssql" + + @property + def full_type(self) -> str: + return self.source.upper() + "_" + self.type + + @property + def formatted_name(self) -> str: + return self.name.replace(",", "-") + + @property + def full_name(self) -> str: + return f"{self.db}.{self.schema}.{self.formatted_name}" + + @property + def escape_full_name(self) -> str: + return f"[{self.db}].[{self.schema}].[{self.formatted_name}]" + + +@dataclass +class JobStep: + job_name: str + step_name: str + flow: MSSQLJob + type: str = "JOB_STEP" + source: str = "mssql" + + @property + def formatted_step(self) -> str: + return self.step_name.replace(",", "-").replace(" ", "_").lower() + + @property + def formatted_name(self) -> str: + return self.job_name.replace(",", "-") + + @property + def full_type(self) -> str: + return self.source.upper() + "_" + self.type + + @property + def full_name(self) -> str: + return f"{self.formatted_name}.{self.formatted_name}" + + +@dataclass +class MSSQLDataJob: + entity: Union[StoredProcedure, JobStep] + type: str = "dataJob" + source: str = "mssql" + external_url: str = "" + description: Optional[str] = None + status: Optional[str] = None + incoming: List[str] = field(default_factory=list) + outgoing: List[str] = field(default_factory=list) + input_jobs: List[str] = field(default_factory=list) + job_properties: Dict[str, str] = field(default_factory=dict) + + @property + def urn(self) -> str: + return make_data_job_urn( + orchestrator=self.entity.flow.orchestrator, + flow_id=self.entity.flow.formatted_name, + job_id=self.entity.formatted_name, + cluster=self.entity.flow.cluster, + ) + + def add_property( + self, + name: str, + value: str, + ) -> None: + self.job_properties[name] = value + + @property + def valued_properties(self) -> Dict[str, str]: + if self.job_properties: + return {k: v for k, v in self.job_properties.items() if v is not None} + return self.job_properties + + @property + def as_datajob_input_output_aspect(self) -> DataJobInputOutputClass: + return DataJobInputOutputClass( + inputDatasets=sorted(self.incoming), + outputDatasets=sorted(self.outgoing), + inputDatajobs=sorted(self.input_jobs), + ) + + @property + def as_datajob_info_aspect(self) -> DataJobInfoClass: + return DataJobInfoClass( + name=self.entity.full_name, + type=self.entity.full_type, + description=self.description, + customProperties=self.valued_properties, + externalUrl=self.external_url, + status=self.status, + ) + + +@dataclass +class MSSQLDataFlow: + entity: Union[MSSQLJob, MSSQLProceduresContainer] + type: str = "dataFlow" + source: str = "mssql" + external_url: str = "" + flow_properties: Dict[str, str] = field(default_factory=dict) + + def add_property( + self, + name: str, + value: str, + ) -> None: + self.flow_properties[name] = value + + @property + def urn(self) -> str: + return make_data_flow_urn( + orchestrator=self.entity.orchestrator, + flow_id=self.entity.formatted_name, + cluster=self.entity.cluster, + ) + + @property + def as_dataflow_info_aspect(self) -> DataFlowInfoClass: + return DataFlowInfoClass( + name=self.entity.formatted_name, + customProperties=self.flow_properties, + externalUrl=self.external_url, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py new file mode 100644 index 0000000000000..3c7701d93edeb --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -0,0 +1,665 @@ +import logging +import re +import urllib.parse +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import pydantic +import sqlalchemy.dialects.mssql + +# This import verifies that the dependencies are available. +import sqlalchemy_pytds # noqa: F401 +from pydantic.fields import Field +from sqlalchemy import create_engine, inspect +from sqlalchemy.engine.base import Connection +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.exc import ProgrammingError, ResourceClosedError + +from datahub.configuration.common import AllowDenyPattern +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.sql.mssql.job_models import ( + JobStep, + MSSQLDataFlow, + MSSQLDataJob, + MSSQLJob, + MSSQLProceduresContainer, + ProcedureDependency, + ProcedureLineageStream, + ProcedureParameter, + StoredProcedure, +) +from datahub.ingestion.source.sql.sql_common import ( + SQLAlchemySource, + SqlWorkUnit, + register_custom_type, +) +from datahub.ingestion.source.sql.sql_config import ( + BasicSQLAlchemyConfig, + make_sqlalchemy_uri, +) +from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass + +logger: logging.Logger = logging.getLogger(__name__) + +register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) +register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) + + +class SQLServerConfig(BasicSQLAlchemyConfig): + # defaults + host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") + scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) + include_stored_procedures: bool = Field( + default=True, + description="Include ingest of stored procedures. Requires access to the 'sys' schema.", + ) + include_stored_procedures_code: bool = Field( + default=True, description="Include information about object code." + ) + include_jobs: bool = Field( + default=True, + description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.", + ) + include_descriptions: bool = Field( + default=True, description="Include table descriptions information." + ) + use_odbc: bool = Field( + default=False, + description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", + ) + uri_args: Dict[str, str] = Field( + default={}, + description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", + ) + database_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for databases to filter in ingestion.", + ) + database: Optional[str] = Field( + default=None, + description="database (catalog). If set to Null, all databases will be considered for ingestion.", + ) + convert_urns_to_lowercase: bool = Field( + default=False, + description="Enable to convert the SQL Server assets urns to lowercase", + ) + + @pydantic.validator("uri_args") + def passwords_match(cls, v, values, **kwargs): + if values["use_odbc"] and "driver" not in v: + raise ValueError("uri_args must contain a 'driver' option") + elif not values["use_odbc"] and v: + raise ValueError("uri_args is not supported when ODBC is disabled") + return v + + def get_sql_alchemy_url( + self, + uri_opts: Optional[Dict[str, Any]] = None, + current_db: Optional[str] = None, + ) -> str: + if self.use_odbc: + # Ensure that the import is available. + import pyodbc # noqa: F401 + + self.scheme = "mssql+pyodbc" + + uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri( + self.scheme, # type: ignore + self.username, + self.password.get_secret_value() if self.password else None, + self.host_port, # type: ignore + current_db if current_db else self.database, + uri_opts=uri_opts, + ) + if self.use_odbc: + uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}" + return uri + + @property + def host(self): + return self.platform_instance or self.host_port.split(":")[0] + + @property + def db(self): + return self.database_alias or self.database + + +@platform_name("Microsoft SQL Server", id="mssql") +@config_class(SQLServerConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +class SQLServerSource(SQLAlchemySource): + """ + This plugin extracts the following: + - Metadata for databases, schemas, views and tables + - Column types associated with each table/view + - Table, row, and column statistics via optional SQL profiling + We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. + """ + + def __init__(self, config: SQLServerConfig, ctx: PipelineContext): + super().__init__(config, ctx, "mssql") + # Cache the table and column descriptions + self.config: SQLServerConfig = config + self.current_database = None + self.table_descriptions: Dict[str, str] = {} + self.column_descriptions: Dict[str, str] = {} + if self.config.include_descriptions: + for inspector in self.get_inspectors(): + db_name: str = self.get_db_name(inspector) + with inspector.engine.connect() as conn: + if self.config.use_odbc: + self._add_output_converters(conn) + self._populate_table_descriptions(conn, db_name) + self._populate_column_descriptions(conn, db_name) + + @staticmethod + def _add_output_converters(conn: Connection) -> None: + def handle_sql_variant_as_string(value): + try: + return value.decode("utf-16le") + except UnicodeDecodeError: + return value.decode("Windows-1251") + + # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported + # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy + try: + conn.connection.add_output_converter(-150, handle_sql_variant_as_string) + except AttributeError as e: + logger.debug( + f"Failed to mount output converter for MSSQL data type -150 due to {e}" + ) + + def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: + # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name + # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/ + table_metadata = conn.execute( + """ + SELECT + SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, + T.NAME AS table_name, + EP.VALUE AS table_description + FROM sys.tables AS T + INNER JOIN sys.extended_properties AS EP + ON EP.MAJOR_ID = T.[OBJECT_ID] + AND EP.MINOR_ID = 0 + AND EP.NAME = 'MS_Description' + AND EP.CLASS = 1 + """ + ) + for row in table_metadata: + self.table_descriptions[ + f"{db_name}.{row['schema_name']}.{row['table_name']}" + ] = row["table_description"] + + def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None: + column_metadata = conn.execute( + """ + SELECT + SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, + T.NAME AS table_name, + C.NAME AS column_name , + EP.VALUE AS column_description + FROM sys.tables AS T + INNER JOIN sys.all_columns AS C + ON C.OBJECT_ID = T.[OBJECT_ID] + INNER JOIN sys.extended_properties AS EP + ON EP.MAJOR_ID = T.[OBJECT_ID] + AND EP.MINOR_ID = C.COLUMN_ID + AND EP.NAME = 'MS_Description' + AND EP.CLASS = 1 + """ + ) + for row in column_metadata: + self.column_descriptions[ + f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}" + ] = row["column_description"] + + @classmethod + def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource": + config = SQLServerConfig.parse_obj(config_dict) + return cls(config, ctx) + + # override to get table descriptions + def get_table_properties( + self, inspector: Inspector, schema: str, table: str + ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: + description, properties, location_urn = super().get_table_properties( + inspector, schema, table + ) + # Update description if available. + db_name: str = self.get_db_name(inspector) + description = self.table_descriptions.get( + f"{db_name}.{schema}.{table}", description + ) + return description, properties, location_urn + + # override to get column descriptions + def _get_columns( + self, dataset_name: str, inspector: Inspector, schema: str, table: str + ) -> List[Dict]: + columns: List[Dict] = super()._get_columns( + dataset_name, inspector, schema, table + ) + # Update column description if available. + db_name: str = self.get_db_name(inspector) + for column in columns: + description: Optional[str] = self.column_descriptions.get( + f"{db_name}.{schema}.{table}.{column['name']}", + ) + if description: + column["comment"] = description + return columns + + def get_database_level_workunits( + self, + inspector: Inspector, + database: str, + ) -> Iterable[MetadataWorkUnit]: + yield from super().get_database_level_workunits( + inspector=inspector, + database=database, + ) + if self.config.include_jobs: + try: + yield from self.loop_jobs(inspector, self.config) + except Exception as e: + self.report.report_failure( + "jobs", + f"Failed to list jobs due to error {e}", + ) + + def get_schema_level_workunits( + self, + inspector: Inspector, + schema: str, + database: str, + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from super().get_schema_level_workunits( + inspector=inspector, + schema=schema, + database=database, + ) + if self.config.include_stored_procedures: + try: + yield from self.loop_stored_procedures(inspector, schema, self.config) + except Exception as e: + self.report.report_failure( + "jobs", + f"Failed to list jobs due to error {e}", + ) + + def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]: + jobs_data = conn.execute( + f""" + SELECT + job.job_id, + job.name, + job.description, + job.date_created, + job.date_modified, + steps.step_id, + steps.step_name, + steps.subsystem, + steps.command, + steps.database_name + FROM + msdb.dbo.sysjobs job + INNER JOIN + msdb.dbo.sysjobsteps steps + ON + job.job_id = steps.job_id + where database_name = '{db_name}' + """ + ) + jobs: Dict[str, Dict[str, Any]] = {} + for row in jobs_data: + step_data = dict( + job_id=row["job_id"], + job_name=row["name"], + description=row["description"], + date_created=row["date_created"], + date_modified=row["date_modified"], + step_id=row["step_id"], + step_name=row["step_name"], + subsystem=row["subsystem"], + command=row["command"], + ) + if row["name"] in jobs: + jobs[row["name"]][row["step_id"]] = step_data + else: + jobs[row["name"]] = {row["step_id"]: step_data} + return jobs + + def loop_jobs( + self, + inspector: Inspector, + sql_config: SQLServerConfig, + ) -> Iterable[MetadataWorkUnit]: + """ + Loop MS SQL jobs as dataFlow-s. + :return: + """ + db_name = self.get_db_name(inspector) + with inspector.engine.connect() as conn: + jobs = self._get_jobs(conn, db_name) + for job_name, job_steps in jobs.items(): + job = MSSQLJob( + name=job_name, + env=sql_config.env, + db=db_name, + platform_instance=sql_config.host, + ) + data_flow = MSSQLDataFlow(entity=job) + yield from self.construct_flow_workunits(data_flow=data_flow) + yield from self.loop_job_steps(job, job_steps) + + def loop_job_steps( + self, job: MSSQLJob, job_steps: Dict[str, Any] + ) -> Iterable[MetadataWorkUnit]: + for step_id, step_data in job_steps.items(): + step = JobStep( + job_name=job.formatted_name, + step_name=step_data["step_name"], + flow=job, + ) + data_job = MSSQLDataJob(entity=step) + for data_name, data_value in step_data.items(): + data_job.add_property(name=data_name, value=str(data_value)) + yield from self.construct_job_workunits(data_job) + + def loop_stored_procedures( # noqa: C901 + self, + inspector: Inspector, + schema: str, + sql_config: SQLServerConfig, + ) -> Iterable[MetadataWorkUnit]: + """ + Loop schema data for get stored procedures as dataJob-s. + """ + db_name = self.get_db_name(inspector) + procedure_flow_name = f"{db_name}.{schema}.stored_procedures" + mssql_default_job = MSSQLProceduresContainer( + name=procedure_flow_name, + env=sql_config.env, + db=db_name, + platform_instance=sql_config.host, + ) + data_flow = MSSQLDataFlow(entity=mssql_default_job) + with inspector.engine.connect() as conn: + procedures_data_list = self._get_stored_procedures(conn, db_name, schema) + procedures = [ + StoredProcedure(flow=mssql_default_job, **procedure_data) + for procedure_data in procedures_data_list + ] + if procedures: + yield from self.construct_flow_workunits(data_flow=data_flow) + for procedure in procedures: + upstream = self._get_procedure_upstream(conn, procedure) + downstream = self._get_procedure_downstream(conn, procedure) + data_job = MSSQLDataJob( + entity=procedure, + ) + # TODO: because of this upstream and downstream are more dependencies, + # can't be used as DataJobInputOutput. + # Should be reorganized into lineage. + data_job.add_property("procedure_depends_on", str(upstream.as_property)) + data_job.add_property( + "depending_on_procedure", str(downstream.as_property) + ) + procedure_definition, procedure_code = self._get_procedure_code( + conn, procedure + ) + if procedure_definition: + data_job.add_property("definition", procedure_definition) + if sql_config.include_stored_procedures_code and procedure_code: + data_job.add_property("code", procedure_code) + procedure_inputs = self._get_procedure_inputs(conn, procedure) + properties = self._get_procedure_properties(conn, procedure) + data_job.add_property( + "input parameters", str([param.name for param in procedure_inputs]) + ) + for param in procedure_inputs: + data_job.add_property( + f"parameter {param.name}", str(param.properties) + ) + for property_name, property_value in properties.items(): + data_job.add_property(property_name, str(property_value)) + yield from self.construct_job_workunits(data_job) + + @staticmethod + def _get_procedure_downstream( + conn: Connection, procedure: StoredProcedure + ) -> ProcedureLineageStream: + downstream_data = conn.execute( + f""" + SELECT DISTINCT OBJECT_SCHEMA_NAME ( referencing_id ) AS [schema], + OBJECT_NAME(referencing_id) AS [name], + o.type_desc AS [type] + FROM sys.sql_expression_dependencies AS sed + INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id + left join sys.objects o1 on sed.referenced_id = o1.object_id + WHERE referenced_id = OBJECT_ID(N'{procedure.escape_full_name}') + AND o.type_desc in ('TABLE_TYPE', 'VIEW', 'USER_TABLE') + """ + ) + downstream_dependencies = [] + for row in downstream_data: + downstream_dependencies.append( + ProcedureDependency( + db=procedure.db, + schema=row["schema"], + name=row["name"], + type=row["type"], + env=procedure.flow.env, + server=procedure.flow.platform_instance, + ) + ) + return ProcedureLineageStream(dependencies=downstream_dependencies) + + @staticmethod + def _get_procedure_upstream( + conn: Connection, procedure: StoredProcedure + ) -> ProcedureLineageStream: + upstream_data = conn.execute( + f""" + SELECT DISTINCT + coalesce(lower(referenced_database_name), db_name()) AS db, + referenced_schema_name AS [schema], + referenced_entity_name AS [name], + o1.type_desc AS [type] + FROM sys.sql_expression_dependencies AS sed + INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id + left join sys.objects o1 on sed.referenced_id = o1.object_id + WHERE referencing_id = OBJECT_ID(N'{procedure.escape_full_name}') + AND referenced_schema_name is not null + AND o1.type_desc in ('TABLE_TYPE', 'VIEW', 'SQL_STORED_PROCEDURE', 'USER_TABLE') + """ + ) + upstream_dependencies = [] + for row in upstream_data: + upstream_dependencies.append( + ProcedureDependency( + db=row["db"], + schema=row["schema"], + name=row["name"], + type=row["type"], + env=procedure.flow.env, + server=procedure.flow.platform_instance, + ) + ) + return ProcedureLineageStream(dependencies=upstream_dependencies) + + @staticmethod + def _get_procedure_inputs( + conn: Connection, procedure: StoredProcedure + ) -> List[ProcedureParameter]: + inputs_data = conn.execute( + f""" + SELECT + name, + type_name(user_type_id) AS 'type' + FROM sys.parameters + WHERE object_id = object_id('{procedure.escape_full_name}') + """ + ) + inputs_list = [] + for row in inputs_data: + inputs_list.append(ProcedureParameter(name=row["name"], type=row["type"])) + return inputs_list + + @staticmethod + def _get_procedure_code( + conn: Connection, procedure: StoredProcedure + ) -> Tuple[Optional[str], Optional[str]]: + query = f"EXEC [{procedure.db}].dbo.sp_helptext '{procedure.full_name}'" + try: + code_data = conn.execute(query) + except ProgrammingError: + logger.warning( + "Denied permission for read text from procedure '%s'", + procedure.full_name, + ) + return None, None + code_list = [] + code_slice_index = 0 + code_slice_text = "create procedure" + try: + for index, row in enumerate(code_data): + code_list.append(row["Text"]) + if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip(): + code_slice_index = index + definition = "\n".join(code_list[:code_slice_index]) + code = "\n".join(code_list[code_slice_index:]) + except ResourceClosedError: + logger.warning( + "Connection was closed from procedure '%s'", + procedure.full_name, + ) + return None, None + return definition, code + + @staticmethod + def _get_procedure_properties( + conn: Connection, procedure: StoredProcedure + ) -> Dict[str, Any]: + properties_data = conn.execute( + f""" + SELECT + create_date as date_created, + modify_date as date_modified + FROM sys.procedures + WHERE object_id = object_id('{procedure.full_name}') + """ + ) + properties = {} + for row in properties_data: + properties = dict( + date_created=row["date_created"], date_modified=row["date_modified"] + ) + return properties + + @staticmethod + def _get_stored_procedures( + conn: Connection, db_name: str, schema: str + ) -> List[Dict[str, str]]: + stored_procedures_data = conn.execute( + f""" + SELECT + pr.name as procedure_name, + s.name as schema_name + FROM + [{db_name}].[sys].[procedures] pr + INNER JOIN + [{db_name}].[sys].[schemas] s ON pr.schema_id = s.schema_id + where s.name = '{schema}' + """ + ) + procedures_list = [] + for row in stored_procedures_data: + procedures_list.append( + dict(db=db_name, schema=row["schema_name"], name=row["procedure_name"]) + ) + return procedures_list + + def construct_job_workunits( + self, + data_job: MSSQLDataJob, + ) -> Iterable[MetadataWorkUnit]: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_info_aspect, + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_input_output_aspect, + ).as_workunit() + # TODO: Add SubType when it appear + + def construct_flow_workunits( + self, + data_flow: MSSQLDataFlow, + ) -> Iterable[MetadataWorkUnit]: + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_flow.as_dataflow_info_aspect, + ).as_workunit() + # TODO: Add SubType when it appear + + def get_inspectors(self) -> Iterable[Inspector]: + # This method can be overridden in the case that you want to dynamically + # run on multiple databases. + url = self.config.get_sql_alchemy_url() + logger.debug(f"sql_alchemy_url={url}") + engine = create_engine(url, **self.config.options) + with engine.connect() as conn: + if self.config.database and self.config.database != "": + inspector = inspect(conn) + yield inspector + else: + databases = conn.execute( + "SELECT name FROM master.sys.databases WHERE name NOT IN \ + ('master', 'model', 'msdb', 'tempdb', 'Resource', \ + 'distribution' , 'reportserver', 'reportservertempdb'); " + ) + for db in databases: + if self.config.database_pattern.allowed(db["name"]): + url = self.config.get_sql_alchemy_url(current_db=db["name"]) + with create_engine( + url, **self.config.options + ).connect() as conn: + inspector = inspect(conn) + self.current_database = db["name"] + yield inspector + + def get_identifier( + self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any + ) -> str: + regular = f"{schema}.{entity}" + qualified_table_name = regular + if self.config.database: + if self.config.database_alias: + qualified_table_name = f"{self.config.database_alias}.{regular}" + else: + qualified_table_name = f"{self.config.database}.{regular}" + if self.current_database: + qualified_table_name = f"{self.current_database}.{regular}" + return ( + qualified_table_name.lower() + if self.config.convert_urns_to_lowercase + else qualified_table_name + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 280f4f47adcdf..b5458a42192fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -478,6 +478,27 @@ def add_table_to_schema_container( parent_container_key=schema_container_key, ) + def get_database_level_workunits( + self, + inspector: Inspector, + database: str, + ) -> Iterable[MetadataWorkUnit]: + yield from self.gen_database_containers(database=database) + + def get_schema_level_workunits( + self, + inspector: Inspector, + schema: str, + database: str, + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from self.gen_schema_containers(schema=schema, database=database) + + if self.config.include_tables: + yield from self.loop_tables(inspector, schema, self.config) + + if self.config.include_views: + yield from self.loop_views(inspector, schema, self.config) + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), @@ -516,27 +537,20 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit ) db_name = self.get_db_name(inspector) - yield from self.gen_database_containers( + yield from self.get_database_level_workunits( + inspector=inspector, database=db_name, ) for schema in self.get_allowed_schemas(inspector, db_name): self.add_information_for_schema(inspector, schema) - yield from self.gen_schema_containers( - database=db_name, + yield from self.get_schema_level_workunits( + inspector=inspector, schema=schema, - extra_properties=self.get_schema_properties( - inspector=inspector, schema=schema, database=db_name - ), + database=db_name, ) - if sql_config.include_tables: - yield from self.loop_tables(inspector, schema, sql_config) - - if sql_config.include_views: - yield from self.loop_views(inspector, schema, sql_config) - if profiler: profile_requests += list( self.loop_profiler_requests(inspector, schema, sql_config) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py new file mode 100644 index 0000000000000..2fcc93292c2ef --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -0,0 +1,223 @@ +import json +import logging +import os +from dataclasses import dataclass +from datetime import datetime, timezone +from functools import partial +from typing import Iterable, List, Optional, Set + +from pydantic import Field + +from datahub.configuration.source_common import ( + EnvConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.emitter.mce_builder import ( + make_dataset_urn_with_platform_instance, + make_user_urn, +) +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit_reporter +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig +from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage + +logger = logging.getLogger(__name__) + + +class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): + query_file: str = Field(description="Path to file to ingest") + + platform: str = Field( + description="The platform for which to generate data, e.g. snowflake" + ) + + usage: BaseUsageConfig = Field( + description="The usage config to use when generating usage statistics", + default=BaseUsageConfig(), + ) + + use_schema_resolver: bool = Field( + description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.", + default=True, + hidden_from_docs=True, + ) + default_db: Optional[str] = Field( + description="The default database to use for unqualified table names", + default=None, + ) + default_schema: Optional[str] = Field( + description="The default schema to use for unqualified table names", + default=None, + ) + + +class SqlQueriesSourceReport(SourceReport): + num_queries_parsed: int = 0 + num_table_parse_failures: int = 0 + num_column_parse_failures: int = 0 + + def compute_stats(self) -> None: + super().compute_stats() + self.table_failure_rate = ( + f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}" + if self.num_queries_parsed + else "0" + ) + self.column_failure_rate = ( + f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}" + if self.num_queries_parsed + else "0" + ) + + +@platform_name("SQL Queries") +@config_class(SqlQueriesSourceConfig) +@support_status(SupportStatus.TESTING) +class SqlQueriesSource(Source): + # TODO: Documentation + urns: Optional[Set[str]] + schema_resolver: SchemaResolver + builder: SqlParsingBuilder + + def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig): + if not ctx.graph: + raise ValueError( + "SqlQueriesSource needs a datahub_api from which to pull schema metadata" + ) + + self.graph: DataHubGraph = ctx.graph + self.ctx = ctx + self.config = config + self.report = SqlQueriesSourceReport() + + self.builder = SqlParsingBuilder(usage_config=self.config.usage) + + if self.config.use_schema_resolver: + schema_resolver, urns = self.graph.initialize_schema_resolver_from_datahub( + platform=self.config.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self.schema_resolver = schema_resolver + self.urns = urns + else: + self.schema_resolver = self.graph._make_schema_resolver( + platform=self.config.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self.urns = None + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource": + config = SqlQueriesSourceConfig.parse_obj(config_dict) + return cls(ctx, config) + + def get_report(self) -> SqlQueriesSourceReport: + return self.report + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [partial(auto_workunit_reporter, self.get_report())] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}") + with open(self.config.query_file) as f: + for line in f: + try: + query_dict = json.loads(line, strict=False) + entry = QueryEntry.create(query_dict, config=self.config) + yield from self._process_query(entry) + except Exception as e: + logger.warning("Error processing query", exc_info=True) + self.report.report_warning("process-query", str(e)) + + logger.info("Generating workunits") + yield from self.builder.gen_workunits() + + def _process_query(self, entry: "QueryEntry") -> Iterable[MetadataWorkUnit]: + self.report.num_queries_parsed += 1 + if self.report.num_queries_parsed % 1000 == 0: + logger.info(f"Parsed {self.report.num_queries_parsed} queries") + + result = sqlglot_lineage( + sql=entry.query, + schema_resolver=self.schema_resolver, + default_db=self.config.default_db, + default_schema=self.config.default_schema, + ) + if result.debug_info.table_error: + logger.info(f"Error parsing table lineage, {result.debug_info.table_error}") + self.report.num_table_parse_failures += 1 + for downstream_urn in set(entry.downstream_tables): + self.builder.add_lineage( + downstream_urn=downstream_urn, + upstream_urns=entry.upstream_tables, + timestamp=entry.timestamp, + user=entry.user, + ) + return + elif result.debug_info.column_error: + logger.debug( + f"Error parsing column lineage, {result.debug_info.column_error}" + ) + self.report.num_column_parse_failures += 1 + + yield from self.builder.process_sql_parsing_result( + result, + query=entry.query, + query_timestamp=entry.timestamp, + user=entry.user, + custom_operation_type=entry.operation_type, + include_urns=self.urns, + ) + + +@dataclass +class QueryEntry: + query: str + timestamp: Optional[datetime] + user: Optional[str] + operation_type: Optional[str] + downstream_tables: List[str] + upstream_tables: List[str] + + @classmethod + def create( + cls, entry_dict: dict, *, config: SqlQueriesSourceConfig + ) -> "QueryEntry": + return cls( + query=entry_dict["query"], + timestamp=datetime.fromtimestamp(entry_dict["timestamp"], tz=timezone.utc) + if "timestamp" in entry_dict + else None, + user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None, + operation_type=entry_dict.get("operation_type"), + downstream_tables=[ + make_dataset_urn_with_platform_instance( + name=table, + platform=config.platform, + platform_instance=config.platform_instance, + env=config.env, + ) + for table in entry_dict.get("downstream_tables", []) + ], + upstream_tables=[ + make_dataset_urn_with_platform_instance( + name=table, + platform=config.platform, + platform_instance=config.platform_instance, + env=config.env, + ) + for table in entry_dict.get("upstream_tables", []) + ], + ) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 6d028c4ac1b9e..534cac5cef2aa 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -7,7 +7,6 @@ from collections import defaultdict from typing import Dict, List, Optional, Set, Tuple, Union -import pydantic import pydantic.dataclasses import sqlglot import sqlglot.errors @@ -23,7 +22,7 @@ from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.metadata.schema_classes import SchemaMetadataClass +from datahub.metadata.schema_classes import OperationTypeClass, SchemaMetadataClass from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.dataset_urn import DatasetUrn @@ -34,6 +33,8 @@ # A lightweight table schema: column -> type mapping. SchemaInfo = Dict[str, str] +SQL_PARSE_RESULT_CACHE_SIZE = 1000 + class QueryType(enum.Enum): CREATE = "CREATE" @@ -45,6 +46,22 @@ class QueryType(enum.Enum): UNKNOWN = "UNKNOWN" + def to_operation_type(self) -> Optional[str]: + if self == QueryType.CREATE: + return OperationTypeClass.CREATE + elif self == QueryType.INSERT: + return OperationTypeClass.INSERT + elif self == QueryType.UPDATE: + return OperationTypeClass.UPDATE + elif self == QueryType.DELETE: + return OperationTypeClass.DELETE + elif self == QueryType.MERGE: + return OperationTypeClass.UPDATE + elif self == QueryType.SELECT: + return None + else: + return OperationTypeClass.UNKNOWN + def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: # UPGRADE: Once we use Python 3.10, replace this with a match expression. @@ -623,16 +640,21 @@ def _translate_internal_column_lineage( ) +def _get_dialect(platform: str) -> str: + # TODO: convert datahub platform names to sqlglot dialect + if platform == "presto-on-hive": + return "hive" + else: + return platform + + def _sqlglot_lineage_inner( sql: str, schema_resolver: SchemaResolver, default_db: Optional[str] = None, default_schema: Optional[str] = None, ) -> SqlParsingResult: - # TODO: convert datahub platform names to sqlglot dialect - # TODO: Pull the platform name from the schema resolver? - dialect = schema_resolver.platform - + dialect = _get_dialect(schema_resolver.platform) if dialect == "snowflake": # in snowflake, table identifiers must be uppercased to match sqlglot's behavior. if default_db: @@ -755,6 +777,7 @@ def _sqlglot_lineage_inner( ) +@functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE) def sqlglot_lineage( sql: str, schema_resolver: SchemaResolver, diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 3bda6c5cce84b..cc3ee1f6ceaa4 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -61,6 +61,7 @@ def test_bigquery_v2_ingest( "project_ids": ["project-id-1"], "include_usage_statistics": False, "include_table_lineage": False, + "include_data_platform_instance": True, } pipeline_config_dict: Dict[str, Any] = { diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index 6167c63e6c9b8..dee85b40bb7a8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -262,8 +262,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" @@ -412,8 +412,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index e66ec4bb89d8c..72db36e63daf7 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index 11e0760decae3..e5508bdb06b9e 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,datahub-demo.view.faa_flights,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index ddfd102cb15b0..91e13debfa028 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -279,8 +279,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" @@ -429,8 +429,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 54624986216b8..e93079119e4f4 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 6cab0db8c33cf..a9c8efa7cdb98 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -206,32 +206,32 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 9a088a7a8baef..edd15624a14cd 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -206,24 +206,24 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index f8e2565e492e1..aebc89b609a08 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 32d4f7bc64ab4..34bded3cf691e 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -158,8 +158,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/expected_output.json b/metadata-ingestion/tests/integration/lookml/expected_output.json index cdf520cc23a30..b53d5857f1d66 100644 --- a/metadata-ingestion/tests/integration/lookml/expected_output.json +++ b/metadata-ingestion/tests/integration/lookml/expected_output.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json index 73edecbe62205..238f4c2580cdf 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json index 9aa6a952c40b4..45d5d839e9d21 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json index 6ce6d809ae8f5..187cedaefb6b2 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json @@ -450,8 +450,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -557,8 +557,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -664,8 +664,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -816,8 +816,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -923,8 +923,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1123,8 +1123,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1230,8 +1230,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1416,8 +1416,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1615,8 +1615,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1854,8 +1854,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json index 1016d4e211458..a323118666940 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json index fc91c97a53003..c2c879e38f37b 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json index 8635a570c0621..c1ac54b0fb588 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -128,8 +128,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -235,8 +235,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -387,8 +387,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -494,8 +494,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json index 19168aa323142..f602ca37b3160 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.autodetect_sql_name_based_on_view_name,DEV)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.ecommerce.ability,DEV)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.flightstats.accidents,DEV)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json index d4ced76a7475d..104bd365669e3 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" @@ -261,8 +261,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -480,8 +480,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -588,8 +588,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -696,8 +696,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -849,8 +849,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -957,8 +957,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1065,8 +1065,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" @@ -1248,8 +1248,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1356,8 +1356,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1543,8 +1543,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1743,8 +1743,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1983,8 +1983,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json index 2bae6452145df..37a6c94c6952e 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)", "type": "VIEW" @@ -459,8 +459,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json index a5c316f365d4b..49831ee554ab1 100644 --- a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" @@ -303,8 +303,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" @@ -410,8 +410,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.order,PROD)", "type": "VIEW" @@ -607,8 +607,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.issue_history,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json index de303d50e7acd..dc5e1aa9096f8 100644 --- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" @@ -1764,8 +1764,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -2003,8 +2003,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index be4ae9e047aea..67a563baa561c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", @@ -3985,6 +4111,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:c6627af82d44de89492e1a9315ae9f4b", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index bc81ce9633432..ef6033dd91943 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", @@ -2053,6 +2179,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index 8be2fe134dca1..8098accebb424 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoDataAlias.Foo.SalesReason,PROD)", @@ -2053,6 +2179,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index ba2ab7330fded..d32002fb5648c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -81,6 +81,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "b6a0c1e2-f90a-4c86-a226-bf7ca59ad79f", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-08-06 21:01:05.157000", + "date_modified": "2023-08-06 21:01:05.283000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:f1b4c0e379c4b2e2e09a8ecd6c1b6dec", @@ -1764,6 +1828,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-08-06 21:01:05.093000", + "date_modified": "2023-08-06 21:01:05.093000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:a6bea84fba7b05fb5d12630c8e6306ac", @@ -2072,5 +2198,65 @@ "lastObserved": 1615443388097, "runId": "mssql-test" } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql index 612de3eb1583c..2ff46e249007a 100644 --- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql +++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql @@ -44,6 +44,10 @@ CREATE TABLE Foo.SalesReason ) ; GO +CREATE PROCEDURE Foo.DBs @ID INT +AS + SELECT @ID AS ThatDB; +GO GO EXEC sys.sp_addextendedproperty @@ -59,5 +63,31 @@ EXEC sys.sp_addextendedproperty @value = N'Description for column LastName of table Persons of schema Foo.', @level0type = N'SCHEMA', @level0name = 'Foo', @level1type = N'TABLE', @level1name = 'Persons', -@level2type = N'COLUMN',@level2name = 'LastName'; -GO \ No newline at end of file +@level2type = N'COLUMN',@level2name = 'LastName'; +GO +USE msdb ; +GO +EXEC dbo.sp_add_job + @job_name = N'Weekly Demo Data Backup' ; +GO +EXEC sp_add_jobstep + @job_name = N'Weekly Demo Data Backup', + @step_name = N'Set database to read only', + @database_name = N'DemoData', + @subsystem = N'TSQL', + @command = N'ALTER DATABASE DemoData SET READ_ONLY', + @retry_attempts = 5, + @retry_interval = 5 ; +GO +EXEC dbo.sp_add_schedule + @schedule_name = N'RunOnce', + @freq_type = 1, + @active_start_time = 233000 ; +GO +EXEC sp_attach_schedule + @job_name = N'Weekly Demo Data Backup', + @schedule_name = N'RunOnce'; +GO +EXEC dbo.sp_add_jobserver + @job_name = N'Weekly Demo Data Backup' +GO diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index 3e7b75edd4878..099690fed34c2 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -50,4 +50,9 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi output_path=tmp_path / "mssql_mces.json", golden_path=test_resources_dir / f"golden_files/golden_mces_{config_file.replace('yml','json')}", + ignore_paths=[ + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['job_id'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_created'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_modified'\]", + ], ) diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index fc8ca166b105a..47418d9a989bb 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -138,13 +138,12 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})" ) - config = BigQueryV2Config.parse_obj({}) + config = BigQueryV2Config.parse_obj({"include_data_platform_instance": True}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) data_platform_instance = source.get_dataplatform_instance_aspect( "urn:li:test", project_id ) - metadata = data_platform_instance.get_metadata()["metadata"] assert data_platform_instance is not None @@ -152,6 +151,20 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): assert metadata.aspect.instance == expected_instance +def test_get_dataplatform_instance_default_no_instance(): + config = BigQueryV2Config.parse_obj({}) + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + + data_platform_instance = source.get_dataplatform_instance_aspect( + "urn:li:test", "project_id" + ) + metadata = data_platform_instance.get_metadata()["metadata"] + + assert data_platform_instance is not None + assert metadata.aspectName == "dataPlatformInstance" + assert metadata.aspect.instance is None + + @patch("google.cloud.bigquery.client.Client") def test_get_projects_with_single_project_id(client_mock): config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/test_snowflake_shares.py new file mode 100644 index 0000000000000..7de86139baf39 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_snowflake_shares.py @@ -0,0 +1,348 @@ +from typing import List + +import pytest + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.source.snowflake.snowflake_config import ( + DatabaseId, + SnowflakeShareConfig, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_schema import ( + SnowflakeDatabase, + SnowflakeSchema, +) +from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler +from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings +from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeProposal + + +@pytest.fixture(scope="module") +def snowflake_databases() -> List[SnowflakeDatabase]: + return [ + SnowflakeDatabase( + name="db1", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema11", + created=None, + comment=None, + last_altered=None, + tables=["table111", "table112"], + views=["view111"], + ), + SnowflakeSchema( + name="schema12", + created=None, + comment=None, + last_altered=None, + tables=["table121", "table122"], + views=["view121"], + ), + ], + ), + SnowflakeDatabase( + name="db2", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema21", + created=None, + comment=None, + last_altered=None, + tables=["table211", "table212"], + views=["view211"], + ), + SnowflakeSchema( + name="schema22", + created=None, + comment=None, + last_altered=None, + tables=["table221", "table222"], + views=["view221"], + ), + ], + ), + SnowflakeDatabase( + name="db3", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema31", + created=None, + comment=None, + last_altered=None, + tables=["table311", "table312"], + views=["view311"], + ) + ], + ), + ] + + +def make_snowflake_urn(table_name, instance_name=None): + return make_dataset_urn_with_platform_instance( + "snowflake", table_name, instance_name + ) + + +def test_snowflake_shares_workunit_no_shares( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config(account_id="abc12345", platform_instance="instance1") + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x) + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + assert len(wus) == 0 + + +def test_same_database_inbound_and_outbound_invalid_config() -> None: + with pytest.raises( + ValueError, + match="Same database can not be present as consumer in more than one share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance3", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + }, + ) + + with pytest.raises( + ValueError, + match="Database included in a share can not be present as consumer in any share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance1", + consumers=[ + DatabaseId(database="db1", platform_instance="instance3") + ], + ), + }, + ) + + with pytest.raises( + ValueError, + match="Database included in a share can not be present as consumer in any share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance1", + consumers=[ + DatabaseId(database="db1", platform_instance="instance3") + ], + ), + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + }, + ) + + +def test_snowflake_shares_workunit_inbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[DatabaseId(database="db1", platform_instance="instance1")], + ) + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets + # Hence 6 Sibling and 6 upstreamLineage aspects + assert len(wus) == 12 + upstream_lineage_aspect_entity_urns = set() + sibling_aspect_entity_urns = set() + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "instance1.db1", "instance2.db1" + ) + upstream_lineage_aspect_entity_urns.add(wu.get_urn()) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db1", "instance2.db1") + ] + sibling_aspect_entity_urns.add(wu.get_urn()) + + assert upstream_lineage_aspect_entity_urns == sibling_aspect_entity_urns + + +def test_snowflake_shares_workunit_outbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share2": SnowflakeShareConfig( + database="db2", + platform_instance="instance1", + consumers=[ + DatabaseId( + database="db2_from_share", platform_instance="instance2" + ), + DatabaseId(database="db2", platform_instance="instance3"), + ], + ) + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets + # Hence 6 Sibling aspects + assert len(wus) == 6 + entity_urns = set() + + for wu in wus: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), + wu.get_urn().replace("instance1.db2", "instance3.db2"), + ] + entity_urns.add(wu.get_urn()) + + assert len((entity_urns)) == 6 + + +def test_snowflake_shares_workunit_inbound_and_outbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[DatabaseId(database="db1", platform_instance="instance1")], + ), + "share2": SnowflakeShareConfig( + database="db2", + platform_instance="instance1", + consumers=[ + DatabaseId( + database="db2_from_share", platform_instance="instance2" + ), + DatabaseId(database="db2", platform_instance="instance3"), + ], + ), + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 6 Sibling and 6 upstreamLineage aspects for db1 tables + # 6 Sibling aspects for db2 tables + assert len(wus) == 18 + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "instance1.db1", "instance2.db1" + ) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + if "db1" in wu.get_urn(): + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db1", "instance2.db1") + ] + else: + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), + wu.get_urn().replace("instance1.db2", "instance3.db2"), + ] diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 025273fc9263e..82273427974af 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -235,3 +235,7 @@ sourceSets.main.java.srcDir "${generateOpenApiPojos.outputDir}/src/main/java" sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/resources" checkstyleMain.exclude '**/generated/**' + +clean { + project.delete("$projectDir/generated") +} \ No newline at end of file diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index bd1e6037ec0c5..5973f77da28aa 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -28,6 +28,8 @@ import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchResultMetadata; +import com.linkedin.metadata.search.SearchSuggestion; +import com.linkedin.metadata.search.SearchSuggestionArray; import com.linkedin.metadata.search.features.Features; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.SearchUtil; @@ -68,7 +70,9 @@ import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; +import org.elasticsearch.search.suggest.term.TermSuggestion; +import static com.linkedin.metadata.search.utils.ESUtils.NAME_SUGGESTION; import static com.linkedin.metadata.search.utils.ESUtils.toFacetField; import static com.linkedin.metadata.search.utils.SearchUtils.applyDefaultSearchFlags; import static com.linkedin.metadata.utils.SearchUtil.*; @@ -199,6 +203,11 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi searchSourceBuilder.highlighter(_highlights); } ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + + if (finalSearchFlags.isGetSuggestions()) { + ESUtils.buildNameSuggestions(searchSourceBuilder, input); + } + searchRequest.source(searchSourceBuilder); log.debug("Search request is: " + searchRequest.toString()); @@ -471,6 +480,9 @@ private SearchResultMetadata extractSearchResultMetadata(@Nonnull SearchResponse final List aggregationMetadataList = extractAggregationMetadata(searchResponse, filter); searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList)); + final List searchSuggestions = extractSearchSuggestions(searchResponse); + searchResultMetadata.setSuggestions(new SearchSuggestionArray(searchSuggestions)); + return searchResultMetadata; } @@ -517,6 +529,23 @@ public static Map extractTermAggregations(@Nonnull SearchResponse return extractTermAggregations((ParsedTerms) aggregation, aggregationName.equals("_entityType")); } + private List extractSearchSuggestions(@Nonnull SearchResponse searchResponse) { + final List searchSuggestions = new ArrayList<>(); + if (searchResponse.getSuggest() != null) { + TermSuggestion termSuggestion = searchResponse.getSuggest().getSuggestion(NAME_SUGGESTION); + if (termSuggestion != null && termSuggestion.getEntries().size() > 0) { + termSuggestion.getEntries().get(0).getOptions().forEach(suggestOption -> { + SearchSuggestion searchSuggestion = new SearchSuggestion(); + searchSuggestion.setText(String.valueOf(suggestOption.getText())); + searchSuggestion.setFrequency(suggestOption.getFreq()); + searchSuggestion.setScore(suggestOption.getScore()); + searchSuggestions.add(searchSuggestion); + }); + } + } + return searchSuggestions; + } + /** * Adds nested sub-aggregation values to the aggregated results * @param aggs The aggregations to traverse. Could be null (base case) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index 5179f2be6d060..741eb5568d2ea 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -27,6 +27,10 @@ import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.search.sort.ScoreSortBuilder; import org.elasticsearch.search.sort.SortOrder; +import org.elasticsearch.search.suggest.SuggestBuilder; +import org.elasticsearch.search.suggest.SuggestBuilders; +import org.elasticsearch.search.suggest.SuggestionBuilder; +import org.elasticsearch.search.suggest.term.TermSuggestionBuilder; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.KEYWORD_FIELDS; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.PATH_HIERARCHY_FIELDS; @@ -46,6 +50,8 @@ public class ESUtils { public static final String OPAQUE_ID_HEADER = "X-Opaque-Id"; public static final String HEADER_VALUE_DELIMITER = "|"; public static final String KEYWORD_TYPE = "keyword"; + public static final String ENTITY_NAME_FIELD = "_entityName"; + public static final String NAME_SUGGESTION = "nameSuggestion"; // we use this to make sure we filter for editable & non-editable fields. Also expands out top-level properties // to field level properties @@ -197,6 +203,17 @@ public static void buildSortOrder(@Nonnull SearchSourceBuilder searchSourceBuild } } + /** + * Populates source field of search query with the suggestions query so that we get search suggestions back. + * Right now we are only supporting suggestions based on the virtual _entityName field alias. + */ + public static void buildNameSuggestions(@Nonnull SearchSourceBuilder searchSourceBuilder, @Nullable String textInput) { + SuggestionBuilder builder = SuggestBuilders.termSuggestion(ENTITY_NAME_FIELD).text(textInput); + SuggestBuilder suggestBuilder = new SuggestBuilder(); + suggestBuilder.addSuggestion(NAME_SUGGESTION, builder); + searchSourceBuilder.suggest(suggestBuilder); + } + /** * Escapes the Elasticsearch reserved characters in the given input string. * diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java index 35a322d37b2fd..8b56ae0beb3f1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java @@ -78,7 +78,7 @@ public static Map getRequestMap(@Nullable Filter requestParams) return criterionArray.stream().collect(Collectors.toMap(Criterion::getField, Criterion::getValue)); } - static boolean isUrn(@Nonnull String value) { + public static boolean isUrn(@Nonnull String value) { // TODO(https://github.com/datahub-project/datahub-gma/issues/51): This method is a bit of a hack to support searching for // URNs that have commas in them, while also using commas a delimiter for search. We should stop supporting commas // as delimiter, and then we can stop using this hack. diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java index 20501225ef787..ef9992db1fb25 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java @@ -46,6 +46,7 @@ import java.util.Map; import static com.linkedin.metadata.Constants.*; +import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; import static org.mockito.ArgumentMatchers.anySet; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -225,6 +226,7 @@ public SearchService searchServiceHelper( .bulkProcessor(_bulkProcessor) .fixtureName(fixtureName) .targetIndexPrefix(prefix) + .refreshIntervalSeconds(REFRESH_INTERVAL_SECONDS) .build() .read(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java b/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java index ee3be08d82a1f..ade7435bf6652 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java @@ -48,6 +48,7 @@ import java.util.Map; import static com.linkedin.metadata.Constants.*; +import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; @TestConfiguration @@ -154,6 +155,7 @@ protected LineageSearchService lineageSearchService( .bulkProcessor(_bulkProcessor) .fixtureName(fixtureName) .targetIndexPrefix(prefix) + .refreshIntervalSeconds(REFRESH_INTERVAL_SECONDS) .build() .read(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java index 673474c96cc51..327447341badf 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java @@ -36,7 +36,7 @@ @TestConfiguration public class ESTestConfiguration { private static final int HTTP_PORT = 9200; - private static final int REFRESH_INTERVAL_SECONDS = 5; + public static final int REFRESH_INTERVAL_SECONDS = 5; public static void syncAfterWrite(ESBulkProcessor bulkProcessor) throws InterruptedException { bulkProcessor.flush(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java index 1f0b7b24397ca..914c5be9f5b09 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java @@ -15,6 +15,7 @@ import java.io.IOException; import java.util.Set; +import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; import static com.linkedin.metadata.ESTestUtils.environmentRestClientBuilder; @TestConfiguration @@ -111,6 +112,7 @@ private void reindexTestFixtureData() throws IOException { FixtureReader reader = FixtureReader.builder() .bulkProcessor(bulkProcessor) .fixtureName("long_tail") + .refreshIntervalSeconds(REFRESH_INTERVAL_SECONDS) .build(); reader.read(); diff --git a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java b/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java index 2b37d86f058db..a0c551b28b507 100644 --- a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java +++ b/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java @@ -36,6 +36,8 @@ public class FixtureReader { @Builder.Default private String targetIndexPrefix = ""; + private long refreshIntervalSeconds; + public Set read() throws IOException { try (Stream files = Files.list(Paths.get(String.format("%s/%s", inputBase, fixtureName)))) { return files.map(file -> { @@ -64,7 +66,7 @@ public Set read() throws IOException { } finally { bulkProcessor.flush(); try { - Thread.sleep(1000); + Thread.sleep(1000 * refreshIntervalSeconds); } catch (InterruptedException ignored) { } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl index 05a94b8fabc4b..be1a30c7f082c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl @@ -28,4 +28,9 @@ record SearchFlags { * Whether to skip aggregates/facets */ skipAggregates:optional boolean = false + + /** + * Whether to request for search suggestions on the _entityName virtualized field + */ + getSuggestions:optional boolean = false } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl index 718d80ba4cb36..60f1b568f586a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl @@ -12,4 +12,9 @@ record SearchResultMetadata { */ aggregations: array[AggregationMetadata] = [] + /** + * A list of search query suggestions based on the given query + */ + suggestions: array[SearchSuggestion] = [] + } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl new file mode 100644 index 0000000000000..7776ec54fe03e --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl @@ -0,0 +1,24 @@ +namespace com.linkedin.metadata.search + +/** + * The model for the search result + */ +record SearchSuggestion { + + /** + * The suggestion text for this search query + */ + text: string + + /** + * The score for how close this suggestion is to the original search query. + * The closer to 1 means it is closer to the original query and 0 is further away. + */ + score: float + + /** + * How many matches there are with the suggested text for the given field + */ + frequency: long + +} \ No newline at end of file diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index 690528059b555..f653ccf72cf54 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -250,11 +250,11 @@ private void addPoliciesToCache(final Map> cache private void addPolicyToCache(final Map> cache, final DataHubPolicyInfo policy) { final List privileges = policy.getPrivileges(); for (String privilege : privileges) { - List existingPolicies = cache.getOrDefault(privilege, new ArrayList<>()); + List existingPolicies = cache.containsKey(privilege) ? new ArrayList<>(cache.get(privilege)) : new ArrayList<>(); existingPolicies.add(policy); cache.put(privilege, existingPolicies); } - List existingPolicies = cache.getOrDefault(ALL, new ArrayList<>()); + List existingPolicies = cache.containsKey(ALL) ? new ArrayList<>(cache.get(ALL)) : new ArrayList<>(); existingPolicies.add(policy); cache.put(ALL, existingPolicies); } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java new file mode 100644 index 0000000000000..7094bbd710f75 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java @@ -0,0 +1,11 @@ +package com.linkedin.metadata.config; + +import lombok.Data; + +@Data +public class SearchResultVisualConfig { + /** + * The default tab to show first on a Domain entity profile. Defaults to React code sorting if not present. + */ + public Boolean enableNameHighlight; +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java index d1c357186e1ae..14ac2406c2256 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java @@ -22,4 +22,9 @@ public class VisualConfiguration { * Queries tab related configurations */ public EntityProfileConfig entityProfile; + + /** + * Search result related configurations + */ + public SearchResultVisualConfig searchResult; } diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 82cf9e8fdc8a7..d21442d0bf5c8 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -111,6 +111,8 @@ visualConfig: entityProfile: # we only support default tab for domains right now. In order to implement for other entities, update React code domainDefaultTab: ${DOMAIN_DEFAULT_TAB:} # set to DOCUMENTATION_TAB to show documentation tab first + searchResult: + enableNameHighlight: ${SEARCH_RESULT_NAME_HIGHLIGHT_ENABLED:true} # Enables visual highlighting on search result names/descriptions. # Storage Layer diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index af6f8aaac84a8..a883625665a96 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -5728,6 +5728,12 @@ "doc" : "Whether to skip aggregates/facets", "default" : false, "optional" : true + }, { + "name" : "getSuggestions", + "type" : "boolean", + "doc" : "Whether to request for search suggestions on the _entityName virtualized field", + "default" : false, + "optional" : true } ] }, { "type" : "enum", @@ -6099,6 +6105,31 @@ }, "doc" : "A list of search result metadata such as aggregations", "default" : [ ] + }, { + "name" : "suggestions", + "type" : { + "type" : "array", + "items" : { + "type" : "record", + "name" : "SearchSuggestion", + "doc" : "The model for the search result", + "fields" : [ { + "name" : "text", + "type" : "string", + "doc" : "The suggestion text for this search query" + }, { + "name" : "score", + "type" : "float", + "doc" : "The score for how close this suggestion is to the original search query.\nThe closer to 1 means it is closer to the original query and 0 is further away." + }, { + "name" : "frequency", + "type" : "long", + "doc" : "How many matches there are with the suggested text for the given field" + } ] + } + }, + "doc" : "A list of search query suggestions based on the given query", + "default" : [ ] } ] }, "doc" : "Metadata specific to the browse result of the queried path" @@ -6205,7 +6236,7 @@ "type" : "int", "doc" : "The total number of entities directly under searched path" } ] - }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { + }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.search.SearchSuggestion", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { "type" : "record", "name" : "SystemMetadata", "namespace" : "com.linkedin.mxe", diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java index d7237da73d287..1afdf520cd6ac 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java @@ -198,8 +198,8 @@ public class PoliciesConfig { public static final Privilege EDIT_ENTITY_PRIVILEGE = Privilege.of( "EDIT_ENTITY", - "Edit All", - "The ability to edit any information about an entity. Super user privileges."); + "Edit Entity", + "The ability to edit any information about an entity. Super user privileges for the entity."); public static final Privilege DELETE_ENTITY_PRIVILEGE = Privilege.of( "DELETE_ENTITY", diff --git a/perf-test/README.md b/perf-test/README.md index 24fb064d3e28a..191833361eae9 100644 --- a/perf-test/README.md +++ b/perf-test/README.md @@ -58,7 +58,9 @@ locust -f perf-test/locustfiles/ingest.py This will set up the web interface in http://localhost:8089 (unless the port is already taken). Once you click into it, you should see the following -![Locust Example](../docs/imgs/locust-example.png) +

+ +

Input the number of users you would like to spawn and the spawn rate. Point the host to the deployed DataHub GMS ( locally, it should be http://localhost:8080). Click on the "Start swarming" button to start the load test. diff --git a/smoke-test/tests/cypress/cypress/e2e/login/login.js b/smoke-test/tests/cypress/cypress/e2e/login/login.js index 74d04aa56d0d0..f86741b5afe01 100644 --- a/smoke-test/tests/cypress/cypress/e2e/login/login.js +++ b/smoke-test/tests/cypress/cypress/e2e/login/login.js @@ -4,6 +4,6 @@ describe('login', () => { cy.get('input[data-testid=username]').type(Cypress.env('ADMIN_USERNAME')); cy.get('input[data-testid=password]').type(Cypress.env('ADMIN_PASSWORD')); cy.contains('Sign In').click(); - cy.contains('Welcome back, Data Hub'); + cy.contains('Welcome back, DataHub'); }); }) diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js index 1f40cdf602062..e4e5a39ce1100 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js @@ -4,68 +4,94 @@ const wrong_url = "https://www.linkedincom"; const correct_url = "https://www.linkedin.com"; describe("edit documentation and link to dataset", () => { + it("open test dataset page, edit documentation", () => { + //edit documentation and verify changes saved + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("[role='tab']").contains("Documentation").click(); + cy.waitTextVisible("my hive dataset"); + cy.waitTextVisible("Sample doc"); + cy.clickOptionWithText("Edit"); + cy.focused().clear(); + cy.focused().type(documentation_edited); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible(documentation_edited); + //return documentation to original state + cy.clickOptionWithText("Edit"); + cy.focused().clear().wait(1000); + cy.focused().type("my hive dataset"); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible("my hive dataset"); + }); - it("open test dataset page, edit documentation", () => { - //edit documentation and verify changes saved - cy.loginWithCredentials(); - cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.waitTextVisible("my hive dataset"); - cy.waitTextVisible("Sample doc"); - cy.clickOptionWithText("Edit"); - cy.focused().clear(); - cy.focused().type(documentation_edited); - cy.get("button").contains("Save").click(); - cy.waitTextVisible("Description Updated"); - cy.waitTextVisible(documentation_edited); - //return documentation to original state - cy.clickOptionWithText("Edit"); - cy.focused().clear().wait(1000); - cy.focused().type("my hive dataset"); - cy.get("button").contains("Save").click(); - cy.waitTextVisible("Description Updated"); - cy.waitTextVisible("my hive dataset"); - }); + it("open test dataset page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("[role='tab']").contains("Documentation").click(); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + }); - it("open test dataset page, remove and add dataset link", () => { - cy.loginWithCredentials(); - cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.contains("Sample doc").trigger("mouseover", { force: true }); - cy.get('[data-icon="delete"]').click(); - cy.waitTextVisible("Link Removed"); - cy.get("button").contains("Add Link").click(); - cy.get("#addLinkForm_url").type(wrong_url); - cy.waitTextVisible("This field must be a valid url."); - cy.focused().clear(); - cy.waitTextVisible("A URL is required."); - cy.focused().type(correct_url); - cy.ensureTextNotPresent("This field must be a valid url."); - cy.get("#addLinkForm_label").type("Sample doc"); - cy.get('[role="dialog"] button').contains("Add").click(); - cy.waitTextVisible("Link Added"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.get(`[href='${correct_url}']`).should("be.visible"); - }); + it("open test domain page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit("/domain/urn:li:domain:marketing/Entities"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + }); - it("edit field documentation", () => { - cy.loginWithCredentials(); - cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); - cy.get("tbody [data-icon='edit']").first().click({ force: true }); - cy.waitTextVisible("Update description"); - cy.waitTextVisible("Foo field description has changed"); - cy.focused().clear().wait(1000); - cy.focused().type(documentation_edited); - cy.get("button").contains("Update").click(); - cy.waitTextVisible("Updated!"); - cy.waitTextVisible(documentation_edited); - cy.waitTextVisible("(edited)"); - cy.get("tbody [data-icon='edit']").first().click({ force: true }); - cy.focused().clear().wait(1000); - cy.focused().type("Foo field description has changed"); - cy.get("button").contains("Update").click(); - cy.waitTextVisible("Updated!"); - cy.waitTextVisible("Foo field description has changed"); - cy.waitTextVisible("(edited)"); - }); -}); \ No newline at end of file + it("edit field documentation", () => { + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.waitTextVisible("Update description"); + cy.waitTextVisible("Foo field description has changed"); + cy.focused().clear().wait(1000); + cy.focused().type(documentation_edited); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible(documentation_edited); + cy.waitTextVisible("(edited)"); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.focused().clear().wait(1000); + cy.focused().type("Foo field description has changed"); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible("Foo field description has changed"); + cy.waitTextVisible("(edited)"); + }); +}); diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js index 353570c0d955b..9559435ff01c8 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js @@ -64,8 +64,7 @@ describe("create and manage group", () => { }); it("update group info", () => { - var expected_name = Cypress.env('ADMIN_USERNAME') == "datahub" ? "Data Hub" : Cypress.env('ADMIN_USERNAME'); - + var expected_name = Cypress.env('ADMIN_USERNAME'); cy.loginWithCredentials(); cy.visit("/settings/identities/groups"); cy.clickOptionWithText(group_name);