From 1d06d38b681be03732111e5d2a6a908dac6a5977 Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Mon, 29 Jan 2024 16:03:27 -0800 Subject: [PATCH] feat(platform): add support for via nodes (#9733) --- .../datahub/graphql/GmsGraphQLEngine.java | 3 +- .../graphql/resolvers/BatchLoadUtils.java | 5 +- .../search/SearchAcrossLineageResolver.java | 52 +- .../resolvers/search/SearchResolver.java | 13 +- .../mappers/GroupingCriterionInputMapper.java | 29 + .../mappers/SearchFlagsInputMapper.java | 13 + .../common/mappers/UrnToEntityMapper.java | 6 + .../UrnSearchAcrossLineageResultsMapper.java | 1 + .../graphql/types/query/QueryType.java | 3 + .../src/main/resources/entity.graphql | 5 + .../src/main/resources/search.graphql | 49 + .../SearchAcrossLineageResolverTest.java | 20 +- .../resolvers/search/SearchResolverTest.java | 61 +- .../ReindexDataJobViaNodesCLLConfig.java | 15 + .../upgrade/config/SystemUpdateConfig.java | 11 +- .../datahub/upgrade/system/SystemUpdate.java | 6 +- .../system/via/ReindexDataJobViaNodesCLL.java | 34 + .../via/ReindexDataJobViaNodesCLLStep.java | 84 ++ .../annotation/RelationshipAnnotation.java | 45 +- .../src/datahub/ingestion/graph/client.py | 6 +- .../metadata/entity/EntityServiceImpl.java | 2 +- .../graph/dgraph/DgraphGraphService.java | 2 +- .../graph/elastic/ESGraphQueryDAO.java | 453 ++++++-- .../graph/elastic/ESGraphWriteDAO.java | 23 + .../elastic/ElasticSearchGraphService.java | 39 +- .../GraphRelationshipMappingsBuilder.java | 17 +- .../graph/neo4j/Neo4jGraphService.java | 3 +- .../metadata/search/LineageSearchService.java | 45 +- .../metadata/search/utils/SearchUtils.java | 24 + .../service/UpdateIndicesService.java | 46 +- .../metadata/graph/GraphServiceTestBase.java | 61 +- .../graph/dgraph/DgraphGraphServiceTest.java | 2 +- .../search/SearchGraphServiceTestBase.java | 21 +- .../fixtures/LineageDataFixtureTestBase.java | 25 +- .../search/utils/SearchUtilsTest.java | 175 ++-- .../linkedin/dataset/FineGrainedLineage.pdl | 7 +- .../pegasus/com/linkedin/dataset/Upstream.pdl | 8 + .../metadata/graph/LineageRelationship.pdl | 9 + .../metadata/query/GroupingCriterion.pdl | 21 + .../linkedin/metadata/query/GroupingSpec.pdl | 15 + .../linkedin/metadata/query/SearchFlags.pdl | 5 + .../metadata/search/LineageSearchEntity.pdl | 5 + .../com/linkedin/query/QueryProperties.pdl | 5 + .../search/GraphQueryConfiguration.java | 4 + .../src/main/resources/application.yml | 3 +- .../linkedin/metadata/boot/BootstrapStep.java | 11 + .../com.linkedin.entity.aspects.snapshot.json | 13 +- ...com.linkedin.entity.entities.snapshot.json | 58 +- .../com.linkedin.entity.runs.snapshot.json | 13 +- ...nkedin.lineage.relationships.snapshot.json | 13 +- ...nkedin.operations.operations.snapshot.json | 13 +- ...m.linkedin.platform.platform.snapshot.json | 13 +- .../linkedin/entity/client/EntityClient.java | 1 + .../com/linkedin/metadata/graph/Edge.java | 27 + .../metadata/graph/GraphIndexUtils.java | 35 +- .../metadata/graph/RelatedEntities.java | 8 +- .../metadata/graph/RelatedEntity.java | 13 + smoke-test/requirements.txt | 3 +- smoke-test/tests/lineage/__init__.py | 0 smoke-test/tests/lineage/test_lineage.py | 991 ++++++++++++++++++ 60 files changed, 2401 insertions(+), 292 deletions(-) create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/GroupingCriterionInputMapper.java create mode 100644 datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/ReindexDataJobViaNodesCLLConfig.java create mode 100644 datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/via/ReindexDataJobViaNodesCLL.java create mode 100644 datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/via/ReindexDataJobViaNodesCLLStep.java create mode 100644 metadata-models/src/main/pegasus/com/linkedin/metadata/query/GroupingCriterion.pdl create mode 100644 metadata-models/src/main/pegasus/com/linkedin/metadata/query/GroupingSpec.pdl create mode 100644 smoke-test/tests/lineage/__init__.py create mode 100644 smoke-test/tests/lineage/test_lineage.py diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 4b5bbdb6e15ec..41f48e0a7dc3e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -878,7 +878,8 @@ private void configureQueryResolvers(final RuntimeWiring.Builder builder) { "scrollAcrossEntities", new ScrollAcrossEntitiesResolver(this.entityClient, this.viewService)) .dataFetcher( - "searchAcrossLineage", new SearchAcrossLineageResolver(this.entityClient)) + "searchAcrossLineage", + new SearchAcrossLineageResolver(this.entityClient, this.entityRegistry)) .dataFetcher( "scrollAcrossLineage", new ScrollAcrossLineageResolver(this.entityClient)) .dataFetcher( diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/BatchLoadUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/BatchLoadUtils.java index 5ab07701c15a2..3126f25546f65 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/BatchLoadUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/BatchLoadUtils.java @@ -28,8 +28,9 @@ public static CompletableFuture> batchLoadEntitiesOfSameType( .filter(entity -> entities.get(0).getClass().isAssignableFrom(entity.objectClass())) .collect(Collectors.toList())); - final DataLoader loader = dataLoaderRegistry.getDataLoader(filteredEntity.name()); - List keyList = new ArrayList(); + final DataLoader loader = + dataLoaderRegistry.getDataLoader(filteredEntity.name()); + List keyList = new ArrayList(); for (Entity entity : entities) { keyList.add(filteredEntity.getKeyProvider().apply(entity)); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java index 2dc5032f2a4eb..1a8b7734c093e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java @@ -2,7 +2,9 @@ import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.*; import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.*; +import static com.linkedin.metadata.Constants.QUERY_ENTITY_NAME; +import com.google.common.collect.ImmutableSet; import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.generated.FacetFilterInput; @@ -14,31 +16,63 @@ import com.linkedin.datahub.graphql.types.entitytype.EntityTypeMapper; import com.linkedin.datahub.graphql.types.mappers.UrnSearchAcrossLineageResultsMapper; import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.search.LineageSearchResult; import com.linkedin.r2.RemoteInvocationException; +import graphql.VisibleForTesting; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; import javax.annotation.Nullable; -import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; /** Resolver responsible for resolving 'searchAcrossEntities' field of the Query type */ @Slf4j -@RequiredArgsConstructor public class SearchAcrossLineageResolver implements DataFetcher> { private static final int DEFAULT_START = 0; private static final int DEFAULT_COUNT = 10; + private static final Set TRANSIENT_ENTITIES = ImmutableSet.of(QUERY_ENTITY_NAME); + private final EntityClient _entityClient; + private final EntityRegistry _entityRegistry; + + @VisibleForTesting final Set _allEntities; + private final List _allowedEntities; + + public SearchAcrossLineageResolver(EntityClient entityClient, EntityRegistry entityRegistry) { + this._entityClient = entityClient; + this._entityRegistry = entityRegistry; + this._allEntities = + entityRegistry.getEntitySpecs().values().stream() + .map(EntitySpec::getName) + .collect(Collectors.toSet()); + + this._allowedEntities = + this._allEntities.stream() + .filter(e -> !TRANSIENT_ENTITIES.contains(e)) + .collect(Collectors.toList()); + } + + private List getEntityNamesFromInput(List inputTypes) { + if (inputTypes != null && !inputTypes.isEmpty()) { + return inputTypes.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()); + } else { + return this._allowedEntities; + } + } + @Override public CompletableFuture get(DataFetchingEnvironment environment) throws URISyntaxException { @@ -50,12 +84,7 @@ public CompletableFuture get(DataFetchingEnvironment final LineageDirection lineageDirection = input.getDirection(); - List entityTypes = - (input.getTypes() == null || input.getTypes().isEmpty()) - ? SEARCHABLE_ENTITY_TYPES - : input.getTypes(); - List entityNames = - entityTypes.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()); + List entityNames = getEntityNamesFromInput(input.getTypes()); // escape forward slash since it is a reserved character in Elasticsearch final String sanitizedQuery = @@ -99,8 +128,7 @@ public CompletableFuture get(DataFetchingEnvironment } else { searchFlags = new SearchFlags().setFulltext(true).setSkipHighlighting(true); } - - return UrnSearchAcrossLineageResultsMapper.map( + LineageSearchResult salResults = _entityClient.searchAcrossLineage( urn, resolvedDirection, @@ -114,7 +142,9 @@ public CompletableFuture get(DataFetchingEnvironment startTimeMillis, endTimeMillis, searchFlags, - ResolverUtils.getAuthentication(environment))); + getAuthentication(environment)); + + return UrnSearchAcrossLineageResultsMapper.map(salResults); } catch (RemoteInvocationException e) { log.error( "Failed to execute search across relationships: source urn {}, direction {}, entity types {}, query {}, filters: {}, start: {}, count: {}", diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java index bc177c600beee..7428207034f5d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java @@ -1,6 +1,7 @@ package com.linkedin.datahub.graphql.resolvers.search; import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.bindArgument; +import static com.linkedin.metadata.Constants.*; import static com.linkedin.metadata.search.utils.SearchUtils.applyDefaultSearchFlags; import com.linkedin.datahub.graphql.generated.SearchInput; @@ -10,6 +11,9 @@ import com.linkedin.datahub.graphql.types.entitytype.EntityTypeMapper; import com.linkedin.datahub.graphql.types.mappers.UrnSearchResultsMapper; import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.query.GroupingCriterion; +import com.linkedin.metadata.query.GroupingCriterionArray; +import com.linkedin.metadata.query.GroupingSpec; import com.linkedin.metadata.query.SearchFlags; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; @@ -28,7 +32,14 @@ public class SearchResolver implements DataFetcher { + + public static final GroupingCriterionInputMapper INSTANCE = new GroupingCriterionInputMapper(); + + public static com.linkedin.metadata.query.GroupingCriterion map( + @Nonnull final GroupingCriterion groupingCriterion) { + return INSTANCE.apply(groupingCriterion); + } + + @Override + public com.linkedin.metadata.query.GroupingCriterion apply(GroupingCriterion input) { + return new com.linkedin.metadata.query.GroupingCriterion() + .setBaseEntityType( + input.getBaseEntityType() != null + ? EntityTypeMapper.getName(input.getBaseEntityType()) + : null, + SetMode.REMOVE_OPTIONAL_IF_NULL) + .setGroupingEntityType(EntityTypeMapper.getName(input.getGroupingEntityType())); + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java index e2d29d0297449..faede5cf9bb1b 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java @@ -2,6 +2,9 @@ import com.linkedin.datahub.graphql.generated.SearchFlags; import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.metadata.query.GroupingCriterionArray; +import com.linkedin.metadata.query.GroupingSpec; +import java.util.stream.Collectors; import javax.annotation.Nonnull; /** @@ -42,6 +45,16 @@ public com.linkedin.metadata.query.SearchFlags apply(@Nonnull final SearchFlags if (searchFlags.getGetSuggestions() != null) { result.setGetSuggestions(searchFlags.getGetSuggestions()); } + if (searchFlags.getGroupingSpec() != null + && searchFlags.getGroupingSpec().getGroupingCriteria() != null) { + result.setGroupingSpec( + new GroupingSpec() + .setGroupingCriteria( + new GroupingCriterionArray( + searchFlags.getGroupingSpec().getGroupingCriteria().stream() + .map(GroupingCriterionInputMapper::map) + .collect(Collectors.toList())))); + } return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/UrnToEntityMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/UrnToEntityMapper.java index 18a082fee95f1..3ca018ea6f5c7 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/UrnToEntityMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/UrnToEntityMapper.java @@ -30,6 +30,7 @@ import com.linkedin.datahub.graphql.generated.MLPrimaryKey; import com.linkedin.datahub.graphql.generated.Notebook; import com.linkedin.datahub.graphql.generated.OwnershipTypeEntity; +import com.linkedin.datahub.graphql.generated.QueryEntity; import com.linkedin.datahub.graphql.generated.Role; import com.linkedin.datahub.graphql.generated.SchemaFieldEntity; import com.linkedin.datahub.graphql.generated.StructuredPropertyEntity; @@ -198,6 +199,11 @@ public Entity apply(Urn input) { ((StructuredPropertyEntity) partialEntity).setUrn(input.toString()); ((StructuredPropertyEntity) partialEntity).setType(EntityType.STRUCTURED_PROPERTY); } + if (input.getEntityType().equals(QUERY_ENTITY_NAME)) { + partialEntity = new QueryEntity(); + ((QueryEntity) partialEntity).setUrn(input.toString()); + ((QueryEntity) partialEntity).setType(EntityType.QUERY); + } return partialEntity; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchAcrossLineageResultsMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchAcrossLineageResultsMapper.java index 642fe90cf2aed..970789facf699 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchAcrossLineageResultsMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchAcrossLineageResultsMapper.java @@ -62,6 +62,7 @@ private SearchAcrossLineageResult mapResult(LineageSearchEntity searchEntity) { .setMatchedFields(getMatchedFieldEntry(searchEntity.getMatchedFields())) .setPaths(searchEntity.getPaths().stream().map(this::mapPath).collect(Collectors.toList())) .setDegree(searchEntity.getDegree()) + .setDegrees(searchEntity.getDegrees().stream().collect(Collectors.toList())) .build(); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/query/QueryType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/query/QueryType.java index 0c1fd33e38110..087c93a97e314 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/query/QueryType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/query/QueryType.java @@ -21,7 +21,9 @@ import java.util.stream.Collectors; import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +@Slf4j @RequiredArgsConstructor public class QueryType implements com.linkedin.datahub.graphql.types.EntityType { @@ -50,6 +52,7 @@ public List> batchLoad( final List viewUrns = urns.stream().map(UrnUtils::getUrn).collect(Collectors.toList()); try { + log.debug("Fetching query entities: {}", viewUrns); final Map entities = _entityClient.batchGetV2( QUERY_ENTITY_NAME, diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 3ea1b38d3db0d..0074dc3fcb44c 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -10948,6 +10948,11 @@ enum QuerySource { The query was provided manually, e.g. from the UI. """ MANUAL + + """ + The query was extracted by the system, e.g. from a dashboard. + """ + SYSTEM } """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index 8896dd02b5ad3..2b921601058fb 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -143,6 +143,15 @@ input SearchFlags { Whether to request for search suggestions on the _entityName virtualized field """ getSuggestions: Boolean + + """ + Additional grouping specifications to apply to the search results + Grouping specifications will control how search results are grouped together + in the response. This is currently being used to group schema fields (columns) + as datasets, and in the future will be used to group other entities as well. + Note: This is an experimental feature and is subject to change. + """ + groupingSpec: GroupingSpec } """ @@ -278,6 +287,7 @@ input ScrollAcrossEntitiesInput { searchFlags: SearchFlags } + """ Input arguments for a search query over the results of a multi-hop graph query """ @@ -669,6 +679,12 @@ type SearchAcrossLineageResult { Degree of relationship (number of hops to get to entity) """ degree: Int! + + """ + Degrees of relationship (for entities discoverable at multiple degrees) + """ + degrees: [Int!] + } """ @@ -1303,4 +1319,37 @@ input SortCriterion { The order in which we will be sorting """ sortOrder: SortOrder! +} + +""" +A grouping specification for search results. +""" +input GroupingSpec { + + """ + A list of grouping criteria for grouping search results. + There is no implied order in the grouping criteria. + """ + groupingCriteria: [GroupingCriterion!] + +} + +""" +A single grouping criterion for grouping search results +""" +input GroupingCriterion { + + """ + The base entity type that needs to be grouped + e.g. schemaField + Omitting this field will result in all base entities being grouped into the groupingEntityType. + """ + baseEntityType: EntityType + + """ + The type of entity being grouped into + e.g. dataset, domain, etc. + """ + groupingEntityType: EntityType! + } \ No newline at end of file diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolverTest.java index 273f7156c12a8..a50591b7fc399 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolverTest.java @@ -14,6 +14,8 @@ import com.linkedin.datahub.graphql.generated.SearchAcrossLineageResult; import com.linkedin.datahub.graphql.generated.SearchAcrossLineageResults; import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.models.registry.ConfigEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.search.AggregationMetadataArray; import com.linkedin.metadata.search.LineageSearchEntity; @@ -22,6 +24,7 @@ import com.linkedin.metadata.search.MatchedFieldArray; import com.linkedin.metadata.search.SearchResultMetadata; import graphql.schema.DataFetchingEnvironment; +import java.io.InputStream; import java.util.Collections; import java.util.List; import org.testng.annotations.BeforeMethod; @@ -43,13 +46,28 @@ public class SearchAcrossLineageResolverTest { private Authentication _authentication; private SearchAcrossLineageResolver _resolver; + private EntityRegistry _entityRegistry; + @BeforeMethod public void setupTest() { _entityClient = mock(EntityClient.class); _dataFetchingEnvironment = mock(DataFetchingEnvironment.class); _authentication = mock(Authentication.class); - _resolver = new SearchAcrossLineageResolver(_entityClient); + _entityRegistry = mock(EntityRegistry.class); + _resolver = new SearchAcrossLineageResolver(_entityClient, _entityRegistry); + } + + @Test + public void testAllEntitiesInitialization() { + InputStream inputStream = ClassLoader.getSystemResourceAsStream("entity-registry.yml"); + EntityRegistry entityRegistry = new ConfigEntityRegistry(inputStream); + SearchAcrossLineageResolver resolver = + new SearchAcrossLineageResolver(_entityClient, entityRegistry); + assertTrue(resolver._allEntities.contains("dataset")); + assertTrue(resolver._allEntities.contains("dataFlow")); + // Test for case sensitivity + assertFalse(resolver._allEntities.contains("dataflow")); } @Test diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolverTest.java index 24724cb8e23ad..9716799628a45 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolverTest.java @@ -1,14 +1,17 @@ package com.linkedin.datahub.graphql.resolvers.search; import static com.linkedin.datahub.graphql.TestUtils.getMockAllowContext; +import static com.linkedin.metadata.Constants.*; import com.datahub.authentication.Authentication; +import com.linkedin.data.template.SetMode; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.generated.SearchFlags; import com.linkedin.datahub.graphql.generated.SearchInput; import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.Constants; +import com.linkedin.metadata.query.GroupingCriterionArray; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.search.SearchEntityArray; @@ -19,6 +22,22 @@ import org.testng.annotations.Test; public class SearchResolverTest { + + private com.linkedin.metadata.query.SearchFlags setConvertSchemaFieldsToDatasets( + com.linkedin.metadata.query.SearchFlags flags, boolean value) { + if (value) { + return flags.setGroupingSpec( + new com.linkedin.metadata.query.GroupingSpec() + .setGroupingCriteria( + new GroupingCriterionArray( + new com.linkedin.metadata.query.GroupingCriterion() + .setBaseEntityType(SCHEMA_FIELD_ENTITY_NAME) + .setGroupingEntityType(DATASET_ENTITY_NAME)))); + } else { + return flags.setGroupingSpec(null, SetMode.REMOVE_IF_NULL); + } + } + @Test public void testDefaultSearchFlags() throws Exception { EntityClient mockClient = initMockSearchEntityClient(); @@ -40,12 +59,14 @@ public void testDefaultSearchFlags() throws Exception { null, 0, 10, - new com.linkedin.metadata.query.SearchFlags() - .setFulltext(true) - .setSkipAggregates(false) - .setSkipHighlighting(true) // empty/wildcard - .setMaxAggValues(20) - .setSkipCache(false)); + setConvertSchemaFieldsToDatasets( + new com.linkedin.metadata.query.SearchFlags() + .setFulltext(true) + .setSkipAggregates(false) + .setSkipHighlighting(true) // empty/wildcard + .setMaxAggValues(20) + .setSkipCache(false), + true)); } @Test @@ -77,12 +98,14 @@ public void testOverrideSearchFlags() throws Exception { null, 1, 11, - new com.linkedin.metadata.query.SearchFlags() - .setFulltext(false) - .setSkipAggregates(true) - .setSkipHighlighting(true) - .setMaxAggValues(10) - .setSkipCache(true)); + setConvertSchemaFieldsToDatasets( + new com.linkedin.metadata.query.SearchFlags() + .setFulltext(false) + .setSkipAggregates(true) + .setSkipHighlighting(true) + .setMaxAggValues(10) + .setSkipCache(true), + false)); } @Test @@ -107,12 +130,14 @@ public void testNonWildCardSearchFlags() throws Exception { null, 0, 10, - new com.linkedin.metadata.query.SearchFlags() - .setFulltext(true) - .setSkipAggregates(false) - .setSkipHighlighting(false) // empty/wildcard - .setMaxAggValues(20) - .setSkipCache(false)); + setConvertSchemaFieldsToDatasets( + new com.linkedin.metadata.query.SearchFlags() + .setFulltext(true) + .setSkipAggregates(false) + .setSkipHighlighting(false) // empty/wildcard + .setMaxAggValues(20) + .setSkipCache(false), + true)); } private EntityClient initMockSearchEntityClient() throws Exception { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/ReindexDataJobViaNodesCLLConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/ReindexDataJobViaNodesCLLConfig.java new file mode 100644 index 0000000000000..06311e1853874 --- /dev/null +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/ReindexDataJobViaNodesCLLConfig.java @@ -0,0 +1,15 @@ +package com.linkedin.datahub.upgrade.config; + +import com.linkedin.datahub.upgrade.system.via.ReindexDataJobViaNodesCLL; +import com.linkedin.metadata.entity.EntityService; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class ReindexDataJobViaNodesCLLConfig { + + @Bean + public ReindexDataJobViaNodesCLL _reindexDataJobViaNodesCLL(EntityService entityService) { + return new ReindexDataJobViaNodesCLL(entityService); + } +} diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java index 3b63d81486eb4..177d4b531ba86 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java @@ -4,6 +4,7 @@ import com.linkedin.datahub.upgrade.system.elasticsearch.BuildIndices; import com.linkedin.datahub.upgrade.system.elasticsearch.CleanIndices; import com.linkedin.datahub.upgrade.system.entity.steps.BackfillBrowsePathsV2; +import com.linkedin.datahub.upgrade.system.via.ReindexDataJobViaNodesCLL; import com.linkedin.gms.factory.common.TopicConventionFactory; import com.linkedin.gms.factory.config.ConfigurationProvider; import com.linkedin.gms.factory.kafka.DataHubKafkaProducerFactory; @@ -34,11 +35,17 @@ public SystemUpdate systemUpdate( @Qualifier("duheKafkaEventProducer") final KafkaEventProducer kafkaEventProducer, final GitVersion gitVersion, @Qualifier("revision") String revision, - final BackfillBrowsePathsV2 backfillBrowsePathsV2) { + final BackfillBrowsePathsV2 backfillBrowsePathsV2, + final ReindexDataJobViaNodesCLL reindexDataJobViaNodesCLL) { String version = String.format("%s-%s", gitVersion.getVersion(), revision); return new SystemUpdate( - buildIndices, cleanIndices, kafkaEventProducer, version, backfillBrowsePathsV2); + buildIndices, + cleanIndices, + kafkaEventProducer, + version, + backfillBrowsePathsV2, + reindexDataJobViaNodesCLL); } @Value("#{systemEnvironment['DATAHUB_REVISION'] ?: '0'}") diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/SystemUpdate.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/SystemUpdate.java index aba751bff8177..ed9c8ddda45c8 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/SystemUpdate.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/SystemUpdate.java @@ -7,6 +7,7 @@ import com.linkedin.datahub.upgrade.system.elasticsearch.CleanIndices; import com.linkedin.datahub.upgrade.system.elasticsearch.steps.DataHubStartupStep; import com.linkedin.datahub.upgrade.system.entity.steps.BackfillBrowsePathsV2; +import com.linkedin.datahub.upgrade.system.via.ReindexDataJobViaNodesCLL; import com.linkedin.metadata.dao.producer.KafkaEventProducer; import java.util.List; import java.util.stream.Collectors; @@ -24,11 +25,12 @@ public SystemUpdate( final CleanIndices cleanIndicesJob, final KafkaEventProducer kafkaEventProducer, final String version, - final BackfillBrowsePathsV2 backfillBrowsePathsV2) { + final BackfillBrowsePathsV2 backfillBrowsePathsV2, + final ReindexDataJobViaNodesCLL upgradeViaNodeCll) { _preStartupUpgrades = List.of(buildIndicesJob); _steps = List.of(new DataHubStartupStep(kafkaEventProducer, version)); - _postStartupUpgrades = List.of(cleanIndicesJob, backfillBrowsePathsV2); + _postStartupUpgrades = List.of(cleanIndicesJob, backfillBrowsePathsV2, upgradeViaNodeCll); } @Override diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/via/ReindexDataJobViaNodesCLL.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/via/ReindexDataJobViaNodesCLL.java new file mode 100644 index 0000000000000..41179a50c4b54 --- /dev/null +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/via/ReindexDataJobViaNodesCLL.java @@ -0,0 +1,34 @@ +package com.linkedin.datahub.upgrade.system.via; + +import static com.linkedin.metadata.Constants.*; + +import com.google.common.collect.ImmutableList; +import com.linkedin.datahub.upgrade.Upgrade; +import com.linkedin.datahub.upgrade.UpgradeStep; +import com.linkedin.metadata.entity.EntityService; +import java.util.List; +import lombok.extern.slf4j.Slf4j; + +/** + * A job that reindexes all datajob inputoutput aspects as part of the via node upgrade. This is + * required to index column-level lineage correctly using via nodes. + */ +@Slf4j +public class ReindexDataJobViaNodesCLL implements Upgrade { + + private final List _steps; + + public ReindexDataJobViaNodesCLL(EntityService entityService) { + _steps = ImmutableList.of(new ReindexDataJobViaNodesCLLStep(entityService)); + } + + @Override + public String id() { + return this.getClass().getName(); + } + + @Override + public List steps() { + return _steps; + } +} diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/via/ReindexDataJobViaNodesCLLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/via/ReindexDataJobViaNodesCLLStep.java new file mode 100644 index 0000000000000..70afbc3d205b2 --- /dev/null +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/via/ReindexDataJobViaNodesCLLStep.java @@ -0,0 +1,84 @@ +package com.linkedin.datahub.upgrade.system.via; + +import static com.linkedin.metadata.Constants.*; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.upgrade.UpgradeContext; +import com.linkedin.datahub.upgrade.UpgradeStep; +import com.linkedin.datahub.upgrade.UpgradeStepResult; +import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; +import com.linkedin.metadata.boot.BootstrapStep; +import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; +import com.linkedin.metadata.entity.restoreindices.RestoreIndicesResult; +import java.net.URISyntaxException; +import java.util.function.Function; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class ReindexDataJobViaNodesCLLStep implements UpgradeStep { + + private static final String UPGRADE_ID = "via-node-cll-reindex-datajob"; + private static final Urn UPGRADE_ID_URN = BootstrapStep.getUpgradeUrn(UPGRADE_ID); + + private static final Integer BATCH_SIZE = 5000; + + private final EntityService _entityService; + + public ReindexDataJobViaNodesCLLStep(EntityService entityService) { + _entityService = entityService; + } + + @Override + public Function executable() { + return (context) -> { + RestoreIndicesArgs args = + new RestoreIndicesArgs() + .setAspectName(DATA_JOB_INPUT_OUTPUT_ASPECT_NAME) + .setUrnLike("urn:li:" + DATA_JOB_ENTITY_NAME + ":%"); + RestoreIndicesResult result = + _entityService.restoreIndices(args, x -> context.report().addLine((String) x)); + context.report().addLine("Rows migrated: " + result.rowsMigrated); + context.report().addLine("Rows ignored: " + result.ignored); + try { + BootstrapStep.setUpgradeResult(UPGRADE_ID_URN, _entityService); + context.report().addLine("State updated: " + UPGRADE_ID_URN); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + return new DefaultUpgradeStepResult(id(), UpgradeStepResult.Result.SUCCEEDED); + }; + } + + @Override + public String id() { + return UPGRADE_ID; + } + + /** + * Returns whether the upgrade should proceed if the step fails after exceeding the maximum + * retries. + */ + @Override + public boolean isOptional() { + return false; + } + + @Override + /** + * Returns whether the upgrade should be skipped. Uses previous run history or the environment + * variable SKIP_REINDEX_DATA_JOB_INPUT_OUTPUT to determine whether to skip. + */ + public boolean skip(UpgradeContext context) { + boolean previouslyRun = _entityService.exists(UPGRADE_ID_URN, true); + boolean envFlagRecommendsSkip = + Boolean.parseBoolean(System.getenv("SKIP_REINDEX_DATA_JOB_INPUT_OUTPUT")); + if (previouslyRun) { + log.info("{} was already run. Skipping.", id()); + } + if (envFlagRecommendsSkip) { + log.info("Environment variable SKIP_REINDEX_DATA_JOB_INPUT_OUTPUT is set to true. Skipping."); + } + return (previouslyRun || envFlagRecommendsSkip); + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/RelationshipAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/RelationshipAnnotation.java index a22ef56d60006..630e7951c0311 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/RelationshipAnnotation.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/RelationshipAnnotation.java @@ -6,10 +6,12 @@ import java.util.Map; import java.util.Optional; import javax.annotation.Nonnull; +import lombok.AllArgsConstructor; import lombok.Value; /** Simple object representation of the @Relationship annotation metadata. */ @Value +@AllArgsConstructor public class RelationshipAnnotation { public static final String ANNOTATION_NAME = "Relationship"; @@ -23,6 +25,8 @@ public class RelationshipAnnotation { private static final String UPDATED_ACTOR = "updatedActor"; private static final String PROPERTIES = "properties"; + private static final String VIA = "via"; + String name; List validDestinationTypes; boolean isUpstream; @@ -32,6 +36,7 @@ public class RelationshipAnnotation { String updatedOn; String updatedActor; String properties; + String via; @Nonnull public static RelationshipAnnotation fromPegasusAnnotationObject( @@ -78,6 +83,7 @@ public static RelationshipAnnotation fromPegasusAnnotationObject( final Optional updatedActor = AnnotationUtils.getField(map, UPDATED_ACTOR, String.class); final Optional properties = AnnotationUtils.getField(map, PROPERTIES, String.class); + final Optional via = AnnotationUtils.getField(map, VIA, String.class); return new RelationshipAnnotation( name.get(), @@ -88,6 +94,43 @@ public static RelationshipAnnotation fromPegasusAnnotationObject( createdActor.orElse(null), updatedOn.orElse(null), updatedActor.orElse(null), - properties.orElse(null)); + properties.orElse(null), + via.orElse(null)); + } + + /** + * Constructor for backwards compatibility + * + * @param name + * @param entityTypes + * @param isUpstream + * @param isLineage + * @param createdOn + * @param createdActor + * @param updatedOn + * @param updatedActor + * @param properties + */ + public RelationshipAnnotation( + String name, + List validDestinationTypes, + boolean isUpstream, + boolean isLineage, + String createdOn, + String createdActor, + String updatedOn, + String updatedActor, + String properties) { + this( + name, + validDestinationTypes, + isUpstream, + isLineage, + createdOn, + createdActor, + updatedOn, + updatedActor, + properties, + null); } } diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 5c24b06dde999..d64f756dddc13 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -83,6 +83,7 @@ class DatahubClientConfig(ConfigModel): class RelatedEntity: urn: str relationship_type: str + via: Optional[str] = None def _graphql_entity_type(entity_type: str) -> str: @@ -833,6 +834,7 @@ def get_related_entities( yield RelatedEntity( urn=related_entity["urn"], relationship_type=related_entity["relationshipType"], + via=related_entity.get("via"), ) done = response.get("count", 0) == 0 or response.get("count", 0) < len( response.get("entities", []) @@ -840,9 +842,9 @@ def get_related_entities( start = start + response.get("count", 0) def exists(self, entity_urn: str) -> bool: - entity_urn_parsed: Urn = Urn.create_from_string(entity_urn) + entity_urn_parsed: Urn = Urn.from_string(entity_urn) try: - key_aspect_class = KEY_ASPECTS.get(entity_urn_parsed.get_type()) + key_aspect_class = KEY_ASPECTS.get(entity_urn_parsed.entity_type) if key_aspect_class: result = self.get_aspect(entity_urn, key_aspect_class) return result is not None diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index b3b11d200ec0d..e6e69c96c1542 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -1948,6 +1948,7 @@ public RollbackRunResult deleteUrn(Urn urn) { */ @Override public Set exists(@Nonnull final Collection urns, boolean includeSoftDeleted) { + final Set dbKeys = urns.stream() .map( @@ -1960,7 +1961,6 @@ public Set exists(@Nonnull final Collection urns, boolean includeSoftD .getName(), ASPECT_LATEST_VERSION)) .collect(Collectors.toSet()); - final Map aspects = _aspectDao.batchGet(dbKeys); final Set existingUrnStrings = aspects.values().stream() diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/dgraph/DgraphGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/dgraph/DgraphGraphService.java index 24e272dee7a25..3bcaf6a08f4e5 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/dgraph/DgraphGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/dgraph/DgraphGraphService.java @@ -653,7 +653,7 @@ protected static List getRelatedEntitiesFromResponseData( }) // for undirected we get duplicate relationships .distinct() - .map(relationship -> new RelatedEntity(relationship, urn)); + .map(relationship -> new RelatedEntity(relationship, urn, null)); } return Stream.empty(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java index 3051319aa54cf..270615aa0e356 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java @@ -1,6 +1,7 @@ package com.linkedin.metadata.graph.elastic; import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.*; +import static com.linkedin.metadata.graph.elastic.GraphRelationshipMappingsBuilder.*; import com.codahale.metrics.Timer; import com.datahub.util.exception.ESQueryException; @@ -11,6 +12,7 @@ import com.linkedin.common.UrnArrayArray; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.template.IntegerArray; import com.linkedin.metadata.config.search.GraphQueryConfiguration; import com.linkedin.metadata.graph.GraphFilters; import com.linkedin.metadata.graph.LineageDirection; @@ -34,7 +36,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedList; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -182,6 +184,24 @@ public static BoolQueryBuilder buildQuery( @Nullable final Filter destinationEntityFilter, @Nonnull final List relationshipTypes, @Nonnull final RelationshipFilter relationshipFilter) { + return buildQuery( + sourceTypes, + sourceEntityFilter, + destinationTypes, + destinationEntityFilter, + relationshipTypes, + relationshipFilter, + null); + } + + public static BoolQueryBuilder buildQuery( + @Nullable final List sourceTypes, + @Nonnull final Filter sourceEntityFilter, + @Nullable final List destinationTypes, + @Nonnull final Filter destinationEntityFilter, + @Nonnull final List relationshipTypes, + @Nonnull final RelationshipFilter relationshipFilter, + @Nullable final String lifecycleOwner) { BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); final RelationshipDirection relationshipDirection = relationshipFilter.getDirection(); @@ -221,6 +241,9 @@ public static BoolQueryBuilder buildQuery( if (relationshipFilter.getOr() != null) { addFilterToQueryBuilder(new Filter().setOr(relationshipFilter.getOr()), null, finalQuery); } + if (lifecycleOwner != null) { + finalQuery.filter(QueryBuilders.termQuery(EDGE_FIELD_LIFECYCLE_OWNER, lifecycleOwner)); + } return finalQuery; } @@ -235,14 +258,16 @@ public LineageResponse getLineage( int maxHops, @Nullable Long startTimeMillis, @Nullable Long endTimeMillis) { - List result = new ArrayList<>(); + Map result = new HashMap<>(); long currentTime = System.currentTimeMillis(); long remainingTime = graphQueryConfiguration.getTimeoutSeconds() * 1000; + boolean exploreMultiplePaths = graphQueryConfiguration.isEnableMultiPathSearch(); long timeoutTime = currentTime + remainingTime; // Do a Level-order BFS Set visitedEntities = ConcurrentHashMap.newKeySet(); visitedEntities.add(entityUrn); + Set viaEntities = ConcurrentHashMap.newKeySet(); Map existingPaths = new HashMap<>(); List currentLevel = ImmutableList.of(entityUrn); @@ -267,12 +292,23 @@ public LineageResponse getLineage( direction, graphFilters, visitedEntities, + viaEntities, i + 1, + maxHops - (i + 1), remainingTime, existingPaths, startTimeMillis, - endTimeMillis); - result.addAll(oneHopRelationships); + endTimeMillis, + exploreMultiplePaths); + for (LineageRelationship oneHopRelnship : oneHopRelationships) { + if (result.containsKey(oneHopRelnship.getEntity())) { + result.put( + oneHopRelnship.getEntity(), + mergeLineageRelationships(result.get(oneHopRelnship.getEntity()), oneHopRelnship)); + } else { + result.put(oneHopRelnship.getEntity(), oneHopRelnship); + } + } currentLevel = oneHopRelationships.stream() .map(LineageRelationship::getEntity) @@ -280,7 +316,8 @@ public LineageResponse getLineage( currentTime = System.currentTimeMillis(); remainingTime = timeoutTime - currentTime; } - LineageResponse response = new LineageResponse(result.size(), result); + List resultList = new ArrayList<>(result.values()); + LineageResponse response = new LineageResponse(resultList.size(), resultList); List subList; if (offset >= response.getTotal()) { @@ -295,6 +332,39 @@ public LineageResponse getLineage( return new LineageResponse(response.getTotal(), subList); } + /** + * Merges two lineage relationship objects. The merged relationship object will have the minimum + * degree of the two relationships, and the union of the paths. In addition, the merged + * relationship object will have the union of the degrees in the new degrees field. + * + * @param existingRelationship + * @param newRelationship + * @return the merged relationship object + */ + private LineageRelationship mergeLineageRelationships( + final LineageRelationship existingRelationship, final LineageRelationship newRelationship) { + try { + LineageRelationship copyRelationship = existingRelationship.copy(); + copyRelationship.setDegree( + Math.min(existingRelationship.getDegree(), newRelationship.getDegree())); + Set degrees = new HashSet<>(); + if (copyRelationship.hasDegrees()) { + degrees = copyRelationship.getDegrees().stream().collect(Collectors.toSet()); + } + degrees.add(newRelationship.getDegree()); + copyRelationship.setDegrees(new IntegerArray(degrees)); + UrnArrayArray copyPaths = + new UrnArrayArray( + existingRelationship.getPaths().size() + newRelationship.getPaths().size()); + copyPaths.addAll(existingRelationship.getPaths()); + copyPaths.addAll(newRelationship.getPaths()); + copyRelationship.setPaths(copyPaths); + return copyRelationship; + } catch (CloneNotSupportedException e) { + throw new RuntimeException("Failed to clone lineage relationship", e); + } + } + // Get 1-hop lineage relationships asynchronously in batches with timeout @WithSpan public List getLineageRelationshipsInBatches( @@ -302,11 +372,14 @@ public List getLineageRelationshipsInBatches( @Nonnull LineageDirection direction, GraphFilters graphFilters, Set visitedEntities, + Set viaEntities, int numHops, + int remainingHops, long remainingTime, Map existingPaths, @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis) { + @Nullable Long endTimeMillis, + boolean exploreMultiplePaths) { List> batches = Lists.partition(entityUrns, graphQueryConfiguration.getBatchSize()); return ConcurrencyUtils.getAllCompleted( batches.stream() @@ -319,10 +392,13 @@ public List getLineageRelationshipsInBatches( direction, graphFilters, visitedEntities, + viaEntities, numHops, + remainingHops, existingPaths, startTimeMillis, - endTimeMillis))) + endTimeMillis, + exploreMultiplePaths))) .collect(Collectors.toList()), remainingTime, TimeUnit.MILLISECONDS) @@ -338,10 +414,13 @@ private List getLineageRelationships( @Nonnull LineageDirection direction, GraphFilters graphFilters, Set visitedEntities, + Set viaEntities, int numHops, + int remainingHops, Map existingPaths, @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis) { + @Nullable Long endTimeMillis, + boolean exploreMultiplePaths) { Map> urnsPerEntityType = entityUrns.stream().collect(Collectors.groupingBy(Urn::getEntityType)); Map> edgesPerEntityType = @@ -365,7 +444,15 @@ private List getLineageRelationships( entry.getValue().stream().map(edgeInfo -> Pair.of(entry.getKey(), edgeInfo))) .collect(Collectors.toSet()); return extractRelationships( - entityUrnSet, response, validEdges, visitedEntities, numHops, existingPaths); + entityUrnSet, + response, + validEdges, + visitedEntities, + viaEntities, + numHops, + remainingHops, + existingPaths, + exploreMultiplePaths); } @VisibleForTesting @@ -408,7 +495,6 @@ public static QueryBuilder getLineageQuery( return finalQuery; } - // Get search query for given list of edges and source urns @VisibleForTesting public static QueryBuilder getLineageQueryForEntityType( @Nonnull List urns, @@ -464,27 +550,88 @@ public static void addEdgeToPaths( @Nonnull final Map existingPaths, @Nonnull final Urn parentUrn, @Nonnull final Urn childUrn) { + addEdgeToPaths(existingPaths, parentUrn, null, childUrn); + } + + /** + * Utility method to log paths to the debug log. + * + * @param paths + * @param message + */ + private static void logPaths(UrnArrayArray paths, String message) { + if (log.isDebugEnabled()) { + log.debug("xxxxxxxxxx"); + log.debug(message); + log.debug("---------"); + if (paths != null) { + paths.forEach(path -> log.debug("{}", path)); + } else { + log.debug("EMPTY"); + } + log.debug("xxxxxxxxxx"); + } + } + + private static boolean containsCycle(final UrnArray path) { + Set urnSet = path.stream().collect(Collectors.toUnmodifiableSet()); + // path contains a cycle if any urn is repeated twice + return (path.size() != urnSet.size()); + } + + public static boolean addEdgeToPaths( + @Nonnull final Map existingPaths, + @Nonnull final Urn parentUrn, + final Urn viaUrn, + @Nonnull final Urn childUrn) { + boolean edgeAdded = false; // Collect all full-paths to this child node. This is what will be returned. UrnArrayArray pathsToParent = existingPaths.get(parentUrn); - if (pathsToParent != null && pathsToParent.size() > 0) { + logPaths(pathsToParent, String.format("Paths to Parent: %s, Child: %s", parentUrn, childUrn)); + logPaths(existingPaths.get(childUrn), String.format("Existing Paths to Child: %s", childUrn)); + if (pathsToParent != null && !pathsToParent.isEmpty()) { // If there are existing paths to this parent node, then we attempt // to append the child to each of the existing paths (lengthen it). // We then store this as a separate, unique path associated with the child. - for (final UrnArray pathToParent : pathsToParent) { + for (UrnArray pathToParent : pathsToParent) { + if (containsCycle(pathToParent)) { + log.debug("Skipping extending path {} because it contains a cycle", pathToParent); + continue; + } UrnArray pathToChild = clonePath(pathToParent); + if (viaUrn != null) { + pathToChild.add(viaUrn); + } pathToChild.add(childUrn); // Save these paths to the global structure for easy access on future iterations. existingPaths.putIfAbsent(childUrn, new UrnArrayArray()); - existingPaths.get(childUrn).add(pathToChild); + UrnArrayArray existingPathsToChild = existingPaths.get(childUrn); + boolean dupExists = false; + for (UrnArray existingPathToChild : existingPathsToChild) { + if (existingPathToChild.equals(pathToChild)) { + dupExists = true; + } + } + if (!dupExists) { + existingPathsToChild.add(pathToChild); + edgeAdded = true; + } } } else { // No existing paths to this parent urn. Let's create a new path to the child! UrnArray pathToChild = new UrnArray(); - pathToChild.addAll(ImmutableList.of(parentUrn, childUrn)); + if (viaUrn == null) { + pathToChild.addAll(ImmutableList.of(parentUrn, childUrn)); + } else { + pathToChild.addAll(ImmutableList.of(parentUrn, viaUrn, childUrn)); + } // Save these paths to the global structure for easy access on future iterations. existingPaths.putIfAbsent(childUrn, new UrnArrayArray()); existingPaths.get(childUrn).add(pathToChild); + edgeAdded = true; } + logPaths(existingPaths.get(childUrn), String.format("New paths to Child: %s", childUrn)); + return edgeAdded; } // Given set of edges and the search response, extract all valid edges that originate from the @@ -495,101 +642,198 @@ private static List extractRelationships( @Nonnull SearchResponse searchResponse, Set> validEdges, Set visitedEntities, + Set viaEntities, int numHops, - Map existingPaths) { - final List result = new LinkedList<>(); - final SearchHit[] hits = searchResponse.getHits().getHits(); - for (SearchHit hit : hits) { - final Map document = hit.getSourceAsMap(); - final Urn sourceUrn = - UrnUtils.getUrn(((Map) document.get(SOURCE)).get("urn").toString()); - final Urn destinationUrn = - UrnUtils.getUrn(((Map) document.get(DESTINATION)).get("urn").toString()); - final String type = document.get(RELATIONSHIP_TYPE).toString(); - final Number createdOnNumber = (Number) document.getOrDefault(CREATED_ON, null); - final Long createdOn = createdOnNumber != null ? createdOnNumber.longValue() : null; - final Number updatedOnNumber = (Number) document.getOrDefault(UPDATED_ON, null); - final Long updatedOn = updatedOnNumber != null ? updatedOnNumber.longValue() : null; - final String createdActorString = (String) document.getOrDefault(CREATED_ACTOR, null); - final Urn createdActor = - createdActorString == null ? null : UrnUtils.getUrn(createdActorString); - final String updatedActorString = (String) document.getOrDefault(UPDATED_ACTOR, null); - final Urn updatedActor = - updatedActorString == null ? null : UrnUtils.getUrn(updatedActorString); - final Map properties; - if (document.containsKey(PROPERTIES) && document.get(PROPERTIES) instanceof Map) { - properties = (Map) document.get(PROPERTIES); - } else { - properties = Collections.emptyMap(); - } - boolean isManual = properties.containsKey(SOURCE) && properties.get(SOURCE).equals("UI"); - - // Potential outgoing edge - if (entityUrns.contains(sourceUrn)) { - // Skip if already visited - // Skip if edge is not a valid outgoing edge - // TODO: Verify if this honors multiple paths to the same node. - if (!visitedEntities.contains(destinationUrn) - && validEdges.contains( - Pair.of( - sourceUrn.getEntityType(), - new EdgeInfo( - type, - RelationshipDirection.OUTGOING, - destinationUrn.getEntityType().toLowerCase())))) { - visitedEntities.add(destinationUrn); - // Append the edge to a set of unique graph paths. - addEdgeToPaths(existingPaths, sourceUrn, destinationUrn); - final LineageRelationship relationship = - createLineageRelationship( - type, - destinationUrn, - numHops, - existingPaths.getOrDefault( + int remainingHops, + Map existingPaths, + boolean exploreMultiplePaths) { + try { + Map lineageRelationshipMap = new HashMap<>(); + final SearchHit[] hits = searchResponse.getHits().getHits(); + log.debug("numHits: {}, numHops {}, remainingHops {}", hits.length, numHops, remainingHops); + int index = -1; + for (SearchHit hit : hits) { + index++; + final Map document = hit.getSourceAsMap(); + log.debug("{}: hit: {}", index, document); + final Urn sourceUrn = + UrnUtils.getUrn(((Map) document.get(SOURCE)).get("urn").toString()); + final Urn destinationUrn = + UrnUtils.getUrn( + ((Map) document.get(DESTINATION)).get("urn").toString()); + final String type = document.get(RELATIONSHIP_TYPE).toString(); + if (sourceUrn.equals(destinationUrn)) { + log.debug("Skipping a self-edge of type {} on {}", type, sourceUrn); + continue; + } + final Number createdOnNumber = (Number) document.getOrDefault(CREATED_ON, null); + final Long createdOn = createdOnNumber != null ? createdOnNumber.longValue() : null; + final Number updatedOnNumber = (Number) document.getOrDefault(UPDATED_ON, null); + final Long updatedOn = updatedOnNumber != null ? updatedOnNumber.longValue() : null; + final String createdActorString = (String) document.getOrDefault(CREATED_ACTOR, null); + final Urn createdActor = + createdActorString == null ? null : UrnUtils.getUrn(createdActorString); + final String updatedActorString = (String) document.getOrDefault(UPDATED_ACTOR, null); + final Urn updatedActor = + updatedActorString == null ? null : UrnUtils.getUrn(updatedActorString); + final Map properties; + if (document.containsKey(PROPERTIES) && document.get(PROPERTIES) instanceof Map) { + properties = (Map) document.get(PROPERTIES); + } else { + properties = Collections.emptyMap(); + } + boolean isManual = properties.containsKey(SOURCE) && properties.get(SOURCE).equals("UI"); + Urn viaEntity = null; + String viaContent = (String) document.getOrDefault(EDGE_FIELD_VIA, null); + if (viaContent != null) { + try { + viaEntity = Urn.createFromString(viaContent); + } catch (Exception e) { + log.warn( + "Failed to parse urn from via entity {}, will swallow exception and continue...", + viaContent); + } + } + log.debug("{}: viaEntity {}", index, viaEntity); + + // Potential outgoing edge + if (entityUrns.contains(sourceUrn)) { + log.debug("{}: entity urns contains source urn {}", index, sourceUrn); + // Skip if already visited or if we're exploring multiple paths + // Skip if edge is not a valid outgoing edge + if ((exploreMultiplePaths || !visitedEntities.contains(destinationUrn)) + && validEdges.contains( + Pair.of( + sourceUrn.getEntityType(), + new EdgeInfo( + type, + RelationshipDirection.OUTGOING, + destinationUrn.getEntityType().toLowerCase())))) { + + if (visitedEntities.contains(destinationUrn)) { + log.debug("Found a second path to the same urn {}", destinationUrn); + } + // Append the edge to a set of unique graph paths. + if (addEdgeToPaths(existingPaths, sourceUrn, viaEntity, destinationUrn)) { + final LineageRelationship relationship = + createLineageRelationship( + type, destinationUrn, - new UrnArrayArray()), // Fetch the paths to the next level entity. - createdOn, - createdActor, - updatedOn, - updatedActor, - isManual); - result.add(relationship); + numHops, + existingPaths.getOrDefault(destinationUrn, new UrnArrayArray()), + // Fetch the paths to the next level entity. + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual); + log.debug("Adding relationship {} to urn {}", relationship, destinationUrn); + lineageRelationshipMap.put(relationship.getEntity(), relationship); + if ((viaEntity != null) && (!viaEntities.contains(viaEntity))) { + UrnArrayArray viaPaths = getViaPaths(existingPaths, destinationUrn, viaEntity); + LineageRelationship viaRelationship = + createLineageRelationship( + type, + viaEntity, + numHops, + viaPaths, + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual); + viaEntities.add(viaEntity); + lineageRelationshipMap.put(viaRelationship.getEntity(), viaRelationship); + log.debug("Adding via entity {} with paths {}", viaEntity, viaPaths); + } + } + visitedEntities.add(destinationUrn); + } } - } - // Potential incoming edge - if (entityUrns.contains(destinationUrn)) { - // Skip if already visited - // Skip if edge is not a valid outgoing edge - // TODO: Verify if this honors multiple paths to the same node. - if (!visitedEntities.contains(sourceUrn) - && validEdges.contains( - Pair.of( - destinationUrn.getEntityType(), - new EdgeInfo( + // Potential incoming edge + if (entityUrns.contains(destinationUrn)) { + // Skip if already visited or if we're exploring multiple paths + // Skip if edge is not a valid outgoing edge + log.debug("entity urns contains destination urn {}", destinationUrn); + if ((exploreMultiplePaths || !visitedEntities.contains(sourceUrn)) + && validEdges.contains( + Pair.of( + destinationUrn.getEntityType(), + new EdgeInfo( + type, + RelationshipDirection.INCOMING, + sourceUrn.getEntityType().toLowerCase())))) { + if (visitedEntities.contains(sourceUrn)) { + log.debug("Found a second path to the same urn {}", sourceUrn); + } + visitedEntities.add(sourceUrn); + // Append the edge to a set of unique graph paths. + if (addEdgeToPaths(existingPaths, destinationUrn, viaEntity, sourceUrn)) { + log.debug("Adding incoming edge: {}, {}, {}", destinationUrn, viaEntity, sourceUrn); + final LineageRelationship relationship = + createLineageRelationship( + type, + sourceUrn, + numHops, + existingPaths.getOrDefault(sourceUrn, new UrnArrayArray()), + // Fetch the paths to the next level entity. + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual); + log.debug("Adding relationship {} to urn {}", relationship, sourceUrn); + lineageRelationshipMap.put(relationship.getEntity(), relationship); + if ((viaEntity != null) && (!viaEntities.contains(viaEntity))) { + UrnArrayArray viaPaths = getViaPaths(existingPaths, sourceUrn, viaEntity); + viaEntities.add(viaEntity); + LineageRelationship viaRelationship = + createLineageRelationship( type, - RelationshipDirection.INCOMING, - sourceUrn.getEntityType().toLowerCase())))) { - visitedEntities.add(sourceUrn); - // Append the edge to a set of unique graph paths. - addEdgeToPaths(existingPaths, destinationUrn, sourceUrn); - final LineageRelationship relationship = - createLineageRelationship( - type, - sourceUrn, - numHops, - existingPaths.getOrDefault( - sourceUrn, new UrnArrayArray()), // Fetch the paths to the next level entity. - createdOn, - createdActor, - updatedOn, - updatedActor, - isManual); - result.add(relationship); + viaEntity, + numHops, + viaPaths, + createdOn, + createdActor, + updatedOn, + updatedActor, + isManual); + lineageRelationshipMap.put(viaRelationship.getEntity(), viaRelationship); + log.debug("Adding via relationship {} to urn {}", viaRelationship, viaEntity); + } + } + } + } + } + List result = new ArrayList<>(lineageRelationshipMap.values()); + log.debug("Number of lineage relationships in list: {}", result.size()); + log.debug("Result: {}", result); + return result; + } catch (Exception e) { + // This exception handler merely exists to log the exception at an appropriate point and + // rethrow + log.error("Caught exception", e); + throw e; + } + } + + private static UrnArrayArray getViaPaths( + Map existingPaths, Urn destinationUrn, Urn viaEntity) { + UrnArrayArray destinationPaths = + existingPaths.getOrDefault(destinationUrn, new UrnArrayArray()); + UrnArrayArray viaPaths = new UrnArrayArray(); + for (UrnArray destPath : destinationPaths) { + UrnArray viaPath = new UrnArray(); + for (Urn urn : destPath) { + viaPath.add(urn); + if (urn.equals(viaEntity)) { + break; } } + viaPaths.add(viaPath); } - return result; + return viaPaths; } private static LineageRelationship createLineageRelationship( @@ -607,6 +851,7 @@ private static LineageRelationship createLineageRelationship( .setType(type) .setEntity(entityUrn) .setDegree(numHops) + .setDegrees(new IntegerArray(ImmutableList.of(numHops))) .setPaths(paths); if (createdOn != null) { relationship.setCreatedOn(createdOn); @@ -658,8 +903,10 @@ private static QueryBuilder buildEntityTypesFilter( } private static QueryBuilder buildUrnFilters(@Nonnull List urns, @Nonnull String prefix) { - return QueryBuilders.termsQuery( - prefix + ".urn", urns.stream().map(Object::toString).collect(Collectors.toList())); + // dedup urns while preserving order + LinkedHashSet urnSet = new LinkedHashSet<>(); + urns.forEach(urn -> urnSet.add(urn.toString())); + return QueryBuilders.termsQuery(prefix + ".urn", urnSet); } private static QueryBuilder buildEdgeFilters(@Nonnull List edgeInfos) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java index 5d722a034fafc..ddbd00f90ef68 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java @@ -75,4 +75,27 @@ public BulkByScrollResponse deleteByQuery( .deleteByQuery(finalQuery, indexConvention.getIndexName(INDEX_NAME)) .orElse(null); } + + public BulkByScrollResponse deleteByQuery( + @Nullable final String sourceType, + @Nonnull final Filter sourceEntityFilter, + @Nullable final String destinationType, + @Nonnull final Filter destinationEntityFilter, + @Nonnull final List relationshipTypes, + @Nonnull final RelationshipFilter relationshipFilter, + String lifecycleOwner) { + BoolQueryBuilder finalQuery = + buildQuery( + sourceType == null ? ImmutableList.of() : ImmutableList.of(sourceType), + sourceEntityFilter, + destinationType == null ? ImmutableList.of() : ImmutableList.of(destinationType), + destinationEntityFilter, + relationshipTypes, + relationshipFilter, + lifecycleOwner); + + return bulkProcessor + .deleteByQuery(finalQuery, indexConvention.getIndexName(INDEX_NAME)) + .orElse(null); + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java index 67590ffd6e7c1..90f46190ac18e 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.graph.elastic; +import static com.linkedin.metadata.graph.elastic.GraphRelationshipMappingsBuilder.*; + import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.annotations.VisibleForTesting; @@ -81,9 +83,9 @@ private String toDocument(@Nonnull final Edge edge) { destinationObject.put("urn", edge.getDestination().toString()); destinationObject.put("entityType", edge.getDestination().getEntityType()); - searchDocument.set("source", sourceObject); - searchDocument.set("destination", destinationObject); - searchDocument.put("relationshipType", edge.getRelationshipType()); + searchDocument.set(EDGE_FIELD_SOURCE, sourceObject); + searchDocument.set(EDGE_FIELD_DESTINATION, destinationObject); + searchDocument.put(EDGE_FIELD_RELNSHIP_TYPE, edge.getRelationshipType()); if (edge.getCreatedOn() != null) { searchDocument.put("createdOn", edge.getCreatedOn()); } @@ -108,8 +110,15 @@ private String toDocument(@Nonnull final Edge edge) { entry.getKey(), entry.getValue())); } } - searchDocument.set("properties", propertiesObject); + searchDocument.set(EDGE_FIELD_PROPERTIES, propertiesObject); + } + if (edge.getLifecycleOwner() != null) { + searchDocument.put(EDGE_FIELD_LIFECYCLE_OWNER, edge.getLifecycleOwner().toString()); + } + if (edge.getVia() != null) { + searchDocument.put(EDGE_FIELD_VIA, edge.getVia().toString()); } + log.debug("Search doc for write {}", searchDocument); return searchDocument.toString(); } @@ -192,8 +201,8 @@ public RelatedEntitiesResult findRelatedEntities( final List relationships = searchHitsToRelatedEntities(response.getHits().getHits(), relationshipDirection).stream() .map(RelatedEntities::asRelatedEntity) + .filter(Objects::nonNull) .collect(Collectors.toList()); - return new RelatedEntitiesResult(offset, relationships.size(), totalCount, relationships); } @@ -277,6 +286,10 @@ public void removeNode(@Nonnull final Urn urn) { _graphWriteDAO.deleteByQuery( null, urnFilter, null, emptyFilter, relationshipTypes, incomingFilter); + // Delete all edges where this entity is a lifecycle owner + _graphWriteDAO.deleteByQuery( + null, emptyFilter, null, emptyFilter, relationshipTypes, incomingFilter, urn.toString()); + return; } @@ -394,15 +407,15 @@ private static List searchHitsToRelatedEntities( return Arrays.stream(searchHits) .map( hit -> { + final Map hitMap = hit.getSourceAsMap(); final String destinationUrnStr = - ((HashMap) - hit.getSourceAsMap().getOrDefault("destination", EMPTY_HASH)) + ((Map) hitMap.getOrDefault(EDGE_FIELD_DESTINATION, EMPTY_HASH)) .getOrDefault("urn", null); final String sourceUrnStr = - ((HashMap) - hit.getSourceAsMap().getOrDefault("source", EMPTY_HASH)) + ((Map) hitMap.getOrDefault(EDGE_FIELD_SOURCE, EMPTY_HASH)) .getOrDefault("urn", null); - final String relationshipType = (String) hit.getSourceAsMap().get("relationshipType"); + final String relationshipType = (String) hitMap.get(EDGE_FIELD_RELNSHIP_TYPE); + String viaEntity = (String) hitMap.get(EDGE_FIELD_VIA); if (destinationUrnStr == null || sourceUrnStr == null || relationshipType == null) { log.error( @@ -414,7 +427,11 @@ private static List searchHitsToRelatedEntities( } return new RelatedEntities( - relationshipType, sourceUrnStr, destinationUrnStr, relationshipDirection); + relationshipType, + sourceUrnStr, + destinationUrnStr, + relationshipDirection, + viaEntity); }) .filter(Objects::nonNull) .collect(Collectors.toList()); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/GraphRelationshipMappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/GraphRelationshipMappingsBuilder.java index 21f2bf6c89204..ab4eaa1b99392 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/GraphRelationshipMappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/GraphRelationshipMappingsBuilder.java @@ -7,16 +7,23 @@ @Slf4j public class GraphRelationshipMappingsBuilder { + public static final String EDGE_FIELD_SOURCE = "source"; + public static final String EDGE_FIELD_DESTINATION = "destination"; + public static final String EDGE_FIELD_RELNSHIP_TYPE = "relationshipType"; + public static final String EDGE_FIELD_PROPERTIES = "properties"; + public static final String EDGE_FIELD_VIA = "via"; + public static final String EDGE_FIELD_LIFECYCLE_OWNER = "lifecycleOwner"; private GraphRelationshipMappingsBuilder() {} public static Map getMappings() { Map mappings = new HashMap<>(); - mappings.put("source", getMappingsForEntity()); - mappings.put("destination", getMappingsForEntity()); - mappings.put("relationshipType", getMappingsForKeyword()); - mappings.put("properties", getMappingsForEdgeProperties()); - + mappings.put(EDGE_FIELD_SOURCE, getMappingsForEntity()); + mappings.put(EDGE_FIELD_DESTINATION, getMappingsForEntity()); + mappings.put(EDGE_FIELD_RELNSHIP_TYPE, getMappingsForKeyword()); + mappings.put(EDGE_FIELD_PROPERTIES, getMappingsForEdgeProperties()); + mappings.put(EDGE_FIELD_LIFECYCLE_OWNER, getMappingsForKeyword()); + mappings.put(EDGE_FIELD_VIA, getMappingsForKeyword()); return ImmutableMap.of("properties", mappings); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java index a1f73a134ec8e..11acc138d4dba 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java @@ -534,7 +534,8 @@ record -> .get(0) .asNode() .get("urn") - .asString())); // Urn TODO: Validate this works against Neo4j. + .asString(), // Urn TODO: Validate this works against Neo4j. + null)); final int totalCount = runQuery(countStatement).single().get(0).asInt(); return new RelatedEntitiesResult(offset, relatedEntities.size(), totalCount, relatedEntities); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java index f6358e4aeb207..cf9279414a394 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java @@ -19,6 +19,9 @@ import com.linkedin.metadata.graph.LineageRelationship; import com.linkedin.metadata.graph.LineageRelationshipArray; import com.linkedin.metadata.query.FreshnessStats; +import com.linkedin.metadata.query.GroupingCriterion; +import com.linkedin.metadata.query.GroupingCriterionArray; +import com.linkedin.metadata.query.GroupingSpec; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.ConjunctiveCriterion; import com.linkedin.metadata.query.filter.Criterion; @@ -55,13 +58,22 @@ @RequiredArgsConstructor @Slf4j public class LineageSearchService { + private static final SearchFlags DEFAULT_SERVICE_SEARCH_FLAGS = new SearchFlags() .setFulltext(false) .setMaxAggValues(20) .setSkipCache(false) .setSkipAggregates(false) - .setSkipHighlighting(true); + .setSkipHighlighting(true) + .setGroupingSpec( + new GroupingSpec() + .setGroupingCriteria( + new GroupingCriterionArray( + new GroupingCriterion() // Convert schema fields to datasets by default to + // maintain backwards compatibility + .setBaseEntityType(SCHEMA_FIELD_ENTITY_NAME) + .setGroupingEntityType(DATASET_ENTITY_NAME)))); private final SearchService _searchService; private final GraphService _graphService; @Nullable private final Cache cache; @@ -206,14 +218,18 @@ public LineageSearchResult searchAcrossLineage( } } - // set schemaField relationship entity to be its reference urn - LineageRelationshipArray updatedRelationships = convertSchemaFieldRelationships(lineageResult); - lineageResult.setRelationships(updatedRelationships); + if (SearchUtils.convertSchemaFieldToDataset(searchFlags)) { + // set schemaField relationship entity to be its reference urn + LineageRelationshipArray updatedRelationships = + convertSchemaFieldRelationships(lineageResult); + lineageResult.setRelationships(updatedRelationships); + } // Filter hopped result based on the set of entities to return and inputFilters before sending // to search List lineageRelationships = filterRelationships(lineageResult, new HashSet<>(entities), inputFilters); + log.debug("Lineage relationships found: {}", lineageRelationships); String lineageGraphInfo = String.format( @@ -247,7 +263,9 @@ public LineageSearchResult searchAcrossLineage( lineageRelationships, input, reducedFilters, sortCriterion, from, size, finalFlags); if (!lineageSearchResult.getEntities().isEmpty()) { log.debug( - "Lineage entity result: {}", lineageSearchResult.getEntities().get(0).toString()); + "Lineage entity results number -> {}; first -> {}", + lineageSearchResult.getNumEntities(), + lineageSearchResult.getEntities().get(0).toString()); } numEntities = lineageSearchResult.getNumEntities(); return lineageSearchResult; @@ -470,9 +488,17 @@ private Map generateUrnToRelationshipMap( if (existingRelationship == null) { urnToRelationship.put(relationship.getEntity(), relationship); } else { - UrnArrayArray paths = existingRelationship.getPaths(); - paths.addAll(relationship.getPaths()); - existingRelationship.setPaths(paths); + UrnArrayArray newPaths = + new UrnArrayArray( + existingRelationship.getPaths().size() + relationship.getPaths().size()); + log.debug( + "Found {} paths for {}, will add to existing paths: {}", + relationship.getPaths().size(), + relationship.getEntity(), + existingRelationship.getPaths().size()); + newPaths.addAll(existingRelationship.getPaths()); + newPaths.addAll(relationship.getPaths()); + existingRelationship.setPaths(newPaths); } } return urnToRelationship; @@ -665,6 +691,9 @@ private LineageSearchEntity buildLineageSearchEntity( if (lineageRelationship != null) { entity.setPaths(lineageRelationship.getPaths()); entity.setDegree(lineageRelationship.getDegree()); + if (lineageRelationship.hasDegrees()) { + entity.setDegrees(lineageRelationship.getDegrees()); + } } return entity; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java index b8cf0626b7251..13ccfd7f972af 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.search.utils; +import static com.linkedin.metadata.Constants.*; + import com.linkedin.common.UrnArray; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.LongMap; @@ -196,6 +198,28 @@ public static SearchFlags applyDefaultSearchFlags( if (!finalSearchFlags.hasSkipCache() || finalSearchFlags.isSkipCache() == null) { finalSearchFlags.setSkipCache(defaultFlags.isSkipCache()); } + if ((!finalSearchFlags.hasGroupingSpec() || finalSearchFlags.getGroupingSpec() == null) + && (defaultFlags.getGroupingSpec() != null)) { + finalSearchFlags.setGroupingSpec(defaultFlags.getGroupingSpec()); + } return finalSearchFlags; } + + /** + * Returns true if the search flags contain a grouping spec that requires conversion of schema + * field entity to dataset entity. + * + * @param searchFlags the search flags + * @return true if the search flags contain a grouping spec that requires conversion of schema + * field entity to dataset entity. + */ + public static boolean convertSchemaFieldToDataset(@Nullable SearchFlags searchFlags) { + return (searchFlags != null) + && (searchFlags.getGroupingSpec() != null) + && (searchFlags.getGroupingSpec().getGroupingCriteria().stream() + .anyMatch( + grouping -> + grouping.getBaseEntityType().equals(SCHEMA_FIELD_ENTITY_NAME) + && grouping.getGroupingEntityType().equals(DATASET_ENTITY_NAME))); + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java index ee2d794471f6b..ed633b063afb2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java @@ -14,7 +14,9 @@ import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.RecordTemplate; +import com.linkedin.datajob.DataJobInputOutput; import com.linkedin.dataset.FineGrainedLineage; +import com.linkedin.dataset.FineGrainedLineageArray; import com.linkedin.dataset.UpstreamLineage; import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.Constants; @@ -275,21 +277,36 @@ private void handleDeleteChangeEvent(@Nonnull final MCLBatchItem event) { // TODO: remove this method once we implement sourceOverride when creating graph edges private void updateFineGrainedEdgesAndRelationships( - RecordTemplate aspect, + Urn entity, + FineGrainedLineageArray fineGrainedLineageArray, List edgesToAdd, HashMap> urnToRelationshipTypesBeingAdded) { - UpstreamLineage upstreamLineage = new UpstreamLineage(aspect.data()); - if (upstreamLineage.getFineGrainedLineages() != null) { - for (FineGrainedLineage fineGrainedLineage : upstreamLineage.getFineGrainedLineages()) { + if (fineGrainedLineageArray != null) { + for (FineGrainedLineage fineGrainedLineage : fineGrainedLineageArray) { if (!fineGrainedLineage.hasDownstreams() || !fineGrainedLineage.hasUpstreams()) { break; } + // Fine grained lineage array is present either on datajob (datajob input/output) or dataset + // We set the datajob as the viaEntity in scenario 1, and the query (if present) as the + // viaEntity in scenario 2 + Urn viaEntity = + entity.getEntityType().equals("dataJob") ? entity : fineGrainedLineage.getQuery(); // for every downstream, create an edge with each of the upstreams for (Urn downstream : fineGrainedLineage.getDownstreams()) { for (Urn upstream : fineGrainedLineage.getUpstreams()) { // TODO: add edges uniformly across aspects edgesToAdd.add( - new Edge(downstream, upstream, DOWNSTREAM_OF, null, null, null, null, null)); + new Edge( + downstream, + upstream, + DOWNSTREAM_OF, + null, + null, + null, + null, + null, + entity, + viaEntity)); Set relationshipTypes = urnToRelationshipTypesBeingAdded.getOrDefault(downstream, new HashSet<>()); relationshipTypes.add(DOWNSTREAM_OF); @@ -357,12 +374,23 @@ private Pair, HashMap>> getEdgesAndRelationshipTypes // inputFields // since @Relationship only links between the parent entity urn and something else. if (aspectSpec.getName().equals(Constants.UPSTREAM_LINEAGE_ASPECT_NAME)) { - updateFineGrainedEdgesAndRelationships(aspect, edgesToAdd, urnToRelationshipTypesBeingAdded); - } - if (aspectSpec.getName().equals(Constants.INPUT_FIELDS_ASPECT_NAME)) { + UpstreamLineage upstreamLineage = new UpstreamLineage(aspect.data()); + updateFineGrainedEdgesAndRelationships( + urn, + upstreamLineage.getFineGrainedLineages(), + edgesToAdd, + urnToRelationshipTypesBeingAdded); + } else if (aspectSpec.getName().equals(Constants.INPUT_FIELDS_ASPECT_NAME)) { final InputFields inputFields = new InputFields(aspect.data()); updateInputFieldEdgesAndRelationships( urn, inputFields, edgesToAdd, urnToRelationshipTypesBeingAdded); + } else if (aspectSpec.getName().equals(Constants.DATA_JOB_INPUT_OUTPUT_ASPECT_NAME)) { + DataJobInputOutput dataJobInputOutput = new DataJobInputOutput(aspect.data()); + updateFineGrainedEdgesAndRelationships( + urn, + dataJobInputOutput.getFineGrainedLineages(), + edgesToAdd, + urnToRelationshipTypesBeingAdded); } Map> extractedFields = @@ -394,7 +422,7 @@ private void updateGraphService( edgeAndRelationTypes.getSecond(); log.debug("Here's the relationship types found {}", urnToRelationshipTypesBeingAdded); - if (urnToRelationshipTypesBeingAdded.size() > 0) { + if (!urnToRelationshipTypesBeingAdded.isEmpty()) { for (Map.Entry> entry : urnToRelationshipTypesBeingAdded.entrySet()) { _graphService.removeEdgesFromNode( entry.getKey(), diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java index 3a51344d5779d..2de61c8ed31bb 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java @@ -14,6 +14,9 @@ import com.linkedin.common.urn.DataJobUrn; import com.linkedin.common.urn.Urn; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; +import com.linkedin.metadata.config.search.GraphQueryConfiguration; +import com.linkedin.metadata.graph.dgraph.DgraphGraphService; +import com.linkedin.metadata.graph.neo4j.Neo4jGraphService; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipDirection; import com.linkedin.metadata.query.filter.RelationshipFilter; @@ -36,6 +39,8 @@ import java.util.stream.IntStream; import javax.annotation.Nonnull; import javax.annotation.Nullable; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.Assert; import org.testng.annotations.BeforeMethod; @@ -198,6 +203,19 @@ public void testStaticUrns() { @Nonnull protected abstract GraphService getGraphService() throws Exception; + /** + * Graph services that support multi-path search should override this method to provide a + * multi-path search enabled GraphService instance. + * + * @param enableMultiPathSearch + * @return + * @throws Exception + */ + @Nonnull + protected GraphService getGraphService(boolean enableMultiPathSearch) throws Exception { + return getGraphService(); + } + /** * Allows the specific GraphService test implementation to wait for GraphService writes to be * synced / become available to reads. @@ -235,7 +253,12 @@ protected GraphService getPopulatedGraphService() throws Exception { } protected GraphService getLineagePopulatedGraphService() throws Exception { - GraphService service = getGraphService(); + return getLineagePopulatedGraphService( + GraphQueryConfiguration.testDefaults.isEnableMultiPathSearch()); + } + + protected GraphService getLineagePopulatedGraphService(boolean multiPathSearch) throws Exception { + GraphService service = getGraphService(multiPathSearch); List edges = Arrays.asList( @@ -1821,9 +1844,16 @@ public void run() { assertEquals(throwables.size(), 0); } - @Test - public void testPopulatedGraphServiceGetLineageMultihop() throws Exception { - GraphService service = getLineagePopulatedGraphService(); + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testPopulatedGraphServiceGetLineageMultihop(boolean attemptMultiPathAlgo) + throws Exception { + + GraphService service = getLineagePopulatedGraphService(attemptMultiPathAlgo); + // Implementations other than Neo4J and DGraph explore more of the graph to discover nodes at + // multiple hops + boolean expandedGraphAlgoEnabled = + (!((service instanceof Neo4jGraphService) || (service instanceof DgraphGraphService))); EntityLineageResult upstreamLineage = service.getLineage(datasetOneUrn, LineageDirection.UPSTREAM, 0, 1000, 2); @@ -1838,16 +1868,23 @@ public void testPopulatedGraphServiceGetLineageMultihop() throws Exception { Map relationships = downstreamLineage.getRelationships().stream() .collect(Collectors.toMap(LineageRelationship::getEntity, Function.identity())); + Set entities = relationships.keySet().stream().collect(Collectors.toUnmodifiableSet()); + assertEquals(entities.size(), 5); assertTrue(relationships.containsKey(datasetTwoUrn)); - assertEquals(relationships.get(datasetTwoUrn).getDegree().intValue(), 1); + assertEquals(relationships.get(dataJobTwoUrn).getDegree(), 1); assertTrue(relationships.containsKey(datasetThreeUrn)); - assertEquals(relationships.get(datasetThreeUrn).getDegree().intValue(), 2); + assertEquals(relationships.get(datasetThreeUrn).getDegree(), 2); assertTrue(relationships.containsKey(datasetFourUrn)); - assertEquals(relationships.get(datasetFourUrn).getDegree().intValue(), 2); + assertEquals(relationships.get(datasetFourUrn).getDegree(), 2); assertTrue(relationships.containsKey(dataJobOneUrn)); - assertEquals(relationships.get(dataJobOneUrn).getDegree().intValue(), 1); + assertEquals(relationships.get(dataJobOneUrn).getDegree(), 1); + // dataJobOne is present both at degree 1 and degree 2 + if (expandedGraphAlgoEnabled && attemptMultiPathAlgo) { + relationships.get(dataJobOneUrn).getDegrees().contains(Integer.valueOf(1)); + relationships.get(dataJobOneUrn).getDegrees().contains(Integer.valueOf(2)); + } assertTrue(relationships.containsKey(dataJobTwoUrn)); - assertEquals(relationships.get(dataJobTwoUrn).getDegree().intValue(), 1); + assertEquals(relationships.get(dataJobTwoUrn).getDegree(), 1); upstreamLineage = service.getLineage(datasetThreeUrn, LineageDirection.UPSTREAM, 0, 1000, 2); assertEquals(upstreamLineage.getTotal().intValue(), 3); @@ -1856,11 +1893,11 @@ public void testPopulatedGraphServiceGetLineageMultihop() throws Exception { upstreamLineage.getRelationships().stream() .collect(Collectors.toMap(LineageRelationship::getEntity, Function.identity())); assertTrue(relationships.containsKey(datasetOneUrn)); - assertEquals(relationships.get(datasetOneUrn).getDegree().intValue(), 2); + assertEquals(relationships.get(datasetOneUrn).getDegree(), 2); assertTrue(relationships.containsKey(datasetTwoUrn)); - assertEquals(relationships.get(datasetTwoUrn).getDegree().intValue(), 1); + assertEquals(relationships.get(datasetTwoUrn).getDegree(), 1); assertTrue(relationships.containsKey(dataJobOneUrn)); - assertEquals(relationships.get(dataJobOneUrn).getDegree().intValue(), 1); + assertEquals(relationships.get(dataJobOneUrn).getDegree(), 1); downstreamLineage = service.getLineage(datasetThreeUrn, LineageDirection.DOWNSTREAM, 0, 1000, 2); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/dgraph/DgraphGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/dgraph/DgraphGraphServiceTest.java index 40b8e83b56d03..1ccf018a74c3a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/dgraph/DgraphGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/dgraph/DgraphGraphServiceTest.java @@ -820,7 +820,7 @@ public void testGetDestinationUrnsFromResponseData() { } @Override - public void testPopulatedGraphServiceGetLineageMultihop() { + public void testPopulatedGraphServiceGetLineageMultihop(boolean attemptMultiHop) { // TODO: Remove this overridden method once the multihop for dGraph is implemented! } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java index bd500cd469100..7f0e4294e0cbf 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java @@ -57,12 +57,14 @@ public abstract class SearchGraphServiceTestBase extends GraphServiceTestBase { private final IndexConvention _indexConvention = new IndexConventionImpl(null); private final String _indexName = _indexConvention.getIndexName(INDEX_NAME); private ElasticSearchGraphService _client; + private boolean _enableMultiPathSearch = + GraphQueryConfiguration.testDefaults.isEnableMultiPathSearch(); private static final String TAG_RELATIONSHIP = "SchemaFieldTaggedWith"; @BeforeClass public void setup() { - _client = buildService(); + _client = buildService(_enableMultiPathSearch); _client.configure(); } @@ -73,8 +75,10 @@ public void wipe() throws Exception { } @Nonnull - private ElasticSearchGraphService buildService() { + private ElasticSearchGraphService buildService(boolean enableMultiPathSearch) { LineageRegistry lineageRegistry = new LineageRegistry(SnapshotEntityRegistry.getInstance()); + GraphQueryConfiguration configuration = GraphQueryConfiguration.testDefaults; + configuration.setEnableMultiPathSearch(enableMultiPathSearch); ESGraphQueryDAO readDAO = new ESGraphQueryDAO( getSearchClient(), @@ -93,10 +97,21 @@ private ElasticSearchGraphService buildService() { @Override @Nonnull - protected GraphService getGraphService() { + protected GraphService getGraphService(boolean enableMultiPathSearch) { + if (enableMultiPathSearch != _enableMultiPathSearch) { + _enableMultiPathSearch = enableMultiPathSearch; + _client = buildService(enableMultiPathSearch); + _client.configure(); + } return _client; } + @Override + @Nonnull + protected GraphService getGraphService() { + return getGraphService(GraphQueryConfiguration.testDefaults.isEnableMultiPathSearch()); + } + @Override protected void syncAfterWrite() throws Exception { SearchTestUtils.syncAfterWrite(getBulkProcessor()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/LineageDataFixtureTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/LineageDataFixtureTestBase.java index 44fe5ea8ac9ae..59942f76744da 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/LineageDataFixtureTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/LineageDataFixtureTestBase.java @@ -10,7 +10,9 @@ import com.linkedin.metadata.search.LineageSearchService; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchService; +import com.linkedin.util.Pair; import java.net.URISyntaxException; +import java.util.stream.Stream; import javax.annotation.Nonnull; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.Test; @@ -49,16 +51,17 @@ public void testDatasetLineage() throws URISyntaxException { Urn.createFromString( "urn:li:dataset:(urn:li:dataPlatform:9cf8c96,e3859789eed1cef55288b44f016ee08290d9fd08973e565c112d8,PROD)"); - // 1 hops - LineageSearchResult lineageResult = lineage(getLineageService(), testUrn, 1); - assertEquals(lineageResult.getEntities().size(), 10); - - // 2 hops - lineageResult = lineage(getLineageService(), testUrn, 2); - assertEquals(lineageResult.getEntities().size(), 5); - - // 3 hops - lineageResult = lineage(getLineageService(), testUrn, 3); - assertEquals(lineageResult.getEntities().size(), 12); + Stream> hopsExpectedResultsStream = + Stream.of( + Pair.of(1, 10), // Hop 1 -> 10 results + Pair.of(2, 5), // Hop 2 -> 5 results + Pair.of(3, 12) // Hop 3 -> 12 results + ); + hopsExpectedResultsStream.forEach( + hopsExpectedResults -> { + LineageSearchResult lineageResult = + lineage(getLineageService(), testUrn, hopsExpectedResults.getFirst()); + assertEquals(lineageResult.getEntities().size(), hopsExpectedResults.getSecond()); + }); } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/SearchUtilsTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/SearchUtilsTest.java index 5ea58e3416205..f4e8224254530 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/SearchUtilsTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/SearchUtilsTest.java @@ -1,22 +1,46 @@ package com.linkedin.metadata.search.utils; +import static com.linkedin.metadata.Constants.*; import static org.testng.Assert.assertEquals; +import com.linkedin.data.template.SetMode; +import com.linkedin.metadata.query.GroupingCriterion; +import com.linkedin.metadata.query.GroupingCriterionArray; +import com.linkedin.metadata.query.GroupingSpec; import com.linkedin.metadata.query.SearchFlags; import java.util.Set; import org.testng.annotations.Test; public class SearchUtilsTest { - @Test - public void testApplyDefaultSearchFlags() { - SearchFlags defaultFlags = + private SearchFlags getDefaultSearchFlags() { + return setConvertSchemaFieldsToDatasets( new SearchFlags() .setFulltext(true) .setSkipCache(true) .setSkipAggregates(true) .setMaxAggValues(1) - .setSkipHighlighting(true); + .setSkipHighlighting(true), + true); + } + + private SearchFlags setConvertSchemaFieldsToDatasets(SearchFlags flags, boolean value) { + if (value) { + return flags.setGroupingSpec( + new GroupingSpec() + .setGroupingCriteria( + new GroupingCriterionArray( + new GroupingCriterion() + .setBaseEntityType(SCHEMA_FIELD_ENTITY_NAME) + .setGroupingEntityType(DATASET_ENTITY_NAME)))); + } else { + return flags.setGroupingSpec(null, SetMode.REMOVE_IF_NULL); + } + } + + @Test + public void testApplyDefaultSearchFlags() { + SearchFlags defaultFlags = getDefaultSearchFlags(); assertEquals( SearchUtils.applyDefaultSearchFlags(null, "not empty", defaultFlags), @@ -33,12 +57,14 @@ public void testApplyDefaultSearchFlags() { .setSkipHighlighting(false), "not empty", defaultFlags), - new SearchFlags() - .setFulltext(false) - .setSkipAggregates(false) - .setSkipCache(false) - .setMaxAggValues(2) - .setSkipHighlighting(false), + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(false) + .setSkipAggregates(false) + .setSkipCache(false) + .setMaxAggValues(2) + .setSkipHighlighting(false), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), "Expected no default values"); assertEquals( @@ -51,12 +77,14 @@ public void testApplyDefaultSearchFlags() { .setSkipHighlighting(false), null, defaultFlags), - new SearchFlags() - .setFulltext(false) - .setSkipAggregates(false) - .setSkipCache(false) - .setMaxAggValues(2) - .setSkipHighlighting(true), + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(false) + .setSkipAggregates(false) + .setSkipCache(false) + .setMaxAggValues(2) + .setSkipHighlighting(true), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), "Expected skip highlight due to query null query"); for (String query : Set.of("*", "")) { assertEquals( @@ -69,94 +97,105 @@ public void testApplyDefaultSearchFlags() { .setSkipHighlighting(false), query, defaultFlags), - new SearchFlags() - .setFulltext(false) - .setSkipAggregates(false) - .setSkipCache(false) - .setMaxAggValues(2) - .setSkipHighlighting(true), + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(false) + .setSkipAggregates(false) + .setSkipCache(false) + .setMaxAggValues(2) + .setSkipHighlighting(true), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), String.format("Expected skip highlight due to query string `%s`", query)); } assertEquals( SearchUtils.applyDefaultSearchFlags( new SearchFlags().setFulltext(false), "not empty", defaultFlags), - new SearchFlags() - .setFulltext(false) - .setSkipAggregates(true) - .setSkipCache(true) - .setMaxAggValues(1) - .setSkipHighlighting(true), + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(false) + .setSkipAggregates(true) + .setSkipCache(true) + .setMaxAggValues(1) + .setSkipHighlighting(true), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), "Expected all default values except fulltext"); assertEquals( SearchUtils.applyDefaultSearchFlags( new SearchFlags().setSkipCache(false), "not empty", defaultFlags), - new SearchFlags() - .setFulltext(true) - .setSkipAggregates(true) - .setSkipCache(false) - .setMaxAggValues(1) - .setSkipHighlighting(true), + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(true) + .setSkipAggregates(true) + .setSkipCache(false) + .setMaxAggValues(1) + .setSkipHighlighting(true), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), "Expected all default values except skipCache"); assertEquals( SearchUtils.applyDefaultSearchFlags( new SearchFlags().setSkipAggregates(false), "not empty", defaultFlags), - new SearchFlags() - .setFulltext(true) - .setSkipAggregates(false) - .setSkipCache(true) - .setMaxAggValues(1) - .setSkipHighlighting(true), + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(true) + .setSkipAggregates(false) + .setSkipCache(true) + .setMaxAggValues(1) + .setSkipHighlighting(true), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), "Expected all default values except skipAggregates"); assertEquals( SearchUtils.applyDefaultSearchFlags( new SearchFlags().setMaxAggValues(2), "not empty", defaultFlags), - new SearchFlags() - .setFulltext(true) - .setSkipAggregates(true) - .setSkipCache(true) - .setMaxAggValues(2) - .setSkipHighlighting(true), + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(true) + .setSkipAggregates(true) + .setSkipCache(true) + .setMaxAggValues(2) + .setSkipHighlighting(true), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), "Expected all default values except maxAggValues"); assertEquals( SearchUtils.applyDefaultSearchFlags( new SearchFlags().setSkipHighlighting(false), "not empty", defaultFlags), - new SearchFlags() - .setFulltext(true) - .setSkipAggregates(true) - .setSkipCache(true) - .setMaxAggValues(1) - .setSkipHighlighting(false), + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(true) + .setSkipAggregates(true) + .setSkipCache(true) + .setMaxAggValues(1) + .setSkipHighlighting(false), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), "Expected all default values except skipHighlighting"); } @Test public void testImmutableDefaults() throws CloneNotSupportedException { - SearchFlags defaultFlags = - new SearchFlags() - .setFulltext(true) - .setSkipCache(true) - .setSkipAggregates(true) - .setMaxAggValues(1) - .setSkipHighlighting(true); + SearchFlags defaultFlags = getDefaultSearchFlags(); + SearchFlags copyFlags = defaultFlags.copy(); assertEquals( SearchUtils.applyDefaultSearchFlags( + setConvertSchemaFieldsToDatasets( + new SearchFlags() + .setFulltext(false) + .setSkipCache(false) + .setSkipAggregates(false) + .setMaxAggValues(2) + .setSkipHighlighting(false), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), + "not empty", + defaultFlags), + setConvertSchemaFieldsToDatasets( new SearchFlags() .setFulltext(false) - .setSkipCache(false) .setSkipAggregates(false) + .setSkipCache(false) .setMaxAggValues(2) .setSkipHighlighting(false), - "not empty", - defaultFlags), - new SearchFlags() - .setFulltext(false) - .setSkipAggregates(false) - .setSkipCache(false) - .setMaxAggValues(2) - .setSkipHighlighting(false), + SearchUtils.convertSchemaFieldToDataset(defaultFlags)), "Expected no default values"); assertEquals(defaultFlags, copyFlags, "Expected defaults to be unmodified"); diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/FineGrainedLineage.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/FineGrainedLineage.pdl index ce72d7c04a3f6..3aa76cc27250c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/FineGrainedLineage.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/FineGrainedLineage.pdl @@ -42,6 +42,9 @@ record FineGrainedLineage { // Other information e.g. user who created this lineage etc. can added here. - // It may be useful to add a "query" field here, but the semantics are tricky. - // To be considered in a future iteration when required. + /** + * The query that was used to generate this lineage. + * Present only if the lineage was generated from a detected query. + */ + query: optional Urn } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/Upstream.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/Upstream.pdl index c4a9fa1727162..b4c98e4f34724 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/Upstream.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/Upstream.pdl @@ -2,6 +2,8 @@ namespace com.linkedin.dataset import com.linkedin.common.AuditStamp import com.linkedin.common.DatasetUrn +import com.linkedin.common.Urn + /** * Upstream lineage information about a dataset including the source reporting the lineage @@ -33,6 +35,7 @@ record Upstream { "updatedOn": "upstreams/*/auditStamp/time" "updatedActor": "upstreams/*/auditStamp/actor" "properties": "upstreams/*/properties" + "via": "upstreams/*/query" } @Searchable = { "fieldName": "upstreams", @@ -50,4 +53,9 @@ record Upstream { * A generic properties bag that allows us to store specific information on this graph edge. */ properties: optional map[string, string] + + /** + * If the lineage is generated by a query, a reference to the query + */ + query: optional Urn } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/graph/LineageRelationship.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/graph/LineageRelationship.pdl index ad4bd27b4cdae..c25a1cee7db47 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/graph/LineageRelationship.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/graph/LineageRelationship.pdl @@ -31,7 +31,9 @@ record LineageRelationship { /** * Degree of relationship (number of hops to get to entity) + * Deprecated by degrees. degree field is populated by min(degrees) for backward compatibility. */ + @deprecated degree: int = 1 /** @@ -58,4 +60,11 @@ record LineageRelationship { * Whether this lineage edge is a manual edge. */ isManual: optional boolean + + /** + * The different depths at which this entity is discovered in the lineage graph. + * Marked as optional to maintain backward compatibility, but is filled out by implementations. + * Replaces the deprecated field "degree". + **/ + degrees: optional array[int] } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/GroupingCriterion.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/GroupingCriterion.pdl new file mode 100644 index 0000000000000..da0a1c2fd3514 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/GroupingCriterion.pdl @@ -0,0 +1,21 @@ +namespace com.linkedin.metadata.query + +/** +* +**/ + +record GroupingCriterion { + + /** + * The type of the entity to be grouped. + * e.g. schemaField + * Omitting this field will result in all base entities being grouped. + */ + baseEntityType: optional string + + /** + * The type of the entity to be grouped into. + * e.g. dataset, domain, etc. + */ + groupingEntityType: string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/GroupingSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/GroupingSpec.pdl new file mode 100644 index 0000000000000..c4c8a6c0e6bd9 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/GroupingSpec.pdl @@ -0,0 +1,15 @@ +namespace com.linkedin.metadata.query + +/** + * A set of directives to control how results are grouped. + * The underlying generic groupings are nested to allow for further evolution of the grouping spec. + */ + +record GroupingSpec { + + /** + * A list of generic directives to group results by. + **/ + groupingCriteria: array[GroupingCriterion] = [] + +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl index be1a30c7f082c..67f41ea175b51 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl @@ -33,4 +33,9 @@ record SearchFlags { * Whether to request for search suggestions on the _entityName virtualized field */ getSuggestions:optional boolean = false + + /** + * Instructions for grouping results before returning + */ + groupingSpec: optional GroupingSpec } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/LineageSearchEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/LineageSearchEntity.pdl index 2e81a63319ae9..e99115893712d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/LineageSearchEntity.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/LineageSearchEntity.pdl @@ -22,6 +22,11 @@ record LineageSearchEntity includes SearchEntity { /** * Degree of relationship (number of hops to get to entity) */ + @deprecated degree: int = 1 + /** + * The degrees of separation (number of hops) between the source and this entity + */ + degrees: array[int] = [] } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl index 9587775dbed3a..1f4929b878de6 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl @@ -74,4 +74,9 @@ record QueryProperties { } } lastModified: AuditStamp + + /** + * The urn of the DataPlatform where the Query was executed. + */ + dataPlatform: optional Urn } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java index 6f3e1cb278f5f..4da50f47e2feb 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java @@ -8,6 +8,9 @@ public class GraphQueryConfiguration { private long timeoutSeconds; private int batchSize; private int maxResult; + // When set to true, the graph walk (typically in search-across-lineage or scroll-across-lineage) + // will return all paths between the source and destination nodes within the hops limit. + private boolean enableMultiPathSearch; public static GraphQueryConfiguration testDefaults; @@ -16,5 +19,6 @@ public class GraphQueryConfiguration { testDefaults.setBatchSize(1000); testDefaults.setTimeoutSeconds(10); testDefaults.setMaxResult(10000); + testDefaults.setEnableMultiPathSearch(true); } } diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 2b202d513c9bf..a7222f2adc3c6 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -226,6 +226,7 @@ elasticsearch: timeoutSeconds: ${ELASTICSEARCH_SEARCH_GRAPH_TIMEOUT_SECONDS:50} # graph dao timeout seconds batchSize: ${ELASTICSEARCH_SEARCH_GRAPH_BATCH_SIZE:1000} # graph dao batch size maxResult: ${ELASTICSEARCH_SEARCH_GRAPH_MAX_RESULT:10000} # graph dao max result size + enableMultiPathSearch: ${ELASTICSEARCH_SEARCH_GRAPH_MULTI_PATH_SEARCH:true} # TODO: Kafka topic convention kafka: @@ -394,4 +395,4 @@ springdoc.api-docs.groups.enabled: true forms: hook: - enabled: {$FORMS_HOOK_ENABLED:true} \ No newline at end of file + enabled: { $FORMS_HOOK_ENABLED:true } diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/BootstrapStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/BootstrapStep.java index 7ff91affdf765..a79bdacfc55e9 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/BootstrapStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/BootstrapStep.java @@ -48,6 +48,17 @@ static void setUpgradeResult(Urn urn, EntityService entityService) throws URI final DataHubUpgradeResult upgradeResult = new DataHubUpgradeResult().setTimestampMs(System.currentTimeMillis()); + // Workaround because entity service does not auto-generate the key aspect for us + final MetadataChangeProposal keyProposal = new MetadataChangeProposal(); + final DataHubUpgradeKey upgradeKey = new DataHubUpgradeKey().setId(urn.getId()); + keyProposal.setEntityUrn(urn); + keyProposal.setEntityType(Constants.DATA_HUB_UPGRADE_ENTITY_NAME); + keyProposal.setAspectName(Constants.DATA_HUB_UPGRADE_KEY_ASPECT_NAME); + keyProposal.setAspect(GenericRecordUtils.serializeAspect(upgradeKey)); + keyProposal.setChangeType(ChangeType.UPSERT); + entityService.ingestProposal(keyProposal, auditStamp, false); + + // Ingest the upgrade result final MetadataChangeProposal upgradeProposal = new MetadataChangeProposal(); upgradeProposal.setEntityUrn(urn); upgradeProposal.setEntityType(Constants.DATA_HUB_UPGRADE_ENTITY_NAME); diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index ee45b8921143a..fe16d24e3475a 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -1818,6 +1818,11 @@ "type" : "float", "doc" : "The confidence in this lineage between 0 (low confidence) and 1 (high confidence)", "default" : 1.0 + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "The query that was used to generate this lineage. \nPresent only if the lineage was generated from a detected query.", + "optional" : true } ] } }, @@ -1986,7 +1991,8 @@ "name" : "DownstreamOf", "properties" : "upstreams/*/properties", "updatedActor" : "upstreams/*/auditStamp/actor", - "updatedOn" : "upstreams/*/auditStamp/time" + "updatedOn" : "upstreams/*/auditStamp/time", + "via" : "upstreams/*/query" }, "Searchable" : { "fieldName" : "upstreams", @@ -2005,6 +2011,11 @@ }, "doc" : "A generic properties bag that allows us to store specific information on this graph edge.", "optional" : true + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "If the lineage is generated by a query, a reference to the query", + "optional" : true } ] }, { "type" : "record", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 505f44c52d583..55fed125936eb 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -1873,6 +1873,11 @@ "type" : "float", "doc" : "The confidence in this lineage between 0 (low confidence) and 1 (high confidence)", "default" : 1.0 + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "The query that was used to generate this lineage. \nPresent only if the lineage was generated from a detected query.", + "optional" : true } ] } }, @@ -2258,7 +2263,8 @@ "name" : "DownstreamOf", "properties" : "upstreams/*/properties", "updatedActor" : "upstreams/*/auditStamp/actor", - "updatedOn" : "upstreams/*/auditStamp/time" + "updatedOn" : "upstreams/*/auditStamp/time", + "via" : "upstreams/*/query" }, "Searchable" : { "fieldName" : "upstreams", @@ -2277,6 +2283,11 @@ }, "doc" : "A generic properties bag that allows us to store specific information on this graph edge.", "optional" : true + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "If the lineage is generated by a query, a reference to the query", + "optional" : true } ] }, { "type" : "record", @@ -5653,6 +5664,35 @@ "doc" : "Specific entities to recommend" } ] }, "com.linkedin.metadata.query.FreshnessStats", { + "type" : "record", + "name" : "GroupingCriterion", + "namespace" : "com.linkedin.metadata.query", + "doc" : "\n", + "fields" : [ { + "name" : "baseEntityType", + "type" : "string", + "doc" : "The type of the entity to be grouped.\ne.g. schemaField\nOmitting this field will result in all base entities being grouped.", + "optional" : true + }, { + "name" : "groupingEntityType", + "type" : "string", + "doc" : "The type of the entity to be grouped into.\ne.g. dataset, domain, etc." + } ] + }, { + "type" : "record", + "name" : "GroupingSpec", + "namespace" : "com.linkedin.metadata.query", + "doc" : "A set of directives to control how results are grouped.\nThe underlying generic groupings are nested to allow for further evolution of the grouping spec.", + "fields" : [ { + "name" : "groupingCriteria", + "type" : { + "type" : "array", + "items" : "GroupingCriterion" + }, + "doc" : "A list of generic directives to group results by.\n", + "default" : [ ] + } ] + }, { "type" : "record", "name" : "ListResult", "namespace" : "com.linkedin.metadata.query", @@ -5740,6 +5780,11 @@ "doc" : "Whether to request for search suggestions on the _entityName virtualized field", "default" : false, "optional" : true + }, { + "name" : "groupingSpec", + "type" : "GroupingSpec", + "doc" : "Instructions for grouping results before returning", + "optional" : true } ] }, { "type" : "enum", @@ -6092,7 +6137,16 @@ "name" : "degree", "type" : "int", "doc" : "Degree of relationship (number of hops to get to entity)", - "default" : 1 + "default" : 1, + "deprecated" : true + }, { + "name" : "degrees", + "type" : { + "type" : "array", + "items" : "int" + }, + "doc" : "The degrees of separation (number of hops) between the source and this entity ", + "default" : [ ] } ] } }, diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index e8c15d1b4ca04..f9f1999923ec0 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -1560,6 +1560,11 @@ "type" : "float", "doc" : "The confidence in this lineage between 0 (low confidence) and 1 (high confidence)", "default" : 1.0 + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "The query that was used to generate this lineage. \nPresent only if the lineage was generated from a detected query.", + "optional" : true } ] } }, @@ -1728,7 +1733,8 @@ "name" : "DownstreamOf", "properties" : "upstreams/*/properties", "updatedActor" : "upstreams/*/auditStamp/actor", - "updatedOn" : "upstreams/*/auditStamp/time" + "updatedOn" : "upstreams/*/auditStamp/time", + "via" : "upstreams/*/query" }, "Searchable" : { "fieldName" : "upstreams", @@ -1747,6 +1753,11 @@ }, "doc" : "A generic properties bag that allows us to store specific information on this graph edge.", "optional" : true + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "If the lineage is generated by a query, a reference to the query", + "optional" : true } ] }, { "type" : "record", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.lineage.relationships.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.lineage.relationships.snapshot.json index 9aa40edd0b118..056ca0e4da206 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.lineage.relationships.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.lineage.relationships.snapshot.json @@ -141,8 +141,9 @@ }, { "name" : "degree", "type" : "int", - "doc" : "Degree of relationship (number of hops to get to entity)", - "default" : 1 + "doc" : "Degree of relationship (number of hops to get to entity)\nDeprecated by degrees. degree field is populated by min(degrees) for backward compatibility.", + "default" : 1, + "deprecated" : true }, { "name" : "createdOn", "type" : "long", @@ -168,6 +169,14 @@ "type" : "boolean", "doc" : "Whether this lineage edge is a manual edge.", "optional" : true + }, { + "name" : "degrees", + "type" : { + "type" : "array", + "items" : "int" + }, + "doc" : "The different depths at which this entity is discovered in the lineage graph.\nMarked as optional to maintain backward compatibility, but is filled out by implementations. \nReplaces the deprecated field \"degree\".\n", + "optional" : true } ] } }, diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 67f70d40e010c..88dad7e49152a 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -1560,6 +1560,11 @@ "type" : "float", "doc" : "The confidence in this lineage between 0 (low confidence) and 1 (high confidence)", "default" : 1.0 + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "The query that was used to generate this lineage. \nPresent only if the lineage was generated from a detected query.", + "optional" : true } ] } }, @@ -1728,7 +1733,8 @@ "name" : "DownstreamOf", "properties" : "upstreams/*/properties", "updatedActor" : "upstreams/*/auditStamp/actor", - "updatedOn" : "upstreams/*/auditStamp/time" + "updatedOn" : "upstreams/*/auditStamp/time", + "via" : "upstreams/*/query" }, "Searchable" : { "fieldName" : "upstreams", @@ -1747,6 +1753,11 @@ }, "doc" : "A generic properties bag that allows us to store specific information on this graph edge.", "optional" : true + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "If the lineage is generated by a query, a reference to the query", + "optional" : true } ] }, { "type" : "record", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 4c8cd1f20d476..4d34126cd59fc 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -1873,6 +1873,11 @@ "type" : "float", "doc" : "The confidence in this lineage between 0 (low confidence) and 1 (high confidence)", "default" : 1.0 + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "The query that was used to generate this lineage. \nPresent only if the lineage was generated from a detected query.", + "optional" : true } ] } }, @@ -2258,7 +2263,8 @@ "name" : "DownstreamOf", "properties" : "upstreams/*/properties", "updatedActor" : "upstreams/*/auditStamp/actor", - "updatedOn" : "upstreams/*/auditStamp/time" + "updatedOn" : "upstreams/*/auditStamp/time", + "via" : "upstreams/*/query" }, "Searchable" : { "fieldName" : "upstreams", @@ -2277,6 +2283,11 @@ }, "doc" : "A generic properties bag that allows us to store specific information on this graph edge.", "optional" : true + }, { + "name" : "query", + "type" : "com.linkedin.common.Urn", + "doc" : "If the lineage is generated by a query, a reference to the query", + "optional" : true } ] }, { "type" : "record", diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java index 2f470dca01f2a..b1b24ac97f0b8 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -381,6 +381,7 @@ public LineageSearchResult searchAcrossLineage( * @param endTimeMillis end time to filter to * @param startTimeMillis start time to filter from * @param searchFlags configuration flags for the search request + * @param authentication a reference to an authentication * @return a {@link SearchResult} that contains a list of matched documents and related search * result metadata */ diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/Edge.java b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/Edge.java index d27b0ed303972..458b23317c6c8 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/Edge.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/Edge.java @@ -17,4 +17,31 @@ public class Edge { @EqualsAndHashCode.Exclude private Long updatedOn; @EqualsAndHashCode.Exclude private Urn updatedActor; @EqualsAndHashCode.Exclude private Map properties; + // The entity who owns the lifecycle of this edge + @EqualsAndHashCode.Exclude private Urn lifecycleOwner; + // An entity through which the edge between source and destination is created + @EqualsAndHashCode.Include private Urn via; + + // For backwards compatibility + public Edge( + Urn source, + Urn destination, + String relationshipType, + Long createdOn, + Urn createdActor, + Long updatedOn, + Urn updatedActor, + Map properties) { + this( + source, + destination, + relationshipType, + createdOn, + createdActor, + updatedOn, + updatedActor, + properties, + null, + null); + } } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/GraphIndexUtils.java b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/GraphIndexUtils.java index 2afe907399745..8a08835ab6896 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/GraphIndexUtils.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/GraphIndexUtils.java @@ -53,6 +53,17 @@ private static List> getPropertiesList( return (List>) value; } + @Nullable + private static List getViaList( + @Nullable final String path, @Nonnull final RecordTemplate aspect) { + if (path == null) { + return null; + } + final PathSpec viaPathSpec = new PathSpec(path.split("/")); + final Object value = RecordUtils.getNullableFieldValue(aspect, viaPathSpec); + return (List) value; + } + @Nullable private static boolean isValueListValid( @Nullable final List entryList, final int valueListSize) { @@ -94,6 +105,15 @@ private static Map getProperties( return null; } + @Nullable + private static Urn getVia( + @Nullable final List viaList, final int index, final int valueListSize) { + if (isValueListValid(viaList, valueListSize)) { + return viaList.get(index); + } + return null; + } + /** * Used to create new edges for the graph db, adding all the metadata associated with each edge * based on the aspect. Returns a list of Edges to be consumed by the graph service. @@ -116,12 +136,14 @@ public static List extractGraphEdges( extractedFieldsEntry.getKey().getRelationshipAnnotation().getUpdatedActor(); final String propertiesPath = extractedFieldsEntry.getKey().getRelationshipAnnotation().getProperties(); + final String viaNodePath = extractedFieldsEntry.getKey().getRelationshipAnnotation().getVia(); final List createdOnList = getTimestampList(createdOnPath, aspect); final List createdActorList = getActorList(createdActorPath, aspect); final List updatedOnList = getTimestampList(updatedOnPath, aspect); final List updatedActorList = getActorList(updatedActorPath, aspect); final List> propertiesList = getPropertiesList(propertiesPath, aspect); + final List viaList = getViaList(viaNodePath, aspect); int index = 0; for (Object fieldValue : extractedFieldsEntry.getValue()) { @@ -146,6 +168,11 @@ public static List extractGraphEdges( ? getProperties(propertiesList, index, extractedFieldsEntry.getValue().size()) : null; + Urn viaNode = + viaNodePath != null + ? getVia(viaList, index, extractedFieldsEntry.getValue().size()) + : null; + SystemMetadata systemMetadata; if (isNewAspectVersion) { systemMetadata = event.hasSystemMetadata() ? event.getSystemMetadata() : null; @@ -177,7 +204,9 @@ public static List extractGraphEdges( createdActor, updatedOn, updatedActor, - properties)); + properties, + null, + viaNode)); } catch (URISyntaxException e) { log.error("Invalid destination urn: {}", fieldValue, e); } @@ -198,6 +227,8 @@ public static Edge mergeEdges(@Nonnull final Edge oldEdge, @Nonnull final Edge n null, newEdge.getUpdatedOn(), newEdge.getUpdatedActor(), - newEdge.getProperties()); + newEdge.getProperties(), + oldEdge.getLifecycleOwner(), + oldEdge.getVia()); } } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/RelatedEntities.java b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/RelatedEntities.java index 0c6f8a0d65d5c..3c54e987fec35 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/RelatedEntities.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/RelatedEntities.java @@ -17,15 +17,17 @@ public RelatedEntities( @Nonnull String relationshipType, @Nonnull String sourceUrn, @Nonnull String destinationUrn, - @Nonnull RelationshipDirection relationshipDirection) { + @Nonnull RelationshipDirection relationshipDirection, + String viaEntity) { super( relationshipType, - relationshipDirection == RelationshipDirection.OUTGOING ? destinationUrn : sourceUrn); + relationshipDirection == RelationshipDirection.OUTGOING ? destinationUrn : sourceUrn, + viaEntity); this.sourceUrn = sourceUrn; this.destinationUrn = destinationUrn; } public RelatedEntity asRelatedEntity() { - return new RelatedEntity(relationshipType, urn); + return new RelatedEntity(relationshipType, urn, via); } } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/RelatedEntity.java b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/RelatedEntity.java index be1b55655f671..39c455a3fbd74 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/graph/RelatedEntity.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/graph/RelatedEntity.java @@ -11,4 +11,17 @@ public class RelatedEntity { /** Urn associated with the related entity. */ String urn; + + /** Urn associated with an entity through which this relationship is established */ + String via; + + /** + * Constructor for backwards compatibility + * + * @param relationshipType + * @param urn + */ + public RelatedEntity(String relationshipType, String urn) { + this(relationshipType, urn, null); + } } diff --git a/smoke-test/requirements.txt b/smoke-test/requirements.txt index 59d5bc2600400..e37de9caddc69 100644 --- a/smoke-test/requirements.txt +++ b/smoke-test/requirements.txt @@ -6,4 +6,5 @@ tenacity slack-sdk==3.18.1 aiohttp joblib -pytest-xdist \ No newline at end of file +pytest-xdist +networkx \ No newline at end of file diff --git a/smoke-test/tests/lineage/__init__.py b/smoke-test/tests/lineage/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/smoke-test/tests/lineage/test_lineage.py b/smoke-test/tests/lineage/test_lineage.py new file mode 100644 index 0000000000000..52d61d666c7d9 --- /dev/null +++ b/smoke-test/tests/lineage/test_lineage.py @@ -0,0 +1,991 @@ +import logging +import time +from enum import Enum +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union + +import datahub.emitter.mce_builder as builder +import networkx as nx +import pytest +from datahub.cli.cli_utils import get_url_and_token +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import ( + DatahubClientConfig, + DataHubGraph, + get_default_graph, +) +from datahub.metadata.schema_classes import ( + AuditStampClass, + ChangeAuditStampsClass, + ChartInfoClass, + DataFlowInfoClass, + DataJobInfoClass, + DataJobInputOutputClass, + DatasetLineageTypeClass, + DatasetPropertiesClass, + EdgeClass, +) +from datahub.metadata.schema_classes import ( + FineGrainedLineageClass as FineGrainedLineage, +) +from datahub.metadata.schema_classes import ( + FineGrainedLineageDownstreamTypeClass as FineGrainedLineageDownstreamType, +) +from datahub.metadata.schema_classes import ( + FineGrainedLineageUpstreamTypeClass as FineGrainedLineageUpstreamType, +) +from datahub.metadata.schema_classes import ( + OtherSchemaClass, + QueryLanguageClass, + QueryPropertiesClass, + QuerySourceClass, + QueryStatementClass, + SchemaFieldClass, + SchemaFieldDataTypeClass, + SchemaMetadataClass, + StringTypeClass, + UpstreamClass, + UpstreamLineageClass, +) +from datahub.utilities.urns.dataset_urn import DatasetUrn +from datahub.utilities.urns.urn import Urn +from pydantic import BaseModel, validator +from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync + +logger = logging.getLogger(__name__) + + +class DeleteAgent: + def delete_entity(self, urn: str) -> None: + pass + + +class DataHubGraphDeleteAgent(DeleteAgent): + def __init__(self, graph: DataHubGraph): + self.graph = graph + + def delete_entity(self, urn: str) -> None: + self.graph.delete_entity(urn, hard=True) + + +class DataHubConsoleDeleteAgent(DeleteAgent): + def delete_entity(self, urn: str) -> None: + print(f"Would delete {urn}") + + +class DataHubConsoleEmitter: + def emit_mcp(self, mcp: MetadataChangeProposalWrapper) -> None: + print(mcp) + + +INFINITE_HOPS: int = -1 + + +@pytest.mark.dependency(depends="wait_for_healthchecks") +def ingest_tableau_cll_via_rest(wait_for_healthchecks) -> None: + ingest_file_via_rest( + "tests/lineage/tableau_cll_mcps.json", + ) + yield + + +def search_across_lineage( + graph: DataHubGraph, + main_entity: str, + hops: int = INFINITE_HOPS, + direction: str = "UPSTREAM", + convert_schema_fields_to_datasets: bool = True, +): + def _explain_sal_result(result: dict) -> str: + explain = "" + entities = [ + x["entity"]["urn"] for x in result["searchAcrossLineage"]["searchResults"] + ] + number_of_results = len(entities) + explain += f"Number of results: {number_of_results}\n" + explain += "Entities: " + try: + for e in entities: + explain += f"\t{e.replace('urn:li:','')}\n" + for entity in entities: + paths = [ + x["paths"][0]["path"] + for x in result["searchAcrossLineage"]["searchResults"] + if x["entity"]["urn"] == entity + ] + explain += f"Paths for entity {entity}: " + for path in paths: + explain += ( + "\t" + + " -> ".join( + [ + x["urn"] + .replace("urn:li:schemaField", "field") + .replace("urn:li:dataset", "dataset") + .replace("urn:li:dataPlatform", "platform") + for x in path + ] + ) + + "\n" + ) + except Exception: + # breakpoint() + pass + return explain + + variable: dict[str, Any] = { + "input": ( + { + "urn": main_entity, + "query": "*", + "direction": direction, + "searchFlags": { + "groupingSpec": { + "groupingCriteria": [ + { + "baseEntityType": "SCHEMA_FIELD", + "groupingEntityType": "DATASET", + }, + ] + }, + "skipCache": True, + }, + } + if convert_schema_fields_to_datasets + else { + "urn": main_entity, + "query": "*", + "direction": direction, + "searchFlags": { + "skipCache": True, + }, + } + ) + } + if hops != INFINITE_HOPS: + variable["input"].update( + { + "orFilters": [ + { + "and": [ + { + "field": "degree", + "condition": "EQUAL", + "values": ["{}".format(hops)], + "negated": False, + } + ] + } + ] + } + ) + result = graph.execute_graphql( + """ + query($input: SearchAcrossLineageInput!) { + searchAcrossLineage(input: $input) + { + searchResults { + entity { + urn + } + paths { + path { + urn + } + } + } + } + } + """, + variables=variable, + ) + print(f"Query -> Entity {main_entity} with hops {hops} and direction {direction}") + print(result) + print(_explain_sal_result(result)) + return result + + +class Direction(Enum): + UPSTREAM = "UPSTREAM" + DOWNSTREAM = "DOWNSTREAM" + + def opposite(self): + if self == Direction.UPSTREAM: + return Direction.DOWNSTREAM + else: + return Direction.UPSTREAM + + +class Path(BaseModel): + path: List[str] + + def add_node(self, node: str) -> None: + self.path.append(node) + + def __hash__(self) -> int: + return ".".join(self.path).__hash__() + + +class LineageExpectation(BaseModel): + direction: Direction + main_entity: str + hops: int + impacted_entities: Dict[str, List[Path]] + + +class ImpactQuery(BaseModel): + main_entity: str + hops: int + direction: Direction + upconvert_schema_fields_to_datasets: bool + + def __hash__(self) -> int: + raw_string = ( + f"{self.main_entity}{self.hops}{self.direction}" + + f"{self.upconvert_schema_fields_to_datasets}" + ) + return raw_string.__hash__() + + +class ScenarioExpectation: + """ + This class stores the expectations for the lineage of a scenario. It is used + to store the pre-materialized expectations for all datasets and schema + fields across all hops and directions possible. This makes it easy to check + that the results of a lineage query match the expectations. + """ + + def __init__(self): + self._graph = nx.DiGraph() + + def __simplify(self, urn_or_list: Union[str, List[str]]) -> str: + if isinstance(urn_or_list, list): + return ",".join([self.__simplify(x) for x in urn_or_list]) + else: + return ( + urn_or_list.replace("urn:li:schemaField", "F") + .replace("urn:li:dataset", "D") + .replace("urn:li:dataPlatform", "P") + .replace("urn:li:query", "Q") + ) + + def extend_impacted_entities( + self, + direction: Direction, + parent_entity: str, + child_entity: str, + path_extension: Optional[List[str]] = None, + ) -> None: + via_node = path_extension[0] if path_extension else None + if via_node: + self._graph.add_edge(parent_entity, child_entity, via=via_node) + else: + self._graph.add_edge(parent_entity, child_entity) + + def generate_query_expectation_pairs( + self, max_hops: int + ) -> Iterable[Tuple[ImpactQuery, LineageExpectation]]: + upconvert_options = [ + True + ] # TODO: Add False once search-across-lineage supports returning schema fields + for main_entity in self._graph.nodes(): + for direction in [Direction.UPSTREAM, Direction.DOWNSTREAM]: + for upconvert_schema_fields_to_datasets in upconvert_options: + possible_hops = [h for h in range(1, max_hops)] + [INFINITE_HOPS] + for hops in possible_hops: + query = ImpactQuery( + main_entity=main_entity, + hops=hops, + direction=direction, + upconvert_schema_fields_to_datasets=upconvert_schema_fields_to_datasets, + ) + yield query, self.get_expectation_for_query(query) + + def get_expectation_for_query(self, query: ImpactQuery) -> LineageExpectation: + graph_to_walk = ( + self._graph + if query.direction == Direction.DOWNSTREAM + else self._graph.reverse() + ) + entity_paths = nx.shortest_path(graph_to_walk, source=query.main_entity) + lineage_expectation = LineageExpectation( + direction=query.direction, + main_entity=query.main_entity, + hops=query.hops, + impacted_entities={}, + ) + for entity, paths in entity_paths.items(): + if entity == query.main_entity: + continue + if query.hops != INFINITE_HOPS and len(paths) != ( + query.hops + 1 + ): # +1 because the path includes the main entity + print( + f"Skipping {entity} because it is less than or more than {query.hops} hops away" + ) + continue + path_graph = nx.path_graph(paths) + expanded_path: List[str] = [] + via_entity = None + for ea in path_graph.edges(): + expanded_path.append(ea[0]) + if "via" in graph_to_walk.edges[ea[0], ea[1]]: + via_entity = graph_to_walk.edges[ea[0], ea[1]]["via"] + expanded_path.append(via_entity) + if via_entity and not via_entity.startswith( + "urn:li:query" + ): # Transient nodes like queries are not included as impacted entities + if via_entity not in lineage_expectation.impacted_entities: + lineage_expectation.impacted_entities[via_entity] = [] + via_path = Path(path=[x for x in expanded_path]) + if via_path not in lineage_expectation.impacted_entities[via_entity]: + lineage_expectation.impacted_entities[via_entity].append( + Path(path=[x for x in expanded_path]) + ) + + expanded_path.append(paths[-1]) + if entity not in lineage_expectation.impacted_entities: + lineage_expectation.impacted_entities[entity] = [] + lineage_expectation.impacted_entities[entity].append( + Path(path=expanded_path) + ) + + if query.upconvert_schema_fields_to_datasets: + entries_to_add: Dict[str, List[Path]] = {} + entries_to_remove = [] + for impacted_entity in lineage_expectation.impacted_entities: + if impacted_entity.startswith("urn:li:schemaField"): + impacted_dataset_entity = Urn.create_from_string( + impacted_entity + ).entity_ids[0] + if impacted_dataset_entity in entries_to_add: + entries_to_add[impacted_dataset_entity].extend( + lineage_expectation.impacted_entities[impacted_entity] + ) + else: + entries_to_add[ + impacted_dataset_entity + ] = lineage_expectation.impacted_entities[impacted_entity] + entries_to_remove.append(impacted_entity) + for impacted_entity in entries_to_remove: + del lineage_expectation.impacted_entities[impacted_entity] + lineage_expectation.impacted_entities.update(entries_to_add) + return lineage_expectation + + +class Scenario(BaseModel): + class Config: + arbitrary_types_allowed = True + + class LineageStyle(Enum): + DATASET_QUERY_DATASET = "DATASET_QUERY_DATASET" + DATASET_JOB_DATASET = "DATASET_JOB_DATASET" + + lineage_style: LineageStyle + default_platform: str = "mysql" + default_transformation_platform: str = "airflow" + hop_platform_map: Dict[int, str] = {} + hop_transformation_map: Dict[int, str] = {} + num_hops: int = 1 + default_datasets_at_each_hop: int = 2 + default_dataset_fanin: int = 2 # Number of datasets that feed into a transformation + default_column_fanin: int = 2 # Number of columns that feed into a transformation + default_dataset_fanout: int = ( + 1 # Number of datasets that a transformation feeds into + ) + default_column_fanout: int = 1 # Number of columns that a transformation feeds into + # num_upstream_datasets: int = 2 + # num_downstream_datasets: int = 1 + default_dataset_prefix: str = "librarydb." + hop_dataset_prefix_map: Dict[int, str] = {} + query_id: str = "guid-guid-guid" + query_string: str = "SELECT * FROM foo" + transformation_job: str = "job1" + transformation_flow: str = "flow1" + _generated_urns: Set[str] = set() + expectations: Optional[ScenarioExpectation] = None + + @validator("expectations", pre=True, always=True) + def expectations_validator( + cls, v: Optional[ScenarioExpectation] + ) -> ScenarioExpectation: + if v is None: + return ScenarioExpectation() + else: + return v + + def get_column_name(self, column_index: int) -> str: + return f"column_{column_index}" + + def set_upstream_dataset_prefix(self, dataset): + self.upstream_dataset_prefix = dataset + + def set_downstream_dataset_prefix(self, dataset): + self.downstream_dataset_prefix = dataset + + def set_transformation_query(self, query: str) -> None: + self.transformation_query = query + + def set_transformation_job(self, job: str) -> None: + self.transformation_job = job + + def set_transformation_flow(self, flow: str) -> None: + self.transformation_flow = flow + + def get_transformation_job_urn(self, hop_index: int) -> str: + return builder.make_data_job_urn( + orchestrator=self.default_transformation_platform, + flow_id=f"layer_{hop_index}_{self.transformation_flow}", + job_id=self.transformation_job, + cluster="PROD", + ) + + def get_transformation_query_urn(self, hop_index: int = 0) -> str: + return f"urn:li:query:{self.query_id}_{hop_index}" # TODO - add hop index to query id + + def get_transformation_flow_urn(self, hop_index: int) -> str: + return builder.make_data_flow_urn( + orchestrator=self.default_transformation_platform, + flow_id=f"layer_{hop_index}_{self.transformation_flow}", + cluster="PROD", + ) + + def get_upstream_dataset_urns(self, hop_index: int) -> List[str]: + return [ + self.get_dataset_urn(hop_index=hop_index, index=i) + for i in range(self.default_dataset_fanin) + ] + + def get_dataset_urn(self, hop_index: int, index: int) -> str: + platform = self.hop_platform_map.get(hop_index, self.default_platform) + prefix = self.hop_dataset_prefix_map.get( + index, f"{self.default_dataset_prefix}layer_{hop_index}." + ) + return builder.make_dataset_urn(platform, f"{prefix}{index}") + + def get_column_urn( + self, hop_index: int, dataset_index: int, column_index: int = 0 + ) -> str: + return builder.make_schema_field_urn( + self.get_dataset_urn(hop_index, dataset_index), + self.get_column_name(column_index), + ) + + def get_upstream_column_urn( + self, hop_index: int, dataset_index: int, column_index: int = 0 + ) -> str: + return builder.make_schema_field_urn( + self.get_dataset_urn(hop_index, dataset_index), + self.get_column_name(column_index), + ) + + def get_downstream_column_urn( + self, hop_index: int, dataset_index: int, column_index: int = 0 + ) -> str: + return builder.make_schema_field_urn( + self.get_dataset_urn(hop_index + 1, dataset_index), + self.get_column_name(column_index), + ) + + def get_downstream_dataset_urns(self, hop_index: int) -> List[str]: + return [ + self.get_dataset_urn(hop_index + 1, i) + for i in range(self.default_dataset_fanout) + ] + + def get_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: + for hop_index in range(0, self.num_hops): + yield from self.get_lineage_mcps_for_hop(hop_index) + + def get_lineage_mcps_for_hop( + self, hop_index: int + ) -> Iterable[MetadataChangeProposalWrapper]: + if self.lineage_style == Scenario.LineageStyle.DATASET_JOB_DATASET: + fine_grained_lineage = FineGrainedLineage( + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=[ + self.get_upstream_column_urn(hop_index, dataset_index, 0) + for dataset_index in range(self.default_dataset_fanin) + ], + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=[ + self.get_downstream_column_urn(hop_index, dataset_index, 0) + for dataset_index in range(self.default_dataset_fanout) + ], + ) + datajob_io = DataJobInputOutputClass( + inputDatasets=self.get_upstream_dataset_urns(hop_index), + outputDatasets=self.get_downstream_dataset_urns(hop_index), + inputDatajobs=[], # not supporting job -> job lineage for now + fineGrainedLineages=[fine_grained_lineage], + ) + yield MetadataChangeProposalWrapper( + entityUrn=self.get_transformation_job_urn(hop_index), + aspect=datajob_io, + ) + + # Add field level expectations + for upstream_field_urn in fine_grained_lineage.upstreams or []: + for downstream_field_urn in fine_grained_lineage.downstreams or []: + self.expectations.extend_impacted_entities( + Direction.DOWNSTREAM, + upstream_field_urn, + downstream_field_urn, + path_extension=[ + self.get_transformation_job_urn(hop_index), + downstream_field_urn, + ], + ) + + # Add table level expectations + for upstream_dataset_urn in datajob_io.inputDatasets: + # No path extension, because we don't use via nodes for dataset -> dataset edges + self.expectations.extend_impacted_entities( + Direction.DOWNSTREAM, + upstream_dataset_urn, + self.get_transformation_job_urn(hop_index), + ) + for downstream_dataset_urn in datajob_io.outputDatasets: + self.expectations.extend_impacted_entities( + Direction.DOWNSTREAM, + self.get_transformation_job_urn(hop_index), + downstream_dataset_urn, + ) + + if self.lineage_style == Scenario.LineageStyle.DATASET_QUERY_DATASET: + # we emit upstream lineage from the downstream dataset + for downstream_dataset_index in range(self.default_dataset_fanout): + mcp_entity_urn = self.get_dataset_urn( + hop_index + 1, downstream_dataset_index + ) + fine_grained_lineages = [ + FineGrainedLineage( + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=[ + self.get_upstream_column_urn( + hop_index, d_i, upstream_col_index + ) + for d_i in range(self.default_dataset_fanin) + ], + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=[ + self.get_downstream_column_urn( + hop_index, + downstream_dataset_index, + downstream_col_index, + ) + for downstream_col_index in range( + self.default_column_fanout + ) + ], + query=self.get_transformation_query_urn(hop_index), + ) + for upstream_col_index in range(self.default_column_fanin) + ] + upstream_lineage = UpstreamLineageClass( + upstreams=[ + UpstreamClass( + dataset=self.get_dataset_urn(hop_index, i), + type=DatasetLineageTypeClass.TRANSFORMED, + query=self.get_transformation_query_urn(hop_index), + ) + for i in range(self.default_dataset_fanin) + ], + fineGrainedLineages=fine_grained_lineages, + ) + for fine_grained_lineage in fine_grained_lineages: + # Add field level expectations + for upstream_field_urn in fine_grained_lineage.upstreams or []: + for downstream_field_urn in ( + fine_grained_lineage.downstreams or [] + ): + self.expectations.extend_impacted_entities( + Direction.DOWNSTREAM, + upstream_field_urn, + downstream_field_urn, + path_extension=[ + self.get_transformation_query_urn(hop_index), + downstream_field_urn, + ], + ) + + # Add table level expectations + for upstream_dataset in upstream_lineage.upstreams: + self.expectations.extend_impacted_entities( + Direction.DOWNSTREAM, + upstream_dataset.dataset, + mcp_entity_urn, + path_extension=[ + self.get_transformation_query_urn(hop_index), + mcp_entity_urn, + ], + ) + + yield MetadataChangeProposalWrapper( + entityUrn=mcp_entity_urn, + aspect=upstream_lineage, + ) + + def get_entity_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: + for hop_index in range( + 0, self.num_hops + 1 + ): # we generate entities with last hop inclusive + for mcp in self.get_entity_mcps_for_hop(hop_index): + assert mcp.entityUrn + self._generated_urns.add(mcp.entityUrn) + yield mcp + + def get_entity_mcps_for_hop( + self, hop_index: int + ) -> Iterable[MetadataChangeProposalWrapper]: + if self.lineage_style == Scenario.LineageStyle.DATASET_JOB_DATASET: + # Construct the DataJobInfo aspect with the job -> flow lineage. + dataflow_urn = self.get_transformation_flow_urn(hop_index) + + dataflow_info = DataFlowInfoClass( + name=self.transformation_flow.title() + " Flow" + ) + + dataflow_info_mcp = MetadataChangeProposalWrapper( + entityUrn=dataflow_urn, + aspect=dataflow_info, + ) + yield dataflow_info_mcp + + datajob_info = DataJobInfoClass( + name=self.transformation_job.title() + " Job", + type="AIRFLOW", + flowUrn=dataflow_urn, + ) + + # Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect. + # NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job. + datajob_info_mcp = MetadataChangeProposalWrapper( + entityUrn=self.get_transformation_job_urn(hop_index), + aspect=datajob_info, + ) + yield datajob_info_mcp + + if self.lineage_style == Scenario.LineageStyle.DATASET_QUERY_DATASET: + query_urn = self.get_transformation_query_urn(hop_index=hop_index) + + fake_auditstamp = AuditStampClass( + time=int(time.time() * 1000), + actor="urn:li:corpuser:datahub", + ) + + query_properties = QueryPropertiesClass( + statement=QueryStatementClass( + value=self.query_string, + language=QueryLanguageClass.SQL, + ), + source=QuerySourceClass.SYSTEM, + created=fake_auditstamp, + lastModified=fake_auditstamp, + ) + + query_info_mcp = MetadataChangeProposalWrapper( + entityUrn=query_urn, + aspect=query_properties, + ) + yield query_info_mcp + # Generate schema and properties mcps for all datasets + for dataset_index in range(self.default_datasets_at_each_hop): + dataset_urn = DatasetUrn.from_string( + self.get_dataset_urn(hop_index, dataset_index) + ) + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=str(dataset_urn), + aspects=[ + SchemaMetadataClass( + schemaName=str(dataset_urn), + platform=builder.make_data_platform_urn(dataset_urn.platform), + version=0, + hash="", + platformSchema=OtherSchemaClass(rawSchema=""), + fields=[ + SchemaFieldClass( + fieldPath=self.get_column_name(i), + type=SchemaFieldDataTypeClass(type=StringTypeClass()), + nativeDataType="string", + ) + for i in range(self.default_column_fanin) + ], + ), + DatasetPropertiesClass( + name=dataset_urn.name, + ), + ], + ) + + def cleanup(self, delete_agent: DeleteAgent) -> None: + """Delete all entities created by this scenario.""" + for urn in self._generated_urns: + delete_agent.delete_entity(urn) + + def generate_expectation(self, query: ImpactQuery) -> LineageExpectation: + return self.expectations.generate_query_expectation_pairs(query) + + def test_expectation(self, graph: DataHubGraph) -> bool: + print("Testing expectation...") + try: + for hop_index in range(self.num_hops): + for dataset_urn in self.get_upstream_dataset_urns(hop_index): + assert graph.exists(dataset_urn) is True + for dataset_urn in self.get_downstream_dataset_urns(hop_index): + assert graph.exists(dataset_urn) is True + + if self.lineage_style == Scenario.LineageStyle.DATASET_JOB_DATASET: + assert graph.exists(self.get_transformation_job_urn(hop_index)) is True + assert graph.exists(self.get_transformation_flow_urn(hop_index)) is True + + if self.lineage_style == Scenario.LineageStyle.DATASET_QUERY_DATASET: + assert ( + graph.exists(self.get_transformation_query_urn(hop_index)) is True + ) + + wait_for_writes_to_sync() # Wait for the graph to update + # We would like to check that lineage is correct for all datasets and schema fields for all values of hops and for all directions of lineage exploration + # Since we already have expectations stored for all datasets and schema_fields, we can just check that the results match the expectations + + for ( + query, + expectation, + ) in self.expectations.generate_query_expectation_pairs(self.num_hops): + impacted_entities_expectation = set( + [x for x in expectation.impacted_entities.keys()] + ) + if len(impacted_entities_expectation) == 0: + continue + result = search_across_lineage( + graph, + query.main_entity, + query.hops, + query.direction.value, + query.upconvert_schema_fields_to_datasets, + ) + impacted_entities = set( + [ + x["entity"]["urn"] + for x in result["searchAcrossLineage"]["searchResults"] + ] + ) + try: + assert ( + impacted_entities == impacted_entities_expectation + ), f"Expected impacted entities to be {impacted_entities_expectation}, found {impacted_entities}" + except Exception: + # breakpoint() + raise + search_results = result["searchAcrossLineage"]["searchResults"] + for impacted_entity in impacted_entities: + # breakpoint() + impacted_entity_paths: List[Path] = [] + # breakpoint() + entity_paths_response = [ + x["paths"] + for x in search_results + if x["entity"]["urn"] == impacted_entity + ] + for path_response in entity_paths_response: + for p in path_response: + q = p["path"] + impacted_entity_paths.append( + Path(path=[x["urn"] for x in q]) + ) + # if len(impacted_entity_paths) > 1: + # breakpoint() + try: + assert len(impacted_entity_paths) == len( + expectation.impacted_entities[impacted_entity] + ), f"Expected length of impacted entity paths to be {len(expectation.impacted_entities[impacted_entity])}, found {len(impacted_entity_paths)}" + assert set(impacted_entity_paths) == set( + expectation.impacted_entities[impacted_entity] + ), f"Expected impacted entity paths to be {expectation.impacted_entities[impacted_entity]}, found {impacted_entity_paths}" + except Exception: + breakpoint() + raise + # for i in range(len(impacted_entity_paths)): + # assert impacted_entity_paths[i].path == expectation.impacted_entities[impacted_entity][i].path, f"Expected impacted entity paths to be {expectation.impacted_entities[impacted_entity][i].path}, found {impacted_entity_paths[i].path}" + print("Test passed!") + return True + except AssertionError as e: + print("Test failed!") + raise e + return False + + +@pytest.mark.dependency() +def test_healthchecks(wait_for_healthchecks): + # Call to wait_for_healthchecks fixture will do the actual functionality. + pass + + +# @tenacity.retry( +# stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) +# ) +@pytest.mark.parametrize( + "lineage_style", + [ + Scenario.LineageStyle.DATASET_QUERY_DATASET, + Scenario.LineageStyle.DATASET_JOB_DATASET, + ], +) +@pytest.mark.parametrize( + "graph_level", + [ + 1, + 2, + 3, + # TODO - convert this to range of 1 to 10 to make sure we can handle large graphs + ], +) +@pytest.mark.dependency(depends=["test_healthchecks"]) +def test_lineage_via_node( + lineage_style: Scenario.LineageStyle, graph_level: int +) -> None: + scenario: Scenario = Scenario( + hop_platform_map={0: "mysql", 1: "snowflake"}, + lineage_style=lineage_style, + num_hops=graph_level, + default_dataset_prefix=f"{lineage_style.value}.", + ) + + # Create an emitter to the GMS REST API. + (url, token) = get_url_and_token() + with DataHubGraph( + DatahubClientConfig(server=url, token=token, retry_max_times=0) + ) as graph: + emitter = graph + # emitter = DataHubConsoleEmitter() + + # Emit metadata! + for mcp in scenario.get_entity_mcps(): + emitter.emit_mcp(mcp) + + for mcps in scenario.get_lineage_mcps(): + emitter.emit_mcp(mcps) + + wait_for_writes_to_sync() + try: + scenario.test_expectation(graph) + finally: + scenario.cleanup(DataHubGraphDeleteAgent(graph)) + + +@pytest.fixture(scope="module") +def chart_urn_fixture(): + return "urn:li:chart:(tableau,2241f3d6-df8d-b515-9c0c-f5e5b347b26e)" + + +@pytest.fixture(scope="module") +def intermediates_fixture(): + return [ + "urn:li:dataset:(urn:li:dataPlatform:tableau,6bd53e72-9fe4-ea86-3d23-14b826c13fa5,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:tableau,1c5653d6-c448-0850-108b-5c78aeaf6b51,PROD)", + ] + + +@pytest.fixture(scope="module") +def destination_urn_fixture(): + return "urn:li:dataset:(urn:li:dataPlatform:external,sales target %28us%29.xlsx.sheet1,PROD)" + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +@pytest.fixture(scope="module", autouse=False) +def ingest_multipath_metadata( + chart_urn_fixture, intermediates_fixture, destination_urn_fixture +): + (url, token) = get_url_and_token() + fake_auditstamp = AuditStampClass( + time=int(time.time() * 1000), + actor="urn:li:corpuser:datahub", + ) + with DataHubGraph( + DatahubClientConfig(server=url, token=token, retry_max_times=0) + ) as graph: + chart_urn = chart_urn_fixture + intermediates = intermediates_fixture + destination_urn = destination_urn_fixture + for mcp in MetadataChangeProposalWrapper.construct_many( + entityUrn=destination_urn, + aspects=[ + DatasetPropertiesClass( + name="sales target (us).xlsx.sheet1", + ), + ], + ): + graph.emit_mcp(mcp) + + for intermediate in intermediates: + for mcp in MetadataChangeProposalWrapper.construct_many( + entityUrn=intermediate, + aspects=[ + DatasetPropertiesClass( + name="intermediate", + ), + UpstreamLineageClass( + upstreams=[ + UpstreamClass( + dataset=destination_urn, + type="TRANSFORMED", + ) + ] + ), + ], + ): + graph.emit_mcp(mcp) + + for mcp in MetadataChangeProposalWrapper.construct_many( + entityUrn=chart_urn, + aspects=[ + ChartInfoClass( + title="chart", + description="chart", + lastModified=ChangeAuditStampsClass(created=fake_auditstamp), + inputEdges=[ + EdgeClass( + destinationUrn=intermediate_entity, + sourceUrn=chart_urn, + ) + for intermediate_entity in intermediates + ], + ) + ], + ): + graph.emit_mcp(mcp) + wait_for_writes_to_sync() + yield + for urn in [chart_urn] + intermediates + [destination_urn]: + graph.delete_entity(urn, hard=True) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +def test_simple_lineage_multiple_paths( + ingest_multipath_metadata, + chart_urn_fixture, + intermediates_fixture, + destination_urn_fixture, +): + chart_urn = chart_urn_fixture + intermediates = intermediates_fixture + destination_urn = destination_urn_fixture + results = search_across_lineage( + get_default_graph(), + chart_urn, + direction="UPSTREAM", + convert_schema_fields_to_datasets=True, + ) + assert destination_urn in [ + x["entity"]["urn"] for x in results["searchAcrossLineage"]["searchResults"] + ] + for search_result in results["searchAcrossLineage"]["searchResults"]: + if search_result["entity"]["urn"] == destination_urn: + assert ( + len(search_result["paths"]) == 2 + ) # 2 paths from the chart to the dataset + for path in search_result["paths"]: + assert len(path["path"]) == 3 + assert path["path"][-1]["urn"] == destination_urn + assert path["path"][0]["urn"] == chart_urn + assert path["path"][1]["urn"] in intermediates