diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 3ba0cc1f747e3..b99f712034fe0 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -821,6 +821,7 @@ private void configureQueryResolvers(final RuntimeWiring.Builder builder) { .dataFetcher("glossaryNode", getResolver(glossaryNodeType)) .dataFetcher("domain", getResolver((domainType))) .dataFetcher("dataPlatform", getResolver(dataPlatformType)) + .dataFetcher("dataPlatformInstance", getResolver(dataPlatformInstanceType)) .dataFetcher("mlFeatureTable", getResolver(mlFeatureTableType)) .dataFetcher("mlFeature", getResolver(mlFeatureType)) .dataFetcher("mlPrimaryKey", getResolver(mlPrimaryKeyType)) @@ -1291,7 +1292,8 @@ private void configureCorpUserResolvers(final RuntimeWiring.Builder builder) { */ private void configureCorpGroupResolvers(final RuntimeWiring.Builder builder) { builder.type("CorpGroup", typeWiring -> typeWiring - .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient))); + .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient)) + .dataFetcher("exists", new EntityExistsResolver(entityService))); builder.type("CorpGroupInfo", typeWiring -> typeWiring .dataFetcher("admins", new LoadableTypeBatchResolver<>(corpUserType, diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java index 3089b8c8fc2db..03e63c7fb472f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java @@ -4,7 +4,7 @@ import com.datahub.plugins.auth.authorization.Authorizer; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.Urn; @@ -90,7 +90,7 @@ public static boolean canManageTags(@Nonnull QueryContext context) { } public static boolean canDeleteEntity(@Nonnull Urn entityUrn, @Nonnull QueryContext context) { - return isAuthorized(context, Optional.of(new ResourceSpec(entityUrn.getEntityType(), entityUrn.toString())), PoliciesConfig.DELETE_ENTITY_PRIVILEGE); + return isAuthorized(context, Optional.of(new EntitySpec(entityUrn.getEntityType(), entityUrn.toString())), PoliciesConfig.DELETE_ENTITY_PRIVILEGE); } public static boolean canManageUserCredentials(@Nonnull QueryContext context) { @@ -173,7 +173,7 @@ public static boolean canDeleteQuery(@Nonnull Urn entityUrn, @Nonnull List public static boolean isAuthorized( @Nonnull QueryContext context, - @Nonnull Optional resourceSpec, + @Nonnull Optional resourceSpec, @Nonnull PoliciesConfig.Privilege privilege) { final Authorizer authorizer = context.getAuthorizer(); final String actor = context.getActorUrn(); @@ -196,7 +196,7 @@ public static boolean isAuthorized( @Nonnull String resource, @Nonnull DisjunctivePrivilegeGroup privilegeGroup ) { - final ResourceSpec resourceSpec = new ResourceSpec(resourceType, resource); + final EntitySpec resourceSpec = new EntitySpec(resourceType, resource); return AuthUtil.isAuthorized(authorizer, actor, Optional.of(resourceSpec), privilegeGroup); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java index 23be49c7e7140..2873866bb34f7 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java @@ -1,6 +1,6 @@ package com.linkedin.datahub.graphql.resolvers.dataset; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.linkedin.common.urn.Urn; @@ -104,7 +104,7 @@ private CorpUser createPartialUser(final Urn userUrn) { private boolean isAuthorized(final Urn resourceUrn, final QueryContext context) { return AuthorizationUtils.isAuthorized(context, - Optional.of(new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString())), + Optional.of(new EntitySpec(resourceUrn.getEntityType(), resourceUrn.toString())), PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE); } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java index 20361830ad5a5..e4bec8e896fdf 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java @@ -1,6 +1,6 @@ package com.linkedin.datahub.graphql.resolvers.dataset; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; @@ -52,7 +52,7 @@ public CompletableFuture get(DataFetchingEnvironment environme private boolean isAuthorized(final Urn resourceUrn, final QueryContext context) { return AuthorizationUtils.isAuthorized(context, - Optional.of(new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString())), + Optional.of(new EntitySpec(resourceUrn.getEntityType(), resourceUrn.toString())), PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE); } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java index 197ca8640559d..f13ebf8373e91 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java @@ -1,6 +1,6 @@ package com.linkedin.datahub.graphql.resolvers.load; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.linkedin.datahub.graphql.generated.Entity; @@ -79,7 +79,7 @@ public TimeSeriesAspectResolver( private boolean isAuthorized(QueryContext context, String urn) { if (_entityName.equals(Constants.DATASET_ENTITY_NAME) && _aspectName.equals( Constants.DATASET_PROFILE_ASPECT_NAME)) { - return AuthorizationUtils.isAuthorized(context, Optional.of(new ResourceSpec(_entityName, urn)), + return AuthorizationUtils.isAuthorized(context, Optional.of(new EntitySpec(_entityName, urn)), PoliciesConfig.VIEW_DATASET_PROFILE_PRIVILEGE); } return true; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java index 2f20fdaf1e9b1..11f7793db82c8 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java @@ -2,7 +2,7 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.DataHubAuthorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.generated.GetGrantedPrivilegesInput; @@ -33,8 +33,8 @@ public CompletableFuture get(final DataFetchingEnvironment environme if (!isAuthorized(context, actor)) { throw new AuthorizationException("Unauthorized to get privileges for the given author."); } - final Optional resourceSpec = Optional.ofNullable(input.getResourceSpec()) - .map(spec -> new ResourceSpec(EntityTypeMapper.getName(spec.getResourceType()), spec.getResourceUrn())); + final Optional resourceSpec = Optional.ofNullable(input.getResourceSpec()) + .map(spec -> new EntitySpec(EntityTypeMapper.getName(spec.getResourceType()), spec.getResourceUrn())); if (context.getAuthorizer() instanceof AuthorizerChain) { DataHubAuthorizer dataHubAuthorizer = ((AuthorizerChain) context.getAuthorizer()).getDefaultAuthorizer(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatforminstance/DataPlatformInstanceType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatforminstance/DataPlatformInstanceType.java index 2423fc31ea52e..87614e1332528 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatforminstance/DataPlatformInstanceType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatforminstance/DataPlatformInstanceType.java @@ -4,16 +4,25 @@ import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.AutoCompleteResults; import com.linkedin.datahub.graphql.generated.DataPlatformInstance; import com.linkedin.datahub.graphql.generated.Entity; import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.generated.FacetFilterInput; +import com.linkedin.datahub.graphql.generated.SearchResults; import com.linkedin.datahub.graphql.types.dataplatforminstance.mappers.DataPlatformInstanceMapper; +import com.linkedin.datahub.graphql.types.mappers.AutoCompleteResultsMapper; +import com.linkedin.datahub.graphql.types.SearchableEntityType; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.Constants; +import com.linkedin.metadata.query.AutoCompleteResult; +import com.linkedin.metadata.query.filter.Filter; import graphql.execution.DataFetcherResult; +import org.apache.commons.lang3.NotImplementedException; import javax.annotation.Nonnull; +import javax.annotation.Nullable; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -22,7 +31,10 @@ import java.util.function.Function; import java.util.stream.Collectors; -public class DataPlatformInstanceType implements com.linkedin.datahub.graphql.types.EntityType { +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME; + +public class DataPlatformInstanceType implements SearchableEntityType, + com.linkedin.datahub.graphql.types.EntityType { static final Set ASPECTS_TO_FETCH = ImmutableSet.of( Constants.DATA_PLATFORM_INSTANCE_KEY_ASPECT_NAME, @@ -84,4 +96,24 @@ public List> batchLoad(@Nonnull List filters, + int start, + int count, + @Nonnull final QueryContext context) throws Exception { + throw new NotImplementedException("Searchable type (deprecated) not implemented on DataPlatformInstance entity type"); + } + + @Override + public AutoCompleteResults autoComplete(@Nonnull String query, + @Nullable String field, + @Nullable Filter filters, + int limit, + @Nonnull final QueryContext context) throws Exception { + final AutoCompleteResult result = _entityClient.autoComplete(DATA_PLATFORM_INSTANCE_ENTITY_NAME, query, + filters, limit, context.getAuthentication()); + return AutoCompleteResultsMapper.map(result); + } + } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 39f86948c77c4..b37a8f34fa056 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -226,6 +226,11 @@ type Query { listOwnershipTypes( "Input required for listing custom ownership types" input: ListOwnershipTypesInput!): ListOwnershipTypesResult! + + """ + Fetch a Data Platform Instance by primary key (urn) + """ + dataPlatformInstance(urn: String!): DataPlatformInstance } """ @@ -3783,6 +3788,11 @@ type CorpGroup implements Entity { Additional read only info about the group """ info: CorpGroupInfo @deprecated + + """ + Whether or not this entity exists on DataHub + """ + exists: Boolean } """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index 4cabdb04afe77..e0cde5a2db9f9 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -458,6 +458,26 @@ enum FilterOperator { Represents the relation: The field exists. If the field is an array, the field is either not present or empty. """ EXISTS + + """ + Represent the relation greater than, e.g. ownerCount > 5 + """ + GREATER_THAN + + """ + Represent the relation greater than or equal to, e.g. ownerCount >= 5 + """ + GREATER_THAN_OR_EQUAL_TO + + """ + Represent the relation less than, e.g. ownerCount < 3 + """ + LESS_THAN + + """ + Represent the relation less than or equal to, e.g. ownerCount <= 3 + """ + LESS_THAN_OR_EQUAL_TO } """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/GlossaryUtilsTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/GlossaryUtilsTest.java index ccaab44f60dd4..8bfc32e1999ae 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/GlossaryUtilsTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/GlossaryUtilsTest.java @@ -5,7 +5,7 @@ import com.datahub.authorization.AuthorizationRequest; import com.datahub.authorization.AuthorizationResult; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.common.urn.GlossaryNodeUrn; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; @@ -89,17 +89,17 @@ private void setUpTests() throws Exception { Mockito.any(Authentication.class) )).thenReturn(new EntityResponse().setAspects(new EnvelopedAspectMap(parentNode3Aspects))); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec3); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec2); - final ResourceSpec resourceSpec1 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); + final EntitySpec resourceSpec1 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec1); } - private void mockAuthRequest(String privilege, AuthorizationResult.Type allowOrDeny, ResourceSpec resourceSpec) { + private void mockAuthRequest(String privilege, AuthorizationResult.Type allowOrDeny, EntitySpec resourceSpec) { final AuthorizationRequest authorizationRequest = new AuthorizationRequest( userUrn, privilege, @@ -150,7 +150,7 @@ public void testCanManageChildrenEntitiesAuthorized() throws Exception { // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn.toString()); + final EntitySpec resourceSpec = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.ALLOW, resourceSpec); assertTrue(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn, mockClient)); @@ -162,7 +162,7 @@ public void testCanManageChildrenEntitiesUnauthorized() throws Exception { // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn.toString()); + final EntitySpec resourceSpec = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec); @@ -175,13 +175,13 @@ public void testCanManageChildrenRecursivelyEntitiesAuthorized() throws Exceptio // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.ALLOW, resourceSpec3); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec2); - final ResourceSpec resourceSpec1 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); + final EntitySpec resourceSpec1 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec1); assertTrue(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn1, mockClient)); @@ -193,13 +193,13 @@ public void testCanManageChildrenRecursivelyEntitiesUnauthorized() throws Except // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec3); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec2); - final ResourceSpec resourceSpec1 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); + final EntitySpec resourceSpec1 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec1); assertFalse(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn1, mockClient)); @@ -211,10 +211,10 @@ public void testCanManageChildrenRecursivelyEntitiesAuthorizedLevel2() throws Ex // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.ALLOW, resourceSpec2); - final ResourceSpec resourceSpec1 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); + final EntitySpec resourceSpec1 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec1); assertTrue(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn1, mockClient)); @@ -226,10 +226,10 @@ public void testCanManageChildrenRecursivelyEntitiesUnauthorizedLevel2() throws // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec3); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec2); assertFalse(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn2, mockClient)); @@ -241,7 +241,7 @@ public void testCanManageChildrenRecursivelyEntitiesNoLevel2() throws Exception // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec3); assertFalse(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn3, mockClient)); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/CreateQueryResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/CreateQueryResolverTest.java index 196eb24b52bf8..9c04c67dd3a3b 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/CreateQueryResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/CreateQueryResolverTest.java @@ -5,7 +5,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.AuthorizationRequest; import com.datahub.authorization.AuthorizationResult; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -201,7 +201,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_QUERIES_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); @@ -210,7 +210,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/DeleteQueryResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/DeleteQueryResolverTest.java index a6b4887b0e882..78c894f27cbc3 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/DeleteQueryResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/DeleteQueryResolverTest.java @@ -5,7 +5,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.AuthorizationRequest; import com.datahub.authorization.AuthorizationResult; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; @@ -134,7 +134,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { DeleteQueryResolverTest.TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_QUERIES_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( DeleteQueryResolverTest.TEST_DATASET_URN.getEntityType(), DeleteQueryResolverTest.TEST_DATASET_URN.toString())) ); @@ -143,7 +143,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/UpdateQueryResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/UpdateQueryResolverTest.java index 7a76b6d6be5a4..9b500b5fb3936 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/UpdateQueryResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/UpdateQueryResolverTest.java @@ -5,7 +5,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.AuthorizationRequest; import com.datahub.authorization.AuthorizationResult; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -206,7 +206,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_QUERIES_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); @@ -215,7 +215,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); @@ -224,7 +224,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_QUERIES_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN_2.getEntityType(), TEST_DATASET_URN_2.toString())) ); @@ -233,7 +233,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN_2.getEntityType(), TEST_DATASET_URN_2.toString())) ); diff --git a/datahub-web-react/src/app/domain/CreateDomainModal.tsx b/datahub-web-react/src/app/domain/CreateDomainModal.tsx index ca1bc30596003..606444d34bdc9 100644 --- a/datahub-web-react/src/app/domain/CreateDomainModal.tsx +++ b/datahub-web-react/src/app/domain/CreateDomainModal.tsx @@ -191,7 +191,10 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) { rules={[{ whitespace: true }, { min: 1, max: 500 }]} hasFeedback > - + diff --git a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx index 0e5c035df00c1..b69f0c5458b5d 100644 --- a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx +++ b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx @@ -42,7 +42,12 @@ export default function ManageDomainsPageV2() {
-
diff --git a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx index 09c8e13853bb7..bf70bd043fd4a 100644 --- a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx +++ b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx @@ -103,7 +103,7 @@ export default function DomainNode({ domain, numDomainChildren, domainUrnToHide, return ( <> - + {hasDomainChildren && ( diff --git a/datahub-web-react/src/app/entity/group/GroupProfile.tsx b/datahub-web-react/src/app/entity/group/GroupProfile.tsx index d5e284af931df..53d2062277dec 100644 --- a/datahub-web-react/src/app/entity/group/GroupProfile.tsx +++ b/datahub-web-react/src/app/entity/group/GroupProfile.tsx @@ -11,6 +11,7 @@ import { RoutedTabs } from '../../shared/RoutedTabs'; import GroupInfoSidebar from './GroupInfoSideBar'; import { GroupAssets } from './GroupAssets'; import { ErrorSection } from '../../shared/error/ErrorSection'; +import NonExistentEntityPage from '../shared/entity/NonExistentEntityPage'; const messageStyle = { marginTop: '10%' }; @@ -110,6 +111,9 @@ export default function GroupProfile() { urn, }; + if (data?.corpGroup?.exists === false) { + return ; + } return ( <> {error && } diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx index be975249b2670..bfb7ff7e540c4 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx @@ -203,7 +203,7 @@ function EntityDropdown(props: Props) { disabled={isMoveDisabled(entityType, entityData, me.platformPrivileges)} onClick={() => setIsMoveModalVisible(true)} > - +  Move @@ -223,7 +223,7 @@ function EntityDropdown(props: Props) { : undefined } > - +  Delete diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx index cdbf6fdabf3c9..3826f934c1c25 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx @@ -67,6 +67,7 @@ function MoveDomainModal(props: Props) { return ( Cancel - + } > diff --git a/datahub-web-react/src/app/settings/AccessTokenModal.tsx b/datahub-web-react/src/app/settings/AccessTokenModal.tsx index 0303db656c2a8..10427210d0692 100644 --- a/datahub-web-react/src/app/settings/AccessTokenModal.tsx +++ b/datahub-web-react/src/app/settings/AccessTokenModal.tsx @@ -60,7 +60,7 @@ export const AccessTokenModal = ({ visible, onClose, accessToken, expiresInText onCancel={onClose} footer={ <> - @@ -81,7 +81,7 @@ export const AccessTokenModal = ({ visible, onClose, accessToken, expiresInText Token {expiresInText} -
{accessToken}
+
{accessToken}
diff --git a/datahub-web-react/src/app/settings/AccessTokens.tsx b/datahub-web-react/src/app/settings/AccessTokens.tsx index 02ff3f1cd304c..c7a015de392da 100644 --- a/datahub-web-react/src/app/settings/AccessTokens.tsx +++ b/datahub-web-react/src/app/settings/AccessTokens.tsx @@ -199,7 +199,12 @@ export const AccessTokens = () => { key: 'x', render: (_, record: any) => ( - diff --git a/datahub-web-react/src/app/settings/CreateTokenModal.tsx b/datahub-web-react/src/app/settings/CreateTokenModal.tsx index 6038a86e23303..3cc446651efcb 100644 --- a/datahub-web-react/src/app/settings/CreateTokenModal.tsx +++ b/datahub-web-react/src/app/settings/CreateTokenModal.tsx @@ -117,10 +117,15 @@ export default function CreateTokenModal({ currentUserUrn, visible, onClose, onC onCancel={onModalClose} footer={ <> - - @@ -148,18 +153,21 @@ export default function CreateTokenModal({ currentUserUrn, visible, onClose, onC ]} hasFeedback > - + Description}> An optional description for your new token. - + Expires in - + {ACCESS_TOKEN_DURATIONS.map((duration) => ( diff --git a/datahub-web-react/src/graphql/group.graphql b/datahub-web-react/src/graphql/group.graphql index 9aa6e2b005f16..1007721e51a4e 100644 --- a/datahub-web-react/src/graphql/group.graphql +++ b/datahub-web-react/src/graphql/group.graphql @@ -3,6 +3,7 @@ query getGroup($urn: String!, $membersCount: Int!) { urn type name + exists origin { type externalType diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 82d9a93a9a2c3..eb082d50b3020 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -2,62 +2,58 @@ # pyspark==3.0.3 # pydeequ==1.0.1 -acryl-datahub-classify==0.0.6 -acryl-iceberg-legacy==0.0.4 -acryl-PyHive==0.6.13 -aenum==3.1.12 -aiohttp==3.8.4 +acryl-datahub-classify==0.0.8 +acryl-PyHive==0.6.14 +acryl-sqlglot==18.5.2.dev45 +aenum==3.1.15 +aiohttp==3.8.6 aiosignal==1.3.1 -alembic==1.11.1 +alembic==1.12.0 altair==4.2.0 -anyio==3.7.0 -apache-airflow==2.6.1 -apache-airflow-providers-common-sql==1.5.1 -apache-airflow-providers-ftp==3.4.1 -apache-airflow-providers-http==4.4.1 -apache-airflow-providers-imap==3.2.1 -apache-airflow-providers-sqlite==3.4.1 -apispec==5.2.2 +anyio==3.7.1 +apache-airflow==2.7.2 +apache-airflow-providers-common-sql==1.7.2 +apache-airflow-providers-ftp==3.5.2 +apache-airflow-providers-http==4.5.2 +apache-airflow-providers-imap==3.3.2 +apache-airflow-providers-sqlite==3.4.3 +apispec==6.3.0 appdirs==1.4.4 appnope==0.1.3 -argcomplete==3.0.8 -argon2-cffi==21.3.0 +argcomplete==3.1.2 +argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 asgiref==3.7.2 asn1crypto==1.5.1 -asttokens==2.2.1 -async-timeout==4.0.2 +asttokens==2.4.0 +async-timeout==4.0.3 asynch==0.2.2 attrs==23.1.0 avro==1.10.2 -avro-gen3==0.7.10 -azure-core==1.26.4 -azure-identity==1.10.0 -azure-storage-blob==12.16.0 -azure-storage-file-datalake==12.11.0 -Babel==2.12.1 +avro-gen3==0.7.11 +Babel==2.13.0 backcall==0.2.0 backoff==2.2.1 beautifulsoup4==4.12.2 -bleach==6.0.0 -blinker==1.6.2 -blis==0.7.9 -boto3==1.26.142 -botocore==1.29.142 +bleach==6.1.0 +blinker==1.6.3 +blis==0.7.11 +boto3==1.28.62 +botocore==1.31.62 bowler==0.9.0 -bracex==2.3.post1 +bracex==2.4 cached-property==1.5.2 cachelib==0.9.0 cachetools==5.3.1 -catalogue==2.0.8 -cattrs==22.2.0 -certifi==2023.5.7 -cffi==1.15.1 -chardet==5.1.0 -charset-normalizer==2.1.1 +catalogue==2.0.10 +cattrs==23.1.2 +certifi==2023.7.22 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.0 ciso8601==2.3.0 -click==8.1.3 -click-default-group==1.2.2 +click==8.1.7 +click-default-group==1.2.4 click-spinner==0.1.10 clickclick==20.10.2 clickhouse-cityhash==1.0.2.4 @@ -66,205 +62,217 @@ clickhouse-sqlalchemy==0.2.4 cloudpickle==2.2.1 colorama==0.4.6 colorlog==4.8.0 -confection==0.0.4 +comm==0.1.4 +confection==0.1.3 ConfigUpdater==3.1.1 confluent-kafka==1.8.2 connexion==2.14.2 cron-descriptor==1.4.0 -croniter==1.3.15 -cryptography==37.0.4 +croniter==2.0.1 +cryptography==41.0.4 cx-Oracle==8.3.0 -cymem==2.0.7 -dask==2023.5.1 -databricks-cli==0.17.7 +cymem==2.0.8 +dask==2023.9.3 +databricks-cli==0.18.0 databricks-dbapi==0.6.0 -databricks-sdk==0.1.8 -debugpy==1.6.7 +databricks-sdk==0.10.0 +debugpy==1.8.0 decorator==5.1.1 defusedxml==0.7.1 -deltalake==0.9.0 +deltalake==0.11.0 Deprecated==1.2.14 -dill==0.3.6 -dnspython==2.3.0 -docker==6.1.2 +dill==0.3.7 +dnspython==2.4.2 +docker==6.1.3 docutils==0.20.1 ecdsa==0.18.0 elasticsearch==7.13.4 email-validator==1.3.1 entrypoints==0.4 et-xmlfile==1.1.0 -exceptiongroup==1.1.1 -executing==1.2.0 -expandvars==0.9.0 -fastapi==0.95.2 -fastavro==1.7.4 -fastjsonschema==2.17.1 -feast==0.29.0 -filelock==3.12.0 +exceptiongroup==1.1.3 +executing==2.0.0 +expandvars==0.11.0 +fastapi==0.103.2 +fastavro==1.8.4 +fastjsonschema==2.18.1 +feast==0.31.1 +filelock==3.12.4 fissix==21.11.13 Flask==2.2.5 flatdict==4.0.1 -frozenlist==1.3.3 -fsspec==2023.5.0 +frozenlist==1.4.0 +fsspec==2023.9.2 future==0.18.3 -GeoAlchemy2==0.13.3 +GeoAlchemy2==0.14.1 gitdb==4.0.10 -GitPython==3.1.31 -google-api-core==2.11.0 -google-auth==2.19.0 -google-cloud-appengine-logging==1.3.0 +GitPython==3.1.37 +google-api-core==2.12.0 +google-auth==2.23.3 +google-cloud-appengine-logging==1.3.2 google-cloud-audit-log==0.2.5 -google-cloud-bigquery==3.10.0 -google-cloud-bigquery-storage==2.19.1 -google-cloud-core==2.3.2 +google-cloud-bigquery==3.12.0 +google-cloud-core==2.3.3 google-cloud-datacatalog-lineage==0.2.2 google-cloud-logging==3.5.0 google-crc32c==1.5.0 -google-resumable-media==2.5.0 -googleapis-common-protos==1.59.0 +google-re2==1.1 +google-resumable-media==2.6.0 +googleapis-common-protos==1.60.0 gql==3.4.1 graphql-core==3.2.3 graphviz==0.20.1 great-expectations==0.15.50 -greenlet==2.0.2 +greenlet==3.0.0 grpc-google-iam-v1==0.12.6 -grpcio==1.54.2 -grpcio-reflection==1.54.2 -grpcio-status==1.54.2 -grpcio-tools==1.54.2 -gssapi==1.8.2 -gunicorn==20.1.0 +grpcio==1.59.0 +grpcio-reflection==1.59.0 +grpcio-status==1.59.0 +grpcio-tools==1.59.0 +gssapi==1.8.3 +gunicorn==21.2.0 h11==0.14.0 -hmsclient==0.1.1 -httpcore==0.17.2 -httptools==0.5.0 -httpx==0.24.1 +httpcore==0.18.0 +httptools==0.6.0 +httpx==0.25.0 humanfriendly==10.0 idna==3.4 -ijson==3.2.0.post0 -importlib-metadata==6.6.0 -importlib-resources==5.12.0 +ijson==3.2.3 +importlib-metadata==6.8.0 +importlib-resources==6.1.0 inflection==0.5.1 ipaddress==1.0.23 ipykernel==6.17.1 -ipython==8.13.2 +ipython==8.16.1 ipython-genutils==0.2.0 -ipywidgets==8.0.6 +ipywidgets==8.1.1 iso3166==2.1.1 isodate==0.6.1 itsdangerous==2.1.2 -jedi==0.18.2 +jedi==0.19.1 Jinja2==3.1.2 jmespath==1.0.1 JPype1==1.4.1 -jsonlines==3.1.0 -jsonpatch==1.32 -jsonpointer==2.3 +jsonlines==4.0.0 +jsonpatch==1.33 +jsonpointer==2.4 jsonref==1.1.0 -jsonschema==4.17.3 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 jupyter-server==1.24.0 jupyter_client==7.4.9 jupyter_core==4.12.0 jupyterlab-pygments==0.2.2 -jupyterlab-widgets==3.0.7 +jupyterlab-widgets==3.0.9 langcodes==3.3.0 lark==1.1.4 lazy-object-proxy==1.9.0 leb128==1.0.5 -limits==3.5.0 +limits==3.6.0 linear-tsv==1.1.0 linkify-it-py==2.0.2 lkml==1.3.1 locket==1.0.0 lockfile==0.12.2 looker-sdk==23.0.0 -lxml==4.9.2 +lxml==4.9.3 lz4==4.3.2 makefun==1.15.1 Mako==1.2.4 -Markdown==3.4.3 -markdown-it-py==2.2.0 -MarkupSafe==2.1.2 -marshmallow==3.19.0 -marshmallow-enum==1.5.1 +Markdown==3.5 +markdown-it-py==3.0.0 +MarkupSafe==2.1.3 +marshmallow==3.20.1 marshmallow-oneofschema==3.0.1 marshmallow-sqlalchemy==0.26.1 matplotlib-inline==0.1.6 -mdit-py-plugins==0.3.5 +mdit-py-plugins==0.4.0 mdurl==0.1.2 -mistune==2.0.5 +mistune==3.0.2 mixpanel==4.10.0 -mmh3==4.0.0 -more-itertools==9.1.0 +mlflow-skinny==2.7.1 +mmh3==4.0.1 +mmhash3==3.0.1 +more-itertools==10.1.0 moreorless==0.4.0 -moto==4.1.10 -msal==1.16.0 -msal-extensions==1.0.0 +moto==4.2.5 +msal==1.22.0 multidict==6.0.4 -murmurhash==1.0.9 -mypy==1.3.0 +murmurhash==1.0.10 +mypy==1.6.0 mypy-extensions==1.0.0 nbclassic==1.0.0 nbclient==0.6.3 -nbconvert==7.4.0 -nbformat==5.8.0 -nest-asyncio==1.5.6 +nbconvert==7.9.2 +nbformat==5.9.1 +nest-asyncio==1.5.8 networkx==3.1 -notebook==6.5.4 +notebook==6.5.6 notebook_shim==0.2.3 -numpy==1.24.3 +numpy==1.26.0 oauthlib==3.2.2 okta==1.7.0 +openlineage-airflow==1.2.0 +openlineage-integration-common==1.2.0 +openlineage-python==1.2.0 +openlineage_sql==1.2.0 openpyxl==3.1.2 +opentelemetry-api==1.20.0 +opentelemetry-exporter-otlp==1.20.0 +opentelemetry-exporter-otlp-proto-common==1.20.0 +opentelemetry-exporter-otlp-proto-grpc==1.20.0 +opentelemetry-exporter-otlp-proto-http==1.20.0 +opentelemetry-proto==1.20.0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 ordered-set==4.1.0 oscrypto==1.3.0 -packaging==23.1 +packaging==23.2 pandas==1.5.3 pandavro==1.5.2 pandocfilters==1.5.0 -parse==1.19.0 +parse==1.19.1 parso==0.8.3 -partd==1.4.0 -pathspec==0.9.0 -pathy==0.10.1 +partd==1.4.1 +pathspec==0.11.2 +pathy==0.10.2 pendulum==2.1.2 pexpect==4.8.0 phonenumbers==8.13.0 pickleshare==0.7.5 -platformdirs==3.5.1 -pluggy==1.0.0 -portalocker==2.7.0 -preshed==3.0.8 +platformdirs==3.11.0 +pluggy==1.3.0 +preshed==3.0.9 prison==0.2.1 progressbar2==4.2.0 -prometheus-client==0.17.0 -prompt-toolkit==3.0.38 -proto-plus==1.22.2 -protobuf==4.23.2 +prometheus-client==0.17.1 +prompt-toolkit==3.0.39 +proto-plus==1.22.3 +protobuf==4.24.4 psutil==5.9.5 -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 pure-sasl==0.6.2 -py-partiql-parser==0.3.0 -pyarrow==8.0.0 +py-partiql-parser==0.3.7 +pyarrow==11.0.0 pyasn1==0.5.0 pyasn1-modules==0.3.0 pyathena==2.4.1 pycountry==22.3.5 pycparser==2.21 -pycryptodome==3.18.0 -pycryptodomex==3.18.0 -pydantic==1.10.8 -pydash==7.0.3 +pycryptodome==3.19.0 +pycryptodomex==3.19.0 +pydantic==1.10.13 +pydash==7.0.6 pydruid==0.6.5 -Pygments==2.15.1 -pymongo==4.3.3 -PyMySQL==1.0.3 -pyOpenSSL==22.0.0 +Pygments==2.16.1 +pyiceberg==0.4.0 +pymongo==4.5.0 +PyMySQL==1.1.0 +pyOpenSSL==23.2.0 pyparsing==3.0.9 -pyrsistent==0.19.3 -pyspnego==0.9.0 +pyspnego==0.10.2 python-daemon==3.0.1 python-dateutil==2.8.2 python-dotenv==1.0.0 @@ -272,111 +280,115 @@ python-jose==3.3.0 python-ldap==3.4.3 python-nvd3==0.15.0 python-slugify==8.0.1 -python-stdnum==1.18 -python-tds==1.12.0 -python-utils==3.6.0 +python-stdnum==1.19 +python-tds==1.13.0 +python-utils==3.8.1 python3-openid==3.2.0 -pytz==2023.3 +pytz==2023.3.post1 pytzdata==2020.1 -PyYAML==6.0 -pyzmq==25.1.0 +PyYAML==6.0.1 +pyzmq==24.0.1 ratelimiter==1.2.0.post0 redash-toolbelt==0.1.9 -redshift-connector==2.0.910 -regex==2023.5.5 -requests==2.28.2 +redshift-connector==2.0.914 +referencing==0.30.2 +regex==2023.10.3 +requests==2.31.0 requests-file==1.5.1 requests-gssapi==1.2.3 requests-ntlm==1.2.0 requests-toolbelt==0.10.1 -responses==0.23.1 -retrying==1.3.4 +responses==0.23.3 rfc3339-validator==0.1.4 rfc3986==2.0.0 -rich==13.3.5 -rich_argparse==1.1.0 +rich==13.6.0 +rich-argparse==1.3.0 +rpds-py==0.10.6 rsa==4.9 ruamel.yaml==0.17.17 -s3transfer==0.6.1 -sasl3==0.2.11 -schwifty==2023.3.0 -scipy==1.10.1 +ruamel.yaml.clib==0.2.8 +s3transfer==0.7.0 +schwifty==2023.9.0 +scipy==1.11.3 scramp==1.4.4 Send2Trash==1.8.2 -setproctitle==1.3.2 -simple-salesforce==1.12.4 +sentry-sdk==1.32.0 +setproctitle==1.3.3 +simple-salesforce==1.12.5 six==1.16.0 -smart-open==6.3.0 -smmap==5.0.0 +smart-open==6.4.0 +smmap==5.0.1 sniffio==1.3.0 -snowflake-connector-python==2.9.0 -snowflake-sqlalchemy==1.4.7 -soupsieve==2.4.1 +snowflake-connector-python==3.2.1 +snowflake-sqlalchemy==1.5.0 +sortedcontainers==2.4.0 +soupsieve==2.5 spacy==3.4.3 spacy-legacy==3.0.12 -spacy-loggers==1.0.4 +spacy-loggers==1.0.5 sql-metadata==2.2.2 -SQLAlchemy==1.4.41 -sqlalchemy-bigquery==1.6.1 +SQLAlchemy==1.4.44 +sqlalchemy-bigquery==1.8.0 SQLAlchemy-JSONField==1.0.1.post0 sqlalchemy-pytds==0.3.5 sqlalchemy-redshift==0.8.14 SQLAlchemy-Utils==0.41.1 -sqlalchemy2-stubs==0.0.2a34 -sqllineage==1.3.6 -sqlparse==0.4.3 -srsly==2.4.6 -stack-data==0.6.2 +sqlalchemy2-stubs==0.0.2a35 +sqllineage==1.3.8 +sqlparse==0.4.4 +srsly==2.4.8 +stack-data==0.6.3 starlette==0.27.0 +strictyaml==1.7.3 tableauserverclient==0.25 tableschema==1.20.2 tabulate==0.9.0 tabulator==1.53.5 -tenacity==8.2.2 +tenacity==8.2.3 termcolor==2.3.0 terminado==0.17.1 text-unidecode==1.3 -thinc==8.1.10 -thrift==0.16.0 +thinc==8.1.12 +thrift==0.13.0 thrift-sasl==0.4.3 tinycss2==1.2.1 toml==0.10.2 tomli==2.0.1 +tomlkit==0.12.1 toolz==0.12.0 -tornado==6.3.2 -tqdm==4.65.0 +tornado==6.3.3 +tqdm==4.66.1 traitlets==5.2.1.post0 -trino==0.324.0 +trino==0.327.0 typeguard==2.13.3 typer==0.7.0 -types-PyYAML==6.0.12.10 +types-PyYAML==6.0.12.12 typing-inspect==0.9.0 -typing_extensions==4.5.0 -tzlocal==5.0.1 +typing_extensions==4.8.0 +tzlocal==5.1 uc-micro-py==1.0.2 -ujson==5.7.0 +ujson==5.8.0 unicodecsv==0.14.1 -urllib3==1.26.16 -uvicorn==0.22.0 +urllib3==1.26.17 +uvicorn==0.23.2 uvloop==0.17.0 -vertica-python==1.3.2 -vertica-sqlalchemy-dialect==0.0.1 +vertica-python==1.3.5 +vertica-sqlalchemy-dialect==0.0.8 vininfo==1.7.0 volatile==2.1.0 wasabi==0.10.1 -watchfiles==0.19.0 -wcmatch==8.4.1 -wcwidth==0.2.6 +watchfiles==0.20.0 +wcmatch==8.5 +wcwidth==0.2.8 webencodings==0.5.1 -websocket-client==1.5.2 +websocket-client==1.6.4 websockets==11.0.3 Werkzeug==2.2.3 -widgetsnbextension==4.0.7 +widgetsnbextension==4.0.9 wrapt==1.15.0 -WTForms==3.0.1 +WTForms==3.1.0 xlrd==2.0.1 xmltodict==0.13.0 yarl==1.9.2 zeep==4.2.1 -zipp==3.15.0 -zstd==1.5.5.1 +zstd==1.5.5.1 \ No newline at end of file diff --git a/docker/mariadb/init.sql b/docker/mariadb/init.sql index c4132575cf442..95c8cabbc5ca4 100644 --- a/docker/mariadb/init.sql +++ b/docker/mariadb/init.sql @@ -28,3 +28,5 @@ insert into metadata_aspect_v2 (urn, aspect, version, metadata, createdon, creat now(), 'urn:li:corpuser:__datahub_system' ); + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/mysql-setup/init.sql b/docker/mysql-setup/init.sql index 2370a971941d2..b789329ddfd17 100644 --- a/docker/mysql-setup/init.sql +++ b/docker/mysql-setup/init.sql @@ -39,3 +39,5 @@ INSERT INTO metadata_aspect_v2 SELECT * FROM temp_metadata_aspect_v2 WHERE NOT EXISTS (SELECT * from metadata_aspect_v2); DROP TABLE temp_metadata_aspect_v2; + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/mysql/init.sql b/docker/mysql/init.sql index b4b4e4617806c..aca57d7cd444c 100644 --- a/docker/mysql/init.sql +++ b/docker/mysql/init.sql @@ -27,3 +27,5 @@ INSERT INTO metadata_aspect_v2 (urn, aspect, version, metadata, createdon, creat now(), 'urn:li:corpuser:__datahub_system' ); + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/postgres-setup/init.sql b/docker/postgres-setup/init.sql index 12fff7aec7fe6..72b2f73192e00 100644 --- a/docker/postgres-setup/init.sql +++ b/docker/postgres-setup/init.sql @@ -35,3 +35,5 @@ INSERT INTO metadata_aspect_v2 SELECT * FROM temp_metadata_aspect_v2 WHERE NOT EXISTS (SELECT * from metadata_aspect_v2); DROP TABLE temp_metadata_aspect_v2; + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/postgres/init.sql b/docker/postgres/init.sql index cf477c135422e..87c8dd3337fac 100644 --- a/docker/postgres/init.sql +++ b/docker/postgres/init.sql @@ -28,3 +28,5 @@ insert into metadata_aspect_v2 (urn, aspect, version, metadata, createdon, creat now(), 'urn:li:corpuser:__datahub_system' ); + +DROP TABLE IF EXISTS metadata_index; diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index bdf3926c17e0d..4fa73c995157a 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -140,6 +140,7 @@ module.exports = { "metadata-ingestion/docs/dev_guides/classification", "metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source", "metadata-ingestion/docs/dev_guides/sql_profiles", + "metadata-ingestion/docs/dev_guides/profiling_ingestions", ], }, ], @@ -607,6 +608,7 @@ module.exports = { }, { "Managed DataHub Release History": [ + "docs/managed-datahub/release-notes/v_0_2_12", "docs/managed-datahub/release-notes/v_0_2_11", "docs/managed-datahub/release-notes/v_0_2_10", "docs/managed-datahub/release-notes/v_0_2_9", diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 5d0ad5eaf8f7e..9cd4ad5c6f02d 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -7,6 +7,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Breaking Changes - #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now. +- #8942 - Removed `urn:li:corpuser:datahub` owner for the `Measure`, `Dimension` and `Temporal` tags emitted + by Looker and LookML source connectors. - #8853 - The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. See the docs for more details. - #8853 - Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`. - #8943 The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled. diff --git a/docs/managed-datahub/release-notes/v_0_2_11.md b/docs/managed-datahub/release-notes/v_0_2_11.md index 1f42090848712..c99d10201e097 100644 --- a/docs/managed-datahub/release-notes/v_0_2_11.md +++ b/docs/managed-datahub/release-notes/v_0_2_11.md @@ -7,7 +7,7 @@ Release Availability Date Recommended CLI/SDK --- -- `v0.11.0` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.10.5.5 +- `v0.11.0` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.11.0 - [Deprecation] In LDAP ingestor, the manager_pagination_enabled changed to general pagination_enabled If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. diff --git a/docs/managed-datahub/release-notes/v_0_2_12.md b/docs/managed-datahub/release-notes/v_0_2_12.md new file mode 100644 index 0000000000000..b13f471d9bf63 --- /dev/null +++ b/docs/managed-datahub/release-notes/v_0_2_12.md @@ -0,0 +1,30 @@ +# v0.2.12 +--- + +Release Availability Date +--- +13-Oct-2023 + +Recommended CLI/SDK +--- +- `v0.11.0.4` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.11.0.4 +- [breaking] Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now. +- [breaking] Removed `urn:li:corpuser:datahub` owner for the `Measure`, `Dimension` and `Temporal` tags emitted by Looker and LookML source connectors. +- [breaking] The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. +- [breaking] Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`. +- [breaking] The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled. +This is currently enabled by default to preserve compatibility, but will be disabled by default and then removed in the future. +If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup. +Otherwise, we recommend soft deleting all databricks data via the DataHub CLI: +`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`. + + +If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. + + +## Release Changelog +--- +- Since `v0.2.11` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/75252a3d9f6a576904be5a0790d644b9ae2df6ac...10a190470e8c932b6d34cba49de7dbcba687a088 have been pulled in. + +## Some notable features in this SaaS release +- Nested Domains available in this release diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthUtil.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthUtil.java index dfb936c61ee0c..e159993a8a243 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthUtil.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthUtil.java @@ -11,7 +11,7 @@ public class AuthUtil { public static boolean isAuthorized( @Nonnull Authorizer authorizer, @Nonnull String actor, - @Nonnull Optional maybeResourceSpec, + @Nonnull Optional maybeResourceSpec, @Nonnull DisjunctivePrivilegeGroup privilegeGroup ) { for (ConjunctivePrivilegeGroup andPrivilegeGroup : privilegeGroup.getAuthorizedPrivilegeGroups()) { @@ -27,7 +27,7 @@ public static boolean isAuthorized( public static boolean isAuthorizedForResources( @Nonnull Authorizer authorizer, @Nonnull String actor, - @Nonnull List> resourceSpecs, + @Nonnull List> resourceSpecs, @Nonnull DisjunctivePrivilegeGroup privilegeGroup ) { for (ConjunctivePrivilegeGroup andPrivilegeGroup : privilegeGroup.getAuthorizedPrivilegeGroups()) { @@ -44,7 +44,7 @@ private static boolean isAuthorized( @Nonnull Authorizer authorizer, @Nonnull String actor, @Nonnull ConjunctivePrivilegeGroup requiredPrivileges, - @Nonnull Optional resourceSpec) { + @Nonnull Optional resourceSpec) { // Each privilege in a group _must_ all be true to permit the operation. for (final String privilege : requiredPrivileges.getRequiredPrivileges()) { // Create and evaluate an Authorization request. @@ -62,11 +62,11 @@ private static boolean isAuthorizedForResources( @Nonnull Authorizer authorizer, @Nonnull String actor, @Nonnull ConjunctivePrivilegeGroup requiredPrivileges, - @Nonnull List> resourceSpecs) { + @Nonnull List> resourceSpecs) { // Each privilege in a group _must_ all be true to permit the operation. for (final String privilege : requiredPrivileges.getRequiredPrivileges()) { // Create and evaluate an Authorization request. - for (Optional resourceSpec : resourceSpecs) { + for (Optional resourceSpec : resourceSpecs) { final AuthorizationRequest request = new AuthorizationRequest(actor, privilege, resourceSpec); final AuthorizationResult result = authorizer.authorize(request); if (AuthorizationResult.Type.DENY.equals(result.getType())) { diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizationRequest.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizationRequest.java index 084a455495551..9e75de3cbf44d 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizationRequest.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizationRequest.java @@ -21,5 +21,5 @@ public class AuthorizationRequest { * The resource that the user is requesting for, if applicable. If the privilege is a platform privilege * this optional will be empty. */ - Optional resourceSpec; + Optional resourceSpec; } diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizerContext.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizerContext.java index f9940d171d5d4..b79a4fa20c7ea 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizerContext.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizerContext.java @@ -18,9 +18,9 @@ public class AuthorizerContext { private final Map contextMap; /** - * A utility for resolving a {@link ResourceSpec} to resolved resource field values. + * A utility for resolving an {@link EntitySpec} to resolved entity field values. */ - private ResourceSpecResolver resourceSpecResolver; + private EntitySpecResolver entitySpecResolver; /** * diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java new file mode 100644 index 0000000000000..46763f29a7040 --- /dev/null +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java @@ -0,0 +1,31 @@ +package com.datahub.authorization; + +/** + * List of entity field types to fetch for a given entity + */ +public enum EntityFieldType { + /** + * Type of the entity (e.g. dataset, chart) + */ + TYPE, + /** + * Urn of the entity + */ + URN, + /** + * Owners of the entity + */ + OWNER, + /** + * Domains of the entity + */ + DOMAIN, + /** + * Groups of which the entity (only applies to corpUser) is a member + */ + GROUP_MEMBERSHIP, + /** + * Data platform instance of resource + */ + DATA_PLATFORM_INSTANCE +} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpec.java new file mode 100644 index 0000000000000..656bec0f44fc2 --- /dev/null +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpec.java @@ -0,0 +1,23 @@ +package com.datahub.authorization; + +import javax.annotation.Nonnull; +import lombok.Value; + + +/** + * Details about the entities involved in the authorization process. It models the actor and the resource being acted + * upon. Resource types currently supported can be found inside of {@link com.linkedin.metadata.authorization.PoliciesConfig} + */ +@Value +public class EntitySpec { + /** + * The entity type. (dataset, chart, dashboard, corpGroup, etc). + */ + @Nonnull + String type; + /** + * The entity identity. Most often, this corresponds to the raw entity urn. (urn:li:corpGroup:groupId) + */ + @Nonnull + String entity; +} \ No newline at end of file diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpecResolver.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpecResolver.java new file mode 100644 index 0000000000000..67347fbf87a87 --- /dev/null +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpecResolver.java @@ -0,0 +1,11 @@ +package com.datahub.authorization; + +/** + * An Entity Spec Resolver is responsible for resolving a {@link EntitySpec} to a {@link ResolvedEntitySpec}. + */ +public interface EntitySpecResolver { + /** + Resolve a {@link EntitySpec} to a resolved entity spec. + **/ + ResolvedEntitySpec resolve(EntitySpec entitySpec); +} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/FieldResolver.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/FieldResolver.java index 9318f5f8e7b96..955a06fd54cb9 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/FieldResolver.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/FieldResolver.java @@ -33,9 +33,9 @@ public static FieldResolver getResolverFromValues(Set values) { /** * Helper function that returns FieldResolver given a fetchFieldValue function */ - public static FieldResolver getResolverFromFunction(ResourceSpec resourceSpec, - Function fetchFieldValue) { - return new FieldResolver(() -> CompletableFuture.supplyAsync(() -> fetchFieldValue.apply(resourceSpec))); + public static FieldResolver getResolverFromFunction(EntitySpec entitySpec, + Function fetchFieldValue) { + return new FieldResolver(() -> CompletableFuture.supplyAsync(() -> fetchFieldValue.apply(entitySpec))); } public static FieldValue emptyFieldValue() { diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedEntitySpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedEntitySpec.java new file mode 100644 index 0000000000000..7948766df5715 --- /dev/null +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedEntitySpec.java @@ -0,0 +1,66 @@ +package com.datahub.authorization; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.ToString; + + +/** + * Wrapper around authorization request with field resolvers for lazily fetching the field values for each field type + */ +@RequiredArgsConstructor +@ToString +public class ResolvedEntitySpec { + @Getter + private final EntitySpec spec; + private final Map fieldResolvers; + + public Set getFieldValues(EntityFieldType entityFieldType) { + if (!fieldResolvers.containsKey(entityFieldType)) { + return Collections.emptySet(); + } + return fieldResolvers.get(entityFieldType).getFieldValuesFuture().join().getValues(); + } + + /** + * Fetch the owners for an entity. + * @return a set of owner urns, or empty set if none exist. + */ + public Set getOwners() { + if (!fieldResolvers.containsKey(EntityFieldType.OWNER)) { + return Collections.emptySet(); + } + return fieldResolvers.get(EntityFieldType.OWNER).getFieldValuesFuture().join().getValues(); + } + + /** + * Fetch the platform instance for a Resolved Resource Spec + * @return a Platform Instance or null if one does not exist. + */ + @Nullable + public String getDataPlatformInstance() { + if (!fieldResolvers.containsKey(EntityFieldType.DATA_PLATFORM_INSTANCE)) { + return null; + } + Set dataPlatformInstance = fieldResolvers.get(EntityFieldType.DATA_PLATFORM_INSTANCE).getFieldValuesFuture().join().getValues(); + if (dataPlatformInstance.size() > 0) { + return dataPlatformInstance.stream().findFirst().get(); + } + return null; + } + + /** + * Fetch the group membership for an entity. + * @return a set of groups urns, or empty set if none exist. + */ + public Set getGroupMembership() { + if (!fieldResolvers.containsKey(EntityFieldType.GROUP_MEMBERSHIP)) { + return Collections.emptySet(); + } + return fieldResolvers.get(EntityFieldType.GROUP_MEMBERSHIP).getFieldValuesFuture().join().getValues(); + } +} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java deleted file mode 100644 index 53dd0be44f963..0000000000000 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.datahub.authorization; - -import java.util.Collections; -import java.util.Map; -import java.util.Set; -import lombok.Getter; -import lombok.RequiredArgsConstructor; -import lombok.ToString; - - -/** - * Wrapper around authorization request with field resolvers for lazily fetching the field values for each field type - */ -@RequiredArgsConstructor -@ToString -public class ResolvedResourceSpec { - @Getter - private final ResourceSpec spec; - private final Map fieldResolvers; - - public Set getFieldValues(ResourceFieldType resourceFieldType) { - if (!fieldResolvers.containsKey(resourceFieldType)) { - return Collections.emptySet(); - } - return fieldResolvers.get(resourceFieldType).getFieldValuesFuture().join().getValues(); - } - - /** - * Fetch the owners for a resource. - * @return a set of owner urns, or empty set if none exist. - */ - public Set getOwners() { - if (!fieldResolvers.containsKey(ResourceFieldType.OWNER)) { - return Collections.emptySet(); - } - return fieldResolvers.get(ResourceFieldType.OWNER).getFieldValuesFuture().join().getValues(); - } -} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceFieldType.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceFieldType.java deleted file mode 100644 index ee54d2bfbba1d..0000000000000 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceFieldType.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.datahub.authorization; - -/** - * List of resource field types to fetch for a given resource - */ -public enum ResourceFieldType { - /** - * Type of resource (e.g. dataset, chart) - */ - RESOURCE_TYPE, - /** - * Urn of resource - */ - RESOURCE_URN, - /** - * Owners of resource - */ - OWNER, - /** - * Domains of resource - */ - DOMAIN -} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpec.java deleted file mode 100644 index c1bd53e31fe29..0000000000000 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpec.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.datahub.authorization; - -import javax.annotation.Nonnull; -import lombok.Value; - - -/** - * Details about a specific resource being acted upon. Resource types currently supported - * can be found inside of {@link com.linkedin.metadata.authorization.PoliciesConfig} - */ -@Value -public class ResourceSpec { - /** - * The resource type. Most often, this corresponds to the entity type. (dataset, chart, dashboard, corpGroup, etc). - */ - @Nonnull - String type; - /** - * The resource identity. Most often, this corresponds to the raw entity urn. (urn:li:corpGroup:groupId) - */ - @Nonnull - String resource; -} \ No newline at end of file diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpecResolver.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpecResolver.java deleted file mode 100644 index 05c35f377b9a9..0000000000000 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpecResolver.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.datahub.authorization; - -/** - * A Resource Spec Resolver is responsible for resolving a {@link ResourceSpec} to a {@link ResolvedResourceSpec}. - */ -public interface ResourceSpecResolver { - /** - Resolve a {@link ResourceSpec} to a resolved resource spec. - **/ - ResolvedResourceSpec resolve(ResourceSpec resourceSpec); -} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/plugins/auth/authorization/Authorizer.java b/metadata-auth/auth-api/src/main/java/com/datahub/plugins/auth/authorization/Authorizer.java index ce7a3f22b3147..c731a3ec987c1 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/plugins/auth/authorization/Authorizer.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/plugins/auth/authorization/Authorizer.java @@ -4,7 +4,7 @@ import com.datahub.authorization.AuthorizationResult; import com.datahub.authorization.AuthorizedActors; import com.datahub.authorization.AuthorizerContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.Plugin; import java.util.Map; import java.util.Optional; @@ -32,5 +32,5 @@ public interface Authorizer extends Plugin { * Retrieves the current list of actors authorized to for a particular privilege against * an optional resource */ - AuthorizedActors authorizedActors(final String privilege, final Optional resourceSpec); + AuthorizedActors authorizedActors(final String privilege, final Optional resourceSpec); } diff --git a/metadata-ingestion/docs/dev_guides/profiling_ingestions.md b/metadata-ingestion/docs/dev_guides/profiling_ingestions.md new file mode 100644 index 0000000000000..d876d99b494f8 --- /dev/null +++ b/metadata-ingestion/docs/dev_guides/profiling_ingestions.md @@ -0,0 +1,55 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Profiling ingestions + + + +**🤝 Version compatibility** +> Open Source DataHub: **0.11.1** | Acryl: **0.2.12** + +This page documents how to perform memory profiles of ingestion runs. +It is useful when trying to size the amount of resources necessary to ingest some source or when developing new features or sources. + +## How to use +Install the `debug` plugin for DataHub's CLI wherever the ingestion runs: + +```bash +pip install 'acryl-datahub[debug]' +``` + +This will install [memray](https://github.com/bloomberg/memray) in your python environment. + +Add a flag to your ingestion recipe to generate a memray memory dump of your ingestion: +```yaml +source: + ... + +sink: + ... + +flags: + generate_memory_profiles: "" +``` + +Once the ingestion run starts a binary file will be created and appended to during the execution of the ingestion. + +These files follow the pattern `file-.bin` for a unique identification. +Once the ingestion has finished you can use `memray` to analyze the memory dump in a flamegraph view using: + +```$ memray flamegraph file-None-file-2023_09_18-21_38_43.bin``` + +This will generate an interactive HTML file for analysis: + +

+ +

+ + +`memray` has an extensive set of features for memory investigation. Take a look at their [documentation](https://bloomberg.github.io/memray/overview.html) to see the full feature set. + + +## Questions + +If you've got any questions on configuring profiling, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/docs/sources/teradata/teradata_pre.md b/metadata-ingestion/docs/sources/teradata/teradata_pre.md new file mode 100644 index 0000000000000..7263a59f5ea3d --- /dev/null +++ b/metadata-ingestion/docs/sources/teradata/teradata_pre.md @@ -0,0 +1,28 @@ +### Prerequisites +1. Create a user which has access to the database you want to ingest. + ```sql + CREATE USER datahub FROM AS PASSWORD = PERM = 20000000; + ``` +2. Create a user with the following privileges: + ```sql + GRANT SELECT ON dbc.columns TO datahub; + GRANT SELECT ON dbc.databases TO datahub; + GRANT SELECT ON dbc.tables TO datahub; + GRANT SELECT ON DBC.All_RI_ChildrenV TO datahub; + GRANT SELECT ON DBC.ColumnsV TO datahub; + GRANT SELECT ON DBC.IndicesV TO datahub; + GRANT SELECT ON dbc.TableTextV TO datahub; + GRANT SELECT ON dbc.TablesV TO datahub; + GRANT SELECT ON dbc.dbqlogtbl TO datahub; -- if lineage or usage extraction is enabled + ``` + + If you want to run profiling, you need to grant select permission on all the tables you want to profile. + +3. If lineage or usage extraction is enabled, please, check if query logging is enabled and it is set to size which +will fit for your queries (the default query text size Teradata captures is max 200 chars) + An example how you can set it for all users: + ```sql + REPLACE QUERY LOGGING LIMIT SQLTEXT=2000 ON ALL; + ``` + See more here about query logging: + [https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Database-Reference/Database-Administration/Tracking-Query-Behavior-with-Database-Query-Logging-Operational-DBAs]() diff --git a/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml b/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml new file mode 100644 index 0000000000000..cc94de20110fe --- /dev/null +++ b/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml @@ -0,0 +1,16 @@ +pipeline_name: my-teradata-ingestion-pipeline +source: + type: teradata + config: + host_port: "myteradatainstance.teradata.com:1025" + username: myuser + password: mypassword + #database_pattern: + # allow: + # - "my_database" + # ignoreCase: true + include_table_lineage: true + include_usage_statistics: true + stateful_ingestion: + enabled: true +sink: diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index fe8e3be4632c4..545cafca9d4df 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -373,6 +373,10 @@ # FIXME: I don't think tableau uses sqllineage anymore so we should be able # to remove that dependency. "tableau": {"tableauserverclient>=0.17.0"} | sqllineage_lib | sqlglot_lib, + "teradata": sql_common + | usage_common + | sqlglot_lib + | {"teradatasqlalchemy>=17.20.0.0"}, "trino": sql_common | trino, "starburst-trino-usage": sql_common | usage_common | trino, "nifi": {"requests", "packaging", "requests-gssapi"}, @@ -431,6 +435,8 @@ deepdiff_dep = "deepdiff" test_api_requirements = {pytest_dep, deepdiff_dep, "PyYAML"} +debug_requirements = {"memray"} + base_dev_requirements = { *base_requirements, *framework_common, @@ -495,6 +501,7 @@ "s3", "snowflake", "tableau", + "teradata", "trino", "hive", "starburst-trino-usage", @@ -593,6 +600,7 @@ "tableau = datahub.ingestion.source.tableau:TableauSource", "openapi = datahub.ingestion.source.openapi:OpenApiSource", "metabase = datahub.ingestion.source.metabase:MetabaseSource", + "teradata = datahub.ingestion.source.sql.teradata:TeradataSource", "trino = datahub.ingestion.source.sql.trino:TrinoSource", "starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource", "nifi = datahub.ingestion.source.nifi:NifiSource", @@ -723,5 +731,6 @@ "dev": list(dev_requirements), "testing-utils": list(test_api_requirements), # To import `datahub.testing` "integration-tests": list(full_test_dev_requirements), + "debug": list(debug_requirements), }, ) diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py new file mode 100644 index 0000000000000..c45d4ddc92458 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py @@ -0,0 +1,7 @@ +from typing import Optional + +from datahub.configuration import ConfigModel + + +class BaseAssertion(ConfigModel): + description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py new file mode 100644 index 0000000000000..a41b0f7aafd9f --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py @@ -0,0 +1,162 @@ +from typing import Optional, Union + +from typing_extensions import Literal, Protocol + +from datahub.configuration import ConfigModel +from datahub.metadata.schema_classes import ( + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, +) + + +class Operator(Protocol): + """Specification for an assertion operator. + + This class exists only for documentation (not used in typing checking). + """ + + operator: str + + def id(self) -> str: + ... + + def generate_parameters(self) -> AssertionStdParametersClass: + ... + + +def _generate_assertion_std_parameter( + value: Union[str, int, float] +) -> AssertionStdParameterClass: + if isinstance(value, str): + return AssertionStdParameterClass( + value=value, type=AssertionStdParameterTypeClass.STRING + ) + elif isinstance(value, (int, float)): + return AssertionStdParameterClass( + value=str(value), type=AssertionStdParameterTypeClass.NUMBER + ) + else: + raise ValueError( + f"Unsupported assertion parameter {value} of type {type(value)}" + ) + + +Param = Union[str, int, float] + + +def _generate_assertion_std_parameters( + value: Optional[Param] = None, + min_value: Optional[Param] = None, + max_value: Optional[Param] = None, +) -> AssertionStdParametersClass: + return AssertionStdParametersClass( + value=_generate_assertion_std_parameter(value) if value else None, + minValue=_generate_assertion_std_parameter(min_value) if min_value else None, + maxValue=_generate_assertion_std_parameter(max_value) if max_value else None, + ) + + +class EqualToOperator(ConfigModel): + type: Literal["equal_to"] + value: Union[str, int, float] + + operator: str = AssertionStdOperatorClass.EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class BetweenOperator(ConfigModel): + type: Literal["between"] + min: Union[int, float] + max: Union[int, float] + + operator: str = AssertionStdOperatorClass.BETWEEN + + def id(self) -> str: + return f"{self.type}-{self.min}-{self.max}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters( + min_value=self.min, max_value=self.max + ) + + +class LessThanOperator(ConfigModel): + type: Literal["less_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOperator(ConfigModel): + type: Literal["greater_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class LessThanOrEqualToOperator(ConfigModel): + type: Literal["less_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOrEqualToOperator(ConfigModel): + type: Literal["greater_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class NotNullOperator(ConfigModel): + type: Literal["not_null"] + + operator: str = AssertionStdOperatorClass.NOT_NULL + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +Operators = Union[ + EqualToOperator, + BetweenOperator, + LessThanOperator, + LessThanOrEqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + NotNullOperator, +] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py index a665e95e93c43..6a3944ba36baf 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py @@ -4,6 +4,8 @@ from typing_extensions import Literal import datahub.emitter.mce_builder as builder +from datahub.api.entities.datacontract.assertion import BaseAssertion +from datahub.api.entities.datacontract.assertion_operator import Operators from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -14,12 +16,15 @@ AssertionStdParametersClass, AssertionStdParameterTypeClass, AssertionTypeClass, + AssertionValueChangeTypeClass, DatasetAssertionInfoClass, DatasetAssertionScopeClass, + SqlAssertionInfoClass, + SqlAssertionTypeClass, ) -class IdConfigMixin(ConfigModel): +class IdConfigMixin(BaseAssertion): id_raw: Optional[str] = pydantic.Field( default=None, alias="id", @@ -30,25 +35,32 @@ def generate_default_id(self) -> str: raise NotImplementedError -class CustomSQLAssertion(IdConfigMixin, ConfigModel): +class CustomSQLAssertion(IdConfigMixin, BaseAssertion): type: Literal["custom_sql"] - sql: str + operator: Operators = pydantic.Field(discriminator="type") - def generate_dataset_assertion_info( - self, entity_urn: str - ) -> DatasetAssertionInfoClass: - return DatasetAssertionInfoClass( - dataset=entity_urn, - scope=DatasetAssertionScopeClass.UNKNOWN, - fields=[], - operator=AssertionStdOperatorClass._NATIVE_, - aggregation=AssertionStdAggregationClass._NATIVE_, - logic=self.sql, + def generate_default_id(self) -> str: + return f"{self.type}-{self.sql}-{self.operator.id()}" + + def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: + sql_assertion_info = SqlAssertionInfoClass( + entity=entity_urn, + statement=self.sql, + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + # TODO: Support other types of assertions + type=SqlAssertionTypeClass.METRIC, + changeType=AssertionValueChangeTypeClass.ABSOLUTE, + ) + return AssertionInfoClass( + type=AssertionTypeClass.SQL, + sqlAssertion=sql_assertion_info, + description=self.description, ) -class ColumnUniqueAssertion(IdConfigMixin, ConfigModel): +class ColumnUniqueAssertion(IdConfigMixin, BaseAssertion): type: Literal["unique"] # TODO: support multiple columns? @@ -57,10 +69,8 @@ class ColumnUniqueAssertion(IdConfigMixin, ConfigModel): def generate_default_id(self) -> str: return f"{self.type}-{self.column}" - def generate_dataset_assertion_info( - self, entity_urn: str - ) -> DatasetAssertionInfoClass: - return DatasetAssertionInfoClass( + def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: + dataset_assertion_info = DatasetAssertionInfoClass( dataset=entity_urn, scope=DatasetAssertionScopeClass.DATASET_COLUMN, fields=[builder.make_schema_field_urn(entity_urn, self.column)], @@ -72,6 +82,11 @@ def generate_dataset_assertion_info( ) ), ) + return AssertionInfoClass( + type=AssertionTypeClass.DATASET, + datasetAssertion=dataset_assertion_info, + description=self.description, + ) class DataQualityAssertion(ConfigModel): @@ -92,16 +107,9 @@ def id(self) -> str: def generate_mcp( self, assertion_urn: str, entity_urn: str ) -> List[MetadataChangeProposalWrapper]: - dataset_assertion_info = self.__root__.generate_dataset_assertion_info( - entity_urn - ) - return [ MetadataChangeProposalWrapper( entityUrn=assertion_urn, - aspect=AssertionInfoClass( - type=AssertionTypeClass.DATASET, - datasetAssertion=dataset_assertion_info, - ), + aspect=self.__root__.generate_assertion_info(entity_urn), ) ] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py index 2df446623a9d6..f3c6be55e5fea 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py @@ -54,7 +54,7 @@ class DataContract(ConfigModel): freshness: Optional[FreshnessAssertion] = pydantic.Field(default=None) # TODO: Add a validator to ensure that ids are unique - data_quality: Optional[List[DataQualityAssertion]] = None + data_quality: Optional[List[DataQualityAssertion]] = pydantic.Field(default=None) _original_yaml_dict: Optional[dict] = None diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py index ee8fa1181e614..71741d76b22fc 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py @@ -6,6 +6,7 @@ import pydantic from typing_extensions import Literal +from datahub.api.entities.datacontract.assertion import BaseAssertion from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -21,7 +22,7 @@ ) -class CronFreshnessAssertion(ConfigModel): +class CronFreshnessAssertion(BaseAssertion): type: Literal["cron"] cron: str = pydantic.Field( @@ -32,12 +33,30 @@ class CronFreshnessAssertion(ConfigModel): description="The timezone to use for the cron schedule. Defaults to UTC.", ) + def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleClass: + return FreshnessAssertionScheduleClass( + type=FreshnessAssertionScheduleTypeClass.CRON, + cron=FreshnessCronScheduleClass( + cron=self.cron, + timezone=self.timezone, + ), + ) + -class FixedIntervalFreshnessAssertion(ConfigModel): +class FixedIntervalFreshnessAssertion(BaseAssertion): type: Literal["interval"] interval: timedelta + def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleClass: + return FreshnessAssertionScheduleClass( + type=FreshnessAssertionScheduleTypeClass.FIXED_INTERVAL, + fixedInterval=FixedIntervalScheduleClass( + unit=CalendarIntervalClass.SECOND, + multiple=int(self.interval.total_seconds()), + ), + ) + class FreshnessAssertion(ConfigModel): __root__: Union[ @@ -51,36 +70,13 @@ def id(self): def generate_mcp( self, assertion_urn: str, entity_urn: str ) -> List[MetadataChangeProposalWrapper]: - freshness = self.__root__ - - if isinstance(freshness, CronFreshnessAssertion): - schedule = FreshnessAssertionScheduleClass( - type=FreshnessAssertionScheduleTypeClass.CRON, - cron=FreshnessCronScheduleClass( - cron=freshness.cron, - timezone=freshness.timezone, - ), - ) - elif isinstance(freshness, FixedIntervalFreshnessAssertion): - schedule = FreshnessAssertionScheduleClass( - type=FreshnessAssertionScheduleTypeClass.FIXED_INTERVAL, - fixedInterval=FixedIntervalScheduleClass( - unit=CalendarIntervalClass.SECOND, - multiple=int(freshness.interval.total_seconds()), - ), - ) - else: - raise ValueError(f"Unknown freshness type {freshness}") - - assertionInfo = AssertionInfoClass( + aspect = AssertionInfoClass( type=AssertionTypeClass.FRESHNESS, freshnessAssertion=FreshnessAssertionInfoClass( entity=entity_urn, type=FreshnessAssertionTypeClass.DATASET_CHANGE, - schedule=schedule, + schedule=self.__root__.generate_freshness_assertion_schedule(), ), + description=self.__root__.description, ) - - return [ - MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=assertionInfo) - ] + return [MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=aspect)] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py index b5b592e01f58f..b62f94e0592fc 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py @@ -6,6 +6,7 @@ import pydantic from typing_extensions import Literal +from datahub.api.entities.datacontract.assertion import BaseAssertion from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.extractor.json_schema_util import get_schema_metadata @@ -19,7 +20,7 @@ ) -class JsonSchemaContract(ConfigModel): +class JsonSchemaContract(BaseAssertion): type: Literal["json-schema"] json_schema: dict = pydantic.Field(alias="json-schema") @@ -36,7 +37,7 @@ def _init_private_attributes(self) -> None: ) -class FieldListSchemaContract(ConfigModel, arbitrary_types_allowed=True): +class FieldListSchemaContract(BaseAssertion, arbitrary_types_allowed=True): type: Literal["field-list"] fields: List[SchemaFieldClass] @@ -67,15 +68,13 @@ def id(self): def generate_mcp( self, assertion_urn: str, entity_urn: str ) -> List[MetadataChangeProposalWrapper]: - schema_metadata = self.__root__._schema_metadata - - assertionInfo = AssertionInfoClass( + aspect = AssertionInfoClass( type=AssertionTypeClass.DATA_SCHEMA, schemaAssertion=SchemaAssertionInfoClass( - entity=entity_urn, schema=schema_metadata + entity=entity_urn, + schema=self.__root__._schema_metadata, ), + description=self.__root__.description, ) - return [ - MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=assertionInfo) - ] + return [MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=aspect)] diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index a9f891ddb7b1e..80b6ceb576c1c 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -54,6 +54,13 @@ class DatasetSourceConfigMixin(PlatformInstanceConfigMixin, EnvConfigMixin): """ +class LowerCaseDatasetUrnConfigMixin(ConfigModel): + convert_urns_to_lowercase: bool = Field( + default=False, + description="Whether to convert dataset urns to lowercase.", + ) + + class DatasetLineageProviderConfigBase(EnvConfigMixin): """ Any non-Dataset source that produces lineage to Datasets should inherit this class. diff --git a/metadata-ingestion/src/datahub/ingestion/api/common.py b/metadata-ingestion/src/datahub/ingestion/api/common.py index 778bd119615e2..a6761a3c77d5e 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/common.py +++ b/metadata-ingestion/src/datahub/ingestion/api/common.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, Generic, Iterable, Optional, Tuple, TypeVar +from datahub.configuration.common import ConfigurationError from datahub.emitter.mce_builder import set_dataset_urn_to_lower from datahub.ingestion.api.committable import Committable from datahub.ingestion.graph.client import DataHubGraph @@ -75,3 +76,11 @@ def register_checkpointer(self, committable: Committable) -> None: def get_committables(self) -> Iterable[Tuple[str, Committable]]: yield from self.checkpointers.items() + + def require_graph(self, operation: Optional[str] = None) -> DataHubGraph: + if not self.graph: + raise ConfigurationError( + f"{operation or 'This operation'} requires a graph, but none was provided. " + "To provide one, either use the datahub-rest sink or set the top-level datahub_api config in the recipe." + ) + return self.graph diff --git a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py new file mode 100644 index 0000000000000..9478c5cf7efa2 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py @@ -0,0 +1,139 @@ +import copy +from typing import Dict, Iterable, Optional + +from datahub.emitter.mce_builder import datahub_guid, set_aspect +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph +from datahub.metadata.schema_classes import ( + FineGrainedLineageClass, + MetadataChangeEventClass, + SystemMetadataClass, + UpstreamClass, + UpstreamLineageClass, +) +from datahub.specific.dataset import DatasetPatchBuilder + + +def _convert_upstream_lineage_to_patch( + urn: str, + aspect: UpstreamLineageClass, + system_metadata: Optional[SystemMetadataClass], +) -> MetadataWorkUnit: + patch_builder = DatasetPatchBuilder(urn, system_metadata) + for upstream in aspect.upstreams: + patch_builder.add_upstream_lineage(upstream) + mcp = next(iter(patch_builder.build())) + return MetadataWorkUnit(id=f"{urn}-upstreamLineage", mcp_raw=mcp) + + +def get_fine_grained_lineage_key(fine_upstream: FineGrainedLineageClass) -> str: + return datahub_guid( + { + "upstreams": sorted(fine_upstream.upstreams or []), + "downstreams": sorted(fine_upstream.downstreams or []), + "transformOperation": fine_upstream.transformOperation, + } + ) + + +def _merge_upstream_lineage( + new_aspect: UpstreamLineageClass, gms_aspect: UpstreamLineageClass +) -> UpstreamLineageClass: + merged_aspect = copy.deepcopy(gms_aspect) + + upstreams_map: Dict[str, UpstreamClass] = { + upstream.dataset: upstream for upstream in merged_aspect.upstreams + } + + upstreams_updated = False + fine_upstreams_updated = False + + for table_upstream in new_aspect.upstreams: + if table_upstream.dataset not in upstreams_map or ( + table_upstream.auditStamp.time + > upstreams_map[table_upstream.dataset].auditStamp.time + ): + upstreams_map[table_upstream.dataset] = table_upstream + upstreams_updated = True + + if upstreams_updated: + merged_aspect.upstreams = list(upstreams_map.values()) + + if new_aspect.fineGrainedLineages and merged_aspect.fineGrainedLineages: + fine_upstreams_map: Dict[str, FineGrainedLineageClass] = { + get_fine_grained_lineage_key(fine_upstream): fine_upstream + for fine_upstream in merged_aspect.fineGrainedLineages + } + for column_upstream in new_aspect.fineGrainedLineages: + column_upstream_key = get_fine_grained_lineage_key(column_upstream) + + if column_upstream_key not in fine_upstreams_map or ( + column_upstream.confidenceScore + > fine_upstreams_map[column_upstream_key].confidenceScore + ): + fine_upstreams_map[column_upstream_key] = column_upstream + fine_upstreams_updated = True + + if fine_upstreams_updated: + merged_aspect.fineGrainedLineages = list(fine_upstreams_map.values()) + else: + merged_aspect.fineGrainedLineages = ( + new_aspect.fineGrainedLineages or gms_aspect.fineGrainedLineages + ) + + return merged_aspect + + +def _lineage_wu_via_read_modify_write( + graph: Optional[DataHubGraph], + urn: str, + aspect: UpstreamLineageClass, + system_metadata: Optional[SystemMetadataClass], +) -> MetadataWorkUnit: + if graph is None: + raise ValueError( + "Failed to handle incremental lineage, DataHubGraph is missing. " + "Use `datahub-rest` sink OR provide `datahub-api` config in recipe. " + ) + gms_aspect = graph.get_aspect(urn, UpstreamLineageClass) + if gms_aspect: + new_aspect = _merge_upstream_lineage(aspect, gms_aspect) + else: + new_aspect = aspect + + return MetadataChangeProposalWrapper( + entityUrn=urn, aspect=new_aspect, systemMetadata=system_metadata + ).as_workunit() + + +def auto_incremental_lineage( + graph: Optional[DataHubGraph], + incremental_lineage: bool, + stream: Iterable[MetadataWorkUnit], +) -> Iterable[MetadataWorkUnit]: + if not incremental_lineage: + yield from stream + return # early exit + + for wu in stream: + lineage_aspect: Optional[UpstreamLineageClass] = wu.get_aspect_of_type( + UpstreamLineageClass + ) + urn = wu.get_urn() + + if lineage_aspect: + if isinstance(wu.metadata, MetadataChangeEventClass): + set_aspect( + wu.metadata, None, UpstreamLineageClass + ) # we'll emit upstreamLineage separately below + if len(wu.metadata.proposedSnapshot.aspects) > 0: + yield wu + + yield _lineage_wu_via_read_modify_write( + graph, urn, lineage_aspect, wu.metadata.systemMetadata + ) if lineage_aspect.fineGrainedLineages else _convert_upstream_lineage_to_patch( + urn, lineage_aspect, wu.metadata.systemMetadata + ) + else: + yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 0bcc220cad49b..b86844b1c4c83 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -29,6 +29,7 @@ from datahub.ingestion.api.report import Report from datahub.ingestion.api.source_helpers import ( auto_browse_path_v2, + auto_lowercase_urns, auto_materialize_referenced_tags, auto_status_aspect, auto_workunit_reporter, @@ -192,7 +193,30 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: self.ctx.pipeline_config.flags.generate_browse_path_v2_dry_run ) + auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None + if ( + self.ctx.pipeline_config + and self.ctx.pipeline_config.source + and self.ctx.pipeline_config.source.config + and ( + ( + hasattr( + self.ctx.pipeline_config.source.config, + "convert_urns_to_lowercase", + ) + and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase + ) + or ( + hasattr(self.ctx.pipeline_config.source.config, "get") + and self.ctx.pipeline_config.source.config.get( + "convert_urns_to_lowercase" + ) + ) + ) + ): + auto_lowercase_dataset_urns = auto_lowercase_urns return [ + auto_lowercase_dataset_urns, auto_status_aspect, auto_materialize_referenced_tags, browse_path_processor, diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 7fc15cf829678..2ce9e07bc57bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -35,7 +35,7 @@ from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.tag_urn import TagUrn from datahub.utilities.urns.urn import guess_entity_type -from datahub.utilities.urns.urn_iter import list_urns +from datahub.utilities.urns.urn_iter import list_urns, lowercase_dataset_urns if TYPE_CHECKING: from datahub.ingestion.api.source import SourceReport @@ -70,7 +70,6 @@ def auto_status_aspect( for wu in stream: urn = wu.get_urn() all_urns.add(urn) - if not wu.is_primary_source: # If this is a non-primary source, we pretend like we've seen the status # aspect so that we don't try to emit a removal for it. @@ -173,6 +172,23 @@ def auto_materialize_referenced_tags( ).as_workunit() +def auto_lowercase_urns( + stream: Iterable[MetadataWorkUnit], +) -> Iterable[MetadataWorkUnit]: + """Lowercase all dataset urns""" + + for wu in stream: + try: + old_urn = wu.get_urn() + lowercase_dataset_urns(wu.metadata) + wu.id = wu.id.replace(old_urn, wu.get_urn()) + + yield wu + except Exception as e: + logger.warning(f"Failed to lowercase urns for {wu}: {e}", exc_info=True) + yield wu + + def auto_browse_path_v2( stream: Iterable[MetadataWorkUnit], *, diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 79d959965e0dd..07b55e0e25a89 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -353,77 +353,89 @@ def _time_to_print(self) -> bool: return False def run(self) -> None: - self.final_status = "unknown" - self._notify_reporters_on_ingestion_start() - callback = None - try: - callback = ( - LoggingCallback() - if not self.config.failure_log.enabled - else DeadLetterQueueCallback( - self.ctx, self.config.failure_log.log_config - ) - ) - for wu in itertools.islice( - self.source.get_workunits(), - self.preview_workunits if self.preview_mode else None, - ): - try: - if self._time_to_print(): - self.pretty_print_summary(currently_running=True) - except Exception as e: - logger.warning(f"Failed to print summary {e}") - - if not self.dry_run: - self.sink.handle_work_unit_start(wu) - try: - record_envelopes = self.extractor.get_records(wu) - for record_envelope in self.transform(record_envelopes): - if not self.dry_run: - self.sink.write_record_async(record_envelope, callback) - - except RuntimeError: - raise - except SystemExit: - raise - except Exception as e: - logger.error( - "Failed to process some records. Continuing.", exc_info=e + with contextlib.ExitStack() as stack: + if self.config.flags.generate_memory_profiles: + import memray + + stack.enter_context( + memray.Tracker( + f"{self.config.flags.generate_memory_profiles}/{self.config.run_id}.bin" ) - # TODO: Transformer errors should cause the pipeline to fail. - - self.extractor.close() - if not self.dry_run: - self.sink.handle_work_unit_end(wu) - self.source.close() - # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state - for record_envelope in self.transform( - [ - RecordEnvelope( - record=EndOfStream(), metadata={"workunit_id": "end-of-stream"} + ) + + self.final_status = "unknown" + self._notify_reporters_on_ingestion_start() + callback = None + try: + callback = ( + LoggingCallback() + if not self.config.failure_log.enabled + else DeadLetterQueueCallback( + self.ctx, self.config.failure_log.log_config ) - ] - ): - if not self.dry_run and not isinstance( - record_envelope.record, EndOfStream + ) + for wu in itertools.islice( + self.source.get_workunits(), + self.preview_workunits if self.preview_mode else None, + ): + try: + if self._time_to_print(): + self.pretty_print_summary(currently_running=True) + except Exception as e: + logger.warning(f"Failed to print summary {e}") + + if not self.dry_run: + self.sink.handle_work_unit_start(wu) + try: + record_envelopes = self.extractor.get_records(wu) + for record_envelope in self.transform(record_envelopes): + if not self.dry_run: + self.sink.write_record_async(record_envelope, callback) + + except RuntimeError: + raise + except SystemExit: + raise + except Exception as e: + logger.error( + "Failed to process some records. Continuing.", + exc_info=e, + ) + # TODO: Transformer errors should cause the pipeline to fail. + + self.extractor.close() + if not self.dry_run: + self.sink.handle_work_unit_end(wu) + self.source.close() + # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state + for record_envelope in self.transform( + [ + RecordEnvelope( + record=EndOfStream(), + metadata={"workunit_id": "end-of-stream"}, + ) + ] ): - # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc. - self.sink.write_record_async(record_envelope, callback) - - self.sink.close() - self.process_commits() - self.final_status = "completed" - except (SystemExit, RuntimeError, KeyboardInterrupt) as e: - self.final_status = "cancelled" - logger.error("Caught error", exc_info=e) - raise - finally: - clear_global_warnings() - - if callback and hasattr(callback, "close"): - callback.close() # type: ignore - - self._notify_reporters_on_ingestion_completion() + if not self.dry_run and not isinstance( + record_envelope.record, EndOfStream + ): + # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc. + self.sink.write_record_async(record_envelope, callback) + + self.sink.close() + self.process_commits() + self.final_status = "completed" + except (SystemExit, RuntimeError, KeyboardInterrupt) as e: + self.final_status = "cancelled" + logger.error("Caught error", exc_info=e) + raise + finally: + clear_global_warnings() + + if callback and hasattr(callback, "close"): + callback.close() # type: ignore + + self._notify_reporters_on_ingestion_completion() def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]: """ diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py index ff9a7a6f3d146..da3cee8ad9c1b 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py @@ -57,6 +57,13 @@ class FlagsConfig(ConfigModel): ), ) + generate_memory_profiles: Optional[str] = Field( + default=None, + description=( + "Generate memray memory dumps for ingestion process by providing a path to write the dump file in." + ), + ) + class PipelineConfig(ConfigModel): # Once support for discriminated unions gets merged into Pydantic, we can diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index fee181864a2d6..552612f877b9a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -16,7 +16,6 @@ make_dataplatform_instance_urn, make_dataset_urn, make_tag_urn, - set_dataset_urn_to_lower, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey @@ -218,8 +217,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): if self.config.enable_legacy_sharded_table_support: BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "" - set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) - self.bigquery_data_dictionary = BigQuerySchemaApi( self.report.schema_api_perf, self.config.get_bigquery_client() ) @@ -461,7 +458,8 @@ def _init_schema_resolver(self) -> SchemaResolver: ) else: logger.warning( - "Failed to load schema info from DataHub as DataHubGraph is missing.", + "Failed to load schema info from DataHub as DataHubGraph is missing. " + "Use `datahub-rest` sink OR provide `datahub-api` config in recipe. ", ) return SchemaResolver(platform=self.platform, env=self.config.env) @@ -1056,6 +1054,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: ): field.description = col.comment schema_fields[idx] = field + break else: tags = [] if col.is_partition_column: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index b0ac77201b415..88060a9cdc91d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -20,7 +20,13 @@ logger: logging.Logger = logging.getLogger(__name__) -_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX = "((.+)[_$])?(\\d{8})$" +# Regexp for sharded tables. +# A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date. +# The regexp checks for valid dates in the suffix (e.g. 20200101, 20200229, 20201231) and if the date is not valid +# then it is not a sharded table. +_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX = ( + "((.+\\D)[_$]?)?(\\d\\d\\d\\d(?:0[1-9]|1[0-2])(?:0[1-9]|[12][0-9]|3[01]))$" +) @dataclass(frozen=True, order=True) @@ -40,7 +46,7 @@ class BigqueryTableIdentifier: _BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd" @staticmethod - def get_table_and_shard(table_name: str) -> Tuple[str, Optional[str]]: + def get_table_and_shard(table_name: str) -> Tuple[Optional[str], Optional[str]]: """ Args: table_name: @@ -53,16 +59,25 @@ def get_table_and_shard(table_name: str) -> Tuple[str, Optional[str]]: In case of non-sharded tables, returns (, None) In case of sharded tables, returns (, shard) """ + new_table_name = table_name match = re.match( BigqueryTableIdentifier._BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX, table_name, re.IGNORECASE, ) if match: - table_name = match.group(2) - shard = match.group(3) - return table_name, shard - return table_name, None + shard: str = match[3] + if shard: + if table_name.endswith(shard): + new_table_name = table_name[: -len(shard)] + + new_table_name = ( + new_table_name.rstrip("_") if new_table_name else new_table_name + ) + if new_table_name.endswith("."): + new_table_name = table_name + return (new_table_name, shard) if new_table_name else (None, shard) + return new_table_name, None @classmethod def from_string_name(cls, table: str) -> "BigqueryTableIdentifier": diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 483355a85ac05..944814b6936a4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -206,11 +206,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: description="This flag enables the data lineage extraction from Data Lineage API exposed by Google Data Catalog. NOTE: This extractor can't build views lineage. It's recommended to enable the view's DDL parsing. Read the docs to have more information about: https://cloud.google.com/data-catalog/docs/concepts/about-data-lineage", ) - convert_urns_to_lowercase: bool = Field( - default=False, - description="Convert urns to lowercase.", - ) - enable_legacy_sharded_table_support: bool = Field( default=True, description="Use the legacy sharded table urn suffix added.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index b3e88459917b3..8ae17600e0eea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -1,12 +1,9 @@ -import dataclasses import logging from datetime import datetime from typing import Dict, Iterable, List, Optional, Tuple, cast from dateutil.relativedelta import relativedelta -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config @@ -15,7 +12,7 @@ RANGE_PARTITION_NAME, BigqueryTable, ) -from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest +from datahub.ingestion.source.sql.sql_generic import BaseTable from datahub.ingestion.source.sql.sql_generic_profiler import ( GenericProfiler, TableProfilerRequest, @@ -25,12 +22,6 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass -class BigqueryProfilerRequest(GEProfilerRequest): - table: BigqueryTable - profile_table_level_only: bool = False - - class BigqueryProfiler(GenericProfiler): config: BigQueryV2Config report: BigQueryV2Report @@ -183,84 +174,54 @@ def get_workunits( ) # Emit the profile work unit - profile_request = self.get_bigquery_profile_request( - project=project_id, dataset=dataset, table=table - ) + profile_request = self.get_profile_request(table, dataset, project_id) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: return - yield from self.generate_wu_from_profile_requests(profile_requests) - - def generate_wu_from_profile_requests( - self, profile_requests: List[BigqueryProfilerRequest] - ) -> Iterable[MetadataWorkUnit]: - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - for request, profile in self.generate_profiles( - table_profile_requests, + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if request is None or profile is None: - continue - - request = cast(BigqueryProfilerRequest, request) - profile.sizeInBytes = request.table.size_in_bytes - # If table is partitioned we profile only one partition (if nothing set then the last one) - # but for table level we can use the rows_count from the table metadata - # This way even though column statistics only reflects one partition data but the rows count - # shows the proper count. - if profile.partitionSpec and profile.partitionSpec.partition: - profile.rowCount = request.table.rows_count - - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler and not request.profile_table_level_only: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() + ) - def get_bigquery_profile_request( - self, project: str, dataset: str, table: BigqueryTable - ) -> Optional[BigqueryProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = BigqueryTableIdentifier( - project_id=project, dataset=dataset, table=table.name + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return BigqueryTableIdentifier( + project_id=db_name, dataset=schema_name, table=table_name ).get_table_name() - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count - ): - profile_table_level_only = True - self.report.num_tables_not_eligible_profiling[f"{project}.{dataset}"] += 1 - if not table.column_count: - skip_profiling = True + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + return dict( + schema=db_name, # + table=f"{schema_name}.{table.name}", # . + ) - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") + def get_profile_request( + self, table: BaseTable, schema_name: str, db_name: str + ) -> Optional[TableProfilerRequest]: + profile_request = super().get_profile_request(table, schema_name, db_name) + + if not profile_request: return None + # Below code handles profiling changes required for partitioned or sharded tables + # 1. Skip profile if partition profiling is disabled. + # 2. Else update `profile_request.batch_kwargs` with partition and custom_sql + + bq_table = cast(BigqueryTable, table) (partition, custom_sql) = self.generate_partition_profiler_query( - project, dataset, table, self.config.profiling.partition_datetime + db_name, schema_name, bq_table, self.config.profiling.partition_datetime ) - if partition is None and table.partition_info: + + if partition is None and bq_table.partition_info: self.report.report_warning( "profile skipped as partitioned table is empty or partition id or type was invalid", - dataset_name, + profile_request.pretty_name, ) return None if ( @@ -268,24 +229,20 @@ def get_bigquery_profile_request( and not self.config.profiling.partition_profiling_enabled ): logger.debug( - f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled" + f"{profile_request.pretty_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled" ) self.report.profiling_skipped_partition_profiling_disabled.append( - dataset_name + profile_request.pretty_name ) return None - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = BigqueryProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict( - schema=project, - table=f"{dataset}.{table.name}", - custom_sql=custom_sql, - partition=partition, - ), - table=table, - profile_table_level_only=profile_table_level_only, - ) + if partition: + logger.debug("Updating profiling request for partitioned/sharded tables") + profile_request.batch_kwargs.update( + dict( + custom_sql=custom_sql, + partition=partition, + ) + ) + return profile_request diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py index a87cb8c1cbfa5..67fcc33cdf218 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py @@ -51,8 +51,8 @@ class BigqueryQuery: p.max_partition_id, p.active_billable_bytes, p.long_term_billable_bytes, - REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, - REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base + REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix, + REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base FROM `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t @@ -92,8 +92,8 @@ class BigqueryQuery: tos.OPTION_VALUE as comment, t.is_insertable_into, t.ddl, - REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, - REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base + REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix, + REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base FROM `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index 7cb487a86d931..611f0c5c52cc6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -129,11 +129,9 @@ def __init__(self, config: CSVEnricherConfig, ctx: PipelineContext): # Map from entity urn to a list of SubResourceRow. self.editable_schema_metadata_map: Dict[str, List[SubResourceRow]] = {} self.should_overwrite: bool = self.config.write_semantics == "OVERRIDE" - if not self.should_overwrite and not self.ctx.graph: - raise ConfigurationError( - "With PATCH semantics, the csv-enricher source requires a datahub_api to connect to. " - "Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe." - ) + + if not self.should_overwrite: + self.ctx.require_graph(operation="The csv-enricher's PATCH semantics flag") def get_resource_glossary_terms_work_unit( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py index d1c949f48e2cd..a35fb94614f72 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py @@ -18,7 +18,14 @@ logger: logging.Logger = logging.getLogger(__name__) SUPPORTED_FILE_TYPES: List[str] = ["csv", "tsv", "json", "parquet", "avro"] -SUPPORTED_COMPRESSIONS: List[str] = ["gz", "bz2"] + +# These come from the smart_open library. +SUPPORTED_COMPRESSIONS: List[str] = [ + "gz", + "bz2", + # We have a monkeypatch on smart_open that aliases .gzip to .gz. + "gzip", +] class PathSpec(ConfigModel): diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index af9769bc9d94c..da1ea8ecb4678 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -20,9 +20,8 @@ DBTCommonConfig, DBTNode, DBTSourceBase, - DBTTest, - DBTTestResult, ) +from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 0f5c08eb6ac54..48d2118a9b091 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -1,11 +1,10 @@ -import json import logging import re from abc import abstractmethod from dataclasses import dataclass, field from datetime import datetime from enum import auto -from typing import Any, Callable, ClassVar, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple import pydantic from pydantic import root_validator, validator @@ -34,6 +33,12 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import DatasetSubTypes +from datahub.ingestion.source.dbt.dbt_tests import ( + DBTTest, + DBTTestResult, + make_assertion_from_test, + make_assertion_result_from_test, +) from datahub.ingestion.source.sql.sql_types import ( ATHENA_SQL_TYPES_MAP, BIGQUERY_TYPES_MAP, @@ -81,20 +86,7 @@ TimeTypeClass, ) from datahub.metadata.schema_classes import ( - AssertionInfoClass, - AssertionResultClass, - AssertionResultTypeClass, - AssertionRunEventClass, - AssertionRunStatusClass, - AssertionStdAggregationClass, - AssertionStdOperatorClass, - AssertionStdParameterClass, - AssertionStdParametersClass, - AssertionStdParameterTypeClass, - AssertionTypeClass, DataPlatformInstanceClass, - DatasetAssertionInfoClass, - DatasetAssertionScopeClass, DatasetPropertiesClass, GlobalTagsClass, GlossaryTermsClass, @@ -551,134 +543,6 @@ def get_column_type( return SchemaFieldDataType(type=TypeClass()) -@dataclass -class AssertionParams: - scope: Union[DatasetAssertionScopeClass, str] - operator: Union[AssertionStdOperatorClass, str] - aggregation: Union[AssertionStdAggregationClass, str] - parameters: Optional[Callable[[Dict[str, str]], AssertionStdParametersClass]] = None - logic_fn: Optional[Callable[[Dict[str, str]], Optional[str]]] = None - - -def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]: - """ - Try to produce a useful string for the name of a relationship constraint. - Return None if we fail to - """ - destination_ref = kw_args.get("to") - source_ref = kw_args.get("model") - column_name = kw_args.get("column_name") - dest_field_name = kw_args.get("field") - if not destination_ref or not source_ref or not column_name or not dest_field_name: - # base assertions are violated, bail early - return None - m = re.match(r"^ref\(\'(.*)\'\)$", destination_ref) - if m: - destination_table = m.group(1) - else: - destination_table = destination_ref - m = re.search(r"ref\(\'(.*)\'\)", source_ref) - if m: - source_table = m.group(1) - else: - source_table = source_ref - return f"{source_table}.{column_name} referential integrity to {destination_table}.{dest_field_name}" - - -@dataclass -class DBTTest: - qualified_test_name: str - column_name: Optional[str] - kw_args: dict - - TEST_NAME_TO_ASSERTION_MAP: ClassVar[Dict[str, AssertionParams]] = { - "not_null": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.NOT_NULL, - aggregation=AssertionStdAggregationClass.IDENTITY, - ), - "unique": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.EQUAL_TO, - aggregation=AssertionStdAggregationClass.UNIQUE_PROPOTION, - parameters=lambda _: AssertionStdParametersClass( - value=AssertionStdParameterClass( - value="1.0", - type=AssertionStdParameterTypeClass.NUMBER, - ) - ), - ), - "accepted_values": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.IN, - aggregation=AssertionStdAggregationClass.IDENTITY, - parameters=lambda kw_args: AssertionStdParametersClass( - value=AssertionStdParameterClass( - value=json.dumps(kw_args.get("values")), - type=AssertionStdParameterTypeClass.SET, - ), - ), - ), - "relationships": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass._NATIVE_, - aggregation=AssertionStdAggregationClass.IDENTITY, - parameters=lambda kw_args: AssertionStdParametersClass( - value=AssertionStdParameterClass( - value=json.dumps(kw_args.get("values")), - type=AssertionStdParameterTypeClass.SET, - ), - ), - logic_fn=_get_name_for_relationship_test, - ), - "dbt_expectations.expect_column_values_to_not_be_null": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.NOT_NULL, - aggregation=AssertionStdAggregationClass.IDENTITY, - ), - "dbt_expectations.expect_column_values_to_be_between": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.BETWEEN, - aggregation=AssertionStdAggregationClass.IDENTITY, - parameters=lambda x: AssertionStdParametersClass( - minValue=AssertionStdParameterClass( - value=str(x.get("min_value", "unknown")), - type=AssertionStdParameterTypeClass.NUMBER, - ), - maxValue=AssertionStdParameterClass( - value=str(x.get("max_value", "unknown")), - type=AssertionStdParameterTypeClass.NUMBER, - ), - ), - ), - "dbt_expectations.expect_column_values_to_be_in_set": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.IN, - aggregation=AssertionStdAggregationClass.IDENTITY, - parameters=lambda kw_args: AssertionStdParametersClass( - value=AssertionStdParameterClass( - value=json.dumps(kw_args.get("value_set")), - type=AssertionStdParameterTypeClass.SET, - ), - ), - ), - } - - -@dataclass -class DBTTestResult: - invocation_id: str - - status: str - execution_time: datetime - - native_results: Dict[str, str] - - -def string_map(input_map: Dict[str, Any]) -> Dict[str, str]: - return {k: str(v) for k, v in input_map.items()} - - @platform_name("dbt") @config_class(DBTCommonConfig) @support_status(SupportStatus.CERTIFIED) @@ -750,7 +614,7 @@ def create_test_entity_mcps( for upstream_urn in sorted(upstream_urns): if self.config.entities_enabled.can_emit_node_type("test"): - yield self._make_assertion_from_test( + yield make_assertion_from_test( custom_props, node, assertion_urn, @@ -759,133 +623,17 @@ def create_test_entity_mcps( if node.test_result: if self.config.entities_enabled.can_emit_test_results: - yield self._make_assertion_result_from_test( - node, assertion_urn, upstream_urn + yield make_assertion_result_from_test( + node, + assertion_urn, + upstream_urn, + test_warnings_are_errors=self.config.test_warnings_are_errors, ) else: logger.debug( f"Skipping test result {node.name} emission since it is turned off." ) - def _make_assertion_from_test( - self, - extra_custom_props: Dict[str, str], - node: DBTNode, - assertion_urn: str, - upstream_urn: str, - ) -> MetadataWorkUnit: - assert node.test_info - qualified_test_name = node.test_info.qualified_test_name - column_name = node.test_info.column_name - kw_args = node.test_info.kw_args - - if qualified_test_name in DBTTest.TEST_NAME_TO_ASSERTION_MAP: - assertion_params = DBTTest.TEST_NAME_TO_ASSERTION_MAP[qualified_test_name] - assertion_info = AssertionInfoClass( - type=AssertionTypeClass.DATASET, - customProperties=extra_custom_props, - datasetAssertion=DatasetAssertionInfoClass( - dataset=upstream_urn, - scope=assertion_params.scope, - operator=assertion_params.operator, - fields=[ - mce_builder.make_schema_field_urn(upstream_urn, column_name) - ] - if ( - assertion_params.scope - == DatasetAssertionScopeClass.DATASET_COLUMN - and column_name - ) - else [], - nativeType=node.name, - aggregation=assertion_params.aggregation, - parameters=assertion_params.parameters(kw_args) - if assertion_params.parameters - else None, - logic=assertion_params.logic_fn(kw_args) - if assertion_params.logic_fn - else None, - nativeParameters=string_map(kw_args), - ), - ) - elif column_name: - # no match with known test types, column-level test - assertion_info = AssertionInfoClass( - type=AssertionTypeClass.DATASET, - customProperties=extra_custom_props, - datasetAssertion=DatasetAssertionInfoClass( - dataset=upstream_urn, - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass._NATIVE_, - fields=[ - mce_builder.make_schema_field_urn(upstream_urn, column_name) - ], - nativeType=node.name, - logic=node.compiled_code or node.raw_code, - aggregation=AssertionStdAggregationClass._NATIVE_, - nativeParameters=string_map(kw_args), - ), - ) - else: - # no match with known test types, default to row-level test - assertion_info = AssertionInfoClass( - type=AssertionTypeClass.DATASET, - customProperties=extra_custom_props, - datasetAssertion=DatasetAssertionInfoClass( - dataset=upstream_urn, - scope=DatasetAssertionScopeClass.DATASET_ROWS, - operator=AssertionStdOperatorClass._NATIVE_, - logic=node.compiled_code or node.raw_code, - nativeType=node.name, - aggregation=AssertionStdAggregationClass._NATIVE_, - nativeParameters=string_map(kw_args), - ), - ) - - wu = MetadataChangeProposalWrapper( - entityUrn=assertion_urn, - aspect=assertion_info, - ).as_workunit() - - return wu - - def _make_assertion_result_from_test( - self, - node: DBTNode, - assertion_urn: str, - upstream_urn: str, - ) -> MetadataWorkUnit: - assert node.test_result - test_result = node.test_result - - assertionResult = AssertionRunEventClass( - timestampMillis=int(test_result.execution_time.timestamp() * 1000.0), - assertionUrn=assertion_urn, - asserteeUrn=upstream_urn, - runId=test_result.invocation_id, - result=AssertionResultClass( - type=AssertionResultTypeClass.SUCCESS - if test_result.status == "pass" - or ( - not self.config.test_warnings_are_errors - and test_result.status == "warn" - ) - else AssertionResultTypeClass.FAILURE, - nativeResults=test_result.native_results, - ), - status=AssertionRunStatusClass.COMPLETE, - ) - - event = MetadataChangeProposalWrapper( - entityUrn=assertion_urn, - aspect=assertionResult, - ) - wu = MetadataWorkUnit( - id=f"{assertion_urn}-assertionRunEvent-{upstream_urn}", - mcp=event, - ) - return wu - @abstractmethod def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: # return dbt nodes + global custom properties diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py index c08295ed1dc59..dc3a84847beb2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py @@ -26,9 +26,8 @@ DBTNode, DBTSourceBase, DBTSourceReport, - DBTTest, - DBTTestResult, ) +from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_tests.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_tests.py new file mode 100644 index 0000000000000..721769d214d9e --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_tests.py @@ -0,0 +1,261 @@ +import json +import re +from dataclasses import dataclass +from datetime import datetime +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union + +from datahub.emitter import mce_builder +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionResultClass, + AssertionResultTypeClass, + AssertionRunEventClass, + AssertionRunStatusClass, + AssertionStdAggregationClass, + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, + AssertionTypeClass, + DatasetAssertionInfoClass, + DatasetAssertionScopeClass, +) + +if TYPE_CHECKING: + from datahub.ingestion.source.dbt.dbt_common import DBTNode + + +@dataclass +class DBTTest: + qualified_test_name: str + column_name: Optional[str] + kw_args: dict + + +@dataclass +class DBTTestResult: + invocation_id: str + + status: str + execution_time: datetime + + native_results: Dict[str, str] + + +def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]: + """ + Try to produce a useful string for the name of a relationship constraint. + Return None if we fail to + """ + destination_ref = kw_args.get("to") + source_ref = kw_args.get("model") + column_name = kw_args.get("column_name") + dest_field_name = kw_args.get("field") + if not destination_ref or not source_ref or not column_name or not dest_field_name: + # base assertions are violated, bail early + return None + m = re.match(r"^ref\(\'(.*)\'\)$", destination_ref) + if m: + destination_table = m.group(1) + else: + destination_table = destination_ref + m = re.search(r"ref\(\'(.*)\'\)", source_ref) + if m: + source_table = m.group(1) + else: + source_table = source_ref + return f"{source_table}.{column_name} referential integrity to {destination_table}.{dest_field_name}" + + +@dataclass +class AssertionParams: + scope: Union[DatasetAssertionScopeClass, str] + operator: Union[AssertionStdOperatorClass, str] + aggregation: Union[AssertionStdAggregationClass, str] + parameters: Optional[Callable[[Dict[str, str]], AssertionStdParametersClass]] = None + logic_fn: Optional[Callable[[Dict[str, str]], Optional[str]]] = None + + +_DBT_TEST_NAME_TO_ASSERTION_MAP: Dict[str, AssertionParams] = { + "not_null": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.NOT_NULL, + aggregation=AssertionStdAggregationClass.IDENTITY, + ), + "unique": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.EQUAL_TO, + aggregation=AssertionStdAggregationClass.UNIQUE_PROPOTION, + parameters=lambda _: AssertionStdParametersClass( + value=AssertionStdParameterClass( + value="1.0", + type=AssertionStdParameterTypeClass.NUMBER, + ) + ), + ), + "accepted_values": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.IN, + aggregation=AssertionStdAggregationClass.IDENTITY, + parameters=lambda kw_args: AssertionStdParametersClass( + value=AssertionStdParameterClass( + value=json.dumps(kw_args.get("values")), + type=AssertionStdParameterTypeClass.SET, + ), + ), + ), + "relationships": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass._NATIVE_, + aggregation=AssertionStdAggregationClass.IDENTITY, + parameters=lambda kw_args: AssertionStdParametersClass( + value=AssertionStdParameterClass( + value=json.dumps(kw_args.get("values")), + type=AssertionStdParameterTypeClass.SET, + ), + ), + logic_fn=_get_name_for_relationship_test, + ), + "dbt_expectations.expect_column_values_to_not_be_null": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.NOT_NULL, + aggregation=AssertionStdAggregationClass.IDENTITY, + ), + "dbt_expectations.expect_column_values_to_be_between": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.BETWEEN, + aggregation=AssertionStdAggregationClass.IDENTITY, + parameters=lambda x: AssertionStdParametersClass( + minValue=AssertionStdParameterClass( + value=str(x.get("min_value", "unknown")), + type=AssertionStdParameterTypeClass.NUMBER, + ), + maxValue=AssertionStdParameterClass( + value=str(x.get("max_value", "unknown")), + type=AssertionStdParameterTypeClass.NUMBER, + ), + ), + ), + "dbt_expectations.expect_column_values_to_be_in_set": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.IN, + aggregation=AssertionStdAggregationClass.IDENTITY, + parameters=lambda kw_args: AssertionStdParametersClass( + value=AssertionStdParameterClass( + value=json.dumps(kw_args.get("value_set")), + type=AssertionStdParameterTypeClass.SET, + ), + ), + ), +} + + +def _string_map(input_map: Dict[str, Any]) -> Dict[str, str]: + return {k: str(v) for k, v in input_map.items()} + + +def make_assertion_from_test( + extra_custom_props: Dict[str, str], + node: "DBTNode", + assertion_urn: str, + upstream_urn: str, +) -> MetadataWorkUnit: + assert node.test_info + qualified_test_name = node.test_info.qualified_test_name + column_name = node.test_info.column_name + kw_args = node.test_info.kw_args + + if qualified_test_name in _DBT_TEST_NAME_TO_ASSERTION_MAP: + assertion_params = _DBT_TEST_NAME_TO_ASSERTION_MAP[qualified_test_name] + assertion_info = AssertionInfoClass( + type=AssertionTypeClass.DATASET, + customProperties=extra_custom_props, + datasetAssertion=DatasetAssertionInfoClass( + dataset=upstream_urn, + scope=assertion_params.scope, + operator=assertion_params.operator, + fields=[mce_builder.make_schema_field_urn(upstream_urn, column_name)] + if ( + assertion_params.scope == DatasetAssertionScopeClass.DATASET_COLUMN + and column_name + ) + else [], + nativeType=node.name, + aggregation=assertion_params.aggregation, + parameters=assertion_params.parameters(kw_args) + if assertion_params.parameters + else None, + logic=assertion_params.logic_fn(kw_args) + if assertion_params.logic_fn + else None, + nativeParameters=_string_map(kw_args), + ), + ) + elif column_name: + # no match with known test types, column-level test + assertion_info = AssertionInfoClass( + type=AssertionTypeClass.DATASET, + customProperties=extra_custom_props, + datasetAssertion=DatasetAssertionInfoClass( + dataset=upstream_urn, + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass._NATIVE_, + fields=[mce_builder.make_schema_field_urn(upstream_urn, column_name)], + nativeType=node.name, + logic=node.compiled_code or node.raw_code, + aggregation=AssertionStdAggregationClass._NATIVE_, + nativeParameters=_string_map(kw_args), + ), + ) + else: + # no match with known test types, default to row-level test + assertion_info = AssertionInfoClass( + type=AssertionTypeClass.DATASET, + customProperties=extra_custom_props, + datasetAssertion=DatasetAssertionInfoClass( + dataset=upstream_urn, + scope=DatasetAssertionScopeClass.DATASET_ROWS, + operator=AssertionStdOperatorClass._NATIVE_, + logic=node.compiled_code or node.raw_code, + nativeType=node.name, + aggregation=AssertionStdAggregationClass._NATIVE_, + nativeParameters=_string_map(kw_args), + ), + ) + + return MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=assertion_info, + ).as_workunit() + + +def make_assertion_result_from_test( + node: "DBTNode", + assertion_urn: str, + upstream_urn: str, + test_warnings_are_errors: bool, +) -> MetadataWorkUnit: + assert node.test_result + test_result = node.test_result + + assertionResult = AssertionRunEventClass( + timestampMillis=int(test_result.execution_time.timestamp() * 1000.0), + assertionUrn=assertion_urn, + asserteeUrn=upstream_urn, + runId=test_result.invocation_id, + result=AssertionResultClass( + type=AssertionResultTypeClass.SUCCESS + if test_result.status == "pass" + or (not test_warnings_are_errors and test_result.status == "warn") + else AssertionResultTypeClass.FAILURE, + nativeResults=test_result.native_results, + ), + status=AssertionRunStatusClass.COMPLETE, + ) + + return MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=assertionResult, + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 01e083d566168..9f6ac9dd21164 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -273,6 +273,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): partition: Optional[str] config: GEProfilingConfig report: SQLSourceReport + custom_sql: Optional[str] query_combiner: SQLAlchemyQueryCombiner @@ -596,16 +597,8 @@ def generate_dataset_profile( # noqa: C901 (complexity) "catch_exceptions", self.config.catch_exceptions ) - profile = DatasetProfileClass(timestampMillis=get_sys_time()) - if self.partition: - profile.partitionSpec = PartitionSpecClass(partition=self.partition) - elif self.config.limit and self.config.offset: - profile.partitionSpec = PartitionSpecClass( - type=PartitionTypeClass.QUERY, - partition=json.dumps( - dict(limit=self.config.limit, offset=self.config.offset) - ), - ) + profile = self.init_profile() + profile.fieldProfiles = [] self._get_dataset_rows(profile) @@ -740,6 +733,24 @@ def generate_dataset_profile( # noqa: C901 (complexity) self.query_combiner.flush() return profile + def init_profile(self): + profile = DatasetProfileClass(timestampMillis=get_sys_time()) + if self.partition: + profile.partitionSpec = PartitionSpecClass(partition=self.partition) + elif self.config.limit: + profile.partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, + partition=json.dumps( + dict(limit=self.config.limit, offset=self.config.offset) + ), + ) + elif self.custom_sql: + profile.partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, partition="SAMPLE" + ) + + return profile + def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> None: if ( self.dataset.engine.dialect.name.lower() == BIGQUERY @@ -1064,6 +1075,7 @@ def _generate_single_profile( partition, self.config, self.report, + custom_sql, query_combiner, ).generate_dataset_profile() diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 77761c529ba0b..24a3e520d8caf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -157,12 +157,12 @@ class GEProfilingConfig(ConfigModel): ) use_sampling: bool = Field( default=True, - description="Whether to profile column level stats on sample of table. Only BigQuery supports this. " + description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. " "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ", ) sample_size: int = Field( - default=1000, + default=10000, description="Number of rows to be sampled from table for column level profiling." "Applicable only if `use_sampling` is set to True.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py index 566304e1999b7..d5039360da567 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py @@ -18,7 +18,10 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.kafka import KafkaConsumerConnectionConfig -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import ( + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +) from datahub.emitter import mce_builder from datahub.emitter.mce_builder import ( make_data_platform_urn, @@ -76,7 +79,11 @@ class KafkaTopicConfigKeys(str, Enum): UNCLEAN_LEADER_ELECTION_CONFIG = "unclean.leader.election.enable" -class KafkaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): +class KafkaSourceConfig( + StatefulIngestionConfigBase, + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +): connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig() topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"]) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 89b1e45695c57..30c38720dd96c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -81,9 +81,6 @@ EnumTypeClass, FineGrainedLineageClass, GlobalTagsClass, - OwnerClass, - OwnershipClass, - OwnershipTypeClass, SchemaMetadataClass, StatusClass, SubTypesClass, @@ -453,17 +450,9 @@ def _get_schema( @staticmethod def _get_tag_mce_for_urn(tag_urn: str) -> MetadataChangeEvent: assert tag_urn in LookerUtil.tag_definitions - ownership = OwnershipClass( - owners=[ - OwnerClass( - owner="urn:li:corpuser:datahub", - type=OwnershipTypeClass.DATAOWNER, - ) - ] - ) return MetadataChangeEvent( proposedSnapshot=TagSnapshotClass( - urn=tag_urn, aspects=[ownership, LookerUtil.tag_definitions[tag_urn]] + urn=tag_urn, aspects=[LookerUtil.tag_definitions[tag_urn]] ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index 804a14b0fe1cf..2789b800940db 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -132,6 +132,10 @@ class RedshiftConfig( description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", ) + extract_column_level_lineage: bool = Field( + default=True, description="Whether to extract column level lineage." + ) + @root_validator(pre=True) def check_email_is_set_on_usage(cls, values): if values.get("include_usage_statistics"): diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index bbe52b5d98ba3..c9ddfbe92ab2a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -9,10 +9,12 @@ import humanfriendly import redshift_connector -from sqllineage.runner import LineageRunner +import datahub.emitter.mce_builder as builder +import datahub.utilities.sqlglot_lineage as sqlglot_l from datahub.emitter import mce_builder from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.aws.s3_util import strip_s3_prefix from datahub.ingestion.source.redshift.common import get_db_name from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig @@ -28,13 +30,19 @@ from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, ) -from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + FineGrainedLineage, + FineGrainedLineageDownstreamType, + FineGrainedLineageUpstreamType, + UpstreamLineage, +) from datahub.metadata.schema_classes import ( DatasetLineageTypeClass, UpstreamClass, UpstreamLineageClass, ) from datahub.utilities import memory_footprint +from datahub.utilities.urns import dataset_urn logger: logging.Logger = logging.getLogger(__name__) @@ -56,13 +64,14 @@ class LineageCollectorType(Enum): @dataclass(frozen=True, eq=True) class LineageDataset: platform: LineageDatasetPlatform - path: str + urn: str @dataclass() class LineageItem: dataset: LineageDataset upstreams: Set[LineageDataset] + cll: Optional[List[sqlglot_l.ColumnLineageInfo]] collector_type: LineageCollectorType dataset_lineage_type: str = field(init=False) @@ -83,10 +92,12 @@ def __init__( self, config: RedshiftConfig, report: RedshiftReport, + context: PipelineContext, redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None, ): self.config = config self.report = report + self.context = context self._lineage_map: Dict[str, LineageItem] = defaultdict() self.redundant_run_skip_handler = redundant_run_skip_handler @@ -121,33 +132,37 @@ def _get_s3_path(self, path: str) -> str: return path - def _get_sources_from_query(self, db_name: str, query: str) -> List[LineageDataset]: + def _get_sources_from_query( + self, db_name: str, query: str + ) -> Tuple[List[LineageDataset], Optional[List[sqlglot_l.ColumnLineageInfo]]]: sources: List[LineageDataset] = list() - parser = LineageRunner(query) + parsed_result: Optional[ + sqlglot_l.SqlParsingResult + ] = sqlglot_l.create_lineage_sql_parsed_result( + query=query, + platform=LineageDatasetPlatform.REDSHIFT.value, + platform_instance=self.config.platform_instance, + database=db_name, + schema=str(self.config.default_schema), + graph=self.context.graph, + env=self.config.env, + ) - for table in parser.source_tables: - split = str(table).split(".") - if len(split) == 3: - db_name, source_schema, source_table = split - elif len(split) == 2: - source_schema, source_table = split - else: - raise ValueError( - f"Invalid table name {table} in query {query}. " - f"Expected format: [db_name].[schema].[table] or [schema].[table] or [table]." - ) + if parsed_result is None: + logger.debug(f"native query parsing failed for {query}") + return sources, None - if source_schema == "": - source_schema = str(self.config.default_schema) + logger.debug(f"parsed_result = {parsed_result}") + for table_urn in parsed_result.in_tables: source = LineageDataset( platform=LineageDatasetPlatform.REDSHIFT, - path=f"{db_name}.{source_schema}.{source_table}", + urn=table_urn, ) sources.append(source) - return sources + return sources, parsed_result.column_lineage def _build_s3_path_from_row(self, filename: str) -> str: path = filename.strip() @@ -165,9 +180,11 @@ def _get_sources( source_table: Optional[str], ddl: Optional[str], filename: Optional[str], - ) -> List[LineageDataset]: + ) -> Tuple[List[LineageDataset], Optional[List[sqlglot_l.ColumnLineageInfo]]]: sources: List[LineageDataset] = list() # Source + cll: Optional[List[sqlglot_l.ColumnLineageInfo]] = None + if ( lineage_type in { @@ -177,7 +194,7 @@ def _get_sources( and ddl is not None ): try: - sources = self._get_sources_from_query(db_name=db_name, query=ddl) + sources, cll = self._get_sources_from_query(db_name=db_name, query=ddl) except Exception as e: logger.warning( f"Error parsing query {ddl} for getting lineage. Error was {e}." @@ -192,22 +209,38 @@ def _get_sources( "Only s3 source supported with copy. The source was: {path}." ) self.report.num_lineage_dropped_not_support_copy_path += 1 - return sources + return sources, cll path = strip_s3_prefix(self._get_s3_path(path)) + urn = make_dataset_urn_with_platform_instance( + platform=platform.value, + name=path, + env=self.config.env, + platform_instance=self.config.platform_instance_map.get( + platform.value + ) + if self.config.platform_instance_map is not None + else None, + ) elif source_schema is not None and source_table is not None: platform = LineageDatasetPlatform.REDSHIFT path = f"{db_name}.{source_schema}.{source_table}" + urn = make_dataset_urn_with_platform_instance( + platform=platform.value, + platform_instance=self.config.platform_instance, + name=path, + env=self.config.env, + ) else: - return [] + return [], cll sources = [ LineageDataset( platform=platform, - path=path, + urn=urn, ) ] - return sources + return sources, cll def _populate_lineage_map( self, @@ -231,6 +264,7 @@ def _populate_lineage_map( :rtype: None """ try: + cll: Optional[List[sqlglot_l.ColumnLineageInfo]] = None raw_db_name = database alias_db_name = get_db_name(self.config) @@ -243,7 +277,7 @@ def _populate_lineage_map( if not target: continue - sources = self._get_sources( + sources, cll = self._get_sources( lineage_type, alias_db_name, source_schema=lineage_row.source_schema, @@ -251,6 +285,7 @@ def _populate_lineage_map( ddl=lineage_row.ddl, filename=lineage_row.filename, ) + target.cll = cll target.upstreams.update( self._get_upstream_lineages( @@ -262,20 +297,16 @@ def _populate_lineage_map( ) # Merging downstreams if dataset already exists and has downstreams - if target.dataset.path in self._lineage_map: - self._lineage_map[ - target.dataset.path - ].upstreams = self._lineage_map[ - target.dataset.path - ].upstreams.union( - target.upstreams - ) + if target.dataset.urn in self._lineage_map: + self._lineage_map[target.dataset.urn].upstreams = self._lineage_map[ + target.dataset.urn + ].upstreams.union(target.upstreams) else: - self._lineage_map[target.dataset.path] = target + self._lineage_map[target.dataset.urn] = target logger.debug( - f"Lineage[{target}]:{self._lineage_map[target.dataset.path]}" + f"Lineage[{target}]:{self._lineage_map[target.dataset.urn]}" ) except Exception as e: self.warn( @@ -308,17 +339,34 @@ def _get_target_lineage( target_platform = LineageDatasetPlatform.S3 # Following call requires 'filename' key in lineage_row target_path = self._build_s3_path_from_row(lineage_row.filename) + urn = make_dataset_urn_with_platform_instance( + platform=target_platform.value, + name=target_path, + env=self.config.env, + platform_instance=self.config.platform_instance_map.get( + target_platform.value + ) + if self.config.platform_instance_map is not None + else None, + ) except ValueError as e: self.warn(logger, "non-s3-lineage", str(e)) return None else: target_platform = LineageDatasetPlatform.REDSHIFT target_path = f"{alias_db_name}.{lineage_row.target_schema}.{lineage_row.target_table}" + urn = make_dataset_urn_with_platform_instance( + platform=target_platform.value, + platform_instance=self.config.platform_instance, + name=target_path, + env=self.config.env, + ) return LineageItem( - dataset=LineageDataset(platform=target_platform, path=target_path), + dataset=LineageDataset(platform=target_platform, urn=urn), upstreams=set(), collector_type=lineage_type, + cll=None, ) def _get_upstream_lineages( @@ -331,11 +379,22 @@ def _get_upstream_lineages( targe_source = [] for source in sources: if source.platform == LineageDatasetPlatform.REDSHIFT: - db, schema, table = source.path.split(".") + qualified_table_name = dataset_urn.DatasetUrn.create_from_string( + source.urn + ).get_entity_id()[1] + db, schema, table = qualified_table_name.split(".") if db == raw_db_name: db = alias_db_name path = f"{db}.{schema}.{table}" - source = LineageDataset(platform=source.platform, path=path) + source = LineageDataset( + platform=source.platform, + urn=make_dataset_urn_with_platform_instance( + platform=LineageDatasetPlatform.REDSHIFT.value, + platform_instance=self.config.platform_instance, + name=path, + env=self.config.env, + ), + ) # Filtering out tables which does not exist in Redshift # It was deleted in the meantime or query parser did not capture well the table name @@ -345,7 +404,7 @@ def _get_upstream_lineages( or not any(table == t.name for t in all_tables[db][schema]) ): logger.debug( - f"{source.path} missing table, dropping from lineage.", + f"{source.urn} missing table, dropping from lineage.", ) self.report.num_lineage_tables_dropped += 1 continue @@ -433,36 +492,73 @@ def populate_lineage( memory_footprint.total_size(self._lineage_map) ) + def make_fine_grained_lineage_class( + self, lineage_item: LineageItem, dataset_urn: str + ) -> List[FineGrainedLineage]: + fine_grained_lineages: List[FineGrainedLineage] = [] + + if ( + self.config.extract_column_level_lineage is False + or lineage_item.cll is None + ): + logger.debug("CLL extraction is disabled") + return fine_grained_lineages + + logger.debug("Extracting column level lineage") + + cll: List[sqlglot_l.ColumnLineageInfo] = lineage_item.cll + + for cll_info in cll: + downstream = ( + [builder.make_schema_field_urn(dataset_urn, cll_info.downstream.column)] + if cll_info.downstream is not None + and cll_info.downstream.column is not None + else [] + ) + + upstreams = [ + builder.make_schema_field_urn(column_ref.table, column_ref.column) + for column_ref in cll_info.upstreams + ] + + fine_grained_lineages.append( + FineGrainedLineage( + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=downstream, + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=upstreams, + ) + ) + + logger.debug(f"Created fine_grained_lineage for {dataset_urn}") + + return fine_grained_lineages + def get_lineage( self, table: Union[RedshiftTable, RedshiftView], dataset_urn: str, schema: RedshiftSchema, ) -> Optional[Tuple[UpstreamLineageClass, Dict[str, str]]]: - dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) - if dataset_key is None: - return None upstream_lineage: List[UpstreamClass] = [] - if dataset_key.name in self._lineage_map: - item = self._lineage_map[dataset_key.name] + cll_lineage: List[FineGrainedLineage] = [] + + if dataset_urn in self._lineage_map: + item = self._lineage_map[dataset_urn] for upstream in item.upstreams: upstream_table = UpstreamClass( - dataset=make_dataset_urn_with_platform_instance( - upstream.platform.value, - upstream.path, - platform_instance=self.config.platform_instance_map.get( - upstream.platform.value - ) - if self.config.platform_instance_map - else None, - env=self.config.env, - ), + dataset=upstream.urn, type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) + cll_lineage = self.make_fine_grained_lineage_class( + lineage_item=item, + dataset_urn=dataset_urn, + ) + tablename = table.name if table.type == "EXTERNAL_TABLE": # external_db_params = schema.option @@ -489,7 +585,12 @@ def get_lineage( else: return None - return UpstreamLineage(upstreams=upstream_lineage), {} + return ( + UpstreamLineage( + upstreams=upstream_lineage, fineGrainedLineages=cll_lineage or None + ), + {}, + ) def report_status(self, step: str, status: bool) -> None: if self.redundant_run_skip_handler: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py index e983734082b1d..771636e8498a3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py @@ -1,33 +1,19 @@ -import dataclasses import logging -from datetime import datetime -from typing import Dict, Iterable, List, Optional, Union, cast +from typing import Dict, Iterable, List, Optional, Union -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest from datahub.ingestion.source.redshift.config import RedshiftConfig from datahub.ingestion.source.redshift.redshift_schema import ( RedshiftTable, RedshiftView, ) from datahub.ingestion.source.redshift.report import RedshiftReport -from datahub.ingestion.source.sql.sql_generic_profiler import ( - GenericProfiler, - TableProfilerRequest, -) +from datahub.ingestion.source.sql.sql_generic_profiler import GenericProfiler from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler logger = logging.getLogger(__name__) -@dataclasses.dataclass -class RedshiftProfilerRequest(GEProfilerRequest): - table: Union[RedshiftTable, RedshiftView] - profile_table_level_only: bool = False - - class RedshiftProfiler(GenericProfiler): config: RedshiftConfig report: RedshiftReport @@ -63,80 +49,21 @@ def get_workunits( continue for table in tables[db].get(schema, {}): # Emit the profile work unit - profile_request = self.get_redshift_profile_request( - table, schema, db - ) + profile_request = self.get_profile_request(table, schema, db) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: continue - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - for request, profile in self.generate_profiles( - table_profile_requests, + + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, db, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if profile is None: - continue - request = cast(RedshiftProfilerRequest, request) - - profile.sizeInBytes = request.table.size_in_bytes - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler and not request.profile_table_level_only: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() - - def get_redshift_profile_request( - self, - table: Union[RedshiftTable, RedshiftView], - schema_name: str, - db_name: str, - ) -> Optional[RedshiftProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = f"{db_name}.{schema_name}.{table.name}".lower() - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count - ): - # Profile only table level if dataset is filtered from profiling - # due to size limits alone - if self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, 0, 0 - ): - profile_table_level_only = True - else: - skip_profiling = True - - if len(table.columns) == 0: - skip_profiling = True - - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") - return None + ) - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = RedshiftProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict(schema=schema_name, table=table.name), - table=table, - profile_table_level_only=profile_table_level_only, - ) - return profile_request + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return f"{db_name}.{schema_name}.{table_name}".lower() diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index e8a8ff976afa6..a1b6333a3775d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -881,6 +881,7 @@ def extract_lineage( self.lineage_extractor = RedshiftLineageExtractor( config=self.config, report=self.report, + context=self.ctx, redundant_run_skip_handler=self.redundant_lineage_run_skip_handler, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index ac4433b7eb1f0..eb49fcbb268c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -10,6 +10,7 @@ from pathlib import PurePath from typing import Any, Dict, Iterable, List, Optional, Tuple +import smart_open.compression as so_compression from more_itertools import peekable from pyspark.conf import SparkConf from pyspark.sql import SparkSession @@ -120,6 +121,9 @@ } PAGE_SIZE = 1000 +# Hack to support the .gzip extension with smart_open. +so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"]) + def get_column_type( report: SourceReport, dataset_name: str, column_type: str @@ -407,7 +411,9 @@ def get_fields(self, table_data: TableData, path_spec: PathSpec) -> List: table_data.full_path, "rb", transport_params={"client": s3_client} ) else: - file = open(table_data.full_path, "rb") + # We still use smart_open here to take advantage of the compression + # capabilities of smart_open. + file = smart_open(table_data.full_path, "rb") fields = [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 5f5e8e4bcdea3..8e18d85d6f3ca 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -1,20 +1,12 @@ -import dataclasses import logging -from datetime import datetime -from typing import Callable, Dict, Iterable, List, Optional, cast +from typing import Callable, Dict, Iterable, List, Optional from snowflake.sqlalchemy import snowdialect from sqlalchemy import create_engine, inspect from sqlalchemy.sql import sqltypes -from datahub.configuration.pattern_utils import is_schema_allowed -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.ge_data_profiler import ( - DatahubGEProfiler, - GEProfilerRequest, -) +from datahub.ingestion.source.ge_data_profiler import DatahubGEProfiler from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report @@ -23,10 +15,8 @@ SnowflakeTable, ) from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin -from datahub.ingestion.source.sql.sql_generic_profiler import ( - GenericProfiler, - TableProfilerRequest, -) +from datahub.ingestion.source.sql.sql_generic import BaseTable +from datahub.ingestion.source.sql.sql_generic_profiler import GenericProfiler from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler snowdialect.ischema_names["GEOGRAPHY"] = sqltypes.NullType @@ -35,12 +25,6 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass -class SnowflakeProfilerRequest(GEProfilerRequest): - table: SnowflakeTable - profile_table_level_only: bool = False - - class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin): def __init__( self, @@ -65,101 +49,52 @@ def get_workunits( profile_requests = [] for schema in database.schemas: - if not is_schema_allowed( - self.config.schema_pattern, - schema.name, - database.name, - self.config.match_fully_qualified_names, - ): - continue - for table in db_tables[schema.name]: - profile_request = self.get_snowflake_profile_request( + profile_request = self.get_profile_request( table, schema.name, database.name ) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: return - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - - for request, profile in self.generate_profiles( - table_profile_requests, + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, database.name, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if profile is None: - continue - profile.sizeInBytes = cast( - SnowflakeProfilerRequest, request - ).table.size_in_bytes - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() + ) - def get_snowflake_profile_request( - self, - table: SnowflakeTable, - schema_name: str, - db_name: str, - ) -> Optional[SnowflakeProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return self.get_dataset_identifier(table_name, schema_name, db_name) + + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + custom_sql = None + if ( + not self.config.profiling.limit + and self.config.profiling.use_sampling + and table.rows_count + and table.rows_count > self.config.profiling.sample_size ): - # Profile only table level if dataset is filtered from profiling - # due to size limits alone - if self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, 0, 0 - ): - profile_table_level_only = True - else: - skip_profiling = True - - if len(table.columns) == 0: - skip_profiling = True - - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") - return None - - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = SnowflakeProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict( - schema=schema_name, - table=table.name, - # Lowercase/Mixedcase table names in Snowflake do not work by default. - # We need to pass `use_quoted_name=True` for such tables as mentioned here - - # https://github.com/great-expectations/great_expectations/pull/2023 - use_quoted_name=(table.name != table.name.upper()), - ), - table=table, - profile_table_level_only=profile_table_level_only, - ) - return profile_request + # GX creates a temporary table from query if query is passed as batch kwargs. + # We are using fraction-based sampling here, instead of fixed-size sampling because + # Fixed-size sampling can be slower than equivalent fraction-based sampling + # as per https://docs.snowflake.com/en/sql-reference/constructs/sample#performance-considerations + sample_pc = 100 * self.config.profiling.sample_size / table.rows_count + custom_sql = f'select * from "{db_name}"."{schema_name}"."{table.name}" TABLESAMPLE ({sample_pc:.8f})' + return { + **super().get_batch_kwargs(table, schema_name, db_name), + # Lowercase/Mixedcase table names in Snowflake do not work by default. + # We need to pass `use_quoted_name=True` for such tables as mentioned here - + # https://github.com/great-expectations/great_expectations/pull/2023 + "use_quoted_name": (table.name != table.name.upper()), + "custom_sql": custom_sql, + } def get_profiler_instance( self, db_name: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 215116b4c33fb..a5c07d9a3870c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -4,6 +4,7 @@ import os.path import platform from dataclasses import dataclass +from functools import partial from typing import Callable, Dict, Iterable, List, Optional, Union import pandas as pd @@ -26,6 +27,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.source import ( CapabilityReport, MetadataWorkUnitProcessor, @@ -511,6 +513,11 @@ def _init_schema_resolver(self) -> SchemaResolver: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), + partial( + auto_incremental_lineage, + self.ctx.graph, + self.config.incremental_lineage, + ), StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py index ba8655b83446d..a6a9d8e2c8597 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py @@ -217,14 +217,15 @@ def _get_view_lineage_elements( key = (lineage.dependent_view, lineage.dependent_schema) # Append the source table to the list. lineage_elements[key].append( - mce_builder.make_dataset_urn( - self.platform, - self.get_identifier( + mce_builder.make_dataset_urn_with_platform_instance( + platform=self.platform, + name=self.get_identifier( schema=lineage.source_schema, entity=lineage.source_table, inspector=inspector, ), - self.config.env, + platform_instance=self.config.platform_instance, + env=self.config.env, ) ) @@ -244,12 +245,13 @@ def _get_view_lineage_workunits( dependent_view, dependent_schema = key # Construct a lineage object. - urn = mce_builder.make_dataset_urn( - self.platform, - self.get_identifier( + urn = mce_builder.make_dataset_urn_with_platform_instance( + platform=self.platform, + name=self.get_identifier( schema=dependent_schema, entity=dependent_view, inspector=inspector ), - self.config.env, + platform_instance=self.config.platform_instance, + env=self.config.env, ) # use the mce_builder to ensure that the change proposal inherits diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 677d32c8bac08..08cc74aec3977 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -7,7 +7,10 @@ from pydantic import Field from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import ( + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +) from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -21,7 +24,11 @@ logger: logging.Logger = logging.getLogger(__name__) -class SQLCommonConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): +class SQLCommonConfig( + StatefulIngestionConfigBase, + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +): options: dict = pydantic.Field( default_factory=dict, description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index 344c114d464a9..aaeee5717a867 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -1,12 +1,15 @@ import logging +from abc import abstractmethod from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone -from typing import Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import Dict, Iterable, List, Optional, Union, cast from sqlalchemy import create_engine, inspect from sqlalchemy.engine.reflection import Inspector from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.ge_data_profiler import ( DatahubGEProfiler, GEProfilerRequest, @@ -16,7 +19,7 @@ from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile -from datahub.metadata.schema_classes import DatasetProfileClass +from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType from datahub.utilities.stats_collections import TopKDict, int_top_k_dict @@ -63,14 +66,14 @@ def __init__( self.platform = platform self.state_handler = state_handler - def generate_profiles( + def generate_profile_workunits( self, requests: List[TableProfilerRequest], max_workers: int, db_name: Optional[str] = None, platform: Optional[str] = None, profiler_args: Optional[Dict] = None, - ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]: + ) -> Iterable[MetadataWorkUnit]: ge_profile_requests: List[GEProfilerRequest] = [ cast(GEProfilerRequest, request) for request in requests @@ -80,21 +83,109 @@ def generate_profiles( request for request in requests if request.profile_table_level_only ] for request in table_level_profile_requests: - profile = DatasetProfile( + table_level_profile = DatasetProfile( timestampMillis=int(datetime.now().timestamp() * 1000), columnCount=request.table.column_count, rowCount=request.table.rows_count, sizeInBytes=request.table.size_in_bytes, ) - yield (request, profile) + dataset_urn = self.dataset_urn_builder(request.pretty_name) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=table_level_profile + ).as_workunit() if not ge_profile_requests: return # Otherwise, if column level profiling is enabled, use GE profiler. ge_profiler = self.get_profiler_instance(db_name) - yield from ge_profiler.generate_profiles( + + for ge_profiler_request, profile in ge_profiler.generate_profiles( ge_profile_requests, max_workers, platform, profiler_args + ): + if profile is None: + continue + + request = cast(TableProfilerRequest, ge_profiler_request) + profile.sizeInBytes = request.table.size_in_bytes + + # If table is partitioned we profile only one partition (if nothing set then the last one) + # but for table level we can use the rows_count from the table metadata + # This way even though column statistics only reflects one partition data but the rows count + # shows the proper count. + if ( + profile.partitionSpec + and profile.partitionSpec.type != PartitionType.FULL_TABLE + ): + profile.rowCount = request.table.rows_count + + dataset_urn = self.dataset_urn_builder(request.pretty_name) + + # We don't add to the profiler state if we only do table level profiling as it always happens + if self.state_handler: + self.state_handler.add_to_state( + dataset_urn, int(datetime.now().timestamp() * 1000) + ) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=profile + ).as_workunit() + + def dataset_urn_builder(self, dataset_name: str) -> str: + return make_dataset_urn_with_platform_instance( + self.platform, + dataset_name, + self.config.platform_instance, + self.config.env, + ) + + @abstractmethod + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + pass + + def get_profile_request( + self, table: BaseTable, schema_name: str, db_name: str + ) -> Optional[TableProfilerRequest]: + skip_profiling = False + profile_table_level_only = self.config.profiling.profile_table_level_only + dataset_name = self.get_dataset_name(table.name, schema_name, db_name) + if not self.is_dataset_eligible_for_profiling( + dataset_name, table.last_altered, table.size_in_bytes, table.rows_count + ): + # Profile only table level if dataset is filtered from profiling + # due to size limits alone + if self.is_dataset_eligible_for_profiling( + dataset_name, table.last_altered, 0, 0 + ): + profile_table_level_only = True + else: + skip_profiling = True + self.report.num_tables_not_eligible_profiling[ + f"{db_name}.{schema_name}" + ] += 1 + + if table.column_count == 0: + skip_profiling = True + + if skip_profiling: + if self.config.profiling.report_dropped_profiles: + self.report.report_dropped(f"profile of {dataset_name}") + return None + + logger.debug(f"Preparing profiling request for {dataset_name}") + profile_request = TableProfilerRequest( + pretty_name=dataset_name, + batch_kwargs=self.get_batch_kwargs(table, schema_name, db_name), + table=table, + profile_table_level_only=profile_table_level_only, + ) + return profile_request + + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + return dict( + schema=schema_name, + table=table.name, ) def get_inspectors(self) -> Iterable[Inspector]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py new file mode 100644 index 0000000000000..6080cf7b371e3 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -0,0 +1,280 @@ +import logging +from dataclasses import dataclass +from datetime import datetime +from typing import Iterable, Optional, Set, Union + +# This import verifies that the dependencies are available. +import teradatasqlalchemy # noqa: F401 +import teradatasqlalchemy.types as custom_types +from pydantic.fields import Field +from sqlalchemy import create_engine +from sqlalchemy.engine import Engine + +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.time_window_config import BaseTimeWindowConfig +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type +from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source.sql.two_tier_sql_source import ( + TwoTierSQLAlchemyConfig, + TwoTierSQLAlchemySource, +) +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport +from datahub.ingestion.source_report.time_window import BaseTimeWindowReport +from datahub.metadata._schema_classes import ( + MetadataChangeEventClass, + SchemaMetadataClass, + ViewPropertiesClass, +) +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + BytesTypeClass, + TimeTypeClass, +) +from datahub.utilities.file_backed_collections import FileBackedDict +from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage +from datahub.utilities.urns.dataset_urn import DatasetUrn + +logger: logging.Logger = logging.getLogger(__name__) + +register_custom_type(custom_types.JSON, BytesTypeClass) +register_custom_type(custom_types.INTERVAL_DAY, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_DAY_TO_SECOND, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_DAY_TO_MINUTE, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_DAY_TO_HOUR, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_SECOND, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_MINUTE, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_MINUTE_TO_SECOND, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_HOUR, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_HOUR_TO_MINUTE, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_HOUR_TO_SECOND, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_MONTH, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_YEAR, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_YEAR_TO_MONTH, TimeTypeClass) +register_custom_type(custom_types.MBB, BytesTypeClass) +register_custom_type(custom_types.MBR, BytesTypeClass) +register_custom_type(custom_types.GEOMETRY, BytesTypeClass) +register_custom_type(custom_types.TDUDT, BytesTypeClass) +register_custom_type(custom_types.XML, BytesTypeClass) + + +@dataclass +class TeradataReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): + num_queries_parsed: int = 0 + num_view_ddl_parsed: int = 0 + num_table_parse_failures: int = 0 + + +class BaseTeradataConfig(TwoTierSQLAlchemyConfig): + scheme = Field(default="teradatasql", description="database scheme") + + +class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig): + database_pattern = Field( + default=AllowDenyPattern(deny=["dbc"]), + description="Regex patterns for databases to filter in ingestion.", + ) + include_table_lineage = Field( + default=False, + description="Whether to include table lineage in the ingestion. " + "This requires to have the table lineage feature enabled.", + ) + + include_view_lineage = Field( + default=True, + description="Whether to include view lineage in the ingestion. " + "This requires to have the view lineage feature enabled.", + ) + usage: BaseUsageConfig = Field( + description="The usage config to use when generating usage statistics", + default=BaseUsageConfig(), + ) + + default_db: Optional[str] = Field( + default=None, + description="The default database to use for unqualified table names", + ) + + include_usage_statistics: bool = Field( + default=False, + description="Generate usage statistic.", + ) + + +@platform_name("Teradata") +@config_class(TeradataConfig) +@support_status(SupportStatus.TESTING) +@capability(SourceCapability.DOMAINS, "Enabled by default") +@capability(SourceCapability.CONTAINERS, "Enabled by default") +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DELETION_DETECTION, "Optionally enabled via configuration") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") +@capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration") +@capability(SourceCapability.USAGE_STATS, "Optionally enabled via configuration") +class TeradataSource(TwoTierSQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for databases, schemas, views, and tables + - Column types associated with each table + - Table, row, and column statistics via optional SQL profiling + """ + + config: TeradataConfig + + LINEAGE_QUERY: str = """SELECT ProcID, UserName as "user", StartTime AT TIME ZONE 'GMT' as "timestamp", DefaultDatabase as default_database, QueryText as query + FROM "DBC".DBQLogTbl + where ErrorCode = 0 + and QueryText like 'create table demo_user.test_lineage%' + and "timestamp" >= TIMESTAMP '{start_time}' + and "timestamp" < TIMESTAMP '{end_time}' + """ + urns: Optional[Set[str]] + + def __init__(self, config: TeradataConfig, ctx: PipelineContext): + super().__init__(config, ctx, "teradata") + + self.report: TeradataReport = TeradataReport() + self.graph: Optional[DataHubGraph] = ctx.graph + + self.builder: SqlParsingBuilder = SqlParsingBuilder( + usage_config=self.config.usage + if self.config.include_usage_statistics + else None, + generate_lineage=True, + generate_usage_statistics=self.config.include_usage_statistics, + generate_operations=self.config.usage.include_operational_stats, + ) + + self.schema_resolver = SchemaResolver( + platform=self.platform, + platform_instance=self.config.platform_instance, + graph=None, + env=self.config.env, + ) + + self._view_definition_cache: FileBackedDict[str] = FileBackedDict() + + @classmethod + def create(cls, config_dict, ctx): + config = TeradataConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_view_lineage(self) -> Iterable[MetadataWorkUnit]: + for key in self._view_definition_cache.keys(): + view_definition = self._view_definition_cache[key] + dataset_urn = DatasetUrn.create_from_string(key) + + db_name: Optional[str] = None + # We need to get the default db from the dataset urn otherwise the builder generates the wrong urns + if "." in dataset_urn.get_dataset_name(): + db_name = dataset_urn.get_dataset_name().split(".", 1)[0] + + self.report.num_view_ddl_parsed += 1 + if self.report.num_view_ddl_parsed % 1000 == 0: + logger.info(f"Parsed {self.report.num_queries_parsed} view ddl") + + yield from self.gen_lineage_from_query( + query=view_definition, default_database=db_name, is_view_ddl=True + ) + + def get_audit_log_mcps(self) -> Iterable[MetadataWorkUnit]: + engine = self.get_metadata_engine() + for entry in engine.execute( + self.LINEAGE_QUERY.format( + start_time=self.config.start_time, end_time=self.config.end_time + ) + ): + self.report.num_queries_parsed += 1 + if self.report.num_queries_parsed % 1000 == 0: + logger.info(f"Parsed {self.report.num_queries_parsed} queries") + + yield from self.gen_lineage_from_query( + query=entry.query, + default_database=entry.default_database, + timestamp=entry.timestamp, + user=entry.user, + is_view_ddl=False, + ) + + def gen_lineage_from_query( + self, + query: str, + default_database: Optional[str] = None, + timestamp: Optional[datetime] = None, + user: Optional[str] = None, + is_view_ddl: bool = False, + ) -> Iterable[MetadataWorkUnit]: + result = sqlglot_lineage( + sql=query, + schema_resolver=self.schema_resolver, + default_db=None, + default_schema=default_database + if default_database + else self.config.default_db, + ) + if result.debug_info.table_error: + logger.debug( + f"Error parsing table lineage, {result.debug_info.table_error}" + ) + self.report.num_table_parse_failures += 1 + else: + yield from self.builder.process_sql_parsing_result( + result, + query=query, + is_view_ddl=is_view_ddl, + query_timestamp=timestamp, + user=f"urn:li:corpuser:{user}", + include_urns=self.schema_resolver.get_urns(), + ) + + def get_metadata_engine(self) -> Engine: + url = self.config.get_sql_alchemy_url() + logger.debug(f"sql_alchemy_url={url}") + return create_engine(url, **self.config.options) + + def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + # Add all schemas to the schema resolver + for wu in super().get_workunits_internal(): + if isinstance(wu.metadata, MetadataChangeEventClass): + if wu.metadata.proposedSnapshot: + for aspect in wu.metadata.proposedSnapshot.aspects: + if isinstance(aspect, SchemaMetadataClass): + self.schema_resolver.add_schema_metadata( + wu.metadata.proposedSnapshot.urn, + aspect, + ) + break + if isinstance(wu.metadata, MetadataChangeProposalWrapper): + if ( + wu.metadata.entityUrn + and isinstance(wu.metadata.aspect, ViewPropertiesClass) + and wu.metadata.aspect.viewLogic + ): + self._view_definition_cache[ + wu.metadata.entityUrn + ] = wu.metadata.aspect.viewLogic + yield wu + + if self.config.include_view_lineage: + self.report.report_ingestion_stage_start("view lineage extraction") + yield from self.get_view_lineage() + + if self.config.include_table_lineage or self.config.include_usage_statistics: + self.report.report_ingestion_stage_start("audit log extraction") + yield from self.get_audit_log_mcps() + + yield from self.builder.gen_workunits() diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index e347cd26d245a..bad7ae49d325e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -77,6 +77,7 @@ FIELD_TYPE_MAPPING, MetadataQueryException, TableauLineageOverrides, + TableauUpstreamReference, clean_query, custom_sql_graphql_query, dashboard_graphql_query, @@ -85,7 +86,6 @@ get_overridden_info, get_unique_custom_sql, make_fine_grained_lineage_class, - make_table_urn, make_upstream_class, published_datasource_graphql_query, query_metadata, @@ -271,7 +271,7 @@ class TableauConfig( "You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.", ) - default_schema_map: dict = Field( + default_schema_map: Dict[str, str] = Field( default={}, description="Default schema to use when schema is not found." ) ingest_tags: Optional[bool] = Field( @@ -997,41 +997,16 @@ def get_upstream_tables( ) continue - schema = table.get(tableau_constant.SCHEMA) or "" - table_name = table.get(tableau_constant.NAME) or "" - full_name = table.get(tableau_constant.FULL_NAME) or "" - upstream_db = ( - table[tableau_constant.DATABASE][tableau_constant.NAME] - if table.get(tableau_constant.DATABASE) - and table[tableau_constant.DATABASE].get(tableau_constant.NAME) - else "" - ) - logger.debug( - "Processing Table with Connection Type: {0} and id {1}".format( - table.get(tableau_constant.CONNECTION_TYPE) or "", - table.get(tableau_constant.ID) or "", + try: + ref = TableauUpstreamReference.create( + table, default_schema_map=self.config.default_schema_map ) - ) - schema = self._get_schema(schema, upstream_db, full_name) - # if the schema is included within the table name we omit it - if ( - schema - and table_name - and full_name - and table_name == full_name - and schema in table_name - ): - logger.debug( - f"Omitting schema for upstream table {table[tableau_constant.ID]}, schema included in table name" - ) - schema = "" + except Exception as e: + logger.info(f"Failed to generate upstream reference for {table}: {e}") + continue - table_urn = make_table_urn( + table_urn = ref.make_dataset_urn( self.config.env, - upstream_db, - table.get(tableau_constant.CONNECTION_TYPE) or "", - schema, - table_name, self.config.platform_instance_map, self.config.lineage_overrides, ) @@ -1052,7 +1027,7 @@ def get_upstream_tables( urn=table_urn, id=table[tableau_constant.ID], num_cols=num_tbl_cols, - paths=set([table_path]) if table_path else set(), + paths={table_path} if table_path else set(), ) else: self.database_tables[table_urn].update_table( @@ -2462,35 +2437,6 @@ def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]: is_embedded_ds=True, ) - @lru_cache(maxsize=None) - def _get_schema(self, schema_provided: str, database: str, fullName: str) -> str: - # For some databases, the schema attribute in tableau api does not return - # correct schema name for the table. For more information, see - # https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_model.html#schema_attribute. - # Hence we extract schema from fullName whenever fullName is available - schema = self._extract_schema_from_fullName(fullName) if fullName else "" - if not schema: - schema = schema_provided - elif schema != schema_provided: - logger.debug( - "Correcting schema, provided {0}, corrected {1}".format( - schema_provided, schema - ) - ) - - if not schema and database in self.config.default_schema_map: - schema = self.config.default_schema_map[database] - - return schema - - @lru_cache(maxsize=None) - def _extract_schema_from_fullName(self, fullName: str) -> str: - # fullName is observed to be in format [schemaName].[tableName] - # OR simply tableName OR [tableName] - if fullName.startswith("[") and "].[" in fullName: - return fullName[1 : fullName.index("]")] - return "" - @lru_cache(maxsize=None) def get_last_modified( self, creator: Optional[str], created_at: bytes, updated_at: bytes diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py index 2c92285fdba77..7c4852042ce7c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py @@ -1,4 +1,6 @@ import html +import logging +from dataclasses import dataclass from functools import lru_cache from typing import Dict, List, Optional, Tuple @@ -6,6 +8,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel +from datahub.ingestion.source import tableau_constant as tc from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageType, FineGrainedLineage, @@ -31,6 +34,8 @@ ) from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult +logger = logging.getLogger(__name__) + class TableauLineageOverrides(ConfigModel): platform_override_map: Optional[Dict[str, str]] = Field( @@ -537,12 +542,12 @@ def get_fully_qualified_table_name( platform: str, upstream_db: str, schema: str, - full_name: str, + table_name: str, ) -> str: if platform == "athena": upstream_db = "" database_name = f"{upstream_db}." if upstream_db else "" - final_name = full_name.replace("[", "").replace("]", "") + final_name = table_name.replace("[", "").replace("]", "") schema_name = f"{schema}." if schema else "" @@ -573,17 +578,123 @@ def get_fully_qualified_table_name( return fully_qualified_table_name -def get_platform_instance( - platform: str, platform_instance_map: Optional[Dict[str, str]] -) -> Optional[str]: - if platform_instance_map is not None and platform in platform_instance_map.keys(): - return platform_instance_map[platform] +@dataclass +class TableauUpstreamReference: + database: Optional[str] + schema: Optional[str] + table: str + + connection_type: str + + @classmethod + def create( + cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None + ) -> "TableauUpstreamReference": + # Values directly from `table` object from Tableau + database = t_database = d.get(tc.DATABASE, {}).get(tc.NAME) + schema = t_schema = d.get(tc.SCHEMA) + table = t_table = d.get(tc.NAME) or "" + t_full_name = d.get(tc.FULL_NAME) + t_connection_type = d[tc.CONNECTION_TYPE] # required to generate urn + t_id = d[tc.ID] + + parsed_full_name = cls.parse_full_name(t_full_name) + if parsed_full_name and len(parsed_full_name) == 3: + database, schema, table = parsed_full_name + elif parsed_full_name and len(parsed_full_name) == 2: + schema, table = parsed_full_name + else: + logger.debug( + f"Upstream urn generation ({t_id}):" + f" Did not parse full name {t_full_name}: unexpected number of values", + ) + + if not schema and default_schema_map and database in default_schema_map: + schema = default_schema_map[database] + + if database != t_database: + logger.debug( + f"Upstream urn generation ({t_id}):" + f" replacing database {t_database} with {database} from full name {t_full_name}" + ) + if schema != t_schema: + logger.debug( + f"Upstream urn generation ({t_id}):" + f" replacing schema {t_schema} with {schema} from full name {t_full_name}" + ) + if table != t_table: + logger.debug( + f"Upstream urn generation ({t_id}):" + f" replacing table {t_table} with {table} from full name {t_full_name}" + ) + + # TODO: See if we can remove this -- made for redshift + if ( + schema + and t_table + and t_full_name + and t_table == t_full_name + and schema in t_table + ): + logger.debug( + f"Omitting schema for upstream table {t_id}, schema included in table name" + ) + schema = "" + + return cls( + database=database, + schema=schema, + table=table, + connection_type=t_connection_type, + ) + + @staticmethod + def parse_full_name(full_name: Optional[str]) -> Optional[List[str]]: + # fullName is observed to be in formats: + # [database].[schema].[table] + # [schema].[table] + # [table] + # table + # schema + + # TODO: Validate the startswith check. Currently required for our integration tests + if full_name is None or not full_name.startswith("["): + return None + + return full_name.replace("[", "").replace("]", "").split(".") + + def make_dataset_urn( + self, + env: str, + platform_instance_map: Optional[Dict[str, str]], + lineage_overrides: Optional[TableauLineageOverrides] = None, + ) -> str: + ( + upstream_db, + platform_instance, + platform, + original_platform, + ) = get_overridden_info( + connection_type=self.connection_type, + upstream_db=self.database, + lineage_overrides=lineage_overrides, + platform_instance_map=platform_instance_map, + ) + + table_name = get_fully_qualified_table_name( + original_platform, + upstream_db or "", + self.schema, + self.table, + ) - return None + return builder.make_dataset_urn_with_platform_instance( + platform, table_name, platform_instance, env + ) def get_overridden_info( - connection_type: str, + connection_type: Optional[str], upstream_db: Optional[str], platform_instance_map: Optional[Dict[str, str]], lineage_overrides: Optional[TableauLineageOverrides] = None, @@ -605,7 +716,9 @@ def get_overridden_info( ): upstream_db = lineage_overrides.database_override_map[upstream_db] - platform_instance = get_platform_instance(original_platform, platform_instance_map) + platform_instance = ( + platform_instance_map.get(original_platform) if platform_instance_map else None + ) if original_platform in ("athena", "hive", "mysql"): # Two tier databases upstream_db = None @@ -613,35 +726,6 @@ def get_overridden_info( return upstream_db, platform_instance, platform, original_platform -def make_table_urn( - env: str, - upstream_db: Optional[str], - connection_type: str, - schema: str, - full_name: str, - platform_instance_map: Optional[Dict[str, str]], - lineage_overrides: Optional[TableauLineageOverrides] = None, -) -> str: - - upstream_db, platform_instance, platform, original_platform = get_overridden_info( - connection_type=connection_type, - upstream_db=upstream_db, - lineage_overrides=lineage_overrides, - platform_instance_map=platform_instance_map, - ) - - table_name = get_fully_qualified_table_name( - original_platform, - upstream_db if upstream_db is not None else "", - schema, - full_name, - ) - - return builder.make_dataset_urn_with_platform_instance( - platform, table_name, platform_instance, env - ) - - def make_description_from_params(description, formula): """ Generate column description diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index f259fa260f653..a57ee39848855 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -7,7 +7,10 @@ from pydantic import Field from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import ( + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +) from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -91,6 +94,7 @@ class UnityCatalogSourceConfig( BaseUsageConfig, DatasetSourceConfigMixin, StatefulProfilingConfigMixin, + LowerCaseDatasetUrnConfigMixin, ): token: str = pydantic.Field(description="Databricks personal access token") workspace_url: str = pydantic.Field( @@ -181,6 +185,17 @@ class UnityCatalogSourceConfig( description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ", ) + column_lineage_column_limit: int = pydantic.Field( + default=300, + description="Limit the number of columns to get column level lineage. ", + ) + + lineage_max_workers: int = pydantic.Field( + default=5 * (os.cpu_count() or 4), + description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.", + hidden_from_docs=True, + ) + include_usage_statistics: bool = Field( default=True, description="Generate usage statistics.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 529d9e7b563a5..9bcdb200f180e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -233,9 +233,7 @@ def list_lineages_by_column(self, table_name: str, column_name: str) -> dict: body={"table_name": table_name, "column_name": column_name}, ) - def table_lineage( - self, table: Table, include_entity_lineage: bool - ) -> Optional[dict]: + def table_lineage(self, table: Table, include_entity_lineage: bool) -> None: # Lineage endpoint doesn't exists on 2.1 version try: response: dict = self.list_lineages_by_table( @@ -256,34 +254,30 @@ def table_lineage( for item in response.get("downstreams") or []: for notebook in item.get("notebookInfos") or []: table.downstream_notebooks.add(notebook["notebook_id"]) - - return response except Exception as e: - logger.error(f"Error getting lineage: {e}") - return None + logger.warning( + f"Error getting lineage on table {table.ref}: {e}", exc_info=True + ) - def get_column_lineage(self, table: Table, include_entity_lineage: bool) -> None: + def get_column_lineage(self, table: Table, column_name: str) -> None: try: - table_lineage = self.table_lineage( - table, include_entity_lineage=include_entity_lineage + response: dict = self.list_lineages_by_column( + table_name=table.ref.qualified_table_name, + column_name=column_name, ) - if table_lineage: - for column in table.columns: - response: dict = self.list_lineages_by_column( - table_name=table.ref.qualified_table_name, - column_name=column.name, - ) - for item in response.get("upstream_cols", []): - table_ref = TableReference.create_from_lineage( - item, table.schema.catalog.metastore - ) - if table_ref: - table.upstreams.setdefault(table_ref, {}).setdefault( - column.name, [] - ).append(item["name"]) - + for item in response.get("upstream_cols") or []: + table_ref = TableReference.create_from_lineage( + item, table.schema.catalog.metastore + ) + if table_ref: + table.upstreams.setdefault(table_ref, {}).setdefault( + column_name, [] + ).append(item["name"]) except Exception as e: - logger.error(f"Error getting lineage: {e}") + logger.warning( + f"Error getting column lineage on table {table.ref}, column {column_name}: {e}", + exc_info=True, + ) @staticmethod def _escape_sequence(value: str) -> str: diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index 808172a136bb3..fa61571fa92cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -18,6 +18,8 @@ class UnityCatalogReport(IngestionStageReport, StaleEntityRemovalSourceReport): table_profiles: EntityFilterReport = EntityFilterReport.field(type="table profile") notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook") + num_column_lineage_skipped_column_count: int = 0 + num_queries: int = 0 num_queries_dropped_parse_failure: int = 0 num_queries_missing_table: int = 0 # Can be due to pattern filter diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 4f7866aee7681..27c1f341aa84d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -1,6 +1,7 @@ import logging import re import time +from concurrent.futures import ThreadPoolExecutor from datetime import timedelta from typing import Dict, Iterable, List, Optional, Set, Union from urllib.parse import urljoin @@ -367,15 +368,7 @@ def process_table(self, table: Table, schema: Schema) -> Iterable[MetadataWorkUn ownership = self._create_table_ownership_aspect(table) data_platform_instance = self._create_data_platform_instance_aspect() - if self.config.include_column_lineage: - self.unity_catalog_api_proxy.get_column_lineage( - table, include_entity_lineage=self.config.include_notebooks - ) - elif self.config.include_table_lineage: - self.unity_catalog_api_proxy.table_lineage( - table, include_entity_lineage=self.config.include_notebooks - ) - lineage = self._generate_lineage_aspect(dataset_urn, table) + lineage = self.ingest_lineage(table) if self.config.include_notebooks: for notebook_id in table.downstream_notebooks: @@ -401,6 +394,28 @@ def process_table(self, table: Table, schema: Schema) -> Iterable[MetadataWorkUn ) ] + def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]: + if self.config.include_table_lineage: + self.unity_catalog_api_proxy.table_lineage( + table, include_entity_lineage=self.config.include_notebooks + ) + + if self.config.include_column_lineage and table.upstreams: + if len(table.columns) > self.config.column_lineage_column_limit: + self.report.num_column_lineage_skipped_column_count += 1 + + with ThreadPoolExecutor( + max_workers=self.config.lineage_max_workers + ) as executor: + for column in table.columns[: self.config.column_lineage_column_limit]: + executor.submit( + self.unity_catalog_api_proxy.get_column_lineage, + table, + column.name, + ) + + return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table) + def _generate_lineage_aspect( self, dataset_urn: str, table: Table ) -> Optional[UpstreamLineageClass]: diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source_config/bigquery.py index 8ca1296d819c1..0a73bb5203e72 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/bigquery.py @@ -4,7 +4,13 @@ from datahub.configuration.common import ConfigModel, ConfigurationError -_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: str = "((.+)[_$])?(\\d{8})$" +# Regexp for sharded tables. +# A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date. +# The regexp checks for valid dates in the suffix (e.g. 20200101, 20200229, 20201231) and if the date is not valid +# then it is not a sharded table. +_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: str = ( + "((.+\\D)[_$]?)?(\\d\\d\\d\\d(?:0[1-9]|1[0-2])(?:0[1-9]|[12][0-9]|3[01]))$" +) class BigQueryBaseConfig(ConfigModel): diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py index 8516a7054a9cd..b3b1331db768b 100644 --- a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py +++ b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py @@ -70,11 +70,14 @@ def assert_sql_result( sql: str, *, dialect: str, + platform_instance: Optional[str] = None, expected_file: pathlib.Path, schemas: Optional[Dict[str, SchemaInfo]] = None, **kwargs: Any, ) -> None: - schema_resolver = SchemaResolver(platform=dialect) + schema_resolver = SchemaResolver( + platform=dialect, platform_instance=platform_instance + ) if schemas: for urn, schema in schemas.items(): schema_resolver.add_raw_schema_info(urn, schema) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 81c43884fdf7d..c830ec8c02fd4 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -5,12 +5,13 @@ import logging import pathlib from collections import defaultdict -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union import pydantic.dataclasses import sqlglot import sqlglot.errors import sqlglot.lineage +import sqlglot.optimizer.annotate_types import sqlglot.optimizer.qualify import sqlglot.optimizer.qualify_columns from pydantic import BaseModel @@ -23,7 +24,17 @@ from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.metadata.schema_classes import OperationTypeClass, SchemaMetadataClass +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BooleanTypeClass, + DateTypeClass, + NumberTypeClass, + OperationTypeClass, + SchemaFieldDataTypeClass, + SchemaMetadataClass, + StringTypeClass, + TimeTypeClass, +) from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.dataset_urn import DatasetUrn @@ -90,8 +101,18 @@ def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: return QueryType.UNKNOWN +class _ParserBaseModel( + BaseModel, + arbitrary_types_allowed=True, + json_encoders={ + SchemaFieldDataTypeClass: lambda v: v.to_obj(), + }, +): + pass + + @functools.total_ordering -class _FrozenModel(BaseModel, frozen=True): +class _FrozenModel(_ParserBaseModel, frozen=True): def __lt__(self, other: "_FrozenModel") -> bool: for field in self.__fields__: self_v = getattr(self, field) @@ -146,29 +167,42 @@ class _ColumnRef(_FrozenModel): column: str -class ColumnRef(BaseModel): +class ColumnRef(_ParserBaseModel): table: Urn column: str -class _DownstreamColumnRef(BaseModel): +class _DownstreamColumnRef(_ParserBaseModel): table: Optional[_TableName] column: str + column_type: Optional[sqlglot.exp.DataType] -class DownstreamColumnRef(BaseModel): +class DownstreamColumnRef(_ParserBaseModel): table: Optional[Urn] column: str + column_type: Optional[SchemaFieldDataTypeClass] + native_column_type: Optional[str] + + @pydantic.validator("column_type", pre=True) + def _load_column_type( + cls, v: Optional[Union[dict, SchemaFieldDataTypeClass]] + ) -> Optional[SchemaFieldDataTypeClass]: + if v is None: + return None + if isinstance(v, SchemaFieldDataTypeClass): + return v + return SchemaFieldDataTypeClass.from_obj(v) -class _ColumnLineageInfo(BaseModel): +class _ColumnLineageInfo(_ParserBaseModel): downstream: _DownstreamColumnRef upstreams: List[_ColumnRef] logic: Optional[str] -class ColumnLineageInfo(BaseModel): +class ColumnLineageInfo(_ParserBaseModel): downstream: DownstreamColumnRef upstreams: List[ColumnRef] @@ -176,7 +210,7 @@ class ColumnLineageInfo(BaseModel): logic: Optional[str] = pydantic.Field(default=None, exclude=True) -class SqlParsingDebugInfo(BaseModel, arbitrary_types_allowed=True): +class SqlParsingDebugInfo(_ParserBaseModel): confidence: float = 0.0 tables_discovered: int = 0 @@ -190,7 +224,7 @@ def error(self) -> Optional[Exception]: return self.table_error or self.column_error -class SqlParsingResult(BaseModel): +class SqlParsingResult(_ParserBaseModel): query_type: QueryType = QueryType.UNKNOWN in_tables: List[Urn] @@ -448,6 +482,11 @@ def _column_level_lineage( # noqa: C901 # Our snowflake source lowercases column identifiers, so we are forced # to do fuzzy (case-insensitive) resolution instead of exact resolution. "snowflake", + # Teradata column names are case-insensitive. + # A name, even when enclosed in double quotation marks, is not case sensitive. For example, CUSTOMER and Customer are the same. + # See more below: + # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/acreldb/n0ejgx4895bofnn14rlguktfx5r3.htm + "teradata", } sqlglot_db_schema = sqlglot.MappingSchema( @@ -541,6 +580,15 @@ def _schema_aware_fuzzy_column_resolve( ) from e logger.debug("Qualified sql %s", statement.sql(pretty=True, dialect=dialect)) + # Try to figure out the types of the output columns. + try: + statement = sqlglot.optimizer.annotate_types.annotate_types( + statement, schema=sqlglot_db_schema + ) + except sqlglot.errors.OptimizeError as e: + # This is not a fatal error, so we can continue. + logger.debug("sqlglot failed to annotate types: %s", e) + column_lineage = [] try: @@ -553,7 +601,6 @@ def _schema_aware_fuzzy_column_resolve( logger.debug("output columns: %s", [col[0] for col in output_columns]) output_col: str for output_col, original_col_expression in output_columns: - # print(f"output column: {output_col}") if output_col == "*": # If schema information is available, the * will be expanded to the actual columns. # Otherwise, we can't process it. @@ -613,12 +660,19 @@ def _schema_aware_fuzzy_column_resolve( output_col = _schema_aware_fuzzy_column_resolve(output_table, output_col) + # Guess the output column type. + output_col_type = None + if original_col_expression.type: + output_col_type = original_col_expression.type + if not direct_col_upstreams: logger.debug(f' "{output_col}" has no upstreams') column_lineage.append( _ColumnLineageInfo( downstream=_DownstreamColumnRef( - table=output_table, column=output_col + table=output_table, + column=output_col, + column_type=output_col_type, ), upstreams=sorted(direct_col_upstreams), # logic=column_logic.sql(pretty=True, dialect=dialect), @@ -673,6 +727,42 @@ def _try_extract_select( return statement +def _translate_sqlglot_type( + sqlglot_type: sqlglot.exp.DataType.Type, +) -> Optional[SchemaFieldDataTypeClass]: + TypeClass: Any + if sqlglot_type in sqlglot.exp.DataType.TEXT_TYPES: + TypeClass = StringTypeClass + elif sqlglot_type in sqlglot.exp.DataType.NUMERIC_TYPES or sqlglot_type in { + sqlglot.exp.DataType.Type.DECIMAL, + }: + TypeClass = NumberTypeClass + elif sqlglot_type in { + sqlglot.exp.DataType.Type.BOOLEAN, + sqlglot.exp.DataType.Type.BIT, + }: + TypeClass = BooleanTypeClass + elif sqlglot_type in { + sqlglot.exp.DataType.Type.DATE, + }: + TypeClass = DateTypeClass + elif sqlglot_type in sqlglot.exp.DataType.TEMPORAL_TYPES: + TypeClass = TimeTypeClass + elif sqlglot_type in { + sqlglot.exp.DataType.Type.ARRAY, + }: + TypeClass = ArrayTypeClass + elif sqlglot_type in { + sqlglot.exp.DataType.Type.UNKNOWN, + }: + return None + else: + logger.debug("Unknown sqlglot type: %s", sqlglot_type) + return None + + return SchemaFieldDataTypeClass(type=TypeClass()) + + def _translate_internal_column_lineage( table_name_urn_mapping: Dict[_TableName, str], raw_column_lineage: _ColumnLineageInfo, @@ -684,6 +774,16 @@ def _translate_internal_column_lineage( downstream=DownstreamColumnRef( table=downstream_urn, column=raw_column_lineage.downstream.column, + column_type=_translate_sqlglot_type( + raw_column_lineage.downstream.column_type.this + ) + if raw_column_lineage.downstream.column_type + else None, + native_column_type=raw_column_lineage.downstream.column_type.sql() + if raw_column_lineage.downstream.column_type + and raw_column_lineage.downstream.column_type.this + != sqlglot.exp.DataType.Type.UNKNOWN + else None, ), upstreams=[ ColumnRef( diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py index 261f95331af61..e13d439161064 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py @@ -3,7 +3,11 @@ from avro.schema import Field, RecordSchema from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.metadata.schema_classes import DictWrapper +from datahub.metadata.schema_classes import ( + DictWrapper, + MetadataChangeEventClass, + MetadataChangeProposalClass, +) from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn, guess_entity_type @@ -32,7 +36,7 @@ def list_urns_with_path( if isinstance(model, MetadataChangeProposalWrapper): if model.entityUrn: - urns.append((model.entityUrn, ["urn"])) + urns.append((model.entityUrn, ["entityUrn"])) if model.entityKeyAspect: urns.extend( _add_prefix_to_paths( @@ -83,7 +87,15 @@ def list_urns(model: Union[DictWrapper, MetadataChangeProposalWrapper]) -> List[ return [urn for urn, _ in list_urns_with_path(model)] -def transform_urns(model: DictWrapper, func: Callable[[str], str]) -> None: +def transform_urns( + model: Union[ + DictWrapper, + MetadataChangeEventClass, + MetadataChangeProposalClass, + MetadataChangeProposalWrapper, + ], + func: Callable[[str], str], +) -> None: """ Rewrites all URNs in the given object according to the given function. """ @@ -95,7 +107,9 @@ def transform_urns(model: DictWrapper, func: Callable[[str], str]) -> None: def _modify_at_path( - model: Union[DictWrapper, list], path: _Path, new_value: str + model: Union[DictWrapper, MetadataChangeProposalWrapper, list], + path: _Path, + new_value: str, ) -> None: assert len(path) > 0 @@ -103,6 +117,8 @@ def _modify_at_path( if isinstance(path[0], int): assert isinstance(model, list) model[path[0]] = new_value + elif isinstance(model, MetadataChangeProposalWrapper): + setattr(model, path[0], new_value) else: assert isinstance(model, DictWrapper) model._inner_dict[path[0]] = new_value @@ -120,7 +136,14 @@ def _lowercase_dataset_urn(dataset_urn: str) -> str: return str(cur_urn) -def lowercase_dataset_urns(model: DictWrapper) -> None: +def lowercase_dataset_urns( + model: Union[ + DictWrapper, + MetadataChangeEventClass, + MetadataChangeProposalClass, + MetadataChangeProposalWrapper, + ] +) -> None: def modify_urn(urn: str) -> str: if guess_entity_type(urn) == "dataset": return _lowercase_dataset_urn(urn) diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index dee85b40bb7a8..1da42b94e320c 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -533,20 +533,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -566,20 +552,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -599,20 +571,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index 72db36e63daf7..685a606a57c33 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -327,20 +327,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -360,20 +346,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -393,20 +365,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index e5508bdb06b9e..069788cb088ac 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -327,20 +327,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -360,20 +346,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -393,20 +365,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index b0f66e7b245c9..f1c932ebd5a70 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -335,20 +335,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -369,20 +355,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -403,20 +375,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index 91e13debfa028..9521c9af4bbdc 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -550,20 +550,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -583,20 +569,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -616,20 +588,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index e93079119e4f4..dbacd52fe83de 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -327,20 +327,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -360,20 +346,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -393,20 +365,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index a9c8efa7cdb98..aaa874d9ff348 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -351,20 +351,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -384,20 +370,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -417,20 +389,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index edd15624a14cd..be8db0722aea3 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -343,20 +343,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -376,20 +362,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -409,20 +381,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index aebc89b609a08..05b74f163ad45 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -327,20 +327,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -360,20 +346,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -393,20 +365,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 34bded3cf691e..0778aa0050b00 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -279,20 +279,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -312,20 +298,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -345,20 +317,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json index 238f4c2580cdf..5a0bd4e12fd3a 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json @@ -2121,20 +2121,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2154,20 +2140,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2187,20 +2159,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json index 45d5d839e9d21..1b0ee3216383c 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json @@ -2121,20 +2121,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2154,20 +2140,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2187,20 +2159,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json index 187cedaefb6b2..b960ba581e6b5 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json @@ -2004,20 +2004,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2037,20 +2023,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2070,20 +2042,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json index c2c879e38f37b..e29292a44c949 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json @@ -2121,20 +2121,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2154,20 +2140,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2187,20 +2159,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json index c1ac54b0fb588..04ecaecbd4afb 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json @@ -584,20 +584,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -617,20 +603,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -650,20 +622,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json index f602ca37b3160..080931ae637bc 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json @@ -2121,20 +2121,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2154,20 +2140,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2187,20 +2159,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json index 104bd365669e3..5826c4316b539 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json @@ -2134,20 +2134,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2167,20 +2153,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2200,20 +2172,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json index 37a6c94c6952e..53d1ec0229de1 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json @@ -681,20 +681,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -714,20 +700,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -747,20 +719,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index e3cc6c8101650..b6cb578217a2c 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -17,7 +17,6 @@ ) from datahub.ingestion.source.powerbi.m_query import parser, resolver, tree_function from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable, Lineage -from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, DownstreamColumnRef pytestmark = pytest.mark.integration_batch_2 @@ -742,75 +741,25 @@ def test_sqlglot_parser(): == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)" ) - assert lineage[0].column_lineage == [ - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="client_director"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="tier"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column='upper("manager")'), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="team_type"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="date_target"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="monthid"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="target_team"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="seller_email"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="agent_key"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="sme_quota"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="revenue_quota"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="service_quota"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="bl_target"), - upstreams=[], - logic=None, - ), - ColumnLineageInfo( - downstream=DownstreamColumnRef(table=None, column="software_quota"), - upstreams=[], - logic=None, - ), + # TODO: None of these columns have upstreams? + # That doesn't seem right - we probably need to add fake schemas for the two tables above. + cols = [ + "client_director", + "tier", + 'upper("manager")', + "team_type", + "date_target", + "monthid", + "target_team", + "seller_email", + "agent_key", + "sme_quota", + "revenue_quota", + "service_quota", + "bl_target", + "software_quota", ] + for i, column in enumerate(cols): + assert lineage[0].column_lineage[i].downstream.table is None + assert lineage[0].column_lineage[i].downstream.column == column + assert lineage[0].column_lineage[i].upstreams == [] diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json index 7687b99ac8d6d..5057dacd5b0c8 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json @@ -24,7 +24,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -39,7 +40,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -54,7 +56,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -71,7 +74,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -86,7 +90,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -115,7 +120,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -130,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -145,7 +152,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -162,7 +170,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -177,7 +186,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -197,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -212,7 +223,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -375,7 +387,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -401,7 +414,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -416,7 +430,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -433,7 +448,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -457,7 +473,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -472,7 +489,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -635,7 +653,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -661,7 +680,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -676,7 +696,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -693,7 +714,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -717,7 +739,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -732,7 +755,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -895,7 +919,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -921,7 +946,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -936,7 +962,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -953,7 +980,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -977,7 +1005,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -992,7 +1021,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1155,7 +1185,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1181,7 +1212,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1196,7 +1228,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1213,7 +1246,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1237,7 +1271,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1252,7 +1287,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1415,7 +1451,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1441,7 +1478,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1456,7 +1494,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1473,7 +1512,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1497,7 +1537,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1512,7 +1553,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1675,7 +1717,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1701,7 +1744,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1716,7 +1760,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1733,7 +1778,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1757,7 +1803,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1772,7 +1819,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1935,7 +1983,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1961,7 +2010,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1976,7 +2026,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1993,7 +2044,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2017,7 +2069,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2032,7 +2085,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2195,7 +2249,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2221,7 +2276,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2236,7 +2292,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2253,7 +2310,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2277,7 +2335,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2292,7 +2351,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2455,7 +2515,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2481,7 +2542,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2496,7 +2558,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2513,7 +2576,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2537,7 +2601,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2552,7 +2617,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2715,7 +2781,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2741,7 +2808,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2756,7 +2824,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2773,7 +2842,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2797,7 +2867,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2821,7 +2892,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2845,7 +2917,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2869,7 +2942,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2893,7 +2967,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2917,7 +2992,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2941,7 +3017,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2965,7 +3042,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2989,7 +3067,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3013,7 +3092,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3037,7 +3117,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 2c77ace8b53e5..3dafe85ef950a 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -125,6 +125,7 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): validate_upstreams_against_patterns=False, include_operational_stats=True, email_as_user_identifier=True, + incremental_lineage=False, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc ), @@ -213,6 +214,7 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_ include_views=False, include_view_lineage=False, include_usage_stats=False, + incremental_lineage=False, include_operational_stats=False, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index bba53c1e97a47..cd53b8f7db4f6 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -283,10 +283,12 @@ def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warni ) snowflake_pipeline_config1 = snowflake_pipeline_config.copy() - cast( + config = cast( SnowflakeV2Config, cast(PipelineConfig, snowflake_pipeline_config1).source.config, - ).include_view_lineage = True + ) + config.include_view_lineage = True + config.incremental_lineage = False pipeline = Pipeline(snowflake_pipeline_config1) pipeline.run() pipeline.raise_from_status() # pipeline should not fail diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py index f72bd5b72d2cd..7e2ac94fa4e35 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py @@ -31,6 +31,7 @@ def stateful_pipeline_config(include_tables: bool) -> PipelineConfig: match_fully_qualified_names=True, schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_tables=include_tables, + incremental_lineage=False, stateful_ingestion=StatefulStaleMetadataRemovalConfig.parse_obj( { "enabled": True, @@ -49,7 +50,7 @@ def stateful_pipeline_config(include_tables: bool) -> PipelineConfig: @freeze_time(FROZEN_TIME) -def test_tableau_stateful(mock_datahub_graph): +def test_stale_metadata_removal(mock_datahub_graph): with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index c31867f5aa904..0510f4a40f659 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -20,7 +20,7 @@ from datahub.ingestion.source.tableau import TableauConfig, TableauSource from datahub.ingestion.source.tableau_common import ( TableauLineageOverrides, - make_table_urn, + TableauUpstreamReference, ) from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageType, @@ -546,13 +546,13 @@ def test_lineage_overrides(): enable_logging() # Simple - specify platform instance to presto table assert ( - make_table_urn( - DEFAULT_ENV, + TableauUpstreamReference( "presto_catalog", - "presto", "test-schema", - "presto_catalog.test-schema.test-table", - platform_instance_map={"presto": "my_presto_instance"}, + "test-table", + "presto", + ).make_dataset_urn( + env=DEFAULT_ENV, platform_instance_map={"presto": "my_presto_instance"} ) == "urn:li:dataset:(urn:li:dataPlatform:presto,my_presto_instance.presto_catalog.test-schema.test-table,PROD)" ) @@ -560,12 +560,13 @@ def test_lineage_overrides(): # Transform presto urn to hive urn # resulting platform instance for hive = mapped platform instance + presto_catalog assert ( - make_table_urn( - DEFAULT_ENV, + TableauUpstreamReference( "presto_catalog", - "presto", "test-schema", - "presto_catalog.test-schema.test-table", + "test-table", + "presto", + ).make_dataset_urn( + env=DEFAULT_ENV, platform_instance_map={"presto": "my_instance"}, lineage_overrides=TableauLineageOverrides( platform_override_map={"presto": "hive"}, @@ -574,14 +575,15 @@ def test_lineage_overrides(): == "urn:li:dataset:(urn:li:dataPlatform:hive,my_instance.presto_catalog.test-schema.test-table,PROD)" ) - # tranform hive urn to presto urn + # transform hive urn to presto urn assert ( - make_table_urn( - DEFAULT_ENV, - "", - "hive", + TableauUpstreamReference( + None, "test-schema", - "test-schema.test-table", + "test-table", + "hive", + ).make_dataset_urn( + env=DEFAULT_ENV, platform_instance_map={"hive": "my_presto_instance.presto_catalog"}, lineage_overrides=TableauLineageOverrides( platform_override_map={"hive": "presto"}, diff --git a/metadata-ingestion/tests/unit/api/entities/datacontract/__init__.py b/metadata-ingestion/tests/unit/api/entities/datacontract/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py b/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py new file mode 100644 index 0000000000000..7be8b667a500b --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py @@ -0,0 +1,55 @@ +from datahub.api.entities.datacontract.data_quality_assertion import ( + DataQualityAssertion, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, + AssertionTypeClass, + AssertionValueChangeTypeClass, + SqlAssertionInfoClass, + SqlAssertionTypeClass, +) + + +def test_parse_sql_assertion(): + assertion_urn = "urn:li:assertion:a" + entity_urn = "urn:li:dataset:d" + statement = "SELECT COUNT(*) FROM my_table WHERE value IS NOT NULL" + + d = { + "type": "custom_sql", + "sql": statement, + "operator": {"type": "between", "min": 5, "max": 10}, + } + + assert DataQualityAssertion.parse_obj(d).generate_mcp( + assertion_urn, entity_urn + ) == [ + MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=AssertionInfoClass( + type=AssertionTypeClass.SQL, + sqlAssertion=SqlAssertionInfoClass( + type=SqlAssertionTypeClass.METRIC, + changeType=AssertionValueChangeTypeClass.ABSOLUTE, + entity=entity_urn, + statement="SELECT COUNT(*) FROM my_table WHERE value IS NOT NULL", + operator=AssertionStdOperatorClass.BETWEEN, + parameters=AssertionStdParametersClass( + minValue=AssertionStdParameterClass( + value="5", + type=AssertionStdParameterTypeClass.NUMBER, + ), + maxValue=AssertionStdParameterClass( + value="10", + type=AssertionStdParameterTypeClass.NUMBER, + ), + ), + ), + ), + ) + ] diff --git a/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_less_upstreams_in_gms_aspect_golden.json b/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_less_upstreams_in_gms_aspect_golden.json new file mode 100644 index 0000000000000..812566143014b --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_less_upstreams_in_gms_aspect_golden.json @@ -0,0 +1,106 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD)", + "type": "TRANSFORMED" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_a)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_b)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_c)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_a)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_a)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_b)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_b)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_c)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_c)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "run-id", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_more_upstreams_in_gms_aspect_golden.json b/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_more_upstreams_in_gms_aspect_golden.json new file mode 100644 index 0000000000000..17f4d10728268 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_more_upstreams_in_gms_aspect_golden.json @@ -0,0 +1,120 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream3,PROD)", + "type": "TRANSFORMED" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_a)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_a)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream3,PROD),col_a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_a)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_b)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_b)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream3,PROD),col_b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_b)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_c)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_c)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream3,PROD),col_c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_c)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_a)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_a)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_b)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_b)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_c)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_c)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "run-id", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/source_helpers/incremental_table_lineage_golden.json b/metadata-ingestion/tests/unit/api/source_helpers/incremental_table_lineage_golden.json new file mode 100644 index 0000000000000..c828373c73080 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/incremental_table_lineage_golden.json @@ -0,0 +1,41 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD)", + "changeType": "PATCH", + "aspectName": "upstreamLineage", + "aspect": { + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aplatform%2Cupstream1%2CPROD%29", + "value": { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD)", + "type": "TRANSFORMED" + } + }, + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aplatform%2Cupstream2%2CPROD%29", + "value": { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD)", + "type": "TRANSFORMED" + } + } + ] + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "run-id", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py new file mode 100644 index 0000000000000..54a22d860285c --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py @@ -0,0 +1,240 @@ +from typing import List, Optional +from unittest.mock import MagicMock + +import pytest + +import datahub.metadata.schema_classes as models +from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.sink.file import write_metadata_file +from tests.test_helpers import mce_helpers + +platform = "platform" +system_metadata = models.SystemMetadataClass(lastObserved=1643871600000, runId="run-id") + + +def make_lineage_aspect( + dataset_name: str, + upstreams: List[str], + timestamp: int = 0, + columns: List[str] = [], + include_cll: bool = False, +) -> models.UpstreamLineageClass: + """ + Generates dataset properties and upstream lineage aspects + with simple column to column lineage between current dataset and all upstreams + """ + + dataset_urn = make_dataset_urn(platform, dataset_name) + return models.UpstreamLineageClass( + upstreams=[ + models.UpstreamClass( + dataset=upstream_urn, + type=models.DatasetLineageTypeClass.TRANSFORMED, + auditStamp=models.AuditStampClass( + time=timestamp, actor="urn:li:corpuser:unknown" + ), + ) + for upstream_urn in upstreams + ], + fineGrainedLineages=[ + models.FineGrainedLineageClass( + upstreamType=models.FineGrainedLineageUpstreamTypeClass.FIELD_SET, + downstreamType=models.FineGrainedLineageDownstreamTypeClass.FIELD, + upstreams=[ + make_schema_field_urn(upstream_urn, col) + for upstream_urn in upstreams + ], + downstreams=[make_schema_field_urn(dataset_urn, col)], + ) + for col in columns + ] + if include_cll + else None, + ) + + +def base_table_lineage_aspect() -> models.UpstreamLineageClass: + return make_lineage_aspect( + "dataset1", + upstreams=[ + make_dataset_urn(platform, name) for name in ["upstream1", "upstream2"] + ], + ) + + +def base_cll_aspect(timestamp: int = 0) -> models.UpstreamLineageClass: + return make_lineage_aspect( + "dataset1", + upstreams=[ + make_dataset_urn(platform, name) for name in ["upstream1", "upstream2"] + ], + timestamp=timestamp, + columns=["col_a", "col_b", "col_c"], + include_cll=True, + ) + + +def test_incremental_table_lineage(tmp_path, pytestconfig): + test_resources_dir = pytestconfig.rootpath / "tests/unit/api/source_helpers" + test_file = tmp_path / "incremental_table_lineage.json" + golden_file = test_resources_dir / "incremental_table_lineage_golden.json" + + urn = make_dataset_urn(platform, "dataset1") + aspect = base_table_lineage_aspect() + + processed_wus = auto_incremental_lineage( + graph=None, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=aspect, systemMetadata=system_metadata + ).as_workunit() + ], + ) + + write_metadata_file( + test_file, + [wu.metadata for wu in processed_wus], + ) + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, output_path=test_file, golden_path=golden_file + ) + + +@pytest.mark.parametrize( + "gms_aspect,current_aspect,output_aspect", + [ + # emitting CLL upstreamLineage over table level upstreamLineage + [ + base_table_lineage_aspect(), + base_cll_aspect(), + base_cll_aspect(), + ], + # emitting upstreamLineage for the first time + [ + None, + base_cll_aspect(), + base_cll_aspect(), + ], + # emitting CLL upstreamLineage over same CLL upstreamLineage + [ + base_cll_aspect(), + base_cll_aspect(), + base_cll_aspect(), + ], + # emitting CLL upstreamLineage over same CLL upstreamLineage but with earlier timestamp + [ + base_cll_aspect(), # default timestamp is 0 + base_cll_aspect(timestamp=1643871600000), + base_cll_aspect(timestamp=1643871600000), + ], + ], +) +def test_incremental_column_level_lineage( + gms_aspect: Optional[models.UpstreamLineageClass], + current_aspect: models.UpstreamLineageClass, + output_aspect: models.UpstreamLineageClass, +) -> None: + mock_graph = MagicMock() + mock_graph.get_aspect.return_value = gms_aspect + dataset_urn = make_dataset_urn(platform, "dataset1") + + processed_wus = auto_incremental_lineage( + graph=mock_graph, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=current_aspect, + systemMetadata=system_metadata, + ).as_workunit() + ], + ) + + wu: MetadataWorkUnit = next(iter(processed_wus)) + aspect = wu.get_aspect_of_type(models.UpstreamLineageClass) + assert aspect == output_aspect + + +def test_incremental_column_lineage_less_upstreams_in_gms_aspect( + tmp_path, pytestconfig +): + test_resources_dir = pytestconfig.rootpath / "tests/unit/api/source_helpers" + test_file = tmp_path / "incremental_cll_less_upstreams_in_gms_aspect.json" + golden_file = ( + test_resources_dir / "incremental_cll_less_upstreams_in_gms_aspect_golden.json" + ) + + urn = make_dataset_urn(platform, "dataset1") + aspect = base_cll_aspect() + + mock_graph = MagicMock() + mock_graph.get_aspect.return_value = make_lineage_aspect( + "dataset1", + upstreams=[make_dataset_urn(platform, name) for name in ["upstream1"]], + columns=["col_a", "col_b", "col_c"], + include_cll=True, + ) + + processed_wus = auto_incremental_lineage( + graph=mock_graph, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=aspect, systemMetadata=system_metadata + ).as_workunit() + ], + ) + + write_metadata_file( + test_file, + [wu.metadata for wu in processed_wus], + ) + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, output_path=test_file, golden_path=golden_file + ) + + +def test_incremental_column_lineage_more_upstreams_in_gms_aspect( + tmp_path, pytestconfig +): + test_resources_dir = pytestconfig.rootpath / "tests/unit/api/source_helpers" + test_file = tmp_path / "incremental_cll_more_upstreams_in_gms_aspect.json" + golden_file = ( + test_resources_dir / "incremental_cll_more_upstreams_in_gms_aspect_golden.json" + ) + + urn = make_dataset_urn(platform, "dataset1") + aspect = base_cll_aspect() + + mock_graph = MagicMock() + mock_graph.get_aspect.return_value = make_lineage_aspect( + "dataset1", + upstreams=[ + make_dataset_urn(platform, name) + for name in ["upstream1", "upstream2", "upstream3"] + ], + columns=["col_a", "col_b", "col_c"], + include_cll=True, + ) + + processed_wus = auto_incremental_lineage( + graph=mock_graph, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=aspect, systemMetadata=system_metadata + ).as_workunit() + ], + ) + + write_metadata_file( + test_file, + [wu.metadata for wu in processed_wus], + ) + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, output_path=test_file, golden_path=golden_file + ) diff --git a/metadata-ingestion/tests/unit/test_source_helpers.py b/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py similarity index 86% rename from metadata-ingestion/tests/unit/test_source_helpers.py rename to metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py index b6ec6ebce240c..b667af8bb41e9 100644 --- a/metadata-ingestion/tests/unit/test_source_helpers.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py @@ -16,6 +16,7 @@ from datahub.ingestion.api.source_helpers import ( auto_browse_path_v2, auto_empty_dataset_usage_statistics, + auto_lowercase_urns, auto_status_aspect, auto_workunit, ) @@ -275,6 +276,75 @@ def test_auto_browse_path_v2_legacy_browse_path(telemetry_ping_mock): assert paths["platform,dataset-2,PROD)"] == _make_browse_path_entries(["something"]) +def test_auto_lowercase_aspects(): + mcws = auto_workunit( + [ + MetadataChangeProposalWrapper( + entityUrn=make_dataset_urn( + "bigquery", "myProject.mySchema.myTable", "PROD" + ), + aspect=models.DatasetKeyClass( + "urn:li:dataPlatform:bigquery", "myProject.mySchema.myTable", "PROD" + ), + ), + MetadataChangeProposalWrapper( + entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a", + aspect=models.ContainerPropertiesClass( + name="test", + ), + ), + models.MetadataChangeEventClass( + proposedSnapshot=models.DatasetSnapshotClass( + urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-Public-Data.Covid19_Aha.staffing,PROD)", + aspects=[ + models.DatasetPropertiesClass( + customProperties={ + "key": "value", + }, + ), + ], + ), + ), + ] + ) + + expected = [ + *list( + auto_workunit( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.myschema.mytable,PROD)", + aspect=models.DatasetKeyClass( + "urn:li:dataPlatform:bigquery", + "myProject.mySchema.myTable", + "PROD", + ), + ), + MetadataChangeProposalWrapper( + entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a", + aspect=models.ContainerPropertiesClass( + name="test", + ), + ), + models.MetadataChangeEventClass( + proposedSnapshot=models.DatasetSnapshotClass( + urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_aha.staffing,PROD)", + aspects=[ + models.DatasetPropertiesClass( + customProperties={ + "key": "value", + }, + ), + ], + ), + ), + ] + ) + ), + ] + assert list(auto_lowercase_urns(mcws)) == expected + + @patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping") def test_auto_browse_path_v2_container_over_legacy_browse_path(telemetry_ping_mock): structure = {"a": {"b": ["c"]}} diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json index e50d944ce72e3..f0175b4dc8892 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json @@ -12,7 +12,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj-2.dataset.my_view,PROD)", - "column": "col5" + "column": "col5", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -24,7 +30,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj-2.dataset.my_view,PROD)", - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -36,7 +48,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj-2.dataset.my_view,PROD)", - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -48,7 +66,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj-2.dataset.my_view,PROD)", - "column": "col3" + "column": "col3", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json index 78591286feb50..b7df5444987f2 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json index 0e93d31fbb6a6..67e306bebf545 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json index 78591286feb50..b7df5444987f2 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json index 17a801a63e3ff..b393b2445d6c4 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json @@ -10,7 +10,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-project.my-dataset.test_table,PROD)", - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -22,7 +28,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-project.my-dataset.test_table,PROD)", - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -34,7 +46,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-project.my-dataset.test_table,PROD)", - "column": "something" + "column": "something", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json index fd8a586ac74ac..53fb94300e804 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json @@ -11,7 +11,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_view,PROD)", - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -27,7 +33,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_view,PROD)", - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json index 1ca56840531e4..ff452467aa5bd 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json @@ -10,7 +10,9 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:oracle,vsal,PROD)", - "column": "Department" + "column": "Department", + "column_type": null, + "native_column_type": null }, "upstreams": [ { @@ -22,14 +24,22 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:oracle,vsal,PROD)", - "column": "Employees" + "column": "Employees", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [] }, { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:oracle,vsal,PROD)", - "column": "Salary" + "column": "Salary", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json index e241bdd08e243..eecb2265eaec5 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "total_agg" + "column": "total_agg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "orderkey" + "column": "orderkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -32,7 +44,13 @@ { "downstream": { "table": null, - "column": "custkey" + "column": "custkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -44,7 +62,13 @@ { "downstream": { "table": null, - "column": "orderstatus" + "column": "orderstatus", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -56,7 +80,13 @@ { "downstream": { "table": null, - "column": "totalprice" + "column": "totalprice", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { @@ -68,7 +98,13 @@ { "downstream": { "table": null, - "column": "orderdate" + "column": "orderdate", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "native_column_type": "DATE" }, "upstreams": [ { @@ -80,7 +116,13 @@ { "downstream": { "table": null, - "column": "orderpriority" + "column": "orderpriority", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -92,7 +134,13 @@ { "downstream": { "table": null, - "column": "clerk" + "column": "clerk", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -104,7 +152,13 @@ { "downstream": { "table": null, - "column": "shippriority" + "column": "shippriority", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -116,7 +170,13 @@ { "downstream": { "table": null, - "column": "comment" + "column": "comment", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json index d7264fd2db6b2..326db47e7ab33 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json @@ -18,21 +18,27 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "i_item_desc" + "column": "i_item_desc", + "column_type": null, + "native_column_type": null }, "upstreams": [] }, { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "w_warehouse_name" + "column": "w_warehouse_name", + "column_type": null, + "native_column_type": null }, "upstreams": [] }, { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "d_week_seq" + "column": "d_week_seq", + "column_type": null, + "native_column_type": null }, "upstreams": [ { @@ -44,7 +50,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "no_promo" + "column": "no_promo", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [ { @@ -56,7 +68,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "promo" + "column": "promo", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [ { @@ -68,7 +86,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "total_cnt" + "column": "total_cnt", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json index 10f5ee20b0c1f..b5fd5eebeb1b1 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json @@ -9,21 +9,27 @@ { "downstream": { "table": null, - "column": "a" + "column": "a", + "column_type": null, + "native_column_type": null }, "upstreams": [] }, { "downstream": { "table": null, - "column": "b" + "column": "b", + "column_type": null, + "native_column_type": null }, "upstreams": [] }, { "downstream": { "table": null, - "column": "c" + "column": "c", + "column_type": null, + "native_column_type": null }, "upstreams": [] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_count.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_count.json index 9f6eeae46c294..a67c944822138 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_count.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_count.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "COUNT(`fact_complaint_snapshot`.`etl_data_dt_id`)" + "column": "COUNT(`fact_complaint_snapshot`.`etl_data_dt_id`)", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json index 109de96180422..5ad847e252497 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "post_id" + "column": "post_id", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -20,7 +26,9 @@ { "downstream": { "table": null, - "column": "id" + "column": "id", + "column_type": null, + "native_column_type": null }, "upstreams": [ { @@ -32,7 +40,9 @@ { "downstream": { "table": null, - "column": "min_metric" + "column": "min_metric", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json index 2340b2e95b0d0..902aa010c8afc 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json @@ -9,14 +9,26 @@ { "downstream": { "table": null, - "column": "label" + "column": "label", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" }, "upstreams": [] }, { "downstream": { "table": null, - "column": "total_agg" + "column": "total_agg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_max.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_max.json index 326c07d332c26..6ea88f45847ce 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_max.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_max.json @@ -8,7 +8,9 @@ { "downstream": { "table": null, - "column": "max_col" + "column": "max_col", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_ctes.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_ctes.json index 3e02314d6e8c3..67e9fd2d21a0e 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_ctes.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_ctes.json @@ -9,7 +9,9 @@ { "downstream": { "table": null, - "column": "COL1" + "column": "COL1", + "column_type": null, + "native_column_type": null }, "upstreams": [ { @@ -21,7 +23,9 @@ { "downstream": { "table": null, - "column": "COL3" + "column": "COL3", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json index c12ad23b2f03b..6ee3d2e61c39b 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "post_id" + "column": "post_id", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -20,7 +26,9 @@ { "downstream": { "table": null, - "column": "id" + "column": "id", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json index 64cd80e9a2d69..a876824127ec1 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "total_price_category" + "column": "total_price_category", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "total_price_success" + "column": "total_price_success", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_cast.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_cast.json new file mode 100644 index 0000000000000..7545e2b3269dc --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_cast.json @@ -0,0 +1,63 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "orderkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL(20, 0)" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "o_orderkey" + } + ] + }, + { + "downstream": { + "table": null, + "column": "total_cast_int", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "o_totalprice" + } + ] + }, + { + "downstream": { + "table": null, + "column": "total_cast_float", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL(16, 4)" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "o_totalprice" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json index 7b22a46757e39..84e6b053000f1 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "total_agg" + "column": "total_agg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "total_avg" + "column": "total_avg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -32,7 +44,13 @@ { "downstream": { "table": null, - "column": "total_min" + "column": "total_min", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { @@ -44,7 +62,13 @@ { "downstream": { "table": null, - "column": "total_max" + "column": "total_max", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json index c912d99a3a8a3..39c94cf83c561 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json @@ -10,7 +10,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", - "column": "Total_Agg" + "column": "Total_Agg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -22,7 +28,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", - "column": "total_avg" + "column": "total_avg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -34,7 +46,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", - "column": "TOTAL_MIN" + "column": "TOTAL_MIN", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { @@ -46,7 +64,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", - "column": "total_max" + "column": "total_max", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json index 2af308ec60623..dbf5b1b9a4453 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json @@ -11,7 +11,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "user_fk" + "column": "user_fk", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL(38, 0)" }, "upstreams": [ { @@ -23,7 +29,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "email" + "column": "email", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR(16777216)" }, "upstreams": [ { @@ -35,7 +47,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "last_purchase_date" + "column": "last_purchase_date", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "native_column_type": "DATE" }, "upstreams": [ { @@ -47,7 +65,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "lifetime_purchase_amount" + "column": "lifetime_purchase_amount", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -59,7 +83,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "lifetime_purchase_count" + "column": "lifetime_purchase_count", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [ { @@ -71,7 +101,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "average_purchase_amount" + "column": "average_purchase_amount", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json new file mode 100644 index 0000000000000..b0351a7e07ad2 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json @@ -0,0 +1,38 @@ +{ + "query_type": "CREATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_diagnoses,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_features,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)", + "column": "PatientId", + "native_column_type": "INTEGER()" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_diagnoses,PROD)", + "column": "PatientId" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)", + "column": "BMI", + "native_column_type": "FLOAT()" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_features,PROD)", + "column": "BMI" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 2a965a9bb1e61..059add8db67e4 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -608,4 +608,67 @@ def test_snowflake_default_normalization(): ) +def test_snowflake_column_cast(): + assert_sql_result( + """ +SELECT + o.o_orderkey::NUMBER(20,0) as orderkey, + CAST(o.o_totalprice AS INT) as total_cast_int, + CAST(o.o_totalprice AS NUMBER(16,4)) as total_cast_float +FROM snowflake_sample_data.tpch_sf1.orders o +LIMIT 10 +""", + dialect="snowflake", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": { + "orderkey": "NUMBER(38,0)", + "totalprice": "NUMBER(12,2)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_column_cast.json", + ) + + # TODO: Add a test for setting platform_instance or env + + +def test_teradata_default_normalization(): + assert_sql_result( + """ +create table demo_user.test_lineage2 as + ( + select + ppd.PatientId, + ppf.bmi + from + demo_user.pima_patient_features ppf + join demo_user.pima_patient_diagnoses ppd on + ppd.PatientId = ppf.PatientId + ) with data; +""", + dialect="teradata", + default_schema="dbc", + platform_instance="myteradata", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_diagnoses,PROD)": { + "HasDiabetes": "INTEGER()", + "PatientId": "INTEGER()", + }, + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_features,PROD)": { + "Age": "INTEGER()", + "BMI": "FLOAT()", + "BloodP": "INTEGER()", + "DiPedFunc": "FLOAT()", + "NumTimesPrg": "INTEGER()", + "PatientId": "INTEGER()", + "PlGlcConc": "INTEGER()", + "SkinThick": "INTEGER()", + "TwoHourSerIns": "INTEGER()", + }, + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)": { + "BMI": "FLOAT()", + "PatientId": "INTEGER()", + }, + }, + expected_file=RESOURCE_DIR / "test_teradata_default_normalization.json", + ) diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index e9e91361f49f4..5a11a933c8595 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -765,11 +765,14 @@ def test_gen_view_dataset_workunits( ("project.dataset.table_20231215", "project.dataset.table", "20231215"), ("project.dataset.table_2023", "project.dataset.table_2023", None), # incorrectly handled special case where dataset itself is a sharded table if full name is specified - ("project.dataset.20231215", "project.dataset.20231215", None), + ("project.dataset.20231215", "project.dataset.20231215", "20231215"), + ("project1.dataset2.20231215", "project1.dataset2.20231215", "20231215"), # Cases with Just the table name as input ("table", "table", None), - ("table20231215", "table20231215", None), + ("table20231215", "table", "20231215"), ("table_20231215", "table", "20231215"), + ("table2_20231215", "table2", "20231215"), + ("table220231215", "table220231215", None), ("table_1624046611000_name", "table_1624046611000_name", None), ("table_1624046611000", "table_1624046611000", None), # Special case where dataset itself is a sharded table @@ -801,7 +804,6 @@ def test_get_table_and_shard_default( ("project.dataset.2023", "project.dataset.2023", None), # Cases with Just the table name as input ("table", "table", None), - ("table20231215", "table20231215", None), ("table_20231215", "table", "20231215"), ("table_2023", "table", "2023"), ("table_1624046611000_name", "table_1624046611000_name", None), @@ -842,7 +844,7 @@ def test_get_table_and_shard_custom_shard_pattern( "project.dataset.table_1624046611000_name", ), ("project.dataset.table_1624046611000", "project.dataset.table_1624046611000"), - ("project.dataset.table20231215", "project.dataset.table20231215"), + ("project.dataset.table20231215", "project.dataset.table"), ("project.dataset.table_*", "project.dataset.table"), ("project.dataset.table_2023*", "project.dataset.table"), ("project.dataset.table_202301*", "project.dataset.table"), diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 4cf42da4395f9..44fd840f28d59 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -144,10 +144,10 @@ def test_bigquery_table_sanitasitation(): assert new_table_ref.dataset == "dataset-4567" table_ref = BigQueryTableRef( - BigqueryTableIdentifier("project-1234", "dataset-4567", "foo_20222110") + BigqueryTableIdentifier("project-1234", "dataset-4567", "foo_20221210") ) new_table_identifier = table_ref.table_identifier - assert new_table_identifier.table == "foo_20222110" + assert new_table_identifier.table == "foo_20221210" assert new_table_identifier.is_sharded_table() assert new_table_identifier.get_table_display_name() == "foo" assert new_table_identifier.project_id == "project-1234" diff --git a/metadata-ingestion/tests/unit/test_redshift_lineage.py b/metadata-ingestion/tests/unit/test_redshift_lineage.py index c7d6ac18e044c..db5af3a71efb9 100644 --- a/metadata-ingestion/tests/unit/test_redshift_lineage.py +++ b/metadata-ingestion/tests/unit/test_redshift_lineage.py @@ -1,6 +1,8 @@ +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.redshift.config import RedshiftConfig from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor from datahub.ingestion.source.redshift.report import RedshiftReport +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, DownstreamColumnRef def test_get_sources_from_query(): @@ -10,14 +12,20 @@ def test_get_sources_from_query(): test_query = """ select * from my_schema.my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test.my_schema.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test.my_schema.my_table,PROD)" + ) def test_get_sources_from_query_with_only_table_name(): @@ -27,14 +35,20 @@ def test_get_sources_from_query_with_only_table_name(): test_query = """ select * from my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test.public.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test.public.my_table,PROD)" + ) def test_get_sources_from_query_with_database(): @@ -44,14 +58,20 @@ def test_get_sources_from_query_with_database(): test_query = """ select * from test.my_schema.my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test.my_schema.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test.my_schema.my_table,PROD)" + ) def test_get_sources_from_query_with_non_default_database(): @@ -61,14 +81,20 @@ def test_get_sources_from_query_with_non_default_database(): test_query = """ select * from test2.my_schema.my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test2.my_schema.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test2.my_schema.my_table,PROD)" + ) def test_get_sources_from_query_with_only_table(): @@ -78,27 +104,48 @@ def test_get_sources_from_query_with_only_table(): test_query = """ select * from my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test.public.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test.public.my_table,PROD)" + ) -def test_get_sources_from_query_with_four_part_table_should_throw_exception(): +def test_cll(): config = RedshiftConfig(host_port="localhost:5439", database="test") report = RedshiftReport() test_query = """ - select * from database.schema.my_table.test + select a,b,c from db.public.customer inner join db.public.order on db.public.customer.id = db.public.order.customer_id """ - lineage_extractor = RedshiftLineageExtractor(config, report) - try: - lineage_extractor._get_sources_from_query(db_name="test", query=test_query) - except ValueError: - pass - - assert f"{test_query} should have thrown a ValueError exception but it didn't" + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + _, cll = lineage_extractor._get_sources_from_query(db_name="db", query=test_query) + + assert cll == [ + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="a"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="b"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="c"), + upstreams=[], + logic=None, + ), + ] diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl index ae2a58028057b..e161270145a88 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl @@ -32,6 +32,11 @@ record AssertionInfo includes CustomProperties, ExternalReference { */ VOLUME + /** + * A raw SQL-statement based assertion + */ + SQL + /** * A schema or structural assertion. * @@ -56,7 +61,12 @@ record AssertionInfo includes CustomProperties, ExternalReference { volumeAssertion: optional VolumeAssertionInfo /** - * An schema Assertion definition. This field is populated when the type is DATASET_SCHEMA + * A SQL Assertion definition. This field is populated when the type is SQL. + */ + sqlAssertion: optional SqlAssertionInfo + + /** + * An schema Assertion definition. This field is populated when the type is DATA_SCHEMA */ schemaAssertion: optional SchemaAssertionInfo @@ -67,4 +77,9 @@ record AssertionInfo includes CustomProperties, ExternalReference { * the platform where it was ingested from. */ source: optional AssertionSource + + /** + * An optional human-readable description of the assertion + */ + description: optional string } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/SqlAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/SqlAssertionInfo.pdl new file mode 100644 index 0000000000000..f6ce738252f35 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/SqlAssertionInfo.pdl @@ -0,0 +1,67 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.dataset.DatasetFilter + +/** +* Attributes defining a SQL Assertion +*/ +record SqlAssertionInfo { + /** + * The type of the SQL assertion being monitored. + */ + @Searchable = {} + type: enum SqlAssertionType { + /** + * A SQL Metric Assertion, e.g. one based on a numeric value returned by an arbitrary SQL query. + */ + METRIC + /** + * A SQL assertion that is evaluated against the CHANGE in a metric assertion + * over time. + */ + METRIC_CHANGE + } + + /** + * The entity targeted by this SQL check. + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset" ] + } + entity: Urn + + /** + * The SQL statement to be executed when evaluating the assertion (or computing the metric). + * This should be a valid and complete statement, executable by itself. + * + * Usually this should be a SELECT query statement. + */ + statement: string + + /** + * The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + * This value is required if the type is METRIC_CHANGE. + */ + changeType: optional AssertionValueChangeType + + /** + * The operator you'd like to apply to the result of the SQL query. + * + * Note that at this time, only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java index d62c37160f816..f8eca541e1efb 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java @@ -82,7 +82,7 @@ public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request } @Override - public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { + public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { if (this.authorizers.isEmpty()) { return null; } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index f653ccf72cf54..4553139e3ca54 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -8,6 +8,8 @@ import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.policy.DataHubPolicyInfo; + +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -55,7 +57,7 @@ public enum AuthorizationMode { private final ScheduledExecutorService _refreshExecutorService = Executors.newScheduledThreadPool(1); private final PolicyRefreshRunnable _policyRefreshRunnable; private final PolicyEngine _policyEngine; - private ResourceSpecResolver _resourceSpecResolver; + private EntitySpecResolver _entitySpecResolver; private AuthorizationMode _mode; public static final String ALL = "ALL"; @@ -76,7 +78,7 @@ public DataHubAuthorizer( @Override public void init(@Nonnull Map authorizerConfig, @Nonnull AuthorizerContext ctx) { // Pass. No static config. - _resourceSpecResolver = Objects.requireNonNull(ctx.getResourceSpecResolver()); + _entitySpecResolver = Objects.requireNonNull(ctx.getEntitySpecResolver()); } public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request) { @@ -86,7 +88,7 @@ public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request return new AuthorizationResult(request, AuthorizationResult.Type.ALLOW, null); } - Optional resolvedResourceSpec = request.getResourceSpec().map(_resourceSpecResolver::resolve); + Optional resolvedResourceSpec = request.getResourceSpec().map(_entitySpecResolver::resolve); // 1. Fetch the policies relevant to the requested privilege. final List policiesToEvaluate = _policyCache.getOrDefault(request.getPrivilege(), new ArrayList<>()); @@ -102,14 +104,17 @@ public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request return new AuthorizationResult(request, AuthorizationResult.Type.DENY, null); } - public List getGrantedPrivileges(final String actorUrn, final Optional resourceSpec) { + public List getGrantedPrivileges(final String actor, final Optional resourceSpec) { // 1. Fetch all policies final List policiesToEvaluate = _policyCache.getOrDefault(ALL, new ArrayList<>()); - Optional resolvedResourceSpec = resourceSpec.map(_resourceSpecResolver::resolve); + Urn actorUrn = UrnUtils.getUrn(actor); + final ResolvedEntitySpec resolvedActorSpec = _entitySpecResolver.resolve(new EntitySpec(actorUrn.getEntityType(), actor)); + + Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve); - return _policyEngine.getGrantedPrivileges(policiesToEvaluate, UrnUtils.getUrn(actorUrn), resolvedResourceSpec); + return _policyEngine.getGrantedPrivileges(policiesToEvaluate, resolvedActorSpec, resolvedResourceSpec); } /** @@ -118,11 +123,11 @@ public List getGrantedPrivileges(final String actorUrn, final Optional resourceSpec) { + final Optional resourceSpec) { // Step 1: Find policies granting the privilege. final List policiesToEvaluate = _policyCache.getOrDefault(privilege, new ArrayList<>()); - Optional resolvedResourceSpec = resourceSpec.map(_resourceSpecResolver::resolve); + Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve); final List authorizedUsers = new ArrayList<>(); final List authorizedGroups = new ArrayList<>(); @@ -180,19 +185,36 @@ private boolean isSystemRequest(final AuthorizationRequest request, final Authen /** * Returns true if a policy grants the requested privilege for a given actor and resource. */ - private boolean isRequestGranted(final DataHubPolicyInfo policy, final AuthorizationRequest request, final Optional resourceSpec) { + private boolean isRequestGranted(final DataHubPolicyInfo policy, final AuthorizationRequest request, final Optional resourceSpec) { if (AuthorizationMode.ALLOW_ALL.equals(mode())) { return true; } + + Optional actorUrn = getUrnFromRequestActor(request.getActorUrn()); + if (actorUrn.isEmpty()) { + return false; + } + + final ResolvedEntitySpec resolvedActorSpec = _entitySpecResolver.resolve( + new EntitySpec(actorUrn.get().getEntityType(), request.getActorUrn())); final PolicyEngine.PolicyEvaluationResult result = _policyEngine.evaluatePolicy( policy, - request.getActorUrn(), + resolvedActorSpec, request.getPrivilege(), resourceSpec ); return result.isGranted(); } + private Optional getUrnFromRequestActor(String actor) { + try { + return Optional.of(Urn.createFromString(actor)); + } catch (URISyntaxException e) { + log.error(String.format("Failed to bind actor %s to an URN. Actors must be URNs. Denying the authorization request", actor)); + return Optional.empty(); + } + } + /** * A {@link Runnable} used to periodically fetch a new instance of the policies Cache. * diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java new file mode 100644 index 0000000000000..4ad14ed59c9c0 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java @@ -0,0 +1,40 @@ +package com.datahub.authorization; + +import com.datahub.authorization.fieldresolverprovider.DataPlatformInstanceFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.EntityTypeFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.OwnerFieldResolverProvider; +import com.datahub.authentication.Authentication; +import com.datahub.authorization.fieldresolverprovider.DomainFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.EntityUrnFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.EntityFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.GroupMembershipFieldResolverProvider; +import com.google.common.collect.ImmutableList; +import com.linkedin.entity.client.EntityClient; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + + +public class DefaultEntitySpecResolver implements EntitySpecResolver { + private final List _entityFieldResolverProviders; + + public DefaultEntitySpecResolver(Authentication systemAuthentication, EntityClient entityClient) { + _entityFieldResolverProviders = + ImmutableList.of(new EntityTypeFieldResolverProvider(), new EntityUrnFieldResolverProvider(), + new DomainFieldResolverProvider(entityClient, systemAuthentication), + new OwnerFieldResolverProvider(entityClient, systemAuthentication), + new DataPlatformInstanceFieldResolverProvider(entityClient, systemAuthentication), + new GroupMembershipFieldResolverProvider(entityClient, systemAuthentication)); + } + + @Override + public ResolvedEntitySpec resolve(EntitySpec entitySpec) { + return new ResolvedEntitySpec(entitySpec, getFieldResolvers(entitySpec)); + } + + private Map getFieldResolvers(EntitySpec entitySpec) { + return _entityFieldResolverProviders.stream() + .collect(Collectors.toMap(EntityFieldResolverProvider::getFieldType, + hydrator -> hydrator.getFieldResolver(entitySpec))); + } +} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultResourceSpecResolver.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultResourceSpecResolver.java deleted file mode 100644 index cd4e0b0967829..0000000000000 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultResourceSpecResolver.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.datahub.authorization; - -import com.datahub.authorization.fieldresolverprovider.EntityTypeFieldResolverProvider; -import com.datahub.authorization.fieldresolverprovider.OwnerFieldResolverProvider; -import com.datahub.authentication.Authentication; -import com.datahub.authorization.fieldresolverprovider.DomainFieldResolverProvider; -import com.datahub.authorization.fieldresolverprovider.EntityUrnFieldResolverProvider; -import com.datahub.authorization.fieldresolverprovider.ResourceFieldResolverProvider; -import com.google.common.collect.ImmutableList; -import com.linkedin.entity.client.EntityClient; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - - -public class DefaultResourceSpecResolver implements ResourceSpecResolver { - private final List _resourceFieldResolverProviders; - - public DefaultResourceSpecResolver(Authentication systemAuthentication, EntityClient entityClient) { - _resourceFieldResolverProviders = - ImmutableList.of(new EntityTypeFieldResolverProvider(), new EntityUrnFieldResolverProvider(), - new DomainFieldResolverProvider(entityClient, systemAuthentication), - new OwnerFieldResolverProvider(entityClient, systemAuthentication)); - } - - @Override - public ResolvedResourceSpec resolve(ResourceSpec resourceSpec) { - return new ResolvedResourceSpec(resourceSpec, getFieldResolvers(resourceSpec)); - } - - private Map getFieldResolvers(ResourceSpec resourceSpec) { - return _resourceFieldResolverProviders.stream() - .collect(Collectors.toMap(ResourceFieldResolverProvider::getFieldType, - hydrator -> hydrator.getFieldResolver(resourceSpec))); - } -} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/FilterUtils.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/FilterUtils.java index 76ed18e2baf78..0dbb9cd132f8a 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/FilterUtils.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/FilterUtils.java @@ -26,7 +26,7 @@ private FilterUtils() { * Creates new PolicyMatchCriterion with field and value, using EQUAL PolicyMatchCondition. */ @Nonnull - public static PolicyMatchCriterion newCriterion(@Nonnull ResourceFieldType field, @Nonnull List values) { + public static PolicyMatchCriterion newCriterion(@Nonnull EntityFieldType field, @Nonnull List values) { return newCriterion(field, values, PolicyMatchCondition.EQUALS); } @@ -34,7 +34,7 @@ public static PolicyMatchCriterion newCriterion(@Nonnull ResourceFieldType field * Creates new PolicyMatchCriterion with field, value and PolicyMatchCondition. */ @Nonnull - public static PolicyMatchCriterion newCriterion(@Nonnull ResourceFieldType field, @Nonnull List values, + public static PolicyMatchCriterion newCriterion(@Nonnull EntityFieldType field, @Nonnull List values, @Nonnull PolicyMatchCondition policyMatchCondition) { return new PolicyMatchCriterion().setField(field.name()) .setValues(new StringArray(values)) @@ -45,7 +45,7 @@ public static PolicyMatchCriterion newCriterion(@Nonnull ResourceFieldType field * Creates new PolicyMatchFilter from a map of Criteria by removing null-valued Criteria and using EQUAL PolicyMatchCondition (default). */ @Nonnull - public static PolicyMatchFilter newFilter(@Nullable Map> params) { + public static PolicyMatchFilter newFilter(@Nullable Map> params) { if (params == null) { return EMPTY_FILTER; } @@ -61,7 +61,7 @@ public static PolicyMatchFilter newFilter(@Nullable Map values) { + public static PolicyMatchFilter newFilter(@Nonnull EntityFieldType field, @Nonnull List values) { return newFilter(Collections.singletonMap(field, values)); } } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java index 6a36fac7de4e0..f8c017ea74e1f 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java @@ -1,7 +1,6 @@ package com.datahub.authorization; import com.datahub.authentication.Authentication; -import com.google.common.collect.ImmutableSet; import com.linkedin.common.Owner; import com.linkedin.common.Ownership; import com.linkedin.common.urn.Urn; @@ -11,8 +10,6 @@ import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.entity.client.EntityClient; -import com.linkedin.identity.GroupMembership; -import com.linkedin.identity.NativeGroupMembership; import com.linkedin.identity.RoleMembership; import com.linkedin.metadata.Constants; import com.linkedin.metadata.authorization.PoliciesConfig; @@ -23,7 +20,7 @@ import com.linkedin.policy.PolicyMatchCriterion; import com.linkedin.policy.PolicyMatchCriterionArray; import com.linkedin.policy.PolicyMatchFilter; -import java.net.URISyntaxException; + import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; @@ -34,6 +31,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -49,37 +47,22 @@ public class PolicyEngine { public PolicyEvaluationResult evaluatePolicy( final DataHubPolicyInfo policy, - final String actorStr, + final ResolvedEntitySpec resolvedActorSpec, final String privilege, - final Optional resource) { - try { - // Currently Actor must be an urn. Consider whether this contract should be pushed up. - final Urn actor = Urn.createFromString(actorStr); - return evaluatePolicy(policy, actor, privilege, resource); - } catch (URISyntaxException e) { - log.error(String.format("Failed to bind actor %s to an URN. Actors must be URNs. Denying the authorization request", actorStr)); - return PolicyEvaluationResult.DENIED; - } - } - - public PolicyEvaluationResult evaluatePolicy( - final DataHubPolicyInfo policy, - final Urn actor, - final String privilege, - final Optional resource) { + final Optional resource) { final PolicyEvaluationContext context = new PolicyEvaluationContext(); log.debug("Evaluating policy {}", policy.getDisplayName()); // If the privilege is not in scope, deny the request. - if (!isPrivilegeMatch(privilege, policy.getPrivileges(), context)) { + if (!isPrivilegeMatch(privilege, policy.getPrivileges())) { log.debug("Policy denied based on irrelevant privileges {} for {}", policy.getPrivileges(), privilege); return PolicyEvaluationResult.DENIED; } // If policy is not applicable, deny the request - if (!isPolicyApplicable(policy, actor, resource, context)) { - log.debug("Policy does not applicable for actor {} and resource {}", actor, resource); + if (!isPolicyApplicable(policy, resolvedActorSpec, resource, context)) { + log.debug("Policy does not applicable for actor {} and resource {}", resolvedActorSpec.getSpec().getEntity(), resource); return PolicyEvaluationResult.DENIED; } @@ -89,7 +72,7 @@ public PolicyEvaluationResult evaluatePolicy( public PolicyActors getMatchingActors( final DataHubPolicyInfo policy, - final Optional resource) { + final Optional resource) { final List users = new ArrayList<>(); final List groups = new ArrayList<>(); boolean allUsers = false; @@ -126,8 +109,8 @@ public PolicyActors getMatchingActors( private boolean isPolicyApplicable( final DataHubPolicyInfo policy, - final Urn actor, - final Optional resource, + final ResolvedEntitySpec resolvedActorSpec, + final Optional resource, final PolicyEvaluationContext context ) { @@ -137,25 +120,21 @@ private boolean isPolicyApplicable( } // If the resource is not in scope, deny the request. - if (!isResourceMatch(policy.getType(), policy.getResources(), resource, context)) { + if (!isResourceMatch(policy.getType(), policy.getResources(), resource)) { return false; } // If the actor does not match, deny the request. - if (!isActorMatch(actor, policy.getActors(), resource, context)) { - return false; - } - - return true; + return isActorMatch(resolvedActorSpec, policy.getActors(), resource, context); } public List getGrantedPrivileges( final List policies, - final Urn actor, - final Optional resource) { + final ResolvedEntitySpec resolvedActorSpec, + final Optional resource) { PolicyEvaluationContext context = new PolicyEvaluationContext(); return policies.stream() - .filter(policy -> isPolicyApplicable(policy, actor, resource, context)) + .filter(policy -> isPolicyApplicable(policy, resolvedActorSpec, resource, context)) .flatMap(policy -> policy.getPrivileges().stream()) .distinct() .collect(Collectors.toList()); @@ -168,9 +147,8 @@ public List getGrantedPrivileges( * If the policy is of type "METADATA", the resourceSpec parameter will be matched against the * resource filter defined on the policy. */ - public Boolean policyMatchesResource(final DataHubPolicyInfo policy, final Optional resourceSpec) { - return isResourceMatch(policy.getType(), policy.getResources(), resourceSpec, - new PolicyEvaluationContext()); + public Boolean policyMatchesResource(final DataHubPolicyInfo policy, final Optional resourceSpec) { + return isResourceMatch(policy.getType(), policy.getResources(), resourceSpec); } /** @@ -178,8 +156,7 @@ public Boolean policyMatchesResource(final DataHubPolicyInfo policy, final Optio */ private boolean isPrivilegeMatch( final String requestPrivilege, - final List policyPrivileges, - final PolicyEvaluationContext context) { + final List policyPrivileges) { return policyPrivileges.contains(requestPrivilege); } @@ -189,8 +166,7 @@ private boolean isPrivilegeMatch( private boolean isResourceMatch( final String policyType, final @Nullable DataHubResourceFilter policyResourceFilter, - final Optional requestResource, - final PolicyEvaluationContext context) { + final Optional requestResource) { if (PoliciesConfig.PLATFORM_POLICY_TYPE.equals(policyType)) { // Currently, platform policies have no associated resource. return true; @@ -199,7 +175,7 @@ private boolean isResourceMatch( // No resource defined on the policy. return true; } - if (!requestResource.isPresent()) { + if (requestResource.isEmpty()) { // Resource filter present in policy, but no resource spec provided. log.debug("Resource filter present in policy, but no resource spec provided."); return false; @@ -218,31 +194,31 @@ private PolicyMatchFilter getFilter(DataHubResourceFilter policyResourceFilter) } PolicyMatchCriterionArray criteria = new PolicyMatchCriterionArray(); if (policyResourceFilter.hasType()) { - criteria.add(new PolicyMatchCriterion().setField(ResourceFieldType.RESOURCE_TYPE.name()) + criteria.add(new PolicyMatchCriterion().setField(EntityFieldType.TYPE.name()) .setValues(new StringArray(Collections.singletonList(policyResourceFilter.getType())))); } if (policyResourceFilter.hasType() && policyResourceFilter.hasResources() && !policyResourceFilter.isAllResources()) { criteria.add( - new PolicyMatchCriterion().setField(ResourceFieldType.RESOURCE_URN.name()).setValues(policyResourceFilter.getResources())); + new PolicyMatchCriterion().setField(EntityFieldType.URN.name()).setValues(policyResourceFilter.getResources())); } return new PolicyMatchFilter().setCriteria(criteria); } - private boolean checkFilter(final PolicyMatchFilter filter, final ResolvedResourceSpec resource) { + private boolean checkFilter(final PolicyMatchFilter filter, final ResolvedEntitySpec resource) { return filter.getCriteria().stream().allMatch(criterion -> checkCriterion(criterion, resource)); } - private boolean checkCriterion(final PolicyMatchCriterion criterion, final ResolvedResourceSpec resource) { - ResourceFieldType resourceFieldType; + private boolean checkCriterion(final PolicyMatchCriterion criterion, final ResolvedEntitySpec resource) { + EntityFieldType entityFieldType; try { - resourceFieldType = ResourceFieldType.valueOf(criterion.getField().toUpperCase()); + entityFieldType = EntityFieldType.valueOf(criterion.getField().toUpperCase()); } catch (IllegalArgumentException e) { log.error("Unsupported field type {}", criterion.getField()); return false; } - Set fieldValues = resource.getFieldValues(resourceFieldType); + Set fieldValues = resource.getFieldValues(entityFieldType); return criterion.getValues() .stream() .anyMatch(filterValue -> checkCondition(fieldValues, filterValue, criterion.getCondition())); @@ -257,46 +233,51 @@ private boolean checkCondition(Set fieldValues, String filterValue, Poli } /** + * Returns true if the actor portion of a DataHub policy matches a the actor being evaluated, false otherwise. * Returns true if the actor portion of a DataHub policy matches a the actor being evaluated, false otherwise. */ private boolean isActorMatch( - final Urn actor, + final ResolvedEntitySpec resolvedActorSpec, final DataHubActorFilter actorFilter, - final Optional resourceSpec, + final Optional resourceSpec, final PolicyEvaluationContext context) { // 1. If the actor is a matching "User" in the actor filter, return true immediately. - if (isUserMatch(actor, actorFilter)) { + if (isUserMatch(resolvedActorSpec, actorFilter)) { return true; } // 2. If the actor is in a matching "Group" in the actor filter, return true immediately. - if (isGroupMatch(actor, actorFilter, context)) { + if (isGroupMatch(resolvedActorSpec, actorFilter, context)) { return true; } // 3. If the actor is the owner, either directly or indirectly via a group, return true immediately. - if (isOwnerMatch(actor, actorFilter, resourceSpec, context)) { + if (isOwnerMatch(resolvedActorSpec, actorFilter, resourceSpec, context)) { return true; } // 4. If the actor is in a matching "Role" in the actor filter, return true immediately. - return isRoleMatch(actor, actorFilter, context); + return isRoleMatch(resolvedActorSpec, actorFilter, context); } - private boolean isUserMatch(final Urn actor, final DataHubActorFilter actorFilter) { + private boolean isUserMatch(final ResolvedEntitySpec resolvedActorSpec, final DataHubActorFilter actorFilter) { // If the actor is a matching "User" in the actor filter, return true immediately. return actorFilter.isAllUsers() || (actorFilter.hasUsers() && Objects.requireNonNull(actorFilter.getUsers()) - .stream() - .anyMatch(user -> user.equals(actor))); + .stream().map(Urn::toString) + .anyMatch(user -> user.equals(resolvedActorSpec.getSpec().getEntity()))); } - private boolean isGroupMatch(final Urn actor, final DataHubActorFilter actorFilter, final PolicyEvaluationContext context) { + private boolean isGroupMatch( + final ResolvedEntitySpec resolvedActorSpec, + final DataHubActorFilter actorFilter, + final PolicyEvaluationContext context) { // If the actor is in a matching "Group" in the actor filter, return true immediately. if (actorFilter.isAllGroups() || actorFilter.hasGroups()) { - final Set groups = resolveGroups(actor, context); - return actorFilter.isAllGroups() || (actorFilter.hasGroups() && Objects.requireNonNull(actorFilter.getGroups()) - .stream() + final Set groups = resolveGroups(resolvedActorSpec, context); + return (actorFilter.isAllGroups() && !groups.isEmpty()) + || (actorFilter.hasGroups() && Objects.requireNonNull(actorFilter.getGroups()) + .stream().map(Urn::toString) .anyMatch(groups::contains)); } // If there are no groups on the policy, return false for the group match. @@ -304,24 +285,24 @@ private boolean isGroupMatch(final Urn actor, final DataHubActorFilter actorFilt } private boolean isOwnerMatch( - final Urn actor, + final ResolvedEntitySpec resolvedActorSpec, final DataHubActorFilter actorFilter, - final Optional requestResource, + final Optional requestResource, final PolicyEvaluationContext context) { // If the policy does not apply to owners, or there is no resource to own, return false immediately. - if (!actorFilter.isResourceOwners() || !requestResource.isPresent()) { + if (!actorFilter.isResourceOwners() || requestResource.isEmpty()) { return false; } List ownershipTypes = actorFilter.getResourceOwnersTypes(); - return isActorOwner(actor, requestResource.get(), ownershipTypes, context); + return isActorOwner(resolvedActorSpec, requestResource.get(), ownershipTypes, context); } - private Set getOwnersForType(ResourceSpec resourceSpec, List ownershipTypes) { - Urn entityUrn = UrnUtils.getUrn(resourceSpec.getResource()); + private Set getOwnersForType(EntitySpec resourceSpec, List ownershipTypes) { + Urn entityUrn = UrnUtils.getUrn(resourceSpec.getEntity()); EnvelopedAspect ownershipAspect; try { EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, - Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME), _systemAuthentication); + Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME), _systemAuthentication); if (response == null || !response.getAspects().containsKey(Constants.OWNERSHIP_ASPECT_NAME)) { return Collections.emptySet(); } @@ -338,50 +319,56 @@ private Set getOwnersForType(ResourceSpec resourceSpec, List owners return ownersStream.map(owner -> owner.getOwner().toString()).collect(Collectors.toSet()); } - private boolean isActorOwner(Urn actor, ResolvedResourceSpec resourceSpec, List ownershipTypes, PolicyEvaluationContext context) { + private boolean isActorOwner( + final ResolvedEntitySpec resolvedActorSpec, + ResolvedEntitySpec resourceSpec, List ownershipTypes, + PolicyEvaluationContext context) { Set owners = this.getOwnersForType(resourceSpec.getSpec(), ownershipTypes); - if (isUserOwner(actor, owners)) { - return true; - } - final Set groups = resolveGroups(actor, context); - if (isGroupOwner(groups, owners)) { + if (isUserOwner(resolvedActorSpec, owners)) { return true; } - return false; + final Set groups = resolveGroups(resolvedActorSpec, context); + + return isGroupOwner(groups, owners); } - private boolean isUserOwner(Urn actor, Set owners) { - return owners.contains(actor.toString()); + private boolean isUserOwner(final ResolvedEntitySpec resolvedActorSpec, Set owners) { + return owners.contains(resolvedActorSpec.getSpec().getEntity()); } - private boolean isGroupOwner(Set groups, Set owners) { - return groups.stream().anyMatch(group -> owners.contains(group.toString())); + private boolean isGroupOwner(Set groups, Set owners) { + return groups.stream().anyMatch(owners::contains); } - private boolean isRoleMatch(final Urn actor, final DataHubActorFilter actorFilter, + private boolean isRoleMatch( + final ResolvedEntitySpec resolvedActorSpec, + final DataHubActorFilter actorFilter, final PolicyEvaluationContext context) { // Can immediately return false if the actor filter does not have any roles if (!actorFilter.hasRoles()) { return false; } // If the actor has a matching "Role" in the actor filter, return true immediately. - Set actorRoles = resolveRoles(actor, context); + Set actorRoles = resolveRoles(resolvedActorSpec, context); return Objects.requireNonNull(actorFilter.getRoles()) .stream() .anyMatch(actorRoles::contains); } - private Set resolveRoles(Urn actor, PolicyEvaluationContext context) { + private Set resolveRoles(final ResolvedEntitySpec resolvedActorSpec, PolicyEvaluationContext context) { if (context.roles != null) { return context.roles; } + String actor = resolvedActorSpec.getSpec().getEntity(); + Set roles = new HashSet<>(); final EnvelopedAspectMap aspectMap; try { - final EntityResponse corpUser = _entityClient.batchGetV2(CORP_USER_ENTITY_NAME, Collections.singleton(actor), - Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME), _systemAuthentication).get(actor); + Urn actorUrn = Urn.createFromString(actor); + final EntityResponse corpUser = _entityClient.batchGetV2(CORP_USER_ENTITY_NAME, Collections.singleton(actorUrn), + Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME), _systemAuthentication).get(actorUrn); if (corpUser == null || !corpUser.hasAspects()) { return roles; } @@ -403,62 +390,25 @@ private Set resolveRoles(Urn actor, PolicyEvaluationContext context) { return roles; } - private Set resolveGroups(Urn actor, PolicyEvaluationContext context) { + private Set resolveGroups(ResolvedEntitySpec resolvedActorSpec, PolicyEvaluationContext context) { if (context.groups != null) { return context.groups; } - Set groups = new HashSet<>(); - final EnvelopedAspectMap aspectMap; - - try { - final EntityResponse corpUser = _entityClient.batchGetV2(CORP_USER_ENTITY_NAME, Collections.singleton(actor), - ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME), _systemAuthentication) - .get(actor); - if (corpUser == null || !corpUser.hasAspects()) { - return groups; - } - aspectMap = corpUser.getAspects(); - } catch (Exception e) { - throw new RuntimeException(String.format("Failed to fetch %s and %s for urn %s", GROUP_MEMBERSHIP_ASPECT_NAME, - NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME, actor), e); - } - - Optional maybeGroupMembership = resolveGroupMembership(aspectMap); - maybeGroupMembership.ifPresent(groupMembership -> groups.addAll(groupMembership.getGroups())); - - Optional maybeNativeGroupMembership = resolveNativeGroupMembership(aspectMap); - maybeNativeGroupMembership.ifPresent( - nativeGroupMembership -> groups.addAll(nativeGroupMembership.getNativeGroups())); + Set groups = resolvedActorSpec.getGroupMembership(); context.setGroups(groups); // Cache the groups. return groups; } - // TODO: Optimization - Cache the group membership. Refresh periodically. - private Optional resolveGroupMembership(final EnvelopedAspectMap aspectMap) { - if (aspectMap.containsKey(GROUP_MEMBERSHIP_ASPECT_NAME)) { - return Optional.of(new GroupMembership(aspectMap.get(GROUP_MEMBERSHIP_ASPECT_NAME).getValue().data())); - } - return Optional.empty(); - } - - private Optional resolveNativeGroupMembership(final EnvelopedAspectMap aspectMap) { - if (aspectMap.containsKey(NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)) { - return Optional.of( - new NativeGroupMembership(aspectMap.get(NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME).getValue().data())); - } - return Optional.empty(); - } - /** * Class used to store state across a single Policy evaluation. */ static class PolicyEvaluationContext { - private Set groups; + private Set groups; private Set roles; - public void setGroups(Set groups) { + public void setGroups(Set groups) { this.groups = groups; } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java new file mode 100644 index 0000000000000..27cb8fcee8138 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java @@ -0,0 +1,70 @@ +package com.datahub.authorization.fieldresolverprovider; + +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME; +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.datahub.authorization.FieldResolver; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.client.EntityClient; +import java.util.Collections; +import java.util.Objects; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +/** + * Provides field resolver for domain given resourceSpec + */ +@Slf4j +@RequiredArgsConstructor +public class DataPlatformInstanceFieldResolverProvider implements EntityFieldResolverProvider { + + private final EntityClient _entityClient; + private final Authentication _systemAuthentication; + + @Override + public EntityFieldType getFieldType() { + return EntityFieldType.DATA_PLATFORM_INSTANCE; + } + + @Override + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromFunction(entitySpec, this::getDataPlatformInstance); + } + + private FieldResolver.FieldValue getDataPlatformInstance(EntitySpec entitySpec) { + Urn entityUrn = UrnUtils.getUrn(entitySpec.getEntity()); + // In the case that the entity is a platform instance, the associated platform instance entity is the instance itself + if (entityUrn.getEntityType().equals(DATA_PLATFORM_INSTANCE_ENTITY_NAME)) { + return FieldResolver.FieldValue.builder() + .values(Collections.singleton(entityUrn.toString())) + .build(); + } + + EnvelopedAspect dataPlatformInstanceAspect; + try { + EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, + Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME), _systemAuthentication); + if (response == null || !response.getAspects().containsKey(DATA_PLATFORM_INSTANCE_ASPECT_NAME)) { + return FieldResolver.emptyFieldValue(); + } + dataPlatformInstanceAspect = response.getAspects().get(DATA_PLATFORM_INSTANCE_ASPECT_NAME); + } catch (Exception e) { + log.error("Error while retrieving platform instance aspect for urn {}", entityUrn, e); + return FieldResolver.emptyFieldValue(); + } + DataPlatformInstance dataPlatformInstance = new DataPlatformInstance(dataPlatformInstanceAspect.getValue().data()); + if (dataPlatformInstance.getInstance() == null) { + return FieldResolver.emptyFieldValue(); + } + return FieldResolver.FieldValue.builder() + .values(Collections.singleton(Objects.requireNonNull(dataPlatformInstance.getInstance()).toString())) + .build(); + } +} \ No newline at end of file diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java index 68c1dd4f644e5..25c2165f02b94 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java @@ -2,8 +2,8 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.domain.DomainProperties; @@ -27,23 +27,23 @@ /** - * Provides field resolver for domain given resourceSpec + * Provides field resolver for domain given entitySpec */ @Slf4j @RequiredArgsConstructor -public class DomainFieldResolverProvider implements ResourceFieldResolverProvider { +public class DomainFieldResolverProvider implements EntityFieldResolverProvider { private final EntityClient _entityClient; private final Authentication _systemAuthentication; @Override - public ResourceFieldType getFieldType() { - return ResourceFieldType.DOMAIN; + public EntityFieldType getFieldType() { + return EntityFieldType.DOMAIN; } @Override - public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { - return FieldResolver.getResolverFromFunction(resourceSpec, this::getDomains); + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromFunction(entitySpec, this::getDomains); } private Set getBatchedParentDomains(@Nonnull final Set urns) { @@ -78,8 +78,8 @@ private Set getBatchedParentDomains(@Nonnull final Set urns) { return parentUrns; } - private FieldResolver.FieldValue getDomains(ResourceSpec resourceSpec) { - final Urn entityUrn = UrnUtils.getUrn(resourceSpec.getResource()); + private FieldResolver.FieldValue getDomains(EntitySpec entitySpec) { + final Urn entityUrn = UrnUtils.getUrn(entitySpec.getEntity()); // In the case that the entity is a domain, the associated domain is the domain itself if (entityUrn.getEntityType().equals(DOMAIN_ENTITY_NAME)) { return FieldResolver.FieldValue.builder() diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java new file mode 100644 index 0000000000000..a76db0ecb5102 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java @@ -0,0 +1,22 @@ +package com.datahub.authorization.fieldresolverprovider; + +import com.datahub.authorization.FieldResolver; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; + + +/** + * Base class for defining a class that provides the field resolver for the given field type + */ +public interface EntityFieldResolverProvider { + + /** + * Field that this hydrator is hydrating + */ + EntityFieldType getFieldType(); + + /** + * Return resolver for fetching the field values given the entity + */ + FieldResolver getFieldResolver(EntitySpec entitySpec); +} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java index 58e3d78ce8c3b..187f696904947 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java @@ -1,22 +1,22 @@ package com.datahub.authorization.fieldresolverprovider; import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; import java.util.Collections; /** - * Provides field resolver for entity type given resourceSpec + * Provides field resolver for entity type given entitySpec */ -public class EntityTypeFieldResolverProvider implements ResourceFieldResolverProvider { +public class EntityTypeFieldResolverProvider implements EntityFieldResolverProvider { @Override - public ResourceFieldType getFieldType() { - return ResourceFieldType.RESOURCE_TYPE; + public EntityFieldType getFieldType() { + return EntityFieldType.TYPE; } @Override - public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { - return FieldResolver.getResolverFromValues(Collections.singleton(resourceSpec.getType())); + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromValues(Collections.singleton(entitySpec.getType())); } } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java index b9d98f1dcbac0..2f5c4a7c6c961 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java @@ -1,22 +1,22 @@ package com.datahub.authorization.fieldresolverprovider; import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; import java.util.Collections; /** - * Provides field resolver for entity urn given resourceSpec + * Provides field resolver for entity urn given entitySpec */ -public class EntityUrnFieldResolverProvider implements ResourceFieldResolverProvider { +public class EntityUrnFieldResolverProvider implements EntityFieldResolverProvider { @Override - public ResourceFieldType getFieldType() { - return ResourceFieldType.RESOURCE_URN; + public EntityFieldType getFieldType() { + return EntityFieldType.URN; } @Override - public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { - return FieldResolver.getResolverFromValues(Collections.singleton(resourceSpec.getResource())); + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromValues(Collections.singleton(entitySpec.getEntity())); } } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java new file mode 100644 index 0000000000000..8db029632d7e2 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java @@ -0,0 +1,78 @@ +package com.datahub.authorization.fieldresolverprovider; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.FieldResolver; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.identity.NativeGroupMembership; +import com.linkedin.metadata.Constants; +import com.linkedin.identity.GroupMembership; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import static com.linkedin.metadata.Constants.GROUP_MEMBERSHIP_ASPECT_NAME; +import static com.linkedin.metadata.Constants.NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME; + + +/** + * Provides field resolver for owners given entitySpec + */ +@Slf4j +@RequiredArgsConstructor +public class GroupMembershipFieldResolverProvider implements EntityFieldResolverProvider { + + private final EntityClient _entityClient; + private final Authentication _systemAuthentication; + + @Override + public EntityFieldType getFieldType() { + return EntityFieldType.GROUP_MEMBERSHIP; + } + + @Override + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromFunction(entitySpec, this::getGroupMembership); + } + + private FieldResolver.FieldValue getGroupMembership(EntitySpec entitySpec) { + Urn entityUrn = UrnUtils.getUrn(entitySpec.getEntity()); + EnvelopedAspect groupMembershipAspect; + EnvelopedAspect nativeGroupMembershipAspect; + List groups = new ArrayList<>(); + try { + EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, + ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME), _systemAuthentication); + if (response == null + || !(response.getAspects().containsKey(Constants.GROUP_MEMBERSHIP_ASPECT_NAME) + || response.getAspects().containsKey(Constants.NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME))) { + return FieldResolver.emptyFieldValue(); + } + if (response.getAspects().containsKey(Constants.GROUP_MEMBERSHIP_ASPECT_NAME)) { + groupMembershipAspect = response.getAspects().get(Constants.GROUP_MEMBERSHIP_ASPECT_NAME); + GroupMembership groupMembership = new GroupMembership(groupMembershipAspect.getValue().data()); + groups.addAll(groupMembership.getGroups()); + } + if (response.getAspects().containsKey(Constants.NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)) { + nativeGroupMembershipAspect = response.getAspects().get(Constants.NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME); + NativeGroupMembership nativeGroupMembership = new NativeGroupMembership(nativeGroupMembershipAspect.getValue().data()); + groups.addAll(nativeGroupMembership.getNativeGroups()); + } + } catch (Exception e) { + log.error("Error while retrieving group membership aspect for urn {}", entityUrn, e); + return FieldResolver.emptyFieldValue(); + } + return FieldResolver.FieldValue.builder() + .values(groups.stream().map(Urn::toString).collect(Collectors.toSet())) + .build(); + } +} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java index 20ec6a09377c8..bdd652d1d3871 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java @@ -2,8 +2,8 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; import com.linkedin.common.Ownership; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; @@ -18,27 +18,27 @@ /** - * Provides field resolver for owners given resourceSpec + * Provides field resolver for owners given entitySpec */ @Slf4j @RequiredArgsConstructor -public class OwnerFieldResolverProvider implements ResourceFieldResolverProvider { +public class OwnerFieldResolverProvider implements EntityFieldResolverProvider { private final EntityClient _entityClient; private final Authentication _systemAuthentication; @Override - public ResourceFieldType getFieldType() { - return ResourceFieldType.OWNER; + public EntityFieldType getFieldType() { + return EntityFieldType.OWNER; } @Override - public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { - return FieldResolver.getResolverFromFunction(resourceSpec, this::getOwners); + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromFunction(entitySpec, this::getOwners); } - private FieldResolver.FieldValue getOwners(ResourceSpec resourceSpec) { - Urn entityUrn = UrnUtils.getUrn(resourceSpec.getResource()); + private FieldResolver.FieldValue getOwners(EntitySpec entitySpec) { + Urn entityUrn = UrnUtils.getUrn(entitySpec.getEntity()); EnvelopedAspect ownershipAspect; try { EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/ResourceFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/ResourceFieldResolverProvider.java deleted file mode 100644 index 4ba4200f8035e..0000000000000 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/ResourceFieldResolverProvider.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.datahub.authorization.fieldresolverprovider; - -import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; - - -/** - * Base class for defining a class that provides the field resolver for the given field type - */ -public interface ResourceFieldResolverProvider { - - /** - * Field that this hydrator is hydrating - */ - ResourceFieldType getFieldType(); - - /** - * Return resolver for fetching the field values given the resource - */ - FieldResolver getFieldResolver(ResourceSpec resourceSpec); -} diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java index 2e48123fb1813..24ecfa6fefc85 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java @@ -158,7 +158,7 @@ public void testSystemAuthentication() throws Exception { // Validate that the System Actor is authorized, even if there is no policy. - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( new Actor(ActorType.USER, DATAHUB_SYSTEM_CLIENT_ID).toUrnStr(), @@ -172,7 +172,7 @@ public void testSystemAuthentication() throws Exception { @Test public void testAuthorizeGranted() throws Exception { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -186,7 +186,7 @@ public void testAuthorizeGranted() throws Exception { @Test public void testAuthorizeNotGranted() throws Exception { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); // Policy for this privilege is inactive. AuthorizationRequest request = new AuthorizationRequest( @@ -203,7 +203,7 @@ public void testAllowAllMode() throws Exception { _dataHubAuthorizer.setMode(DataHubAuthorizer.AuthorizationMode.ALLOW_ALL); - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); // Policy for this privilege is inactive. AuthorizationRequest request = new AuthorizationRequest( @@ -219,7 +219,7 @@ public void testAllowAllMode() throws Exception { public void testInvalidateCache() throws Exception { // First make sure that the default policies are as expected. - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -250,7 +250,7 @@ public void testInvalidateCache() throws Exception { public void testAuthorizedActorsActivePolicy() throws Exception { final AuthorizedActors actors = _dataHubAuthorizer.authorizedActors("EDIT_ENTITY_TAGS", // Should be inside the active policy. - Optional.of(new ResourceSpec("dataset", "urn:li:dataset:1"))); + Optional.of(new EntitySpec("dataset", "urn:li:dataset:1"))); assertTrue(actors.isAllUsers()); assertTrue(actors.isAllGroups()); @@ -272,7 +272,7 @@ public void testAuthorizedActorsActivePolicy() throws Exception { @Test public void testAuthorizationOnDomainWithPrivilegeIsAllowed() { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -285,7 +285,7 @@ public void testAuthorizationOnDomainWithPrivilegeIsAllowed() { @Test public void testAuthorizationOnDomainWithParentPrivilegeIsAllowed() { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -298,7 +298,7 @@ public void testAuthorizationOnDomainWithParentPrivilegeIsAllowed() { @Test public void testAuthorizationOnDomainWithoutPrivilegeIsDenied() { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -334,7 +334,7 @@ private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List p resourceFilter.setType("dataset"); if (domain != null) { - resourceFilter.setFilter(FilterUtils.newFilter(ImmutableMap.of(ResourceFieldType.DOMAIN, Collections.singletonList(domain.toString())))); + resourceFilter.setFilter(FilterUtils.newFilter(ImmutableMap.of(EntityFieldType.DOMAIN, Collections.singletonList(domain.toString())))); } dataHubPolicyInfo.setResources(resourceFilter); @@ -398,6 +398,6 @@ private Map createDomainPropertiesBatchResponse(@Nullable f } private AuthorizerContext createAuthorizerContext(final Authentication systemAuthentication, final EntityClient entityClient) { - return new AuthorizerContext(Collections.emptyMap(), new DefaultResourceSpecResolver(systemAuthentication, entityClient)); + return new AuthorizerContext(Collections.emptyMap(), new DefaultEntitySpecResolver(systemAuthentication, entityClient)); } } diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java index 99d8fee309d91..be8c948f8ef89 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java @@ -11,15 +11,12 @@ import com.linkedin.common.OwnershipType; import com.linkedin.common.UrnArray; import com.linkedin.common.urn.Urn; -import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.entity.client.EntityClient; -import com.linkedin.identity.CorpUserInfo; -import com.linkedin.identity.GroupMembership; import com.linkedin.identity.RoleMembership; import com.linkedin.metadata.Constants; import com.linkedin.policy.DataHubActorFilter; @@ -45,22 +42,19 @@ public class PolicyEngineTest { private static final String AUTHORIZED_PRINCIPAL = "urn:li:corpuser:datahub"; private static final String UNAUTHORIZED_PRINCIPAL = "urn:li:corpuser:unauthorized"; - private static final String AUTHORIZED_GROUP = "urn:li:corpGroup:authorizedGroup"; - private static final String RESOURCE_URN = "urn:li:dataset:test"; - private static final String DOMAIN_URN = "urn:li:domain:domain1"; - private static final String OWNERSHIP_TYPE_URN = "urn:li:ownershipType:__system__technical_owner"; - private static final String OTHER_OWNERSHIP_TYPE_URN = "urn:li:ownershipType:__system__data_steward"; private EntityClient _entityClient; private PolicyEngine _policyEngine; private Urn authorizedUserUrn; + private ResolvedEntitySpec resolvedAuthorizedUserSpec; private Urn unauthorizedUserUrn; + private ResolvedEntitySpec resolvedUnauthorizedUserSpec; private Urn resourceUrn; @BeforeMethod @@ -68,29 +62,34 @@ public void setupTest() throws Exception { _entityClient = Mockito.mock(EntityClient.class); _policyEngine = new PolicyEngine(Mockito.mock(Authentication.class), _entityClient); - // Init mocks. - EntityResponse authorizedEntityResponse = createAuthorizedEntityResponse(); authorizedUserUrn = Urn.createFromString(AUTHORIZED_PRINCIPAL); + resolvedAuthorizedUserSpec = buildEntityResolvers(CORP_USER_ENTITY_NAME, AUTHORIZED_PRINCIPAL, + Collections.emptySet(), Collections.emptySet(), Collections.singleton(AUTHORIZED_GROUP)); + unauthorizedUserUrn = Urn.createFromString(UNAUTHORIZED_PRINCIPAL); + resolvedUnauthorizedUserSpec = buildEntityResolvers(CORP_USER_ENTITY_NAME, UNAUTHORIZED_PRINCIPAL); + resourceUrn = Urn.createFromString(RESOURCE_URN); + + // Init role membership mocks. + EntityResponse authorizedEntityResponse = createAuthorizedEntityResponse(); authorizedEntityResponse.setUrn(authorizedUserUrn); Map authorizedEntityResponseMap = Collections.singletonMap(authorizedUserUrn, authorizedEntityResponse); - when(_entityClient.batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), any(), - any())).thenReturn(authorizedEntityResponseMap); + when(_entityClient.batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), + eq(Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME)), any())).thenReturn(authorizedEntityResponseMap); EntityResponse unauthorizedEntityResponse = createUnauthorizedEntityResponse(); - unauthorizedUserUrn = Urn.createFromString(UNAUTHORIZED_PRINCIPAL); unauthorizedEntityResponse.setUrn(unauthorizedUserUrn); Map unauthorizedEntityResponseMap = Collections.singletonMap(unauthorizedUserUrn, unauthorizedEntityResponse); - when(_entityClient.batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(unauthorizedUserUrn)), any(), - any())).thenReturn(unauthorizedEntityResponseMap); + when(_entityClient.batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(unauthorizedUserUrn)), + eq(Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME)), any())).thenReturn(unauthorizedEntityResponseMap); + // Init ownership type mocks. EntityResponse entityResponse = new EntityResponse(); EnvelopedAspectMap envelopedAspectMap = new EnvelopedAspectMap(); envelopedAspectMap.put(OWNERSHIP_ASPECT_NAME, new EnvelopedAspect().setValue(new com.linkedin.entity.Aspect(createOwnershipAspect(true, true).data()))); entityResponse.setAspects(envelopedAspectMap); - resourceUrn = Urn.createFromString(RESOURCE_URN); Map mockMap = mock(Map.class); when(_entityClient.batchGetV2(any(), eq(Collections.singleton(resourceUrn)), eq(Collections.singleton(OWNERSHIP_ASPECT_NAME)), any())).thenReturn(mockMap); @@ -120,9 +119,9 @@ public void testEvaluatePolicyInactivePolicyState() { resourceFilter.setAllResources(true); resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -149,9 +148,9 @@ public void testEvaluatePolicyPrivilegeFilterNoMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_OWNERS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_OWNERS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -176,7 +175,8 @@ public void testEvaluatePlatformPolicyPrivilegeFilterMatch() throws Exception { dataHubPolicyInfo.setActors(actorFilter); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "MANAGE_POLICIES", Optional.empty()); + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "MANAGE_POLICIES", + Optional.empty()); assertTrue(result.isGranted()); // Verify no network calls @@ -208,10 +208,10 @@ public void testEvaluatePolicyActorFilterUserMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert Authorized user can edit entity tags. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); @@ -245,10 +245,10 @@ public void testEvaluatePolicyActorFilterUserNoMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, "urn:li:corpuser:test", "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, buildEntityResolvers(CORP_USER_ENTITY_NAME, "urn:li:corpuser:test"), "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result2.isGranted()); @@ -270,7 +270,7 @@ public void testEvaluatePolicyActorFilterGroupMatch() throws Exception { final DataHubActorFilter actorFilter = new DataHubActorFilter(); final UrnArray groupsUrnArray = new UrnArray(); - groupsUrnArray.add(Urn.createFromString("urn:li:corpGroup:authorizedGroup")); + groupsUrnArray.add(Urn.createFromString(AUTHORIZED_GROUP)); actorFilter.setGroups(groupsUrnArray); actorFilter.setResourceOwners(false); actorFilter.setAllUsers(false); @@ -282,16 +282,15 @@ public void testEvaluatePolicyActorFilterGroupMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags, because of group membership. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); - // Verify we are only calling for group during these requests. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -307,7 +306,7 @@ public void testEvaluatePolicyActorFilterGroupNoMatch() throws Exception { final DataHubActorFilter actorFilter = new DataHubActorFilter(); final UrnArray groupsUrnArray = new UrnArray(); - groupsUrnArray.add(Urn.createFromString("urn:li:corpGroup:authorizedGroup")); + groupsUrnArray.add(Urn.createFromString(AUTHORIZED_GROUP)); actorFilter.setGroups(groupsUrnArray); actorFilter.setResourceOwners(false); actorFilter.setAllUsers(false); @@ -319,16 +318,15 @@ public void testEvaluatePolicyActorFilterGroupNoMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result2.isGranted()); - // Verify we are only calling for group during these requests. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), - eq(Collections.singleton(unauthorizedUserUrn)), any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -357,17 +355,17 @@ public void testEvaluatePolicyActorFilterRoleMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags. PolicyEngine.PolicyEvaluationResult authorizedResult = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(authorizedResult.isGranted()); // Verify we are only calling for roles during these requests. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - any(), any()); + verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), + eq(Collections.singleton(authorizedUserUrn)), any(), any()); } @Test @@ -396,10 +394,10 @@ public void testEvaluatePolicyActorFilterNoRoleMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags. PolicyEngine.PolicyEvaluationResult unauthorizedResult = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(unauthorizedResult.isGranted()); @@ -431,16 +429,16 @@ public void testEvaluatePolicyActorFilterAllUsersMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags, because of group membership. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result2.isGranted()); @@ -470,24 +468,21 @@ public void testEvaluatePolicyActorFilterAllGroupsMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags, because of group membership. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); - assertTrue(result2.isGranted()); + assertFalse(result2.isGranted()); - // Verify we are only calling for group during these requests. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - any(), any()); - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), - eq(Collections.singleton(unauthorizedUserUrn)), any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -519,17 +514,17 @@ public void testEvaluatePolicyActorFilterUserResourceOwnersMatch() throws Except when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet(), + Collections.emptySet()); // Assert authorized user can edit entity tags, because he is a user owner. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); - // Ensure no calls for group membership. - verify(_entityClient, times(0)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - eq(null), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -562,13 +557,17 @@ public void testEvaluatePolicyActorFilterUserResourceOwnersTypeMatch() throws Ex when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet(), + Collections.emptySet()); PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); + + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -601,13 +600,16 @@ public void testEvaluatePolicyActorFilterUserResourceOwnersTypeNoMatch() throws when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet(), Collections.emptySet()); PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result1.isGranted()); + + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -639,17 +641,17 @@ public void testEvaluatePolicyActorFilterGroupResourceOwnersMatch() throws Excep when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_GROUP), Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_GROUP), Collections.emptySet(), + Collections.emptySet()); // Assert authorized user can edit entity tags, because he is a user owner. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); - // Ensure that caching of groups is working with 1 call to entity client for each principal. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -673,16 +675,15 @@ public void testEvaluatePolicyActorFilterGroupResourceOwnersNoMatch() throws Exc resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result2.isGranted()); - // Ensure that caching of groups is working with 1 call to entity client for each principal. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), - eq(Collections.singleton(unauthorizedUserUrn)), any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -706,10 +707,10 @@ public void testEvaluatePolicyResourceFilterAllResourcesMatch() throws Exception resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", "urn:li:dataset:random"); // A dataset Authorized principal _does not own_. + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", "urn:li:dataset:random"); // A dataset Authorized principal _does not own_. PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result.isGranted()); @@ -738,9 +739,9 @@ public void testEvaluatePolicyResourceFilterAllResourcesNoMatch() throws Excepti resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("chart", RESOURCE_URN); // Notice: Not a dataset. + ResolvedEntitySpec resourceSpec = buildEntityResolvers("chart", RESOURCE_URN); // Notice: Not a dataset. PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -773,9 +774,9 @@ public void testEvaluatePolicyResourceFilterSpecificResourceMatchLegacy() throws resourceFilter.setResources(resourceUrns); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result.isGranted()); @@ -801,13 +802,13 @@ public void testEvaluatePolicyResourceFilterSpecificResourceMatch() throws Excep final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), - ResourceFieldType.RESOURCE_URN, Collections.singletonList(RESOURCE_URN)))); + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), + EntityFieldType.URN, Collections.singletonList(RESOURCE_URN)))); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result.isGranted()); @@ -833,14 +834,14 @@ public void testEvaluatePolicyResourceFilterSpecificResourceNoMatch() throws Exc final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), - ResourceFieldType.RESOURCE_URN, Collections.singletonList(RESOURCE_URN)))); + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), + EntityFieldType.URN, Collections.singletonList(RESOURCE_URN)))); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -866,14 +867,14 @@ public void testEvaluatePolicyResourceFilterSpecificResourceMatchDomain() throws final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), ResourceFieldType.DOMAIN, + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), EntityFieldType.DOMAIN, Collections.singletonList(DOMAIN_URN)))); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, Collections.emptySet(), Collections.singleton(DOMAIN_URN)); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, Collections.emptySet(), Collections.singleton(DOMAIN_URN), Collections.emptySet()); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result.isGranted()); @@ -899,14 +900,14 @@ public void testEvaluatePolicyResourceFilterSpecificResourceNoMatchDomain() thro final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), ResourceFieldType.DOMAIN, + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), EntityFieldType.DOMAIN, Collections.singletonList(DOMAIN_URN)))); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN, Collections.emptySet(), - Collections.singleton("urn:li:domain:domain2")); // Domain doesn't match + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, Collections.emptySet(), + Collections.singleton("urn:li:domain:domain2"), Collections.emptySet()); // Domain doesn't match PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -933,7 +934,7 @@ public void testGetGrantedPrivileges() throws Exception { final DataHubResourceFilter resourceFilter1 = new DataHubResourceFilter(); resourceFilter1.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), ResourceFieldType.DOMAIN, + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), EntityFieldType.DOMAIN, Collections.singletonList(DOMAIN_URN)))); dataHubPolicyInfo1.setResources(resourceFilter1); @@ -954,8 +955,8 @@ public void testGetGrantedPrivileges() throws Exception { final DataHubResourceFilter resourceFilter2 = new DataHubResourceFilter(); resourceFilter2.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), - ResourceFieldType.RESOURCE_URN, Collections.singletonList(RESOURCE_URN)))); + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), + EntityFieldType.URN, Collections.singletonList(RESOURCE_URN)))); dataHubPolicyInfo2.setResources(resourceFilter2); // Policy 3, match dataset type and owner (legacy resource filter) @@ -981,25 +982,25 @@ public void testGetGrantedPrivileges() throws Exception { final List policies = ImmutableList.of(dataHubPolicyInfo1, dataHubPolicyInfo2, dataHubPolicyInfo3); - assertEquals(_policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.empty()), + assertEquals(_policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.empty()), Collections.emptyList()); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN, Collections.emptySet(), - Collections.singleton(DOMAIN_URN)); // Everything matches + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, Collections.emptySet(), + Collections.singleton(DOMAIN_URN), Collections.emptySet()); // Everything matches assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), ImmutableList.of("PRIVILEGE_1", "PRIVILEGE_2_1", "PRIVILEGE_2_2")); - resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN, Collections.emptySet(), - Collections.singleton("urn:li:domain:domain2")); // Domain doesn't match + resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, Collections.emptySet(), + Collections.singleton("urn:li:domain:domain2"), Collections.emptySet()); // Domain doesn't match assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), ImmutableList.of("PRIVILEGE_2_1", "PRIVILEGE_2_2")); - resourceSpec = buildResourceResolvers("dataset", "urn:li:dataset:random", Collections.emptySet(), - Collections.singleton(DOMAIN_URN)); // Resource doesn't match + resourceSpec = buildEntityResolvers("dataset", "urn:li:dataset:random", Collections.emptySet(), + Collections.singleton(DOMAIN_URN), Collections.emptySet()); // Resource doesn't match assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), ImmutableList.of("PRIVILEGE_1")); final EntityResponse entityResponse = new EntityResponse(); @@ -1008,16 +1009,16 @@ public void testGetGrantedPrivileges() throws Exception { entityResponse.setAspects(aspectMap); when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN, Collections.singleton(AUTHORIZED_PRINCIPAL), - Collections.singleton(DOMAIN_URN)); // Is owner + resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, Collections.singleton(AUTHORIZED_PRINCIPAL), + Collections.singleton(DOMAIN_URN), Collections.emptySet()); // Is owner assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), ImmutableList.of("PRIVILEGE_1", "PRIVILEGE_2_1", "PRIVILEGE_2_2", "PRIVILEGE_3")); - resourceSpec = buildResourceResolvers("chart", RESOURCE_URN, Collections.singleton(AUTHORIZED_PRINCIPAL), - Collections.singleton(DOMAIN_URN)); // Resource type doesn't match + resourceSpec = buildEntityResolvers("chart", RESOURCE_URN, Collections.singleton(AUTHORIZED_PRINCIPAL), + Collections.singleton(DOMAIN_URN), Collections.emptySet()); // Resource type doesn't match assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), Collections.emptyList()); } @@ -1050,9 +1051,9 @@ public void testGetMatchingActorsResourceMatch() throws Exception { resourceFilter.setResources(resourceUrns); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL, AUTHORIZED_GROUP), - Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL, AUTHORIZED_GROUP), + Collections.emptySet(), Collections.emptySet()); PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); assertTrue(actors.allUsers()); @@ -1101,8 +1102,8 @@ public void testGetMatchingActorsNoResourceMatch() throws Exception { resourceFilter.setResources(resourceUrns); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); assertFalse(actors.allUsers()); @@ -1155,21 +1156,6 @@ private EntityResponse createAuthorizedEntityResponse() throws URISyntaxExceptio final EntityResponse entityResponse = new EntityResponse(); final EnvelopedAspectMap aspectMap = new EnvelopedAspectMap(); - final CorpUserInfo userInfo = new CorpUserInfo(); - userInfo.setActive(true); - userInfo.setFullName("Data Hub"); - userInfo.setFirstName("Data"); - userInfo.setLastName("Hub"); - userInfo.setEmail("datahub@gmail.com"); - userInfo.setTitle("Admin"); - aspectMap.put(CORP_USER_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(userInfo.data()))); - - final GroupMembership groupsAspect = new GroupMembership(); - final UrnArray groups = new UrnArray(); - groups.add(Urn.createFromString("urn:li:corpGroup:authorizedGroup")); - groupsAspect.setGroups(groups); - aspectMap.put(GROUP_MEMBERSHIP_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(groupsAspect.data()))); - final RoleMembership rolesAspect = new RoleMembership(); final UrnArray roles = new UrnArray(); roles.add(Urn.createFromString("urn:li:dataHubRole:admin")); @@ -1184,21 +1170,6 @@ private EntityResponse createUnauthorizedEntityResponse() throws URISyntaxExcept final EntityResponse entityResponse = new EntityResponse(); final EnvelopedAspectMap aspectMap = new EnvelopedAspectMap(); - final CorpUserInfo userInfo = new CorpUserInfo(); - userInfo.setActive(true); - userInfo.setFullName("Unauthorized User"); - userInfo.setFirstName("Unauthorized"); - userInfo.setLastName("User"); - userInfo.setEmail("Unauth"); - userInfo.setTitle("Engineer"); - aspectMap.put(CORP_USER_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(userInfo.data()))); - - final GroupMembership groupsAspect = new GroupMembership(); - final UrnArray groups = new UrnArray(); - groups.add(Urn.createFromString("urn:li:corpGroup:unauthorizedGroup")); - groupsAspect.setGroups(groups); - aspectMap.put(GROUP_MEMBERSHIP_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(groupsAspect.data()))); - final RoleMembership rolesAspect = new RoleMembership(); final UrnArray roles = new UrnArray(); roles.add(Urn.createFromString("urn:li:dataHubRole:reader")); @@ -1209,17 +1180,18 @@ private EntityResponse createUnauthorizedEntityResponse() throws URISyntaxExcept return entityResponse; } - public static ResolvedResourceSpec buildResourceResolvers(String entityType, String entityUrn) { - return buildResourceResolvers(entityType, entityUrn, Collections.emptySet(), Collections.emptySet()); + public static ResolvedEntitySpec buildEntityResolvers(String entityType, String entityUrn) { + return buildEntityResolvers(entityType, entityUrn, Collections.emptySet(), Collections.emptySet(), Collections.emptySet()); } - public static ResolvedResourceSpec buildResourceResolvers(String entityType, String entityUrn, Set owners, - Set domains) { - return new ResolvedResourceSpec(new ResourceSpec(entityType, entityUrn), - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, - FieldResolver.getResolverFromValues(Collections.singleton(entityType)), ResourceFieldType.RESOURCE_URN, - FieldResolver.getResolverFromValues(Collections.singleton(entityUrn)), ResourceFieldType.OWNER, - FieldResolver.getResolverFromValues(owners), ResourceFieldType.DOMAIN, - FieldResolver.getResolverFromValues(domains))); + public static ResolvedEntitySpec buildEntityResolvers(String entityType, String entityUrn, Set owners, + Set domains, Set groups) { + return new ResolvedEntitySpec(new EntitySpec(entityType, entityUrn), + ImmutableMap.of(EntityFieldType.TYPE, + FieldResolver.getResolverFromValues(Collections.singleton(entityType)), EntityFieldType.URN, + FieldResolver.getResolverFromValues(Collections.singleton(entityUrn)), EntityFieldType.OWNER, + FieldResolver.getResolverFromValues(owners), EntityFieldType.DOMAIN, + FieldResolver.getResolverFromValues(domains), EntityFieldType.GROUP_MEMBERSHIP, + FieldResolver.getResolverFromValues(groups))); } } diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java new file mode 100644 index 0000000000000..b2343bbb01509 --- /dev/null +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java @@ -0,0 +1,193 @@ +package com.datahub.authorization.fieldresolverprovider; + +import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME; +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyZeroInteractions; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.r2.RemoteInvocationException; +import java.net.URISyntaxException; +import java.util.Collections; +import java.util.Set; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class DataPlatformInstanceFieldResolverProviderTest { + + private static final String DATA_PLATFORM_INSTANCE_URN = + "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)"; + private static final String RESOURCE_URN = + "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.testDataset,PROD)"; + private static final EntitySpec RESOURCE_SPEC = new EntitySpec(DATASET_ENTITY_NAME, RESOURCE_URN); + + @Mock + private EntityClient entityClientMock; + @Mock + private Authentication systemAuthenticationMock; + + private DataPlatformInstanceFieldResolverProvider dataPlatformInstanceFieldResolverProvider; + + @BeforeMethod + public void setup() { + MockitoAnnotations.initMocks(this); + dataPlatformInstanceFieldResolverProvider = + new DataPlatformInstanceFieldResolverProvider(entityClientMock, systemAuthenticationMock); + } + + @Test + public void shouldReturnDataPlatformInstanceType() { + assertEquals(EntityFieldType.DATA_PLATFORM_INSTANCE, dataPlatformInstanceFieldResolverProvider.getFieldType()); + } + + @Test + public void shouldReturnFieldValueWithResourceSpecIfTypeIsDataPlatformInstance() { + var resourceSpec = new EntitySpec(DATA_PLATFORM_INSTANCE_ENTITY_NAME, DATA_PLATFORM_INSTANCE_URN); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(resourceSpec); + + assertEquals(Set.of(DATA_PLATFORM_INSTANCE_URN), result.getFieldValuesFuture().join().getValues()); + verifyZeroInteractions(entityClientMock); + } + + @Test + public void shouldReturnEmptyFieldValueWhenResponseIsNull() throws RemoteInvocationException, URISyntaxException { + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(null); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenResourceHasNoDataPlatformInstance() + throws RemoteInvocationException, URISyntaxException { + var entityResponseMock = mock(EntityResponse.class); + when(entityResponseMock.getAspects()).thenReturn(new EnvelopedAspectMap()); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenThereIsAnException() throws RemoteInvocationException, URISyntaxException { + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenThrow(new RemoteInvocationException()); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenDataPlatformInstanceHasNoInstance() + throws RemoteInvocationException, URISyntaxException { + + var dataPlatform = new DataPlatformInstance() + .setPlatform(Urn.createFromString("urn:li:dataPlatform:s3")); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(DATA_PLATFORM_INSTANCE_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(dataPlatform.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnFieldValueWithDataPlatformInstanceOfTheResource() + throws RemoteInvocationException, URISyntaxException { + + var dataPlatformInstance = new DataPlatformInstance() + .setPlatform(Urn.createFromString("urn:li:dataPlatform:s3")) + .setInstance(Urn.createFromString(DATA_PLATFORM_INSTANCE_URN)); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(DATA_PLATFORM_INSTANCE_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(dataPlatformInstance.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertEquals(Set.of(DATA_PLATFORM_INSTANCE_URN), result.getFieldValuesFuture().join().getValues()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } +} diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java new file mode 100644 index 0000000000000..54675045b4413 --- /dev/null +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java @@ -0,0 +1,212 @@ +package com.datahub.authorization.fieldresolverprovider; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.identity.GroupMembership; +import com.linkedin.identity.NativeGroupMembership; +import com.linkedin.r2.RemoteInvocationException; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.net.URISyntaxException; +import java.util.Set; + +import static com.linkedin.metadata.Constants.*; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.*; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class GroupMembershipFieldResolverProviderTest { + + private static final String CORPGROUP_URN = "urn:li:corpGroup:groupname"; + private static final String NATIVE_CORPGROUP_URN = "urn:li:corpGroup:nativegroupname"; + private static final String RESOURCE_URN = "urn:li:dataset:(urn:li:dataPlatform:testPlatform,testDataset,PROD)"; + private static final EntitySpec RESOURCE_SPEC = new EntitySpec(DATASET_ENTITY_NAME, RESOURCE_URN); + + @Mock + private EntityClient entityClientMock; + @Mock + private Authentication systemAuthenticationMock; + + private GroupMembershipFieldResolverProvider groupMembershipFieldResolverProvider; + + @BeforeMethod + public void setup() { + MockitoAnnotations.initMocks(this); + groupMembershipFieldResolverProvider = + new GroupMembershipFieldResolverProvider(entityClientMock, systemAuthenticationMock); + } + + @Test + public void shouldReturnGroupsMembershipType() { + assertEquals(EntityFieldType.GROUP_MEMBERSHIP, groupMembershipFieldResolverProvider.getFieldType()); + } + + @Test + public void shouldReturnEmptyFieldValueWhenResponseIsNull() throws RemoteInvocationException, URISyntaxException { + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(null); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenResourceDoesNotBelongToAnyGroup() + throws RemoteInvocationException, URISyntaxException { + var entityResponseMock = mock(EntityResponse.class); + when(entityResponseMock.getAspects()).thenReturn(new EnvelopedAspectMap()); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenThereIsAnException() throws RemoteInvocationException, URISyntaxException { + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenThrow(new RemoteInvocationException()); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnFieldValueWithOnlyGroupsOfTheResource() + throws RemoteInvocationException, URISyntaxException { + + var groupMembership = new GroupMembership().setGroups( + new UrnArray(ImmutableList.of(Urn.createFromString(CORPGROUP_URN)))); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(GROUP_MEMBERSHIP_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(groupMembership.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertEquals(Set.of(CORPGROUP_URN), result.getFieldValuesFuture().join().getValues()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnFieldValueWithOnlyNativeGroupsOfTheResource() + throws RemoteInvocationException, URISyntaxException { + + var nativeGroupMembership = new NativeGroupMembership().setNativeGroups( + new UrnArray(ImmutableList.of(Urn.createFromString(NATIVE_CORPGROUP_URN)))); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(nativeGroupMembership.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertEquals(Set.of(NATIVE_CORPGROUP_URN), result.getFieldValuesFuture().join().getValues()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnFieldValueWithGroupsAndNativeGroupsOfTheResource() + throws RemoteInvocationException, URISyntaxException { + + var groupMembership = new GroupMembership().setGroups( + new UrnArray(ImmutableList.of(Urn.createFromString(CORPGROUP_URN)))); + var nativeGroupMembership = new NativeGroupMembership().setNativeGroups( + new UrnArray(ImmutableList.of(Urn.createFromString(NATIVE_CORPGROUP_URN)))); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(GROUP_MEMBERSHIP_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(groupMembership.data()))); + envelopedAspectMap.put(NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(nativeGroupMembership.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertEquals(Set.of(CORPGROUP_URN, NATIVE_CORPGROUP_URN), result.getFieldValuesFuture().join().getValues()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } +} \ No newline at end of file diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 4dfd96ac75c6c..d22f92adca8f9 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -276,6 +276,10 @@ bootstrap: enabled: ${UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED:false} # enable to run the upgrade to migrate legacy default browse paths to new ones backfillBrowsePathsV2: enabled: ${BACKFILL_BROWSE_PATHS_V2:false} # Enables running the backfill of browsePathsV2 upgrade step. There are concerns about the load of this step so hiding it behind a flag. Deprecating in favor of running through SystemUpdate + policies: + file: ${BOOTSTRAP_POLICIES_FILE:classpath:boot/policies.json} + # eg for local file + # file: "file:///datahub/datahub-gms/resources/custom-policies.json" servlets: waitTimeout: ${BOOTSTRAP_SERVLETS_WAITTIMEOUT:60} # Total waiting time in seconds for servlets to initialize diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java index bf50a0c7b6473..b90257870a8b2 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java @@ -2,12 +2,12 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.DataHubAuthorizer; -import com.datahub.authorization.DefaultResourceSpecResolver; +import com.datahub.authorization.DefaultEntitySpecResolver; import com.datahub.plugins.PluginConstant; import com.datahub.authentication.Authentication; import com.datahub.plugins.auth.authorization.Authorizer; import com.datahub.authorization.AuthorizerContext; -import com.datahub.authorization.ResourceSpecResolver; +import com.datahub.authorization.EntitySpecResolver; import com.datahub.plugins.common.PluginConfig; import com.datahub.plugins.common.PluginPermissionManager; import com.datahub.plugins.common.PluginType; @@ -64,7 +64,7 @@ public class AuthorizerChainFactory { @Scope("singleton") @Nonnull protected AuthorizerChain getInstance() { - final ResourceSpecResolver resolver = initResolver(); + final EntitySpecResolver resolver = initResolver(); // Extract + initialize customer authorizers from application configs. final List authorizers = new ArrayList<>(initCustomAuthorizers(resolver)); @@ -79,11 +79,11 @@ protected AuthorizerChain getInstance() { return new AuthorizerChain(authorizers, dataHubAuthorizer); } - private ResourceSpecResolver initResolver() { - return new DefaultResourceSpecResolver(systemAuthentication, entityClient); + private EntitySpecResolver initResolver() { + return new DefaultEntitySpecResolver(systemAuthentication, entityClient); } - private List initCustomAuthorizers(ResourceSpecResolver resolver) { + private List initCustomAuthorizers(EntitySpecResolver resolver) { final List customAuthorizers = new ArrayList<>(); Path pluginBaseDirectory = Paths.get(configurationProvider.getDatahub().getPlugin().getAuth().getPath()); @@ -99,7 +99,7 @@ private List initCustomAuthorizers(ResourceSpecResolver resolver) { return customAuthorizers; } - private void registerAuthorizer(List customAuthorizers, ResourceSpecResolver resolver, Config config) { + private void registerAuthorizer(List customAuthorizers, EntitySpecResolver resolver, Config config) { PluginConfigFactory authorizerPluginPluginConfigFactory = new PluginConfigFactory(config); // Load only Authorizer configuration from plugin config factory List authorizers = diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/BootstrapManagerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/BootstrapManagerFactory.java index c490f00021201..3a761bd12647e 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/BootstrapManagerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/BootstrapManagerFactory.java @@ -31,6 +31,7 @@ import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.search.SearchService; import com.linkedin.metadata.search.transformer.SearchDocumentTransformer; + import java.util.ArrayList; import java.util.List; import javax.annotation.Nonnull; @@ -41,6 +42,7 @@ import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Scope; +import org.springframework.core.io.Resource; @Configuration @@ -89,13 +91,16 @@ public class BootstrapManagerFactory { @Value("${bootstrap.backfillBrowsePathsV2.enabled}") private Boolean _backfillBrowsePathsV2Enabled; + @Value("${bootstrap.policies.file}") + private Resource _policiesResource; + @Bean(name = "bootstrapManager") @Scope("singleton") @Nonnull protected BootstrapManager createInstance() { final IngestRootUserStep ingestRootUserStep = new IngestRootUserStep(_entityService); final IngestPoliciesStep ingestPoliciesStep = - new IngestPoliciesStep(_entityRegistry, _entityService, _entitySearchService, _searchDocumentTransformer); + new IngestPoliciesStep(_entityRegistry, _entityService, _entitySearchService, _searchDocumentTransformer, _policiesResource); final IngestRolesStep ingestRolesStep = new IngestRolesStep(_entityService, _entityRegistry); final IngestDataPlatformsStep ingestDataPlatformsStep = new IngestDataPlatformsStep(_entityService); final IngestDataPlatformInstancesStep ingestDataPlatformInstancesStep = diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java index 87dcfd736da40..cf29645214466 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java @@ -25,6 +25,7 @@ import com.linkedin.mxe.GenericAspect; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.policy.DataHubPolicyInfo; + import java.io.IOException; import java.net.URISyntaxException; import java.util.Collections; @@ -35,7 +36,8 @@ import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.springframework.core.io.ClassPathResource; +import org.springframework.core.io.Resource; + import static com.linkedin.metadata.Constants.*; @@ -52,6 +54,8 @@ public class IngestPoliciesStep implements BootstrapStep { private final EntitySearchService _entitySearchService; private final SearchDocumentTransformer _searchDocumentTransformer; + private final Resource _policiesResource; + @Override public String name() { return "IngestPoliciesStep"; @@ -66,10 +70,10 @@ public void execute() throws IOException, URISyntaxException { .maxStringLength(maxSize).build()); // 0. Execute preflight check to see whether we need to ingest policies - log.info("Ingesting default access policies..."); + log.info("Ingesting default access policies from: {}...", _policiesResource); // 1. Read from the file into JSON. - final JsonNode policiesObj = mapper.readTree(new ClassPathResource("./boot/policies.json").getFile()); + final JsonNode policiesObj = mapper.readTree(_policiesResource.getFile()); if (!policiesObj.isArray()) { throw new RuntimeException( diff --git a/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/delegates/EntityApiDelegateImpl.java b/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/delegates/EntityApiDelegateImpl.java index ade49c876f168..207c2284e2673 100644 --- a/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/delegates/EntityApiDelegateImpl.java +++ b/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/delegates/EntityApiDelegateImpl.java @@ -45,8 +45,7 @@ import io.datahubproject.openapi.util.OpenApiEntitiesUtil; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.linkedin.metadata.models.EntitySpec; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.metadata.authorization.PoliciesConfig; import com.google.common.collect.ImmutableList; import com.datahub.authorization.AuthUtil; @@ -377,7 +376,7 @@ public ResponseEntity scroll(@Valid Boolean systemMetadata, @Valid List sort, @Valid SortOrder sortOrder, @Valid String query) { Authentication authentication = AuthenticationContext.getAuthentication(); - EntitySpec entitySpec = OpenApiEntitiesUtil.responseClassToEntitySpec(_entityRegistry, _respClazz); + com.linkedin.metadata.models.EntitySpec entitySpec = OpenApiEntitiesUtil.responseClassToEntitySpec(_entityRegistry, _respClazz); checkScrollAuthorized(authentication, entitySpec); // TODO multi-field sort @@ -410,12 +409,12 @@ public ResponseEntity scroll(@Valid Boolean systemMetadata, @Valid List> resourceSpecs = List.of(Optional.of(new ResourceSpec(entitySpec.getName(), ""))); + List> resourceSpecs = List.of(Optional.of(new EntitySpec(entitySpec.getName(), ""))); if (_restApiAuthorizationEnabled && !AuthUtil.isAuthorizedForResources(_authorizationChain, actorUrnStr, resourceSpecs, orGroup)) { throw new UnauthorizedException(actorUrnStr + " is unauthorized to get entities."); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/entities/EntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/entities/EntitiesController.java index 6439e2f31f7b0..898f768cf999a 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/entities/EntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/entities/EntitiesController.java @@ -8,7 +8,7 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; @@ -93,8 +93,8 @@ public ResponseEntity getEntities( ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE.getType()) ))); - List> resourceSpecs = entityUrns.stream() - .map(urn -> Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + List> resourceSpecs = entityUrns.stream() + .map(urn -> Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (restApiAuthorizationEnabled && !AuthUtil.isAuthorizedForResources(_authorizerChain, actorUrnStr, resourceSpecs, orGroup)) { throw new UnauthorizedException(actorUrnStr + " is unauthorized to get entities."); @@ -175,8 +175,8 @@ public ResponseEntity> deleteEntities( .map(URLDecoder::decode) .map(UrnUtils::getUrn).collect(Collectors.toSet()); - List> resourceSpecs = entityUrns.stream() - .map(urn -> Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + List> resourceSpecs = entityUrns.stream() + .map(urn -> Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (restApiAuthorizationEnabled && !AuthUtil.isAuthorizedForResources(_authorizerChain, actorUrnStr, resourceSpecs, orGroup)) { UnauthorizedException unauthorizedException = new UnauthorizedException(actorUrnStr + " is unauthorized to delete entities."); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java index 1e37170f37b3b..4641fed3a8610 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java @@ -8,7 +8,7 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; @@ -131,8 +131,8 @@ public ResponseEntity getRelationships( // Re-using GET_ENTITY_PRIVILEGE here as it doesn't make sense to split the privileges between these APIs. ))); - List> resourceSpecs = - Collections.singletonList(Optional.of(new ResourceSpec(entityUrn.getEntityType(), entityUrn.toString()))); + List> resourceSpecs = + Collections.singletonList(Optional.of(new EntitySpec(entityUrn.getEntityType(), entityUrn.toString()))); if (restApiAuthorizationEnabled && !AuthUtil.isAuthorizedForResources(_authorizerChain, actorUrnStr, resourceSpecs, orGroup)) { throw new UnauthorizedException(actorUrnStr + " is unauthorized to get relationships."); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/timeline/TimelineController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/timeline/TimelineController.java index 5a0ce2e314e1b..fbde9e8072002 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/timeline/TimelineController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/timeline/TimelineController.java @@ -6,7 +6,7 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.fasterxml.jackson.core.JsonProcessingException; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; @@ -67,7 +67,7 @@ public ResponseEntity> getTimeline( Urn urn = Urn.createFromString(rawUrn); Authentication authentication = AuthenticationContext.getAuthentication(); String actorUrnStr = authentication.getActor().toUrnStr(); - ResourceSpec resourceSpec = new ResourceSpec(urn.getEntityType(), rawUrn); + EntitySpec resourceSpec = new EntitySpec(urn.getEntityType(), rawUrn); DisjunctivePrivilegeGroup orGroup = new DisjunctivePrivilegeGroup( ImmutableList.of(new ConjunctivePrivilegeGroup(ImmutableList.of(PoliciesConfig.GET_TIMELINE_PRIVILEGE.getType())))); if (restApiAuthorizationEnabled && !AuthUtil.isAuthorized(_authorizerChain, actorUrnStr, Optional.of(resourceSpec), orGroup)) { diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java index 2b3e84e2df20f..21dc5a4c8a0d6 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java @@ -5,7 +5,7 @@ import com.datahub.authorization.AuthUtil; import com.datahub.plugins.auth.authorization.Authorizer; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; @@ -27,7 +27,6 @@ import com.linkedin.metadata.entity.ebean.transactions.AspectsBatchImpl; import com.linkedin.metadata.entity.transactions.AspectsBatch; import com.linkedin.metadata.entity.validation.ValidationException; -import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.entity.AspectUtils; import com.linkedin.metadata.utils.EntityKeyUtils; import com.linkedin.metadata.utils.metrics.MetricUtils; @@ -378,11 +377,11 @@ public static GenericAspect convertGenericAspect(@Nonnull io.datahubproject.open public static boolean authorizeProposals(List proposals, EntityService entityService, Authorizer authorizer, String actorUrnStr, DisjunctivePrivilegeGroup orGroup) { - List> resourceSpecs = proposals.stream() + List> resourceSpecs = proposals.stream() .map(proposal -> { - EntitySpec entitySpec = entityService.getEntityRegistry().getEntitySpec(proposal.getEntityType()); + com.linkedin.metadata.models.EntitySpec entitySpec = entityService.getEntityRegistry().getEntitySpec(proposal.getEntityType()); Urn entityUrn = EntityKeyUtils.getUrnFromProposal(proposal, entitySpec.getKeyAspectSpec()); - return Optional.of(new ResourceSpec(proposal.getEntityType(), entityUrn.toString())); + return Optional.of(new EntitySpec(proposal.getEntityType(), entityUrn.toString())); }) .collect(Collectors.toList()); return AuthUtil.isAuthorizedForResources(authorizer, actorUrnStr, resourceSpecs, orGroup); @@ -513,7 +512,7 @@ public static RollbackRunResultDto mapRollbackRunResult(RollbackRunResult rollba } public static UpsertAspectRequest createStatusRemoval(Urn urn, EntityService entityService) { - EntitySpec entitySpec = entityService.getEntityRegistry().getEntitySpec(urn.getEntityType()); + com.linkedin.metadata.models.EntitySpec entitySpec = entityService.getEntityRegistry().getEntitySpec(urn.getEntityType()); if (entitySpec == null || !entitySpec.getAspectSpecMap().containsKey(STATUS_ASPECT_NAME)) { throw new IllegalArgumentException("Entity type is not valid for soft deletes: " + urn.getEntityType()); } diff --git a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java index b6bc282f10b65..442ac1b0d287b 100644 --- a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java +++ b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java @@ -4,7 +4,7 @@ import com.datahub.authorization.AuthorizationResult; import com.datahub.authorization.AuthorizedActors; import com.datahub.authorization.AuthorizerContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.PluginConstant; import com.datahub.plugins.auth.authorization.Authorizer; import java.io.BufferedReader; @@ -74,7 +74,7 @@ public AuthorizationResult authorize(@Nonnull AuthorizationRequest request) { } @Override - public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { + public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { return new AuthorizedActors("ALL", null, null, true, true); } } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 936c8bb67e645..af76af90ce77f 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -3,7 +3,7 @@ import com.codahale.metrics.MetricRegistry; import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; @@ -20,7 +20,6 @@ import com.linkedin.metadata.entity.AspectUtils; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.validation.ValidationException; -import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.restli.RestliUtil; @@ -123,7 +122,7 @@ public Task get(@Nonnull String urnStr, @QueryParam("aspect") @Option Authentication authentication = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get aspect for " + urn); } final VersionedAspect aspect = _entityService.getVersionedAspect(urn, aspectName, version); @@ -154,7 +153,7 @@ public Task getTimeseriesAspectValues( Authentication authentication = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.GET_TIMESERIES_ASPECT_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get timeseries aspect for " + urn); } GetTimeseriesAspectValuesResponse response = new GetTimeseriesAspectValuesResponse(); @@ -193,11 +192,11 @@ public Task ingestProposal( } Authentication authentication = AuthenticationContext.getAuthentication(); - EntitySpec entitySpec = _entityService.getEntityRegistry().getEntitySpec(metadataChangeProposal.getEntityType()); + com.linkedin.metadata.models.EntitySpec entitySpec = _entityService.getEntityRegistry().getEntitySpec(metadataChangeProposal.getEntityType()); Urn urn = EntityKeyUtils.getUrnFromProposal(metadataChangeProposal, entitySpec.getKeyAspectSpec()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to modify entity " + urn); } String actorUrnStr = authentication.getActor().toUrnStr(); @@ -249,7 +248,7 @@ public Task getCount(@ActionParam(PARAM_ASPECT) @Nonnull String aspectN Authentication authentication = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), - (ResourceSpec) null)) { + (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get aspect counts."); } return _entityService.getCountAspect(aspectName, urnLike); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java index 3ff22fb767676..9bab846d1bdcc 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.Urn; @@ -123,9 +123,9 @@ public Task rollback(@ActionParam("runId") @Nonnull String run List aspectRowsToDelete; aspectRowsToDelete = _systemMetadataService.findByRunId(runId, doHardDelete, 0, ESUtils.MAX_RESULT_SIZE); Set urns = aspectRowsToDelete.stream().collect(Collectors.groupingBy(AspectRowSummary::getUrn)).keySet(); - List> resourceSpecs = urns.stream() + List> resourceSpecs = urns.stream() .map(UrnUtils::getUrn) - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java index f6dedfb9a07c6..3ee98b3244718 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java @@ -3,7 +3,7 @@ import com.codahale.metrics.MetricRegistry; import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.common.AuditStamp; @@ -173,7 +173,7 @@ public Task get(@Nonnull String urnStr, final Urn urn = Urn.createFromString(urnStr); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), new ResourceSpec(urn.getEntityType(), urnStr))) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), new EntitySpec(urn.getEntityType(), urnStr))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity " + urn); } @@ -198,8 +198,8 @@ public Task> batchGet(@Nonnull Set urnStrs, for (final String urnStr : urnStrs) { urns.add(Urn.createFromString(urnStr)); } - List> resourceSpecs = urns.stream() - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + List> resourceSpecs = urns.stream() + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) @@ -242,7 +242,7 @@ public Task ingest(@ActionParam(PARAM_ENTITY) @Nonnull Entity entity, final Urn urn = com.datahub.util.ModelUtils.getUrnFromSnapshotUnion(entity.getValue()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to edit entity " + urn); } @@ -273,10 +273,10 @@ public Task batchIngest(@ActionParam(PARAM_ENTITIES) @Nonnull Entity[] ent Authentication authentication = AuthenticationContext.getAuthentication(); String actorUrnStr = authentication.getActor().toUrnStr(); - List> resourceSpecs = Arrays.stream(entities) + List> resourceSpecs = Arrays.stream(entities) .map(Entity::getValue) .map(com.datahub.util.ModelUtils::getUrnFromSnapshotUnion) - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), resourceSpecs)) { @@ -322,7 +322,7 @@ public Task search(@ActionParam(PARAM_ENTITY) @Nonnull String enti @Optional @Nullable @ActionParam(PARAM_SEARCH_FLAGS) SearchFlags searchFlags) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -347,7 +347,7 @@ public Task searchAcrossEntities(@ActionParam(PARAM_ENTITIES) @Opt @ActionParam(PARAM_COUNT) int count, @ActionParam(PARAM_SEARCH_FLAGS) @Optional SearchFlags searchFlags) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -391,7 +391,7 @@ public Task searchAcrossLineage(@ActionParam(PARAM_URN) @No @Optional @Nullable @ActionParam(PARAM_SEARCH_FLAGS) SearchFlags searchFlags) throws URISyntaxException { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -443,7 +443,7 @@ public Task list(@ActionParam(PARAM_ENTITY) @Nonnull String entityNa Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -462,7 +462,7 @@ public Task autocomplete(@ActionParam(PARAM_ENTITY) @Nonnull Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -479,7 +479,7 @@ public Task browse(@ActionParam(PARAM_ENTITY) @Nonnull String enti Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -497,7 +497,7 @@ public Task getBrowsePaths( Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity: " + urn); } @@ -546,9 +546,9 @@ public Task deleteEntities(@ActionParam("registryId") @Optiona log.info("found {} rows to delete...", stringifyRowCount(aspectRowsToDelete.size())); response.setAspectsAffected(aspectRowsToDelete.size()); Set urns = aspectRowsToDelete.stream().collect(Collectors.groupingBy(AspectRowSummary::getUrn)).keySet(); - List> resourceSpecs = urns.stream() + List> resourceSpecs = urns.stream() .map(UrnUtils::getUrn) - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) @@ -590,7 +590,7 @@ public Task deleteEntity(@ActionParam(PARAM_URN) @Nonnull Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.DELETE_ENTITY_PRIVILEGE), - Collections.singletonList(java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))))) { + Collections.singletonList(java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to delete entity: " + urnStr); } @@ -638,7 +638,7 @@ private Long deleteTimeseriesAspects(@Nonnull Urn urn, @Nullable Long startTimeM Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.DELETE_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to delete entity " + urn); } @@ -678,7 +678,7 @@ public Task deleteReferencesTo(@ActionParam(PARAM_URN) Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.DELETE_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urnStr))) { + new EntitySpec(urn.getEntityType(), urnStr))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to delete entity " + urnStr); } @@ -695,7 +695,7 @@ public Task deleteReferencesTo(@ActionParam(PARAM_URN) public Task setWriteable(@ActionParam(PARAM_VALUE) @Optional("true") @Nonnull Boolean value) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SET_WRITEABLE_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SET_WRITEABLE_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to enable and disable write mode."); } @@ -712,7 +712,7 @@ public Task setWriteable(@ActionParam(PARAM_VALUE) @Optional("true") @Nonn public Task getTotalEntityCount(@ActionParam(PARAM_ENTITY) @Nonnull String entityName) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity counts."); } @@ -725,7 +725,7 @@ public Task getTotalEntityCount(@ActionParam(PARAM_ENTITY) @Nonnull String public Task batchGetTotalEntityCount(@ActionParam(PARAM_ENTITIES) @Nonnull String[] entityNames) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity counts."); } @@ -739,7 +739,7 @@ public Task listUrns(@ActionParam(PARAM_ENTITY) @Nonnull String @ActionParam(PARAM_START) int start, @ActionParam(PARAM_COUNT) int count) throws URISyntaxException { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -757,10 +757,10 @@ public Task applyRetention(@ActionParam(PARAM_START) @Optional @Nullable @ActionParam(PARAM_URN) @Optional @Nullable String urn ) { Authentication auth = AuthenticationContext.getAuthentication(); - ResourceSpec resourceSpec = null; + EntitySpec resourceSpec = null; if (StringUtils.isNotBlank(urn)) { Urn resource = UrnUtils.getUrn(urn); - resourceSpec = new ResourceSpec(resource.getEntityType(), resource.toString()); + resourceSpec = new EntitySpec(resource.getEntityType(), resource.toString()); } if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.APPLY_RETENTION_PRIVILEGE), resourceSpec)) { @@ -781,7 +781,7 @@ public Task filter(@ActionParam(PARAM_ENTITY) @Nonnull String enti Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -799,7 +799,7 @@ public Task exists(@ActionParam(PARAM_URN) @Nonnull String urnStr) thro Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urnStr))) { + new EntitySpec(urn.getEntityType(), urnStr))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized get entity: " + urnStr); } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index 7efb93c0f50e6..0c3e93273b863 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.entity.EntityResponse; @@ -68,7 +68,7 @@ public Task get(@Nonnull String urnStr, final Urn urn = Urn.createFromString(urnStr); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), new ResourceSpec(urn.getEntityType(), urnStr))) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), new EntitySpec(urn.getEntityType(), urnStr))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity " + urn); } @@ -96,8 +96,8 @@ public Task> batchGet(@Nonnull Set urnStrs, urns.add(Urn.createFromString(urnStr)); } Authentication auth = AuthenticationContext.getAuthentication(); - List> resourceSpecs = urns.stream() - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + List> resourceSpecs = urns.stream() + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), resourceSpecs)) { diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java index fd5c3507b5408..05b7e6b3ff24b 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.VersionedUrn; import com.linkedin.common.urn.Urn; @@ -65,9 +65,9 @@ public Task> batchGetVersioned( @QueryParam(PARAM_ENTITY_TYPE) @Nonnull String entityType, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) { Authentication auth = AuthenticationContext.getAuthentication(); - List> resourceSpecs = versionedUrnStrs.stream() + List> resourceSpecs = versionedUrnStrs.stream() .map(versionedUrn -> UrnUtils.getUrn(versionedUrn.getUrn())) - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), resourceSpecs)) { diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java index 313d16333f9e9..4a8e74c89039a 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.EntityRelationship; import com.linkedin.common.EntityRelationshipArray; @@ -107,7 +107,7 @@ public Task get(@QueryParam("urn") @Nonnull String rawUrn, Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - Collections.singletonList(java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))))) { + Collections.singletonList(java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity lineage: " + rawUrn); } @@ -142,7 +142,7 @@ public UpdateResponse delete(@QueryParam("urn") @Nonnull String rawUrn) throws E Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.DELETE_ENTITY_PRIVILEGE), - Collections.singletonList(java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))))) { + Collections.singletonList(java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to delete entity: " + rawUrn); } @@ -162,7 +162,7 @@ public Task getLineage(@ActionParam(PARAM_URN) @Nonnull Str Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - Collections.singletonList(java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))))) { + Collections.singletonList(java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity lineage: " + urnStr); } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java index 188e5ae18ee8f..12586b66495a9 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java @@ -2,7 +2,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; @@ -37,10 +37,10 @@ public static String restoreIndices( @Nonnull EntityService entityService ) { Authentication authentication = AuthenticationContext.getAuthentication(); - ResourceSpec resourceSpec = null; + EntitySpec resourceSpec = null; if (StringUtils.isNotBlank(urn)) { Urn resource = UrnUtils.getUrn(urn); - resourceSpec = new ResourceSpec(resource.getEntityType(), resource.toString()); + resourceSpec = new EntitySpec(resource.getEntityType(), resource.toString()); } if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, authorizer, ImmutableList.of(PoliciesConfig.RESTORE_INDICES_PRIVILEGE), diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java index f36841bb4abae..a8018074497c4 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java @@ -3,7 +3,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.entity.Entity; import com.linkedin.metadata.authorization.PoliciesConfig; @@ -54,7 +54,7 @@ public Task producePlatformEvent( @ActionParam("event") @Nonnull PlatformEvent event) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.PRODUCE_PLATFORM_EVENT_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.PRODUCE_PLATFORM_EVENT_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to produce platform events."); } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java index 5c3b90a84aec1..9949556c99b81 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java @@ -4,7 +4,7 @@ import com.datahub.authorization.AuthUtil; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.metadata.authorization.PoliciesConfig; @@ -82,13 +82,13 @@ public static RestLiServiceException invalidArgumentsException(@Nullable String } public static boolean isAuthorized(@Nonnull Authentication authentication, @Nonnull Authorizer authorizer, - @Nonnull final List privileges, @Nonnull final List> resources) { + @Nonnull final List privileges, @Nonnull final List> resources) { DisjunctivePrivilegeGroup orGroup = convertPrivilegeGroup(privileges); return AuthUtil.isAuthorizedForResources(authorizer, authentication.getActor().toUrnStr(), resources, orGroup); } public static boolean isAuthorized(@Nonnull Authentication authentication, @Nonnull Authorizer authorizer, - @Nonnull final List privileges, @Nullable final ResourceSpec resource) { + @Nonnull final List privileges, @Nullable final EntitySpec resource) { DisjunctivePrivilegeGroup orGroup = convertPrivilegeGroup(privileges); return AuthUtil.isAuthorized(authorizer, authentication.getActor().toUrnStr(), java.util.Optional.ofNullable(resource), orGroup); } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java index be70cf9c494ef..02d413301f3b4 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.StreamReadConstraints; import com.fasterxml.jackson.databind.JsonNode; @@ -125,7 +125,7 @@ public Task batchIngest(@ActionParam(PARAM_BUCKETS) @Nonnull UsageAggregat return RestliUtil.toTask(() -> { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to edit entities."); } @@ -323,7 +323,7 @@ public Task query(@ActionParam(PARAM_RESOURCE) @Nonnull String Urn resourceUrn = UrnUtils.getUrn(resource); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE), - new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { + new EntitySpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to query usage."); } @@ -383,7 +383,7 @@ public Task queryRange(@ActionParam(PARAM_RESOURCE) @Nonnull S Urn resourceUrn = UrnUtils.getUrn(resource); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE), - new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { + new EntitySpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to query usage."); } diff --git a/smoke-test/tests/assertions/assertions_test.py b/smoke-test/tests/assertions/assertions_test.py index 4aa64c512f684..48f3564e6cd97 100644 --- a/smoke-test/tests/assertions/assertions_test.py +++ b/smoke-test/tests/assertions/assertions_test.py @@ -2,28 +2,29 @@ import urllib import pytest -import requests_wrapper as requests import tenacity from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext, RecordEnvelope from datahub.ingestion.api.sink import NoopWriteCallback from datahub.ingestion.sink.file import FileSink, FileSinkConfig -from datahub.metadata.com.linkedin.pegasus2avro.assertion import AssertionStdAggregation -from datahub.metadata.schema_classes import ( - AssertionInfoClass, - AssertionResultClass, - AssertionResultTypeClass, - AssertionRunEventClass, - AssertionRunStatusClass, - AssertionStdOperatorClass, - AssertionTypeClass, - DatasetAssertionInfoClass, - DatasetAssertionScopeClass, - PartitionSpecClass, - PartitionTypeClass, -) -from tests.utils import delete_urns_from_file, get_gms_url, ingest_file_via_rest, wait_for_healthcheck_util, get_sleep_info +from datahub.metadata.com.linkedin.pegasus2avro.assertion import \ + AssertionStdAggregation +from datahub.metadata.schema_classes import (AssertionInfoClass, + AssertionResultClass, + AssertionResultTypeClass, + AssertionRunEventClass, + AssertionRunStatusClass, + AssertionStdOperatorClass, + AssertionTypeClass, + DatasetAssertionInfoClass, + DatasetAssertionScopeClass, + PartitionSpecClass, + PartitionTypeClass) + +import requests_wrapper as requests +from tests.utils import (delete_urns_from_file, get_gms_url, get_sleep_info, + ingest_file_via_rest, wait_for_healthcheck_util) restli_default_headers = { "X-RestLi-Protocol-Version": "2.0.0", diff --git a/smoke-test/tests/browse/browse_test.py b/smoke-test/tests/browse/browse_test.py index b9d2143d13ec7..550f0062d5a39 100644 --- a/smoke-test/tests/browse/browse_test.py +++ b/smoke-test/tests/browse/browse_test.py @@ -1,9 +1,10 @@ import time import pytest -import requests_wrapper as requests -from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest +import requests_wrapper as requests +from tests.utils import (delete_urns_from_file, get_frontend_url, + ingest_file_via_rest) TEST_DATASET_1_URN = "urn:li:dataset:(urn:li:dataPlatform:kafka,test-browse-1,PROD)" TEST_DATASET_2_URN = "urn:li:dataset:(urn:li:dataPlatform:kafka,test-browse-2,PROD)" @@ -51,7 +52,9 @@ def test_get_browse_paths(frontend_session, ingest_cleanup_data): # /prod -- There should be one entity get_browse_paths_json = { "query": get_browse_paths_query, - "variables": {"input": { "type": "DATASET", "path": ["prod"], "start": 0, "count": 100 } }, + "variables": { + "input": {"type": "DATASET", "path": ["prod"], "start": 0, "count": 100} + }, } response = frontend_session.post( @@ -67,12 +70,19 @@ def test_get_browse_paths(frontend_session, ingest_cleanup_data): browse = res_data["data"]["browse"] print(browse) - assert browse["entities"] == [{ "urn": TEST_DATASET_3_URN }] + assert browse["entities"] == [{"urn": TEST_DATASET_3_URN}] # /prod/kafka1 get_browse_paths_json = { "query": get_browse_paths_query, - "variables": {"input": { "type": "DATASET", "path": ["prod", "kafka1"], "start": 0, "count": 10 } }, + "variables": { + "input": { + "type": "DATASET", + "path": ["prod", "kafka1"], + "start": 0, + "count": 10, + } + }, } response = frontend_session.post( @@ -88,16 +98,27 @@ def test_get_browse_paths(frontend_session, ingest_cleanup_data): browse = res_data["data"]["browse"] assert browse == { - "total": 3, - "entities": [{ "urn": TEST_DATASET_1_URN }, { "urn": TEST_DATASET_2_URN }, { "urn": TEST_DATASET_3_URN }], - "groups": [], - "metadata": { "path": ["prod", "kafka1"], "totalNumEntities": 0 } + "total": 3, + "entities": [ + {"urn": TEST_DATASET_1_URN}, + {"urn": TEST_DATASET_2_URN}, + {"urn": TEST_DATASET_3_URN}, + ], + "groups": [], + "metadata": {"path": ["prod", "kafka1"], "totalNumEntities": 0}, } # /prod/kafka2 get_browse_paths_json = { "query": get_browse_paths_query, - "variables": {"input": { "type": "DATASET", "path": ["prod", "kafka2"], "start": 0, "count": 10 } }, + "variables": { + "input": { + "type": "DATASET", + "path": ["prod", "kafka2"], + "start": 0, + "count": 10, + } + }, } response = frontend_session.post( @@ -113,10 +134,8 @@ def test_get_browse_paths(frontend_session, ingest_cleanup_data): browse = res_data["data"]["browse"] assert browse == { - "total": 2, - "entities": [{ "urn": TEST_DATASET_1_URN }, { "urn": TEST_DATASET_2_URN }], - "groups": [], - "metadata": { "path": ["prod", "kafka2"], "totalNumEntities": 0 } + "total": 2, + "entities": [{"urn": TEST_DATASET_1_URN}, {"urn": TEST_DATASET_2_URN}], + "groups": [], + "metadata": {"path": ["prod", "kafka2"], "totalNumEntities": 0}, } - - diff --git a/smoke-test/tests/cli/datahub-cli.py b/smoke-test/tests/cli/datahub-cli.py index 1d0080bdd9d48..c3db6028efceb 100644 --- a/smoke-test/tests/cli/datahub-cli.py +++ b/smoke-test/tests/cli/datahub-cli.py @@ -1,8 +1,11 @@ import json -import pytest from time import sleep -from datahub.cli.cli_utils import guess_entity_type, post_entity, get_aspects_for_entity + +import pytest +from datahub.cli.cli_utils import (get_aspects_for_entity, guess_entity_type, + post_entity) from datahub.cli.ingest_cli import get_session_and_host, rollback + from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync ingested_dataset_run_id = "" @@ -24,24 +27,46 @@ def test_setup(): session, gms_host = get_session_and_host() - assert "browsePaths" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) - assert "editableDatasetProperties" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + assert "browsePaths" not in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["browsePaths"], typed=False + ) + assert "editableDatasetProperties" not in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False + ) - ingested_dataset_run_id = ingest_file_via_rest("tests/cli/cli_test_data.json").config.run_id + ingested_dataset_run_id = ingest_file_via_rest( + "tests/cli/cli_test_data.json" + ).config.run_id print("Setup ingestion id: " + ingested_dataset_run_id) - assert "browsePaths" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) + assert "browsePaths" in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["browsePaths"], typed=False + ) yield # Clean up rollback_url = f"{gms_host}/runs?action=rollback" - session.post(rollback_url, data=json.dumps({"runId": ingested_editable_run_id, "dryRun": False, "hardDelete": True})) - session.post(rollback_url, data=json.dumps({"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": True})) + session.post( + rollback_url, + data=json.dumps( + {"runId": ingested_editable_run_id, "dryRun": False, "hardDelete": True} + ), + ) + session.post( + rollback_url, + data=json.dumps( + {"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": True} + ), + ) - assert "browsePaths" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) - assert "editableDatasetProperties" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + assert "browsePaths" not in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["browsePaths"], typed=False + ) + assert "editableDatasetProperties" not in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False + ) @pytest.mark.dependency() @@ -49,9 +74,7 @@ def test_rollback_editable(): global ingested_dataset_run_id global ingested_editable_run_id platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-rollback" - ) + dataset_name = "test-rollback" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -59,23 +82,38 @@ def test_rollback_editable(): print("Ingested dataset id:", ingested_dataset_run_id) # Assert that second data ingestion worked - assert "browsePaths" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) + assert "browsePaths" in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["browsePaths"], typed=False + ) # Make editable change - ingested_editable_run_id = ingest_file_via_rest("tests/cli/cli_editable_test_data.json").config.run_id + ingested_editable_run_id = ingest_file_via_rest( + "tests/cli/cli_editable_test_data.json" + ).config.run_id print("ingested editable id:", ingested_editable_run_id) # Assert that second data ingestion worked - assert "editableDatasetProperties" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + assert "editableDatasetProperties" in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False + ) # rollback ingestion 1 rollback_url = f"{gms_host}/runs?action=rollback" - session.post(rollback_url, data=json.dumps({"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": False})) + session.post( + rollback_url, + data=json.dumps( + {"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": False} + ), + ) # Allow async MCP processor to handle ingestions & rollbacks wait_for_writes_to_sync() # EditableDatasetProperties should still be part of the entity that was soft deleted. - assert "editableDatasetProperties" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + assert "editableDatasetProperties" in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False + ) # But first ingestion aspects should not be present - assert "browsePaths" not in get_aspects_for_entity(entity_urn=dataset_urn, typed=False) + assert "browsePaths" not in get_aspects_for_entity( + entity_urn=dataset_urn, typed=False + ) diff --git a/smoke-test/tests/cli/datahub_graph_test.py b/smoke-test/tests/cli/datahub_graph_test.py index 16925d26f6983..17c8924fb0998 100644 --- a/smoke-test/tests/cli/datahub_graph_test.py +++ b/smoke-test/tests/cli/datahub_graph_test.py @@ -1,13 +1,11 @@ import pytest import tenacity from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph -from datahub.metadata.schema_classes import KafkaSchemaClass, SchemaMetadataClass -from tests.utils import ( - delete_urns_from_file, - get_gms_url, - get_sleep_info, - ingest_file_via_rest, -) +from datahub.metadata.schema_classes import (KafkaSchemaClass, + SchemaMetadataClass) + +from tests.utils import (delete_urns_from_file, get_gms_url, get_sleep_info, + ingest_file_via_rest) sleep_sec, sleep_times = get_sleep_info() diff --git a/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py b/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py index 4288a61b7a0c1..106da7cd8d71e 100644 --- a/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py +++ b/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py @@ -1,21 +1,22 @@ import json import logging +import sys import tempfile import time -import sys from json import JSONDecodeError from typing import Any, Dict, List, Optional -from click.testing import CliRunner, Result - import datahub.emitter.mce_builder as builder +from click.testing import CliRunner, Result from datahub.emitter.serialization_helper import pre_json_transform from datahub.entrypoints import datahub from datahub.metadata.schema_classes import DatasetProfileClass + +import requests_wrapper as requests from tests.aspect_generators.timeseries.dataset_profile_gen import \ gen_dataset_profiles -from tests.utils import get_strftime_from_timestamp_millis, wait_for_writes_to_sync -import requests_wrapper as requests +from tests.utils import (get_strftime_from_timestamp_millis, + wait_for_writes_to_sync) logger = logging.getLogger(__name__) @@ -33,6 +34,7 @@ def sync_elastic() -> None: wait_for_writes_to_sync() + def datahub_put_profile(dataset_profile: DatasetProfileClass) -> None: with tempfile.NamedTemporaryFile("w+t", suffix=".json") as aspect_file: aspect_text: str = json.dumps(pre_json_transform(dataset_profile.to_obj())) diff --git a/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py b/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py index 61e7a5a65b494..e962b1a5cafd6 100644 --- a/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py +++ b/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py @@ -2,14 +2,14 @@ import time from typing import Any, Dict, List, Optional -from click.testing import CliRunner, Result - import datahub.emitter.mce_builder as builder +from click.testing import CliRunner, Result from datahub.emitter.serialization_helper import post_json_transform from datahub.entrypoints import datahub from datahub.metadata.schema_classes import DatasetProfileClass -from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync + import requests_wrapper as requests +from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync runner = CliRunner(mix_stderr=False) diff --git a/smoke-test/tests/cli/user_groups_cmd/test_group_cmd.py b/smoke-test/tests/cli/user_groups_cmd/test_group_cmd.py index 405e061c016f9..7b986d3be0444 100644 --- a/smoke-test/tests/cli/user_groups_cmd/test_group_cmd.py +++ b/smoke-test/tests/cli/user_groups_cmd/test_group_cmd.py @@ -1,6 +1,7 @@ import json import sys import tempfile +import time from typing import Any, Dict, Iterable, List import yaml @@ -8,7 +9,7 @@ from datahub.api.entities.corpgroup.corpgroup import CorpGroup from datahub.entrypoints import datahub from datahub.ingestion.graph.client import DataHubGraph, get_default_graph -import time + import requests_wrapper as requests from tests.utils import wait_for_writes_to_sync diff --git a/smoke-test/tests/conftest.py b/smoke-test/tests/conftest.py index eed7a983197ef..57b92a2db1c19 100644 --- a/smoke-test/tests/conftest.py +++ b/smoke-test/tests/conftest.py @@ -2,8 +2,8 @@ import pytest -from tests.utils import wait_for_healthcheck_util, get_frontend_session from tests.test_result_msg import send_message +from tests.utils import get_frontend_session, wait_for_healthcheck_util # Disable telemetry os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" @@ -28,5 +28,5 @@ def test_healthchecks(wait_for_healthchecks): def pytest_sessionfinish(session, exitstatus): - """ whole test run finishes. """ + """whole test run finishes.""" send_message(exitstatus) diff --git a/smoke-test/tests/consistency_utils.py b/smoke-test/tests/consistency_utils.py index 15993733c592b..607835bf3649c 100644 --- a/smoke-test/tests/consistency_utils.py +++ b/smoke-test/tests/consistency_utils.py @@ -1,10 +1,16 @@ -import time +import logging import os import subprocess +import time _ELASTIC_BUFFER_WRITES_TIME_IN_SEC: int = 1 USE_STATIC_SLEEP: bool = bool(os.getenv("USE_STATIC_SLEEP", False)) -ELASTICSEARCH_REFRESH_INTERVAL_SECONDS: int = int(os.getenv("ELASTICSEARCH_REFRESH_INTERVAL_SECONDS", 5)) +ELASTICSEARCH_REFRESH_INTERVAL_SECONDS: int = int( + os.getenv("ELASTICSEARCH_REFRESH_INTERVAL_SECONDS", 5) +) + +logger = logging.getLogger(__name__) + def wait_for_writes_to_sync(max_timeout_in_sec: int = 120) -> None: if USE_STATIC_SLEEP: @@ -30,7 +36,9 @@ def wait_for_writes_to_sync(max_timeout_in_sec: int = 120) -> None: lag_zero = True if not lag_zero: - logger.warning(f"Exiting early from waiting for elastic to catch up due to a timeout. Current lag is {lag_values}") + logger.warning( + f"Exiting early from waiting for elastic to catch up due to a timeout. Current lag is {lag_values}" + ) else: # we want to sleep for an additional period of time for Elastic writes buffer to clear - time.sleep(_ELASTIC_BUFFER_WRITES_TIME_IN_SEC) \ No newline at end of file + time.sleep(_ELASTIC_BUFFER_WRITES_TIME_IN_SEC) diff --git a/smoke-test/tests/containers/containers_test.py b/smoke-test/tests/containers/containers_test.py index 575e3def6cf23..05a45239dabf8 100644 --- a/smoke-test/tests/containers/containers_test.py +++ b/smoke-test/tests/containers/containers_test.py @@ -1,5 +1,7 @@ import pytest -from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest + +from tests.utils import (delete_urns_from_file, get_frontend_url, + ingest_file_via_rest) @pytest.fixture(scope="module", autouse=False) diff --git a/smoke-test/tests/cypress/cypress/e2e/domains/nested_domains.js b/smoke-test/tests/cypress/cypress/e2e/domains/nested_domains.js new file mode 100644 index 0000000000000..a2d4de0f51659 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/domains/nested_domains.js @@ -0,0 +1,53 @@ +const domainName = "CypressNestedDomain"; +const domainDescription = "CypressNestedDomainDescription"; + +describe("nested domains test", () => { + + it("create a domain, move under parent, remove domain", () => { + // Create a new domain without a parent + cy.loginWithCredentials(); + cy.goToDomainList(); + cy.clickOptionWithTestId("domains-new-domain-button"); + cy.get('[data-testid="create-domain-name"]').click().type(domainName); + cy.get('[data-testid="create-domain-description"]').click().type(domainDescription); + cy.clickOptionWithTestId("create-domain-button"); + cy.waitTextVisible(domainName); + + // Ensure the new domain has no parent in the navigation sidebar + cy.waitTextVisible(domainDescription); + + // Move a domain from the root level to be under a parent domain + cy.clickOptionWithText(domainName); + cy.openThreeDotDropdown(); + cy.clickOptionWithTestId("entity-menu-move-button"); + cy.get('[data-testid="move-domain-modal"]').contains("Marketing").click({force: true}); + cy.get('[data-testid="move-domain-modal"]').contains("Marketing").should("be.visible"); + cy.clickOptionWithTestId("move-domain-modal-move-button").wait(5000); + + // Wnsure domain is no longer on the sidebar navigator at the top level but shows up under the parent + cy.goToDomainList(); + cy.ensureTextNotPresent(domainName); + cy.ensureTextNotPresent(domainDescription); + cy.waitTextVisible("1 sub-domain"); + + // Move a domain from under a parent domain to the root level + cy.get('[data-testid="domain-list-item"]').contains("Marketing").prev().click(); + cy.clickOptionWithText(domainName); + cy.openThreeDotDropdown(); + cy.clickOptionWithTestId("entity-menu-move-button"); + cy.clickOptionWithTestId("move-domain-modal-move-button").wait(5000); + cy.goToDomainList(); + cy.waitTextVisible(domainName); + cy.waitTextVisible(domainDescription); + + // Delete a domain + cy.clickOptionWithText(domainName).wait(3000); + cy.openThreeDotDropdown(); + cy.clickOptionWithTestId("entity-menu-delete-button"); + cy.waitTextVisible("Are you sure you want to remove this Domain?"); + cy.clickOptionWithText("Yes"); + cy.waitTextVisible("Deleted Domain!"); + cy.ensureTextNotPresent(domainName); + cy.ensureTextNotPresent(domainDescription); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js b/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js new file mode 100644 index 0000000000000..4637310b86496 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js @@ -0,0 +1,57 @@ +describe("auto-complete dropdown, filter plus query search test", () => { + + const platformQuerySearch = (query,test_id,active_filter) => { + cy.visit("/"); + cy.get("input[data-testid=search-input]").type(query); + cy.get(`[data-testid="quick-filter-urn:li:dataPlatform:${test_id}"]`).click(); + cy.focused().type("{enter}").wait(3000); + cy.url().should( + "include", + `?filter_platform___false___EQUAL___0=urn%3Ali%3AdataPlatform%3A${test_id}` + ); + cy.get('[data-testid="search-input"]').should("have.value", query); + cy.get(`[data-testid="active-filter-${active_filter}"]`).should("be.visible"); + cy.contains("of 0 results").should("not.exist"); + cy.contains(/of [0-9]+ results/); + } + + const entityQuerySearch = (query,test_id,active_filter) => { + cy.visit("/"); + cy.get("input[data-testid=search-input]").type(query); + cy.get(`[data-testid="quick-filter-${test_id}"]`).click(); + cy.focused().type("{enter}").wait(3000); + cy.url().should( + "include", + `?filter__entityType___false___EQUAL___0=${test_id}` + ); + cy.get('[data-testid="search-input"]').should("have.value", query); + cy.get(`[data-testid="active-filter-${active_filter}"]`).should("be.visible"); + cy.contains("of 0 results").should("not.exist"); + cy.contains(/of [0-9]+ results/); + } + + it("verify the 'filter by' section + query (result in search page with query applied + filter applied)", () => { + // Platform query plus filter test + cy.loginWithCredentials(); + // Airflow + platformQuerySearch ("cypress","airflow","Airflow"); + // BigQuery + platformQuerySearch ("cypress","bigquery","BigQuery"); + // dbt + platformQuerySearch ("cypress","dbt","dbt"); + // Hive + platformQuerySearch ("cypress","hive","Hive"); + + // Entity type query plus filter test + // Datasets + entityQuerySearch ("cypress","DATASET","Datasets"); + // Dashboards + entityQuerySearch ("cypress","DASHBOARD","Dashboards"); + // Pipelines + entityQuerySearch ("cypress","DATA_FLOW","Pipelines"); + // Domains + entityQuerySearch ("Marketing","DOMAIN","Domains"); + // Glossary Terms + entityQuerySearch ("cypress","GLOSSARY_TERM","Glossary Terms"); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/manage_access_tokens.js b/smoke-test/tests/cypress/cypress/e2e/settings/manage_access_tokens.js new file mode 100644 index 0000000000000..7a77c2b77df5b --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/settings/manage_access_tokens.js @@ -0,0 +1,43 @@ +import { aliasQuery, hasOperationName } from "../utils"; +const test_id = Math.floor(Math.random() * 100000); + +describe("manage access tokens", () => { + before(() => { + cy.intercept("POST", "/api/v2/graphql", (req) => { + aliasQuery(req, "appConfig"); + }); + }); + + const setTokenAuthEnabledFlag = (isOn) => { + cy.intercept("POST", "/api/v2/graphql", (req) => { + if (hasOperationName(req, "appConfig")) { + req.reply((res) => { + res.body.data.appConfig.authConfig.tokenAuthEnabled = isOn; + }); + } + }); + }; + + it("create and revoke access token", () => { + //create access token, verify token on ui + setTokenAuthEnabledFlag(true); + cy.loginWithCredentials(); + cy.goToAccessTokenSettings(); + cy.clickOptionWithTestId("add-token-button"); + cy.enterTextInTestId("create-access-token-name", "Token Name" + test_id); + cy.enterTextInTestId("create-access-token-description", "Token Description" + test_id); + cy.clickOptionWithTestId("create-access-token-button"); + cy.waitTextVisible("New Personal Access Token"); + cy.get('[data-testid="access-token-value"]').should("be.visible"); + cy.get('[data-testid="access-token-value"]').invoke('text').should('match', /^[a-zA-Z0-9-_]+\.[a-zA-Z0-9-_]+\.[a-zA-Z0-9-_]+$/); + cy.clickOptionWithTestId("access-token-modal-close-button"); + //revoke access token, verify token removed from ui + cy.waitTextVisible("Token Name" + test_id); + cy.waitTextVisible("Token Description" + test_id); + cy.clickOptionWithTestId("revoke-token-button"); + cy.waitTextVisible("Are you sure you want to revoke this token?"); + cy.clickOptionWithText("Yes"); + cy.ensureTextNotPresent("Token Name" + test_id); + cy.ensureTextNotPresent("Token Description" + test_id); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/support/commands.js b/smoke-test/tests/cypress/cypress/support/commands.js index 8bfe7305c001f..64bc1253fc383 100644 --- a/smoke-test/tests/cypress/cypress/support/commands.js +++ b/smoke-test/tests/cypress/cypress/support/commands.js @@ -84,6 +84,12 @@ Cypress.Commands.add("goToOwnershipTypesSettings", () => { cy.waitTextVisible("Manage Ownership"); }); +Cypress.Commands.add("goToAccessTokenSettings", () => { + cy.visit("/settings/tokens"); + cy.waitTextVisible("Manage Access Tokens"); + cy.wait(3000); +}); + Cypress.Commands.add("goToIngestionPage", () => { cy.visit("/ingestion"); cy.waitTextVisible("Manage Ingestion"); diff --git a/smoke-test/tests/cypress/integration_test.py b/smoke-test/tests/cypress/integration_test.py index b3bacf39ac7ae..4ad2bc53fa87d 100644 --- a/smoke-test/tests/cypress/integration_test.py +++ b/smoke-test/tests/cypress/integration_test.py @@ -1,18 +1,16 @@ -from typing import Set, List - import datetime -import pytest -import subprocess import os +import subprocess +from typing import List, Set + +import pytest + +from tests.setup.lineage.ingest_time_lineage import (get_time_lineage_urns, + ingest_time_lineage) +from tests.utils import (create_datahub_step_state_aspects, delete_urns, + delete_urns_from_file, get_admin_username, + ingest_file_via_rest) -from tests.utils import ( - create_datahub_step_state_aspects, - get_admin_username, - ingest_file_via_rest, - delete_urns_from_file, - delete_urns, -) -from tests.setup.lineage.ingest_time_lineage import ingest_time_lineage, get_time_lineage_urns CYPRESS_TEST_DATA_DIR = "tests/cypress" TEST_DATA_FILENAME = "data.json" @@ -145,7 +143,6 @@ def ingest_cleanup_data(): delete_urns_from_file(f"{CYPRESS_TEST_DATA_DIR}/{TEST_ONBOARDING_DATA_FILENAME}") delete_urns(get_time_lineage_urns()) - print_now() print("deleting onboarding data file") if os.path.exists(f"{CYPRESS_TEST_DATA_DIR}/{TEST_ONBOARDING_DATA_FILENAME}"): diff --git a/smoke-test/tests/dataproduct/test_dataproduct.py b/smoke-test/tests/dataproduct/test_dataproduct.py index db198098f21fa..baef1cb1cb3ba 100644 --- a/smoke-test/tests/dataproduct/test_dataproduct.py +++ b/smoke-test/tests/dataproduct/test_dataproduct.py @@ -1,4 +1,6 @@ +import logging import os +import subprocess import tempfile import time from random import randint @@ -17,8 +19,6 @@ DomainPropertiesClass, DomainsClass) from datahub.utilities.urns.urn import Urn -import subprocess -import logging logger = logging.getLogger(__name__) diff --git a/smoke-test/tests/delete/delete_test.py b/smoke-test/tests/delete/delete_test.py index 68e001f983fbf..d920faaf3a89a 100644 --- a/smoke-test/tests/delete/delete_test.py +++ b/smoke-test/tests/delete/delete_test.py @@ -1,16 +1,14 @@ -import os import json -import pytest +import os from time import sleep + +import pytest from datahub.cli.cli_utils import get_aspects_for_entity from datahub.cli.ingest_cli import get_session_and_host -from tests.utils import ( - ingest_file_via_rest, - wait_for_healthcheck_util, - delete_urns_from_file, - wait_for_writes_to_sync, - get_datahub_graph, -) + +from tests.utils import (delete_urns_from_file, get_datahub_graph, + ingest_file_via_rest, wait_for_healthcheck_util, + wait_for_writes_to_sync) # Disable telemetry os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" @@ -102,7 +100,7 @@ def test_delete_reference(test_setup, depends=["test_healthchecks"]): graph.delete_references_to_urn(tag_urn, dry_run=False) wait_for_writes_to_sync() - + # Validate that references no longer exist references_count, related_aspects = graph.delete_references_to_urn( tag_urn, dry_run=True diff --git a/smoke-test/tests/deprecation/deprecation_test.py b/smoke-test/tests/deprecation/deprecation_test.py index 1149a970aa8e5..a8969804d03d7 100644 --- a/smoke-test/tests/deprecation/deprecation_test.py +++ b/smoke-test/tests/deprecation/deprecation_test.py @@ -1,10 +1,7 @@ import pytest -from tests.utils import ( - delete_urns_from_file, - get_frontend_url, - ingest_file_via_rest, - get_root_urn, -) + +from tests.utils import (delete_urns_from_file, get_frontend_url, get_root_urn, + ingest_file_via_rest) @pytest.fixture(scope="module", autouse=True) diff --git a/smoke-test/tests/domains/domains_test.py b/smoke-test/tests/domains/domains_test.py index 7ffe1682cafd8..fa8c918e3cbe1 100644 --- a/smoke-test/tests/domains/domains_test.py +++ b/smoke-test/tests/domains/domains_test.py @@ -1,12 +1,8 @@ import pytest import tenacity -from tests.utils import ( - delete_urns_from_file, - get_frontend_url, - get_gms_url, - ingest_file_via_rest, - get_sleep_info, -) + +from tests.utils import (delete_urns_from_file, get_frontend_url, get_gms_url, + get_sleep_info, ingest_file_via_rest) sleep_sec, sleep_times = get_sleep_info() @@ -240,4 +236,7 @@ def test_set_unset_domain(frontend_session, ingest_cleanup_data): assert res_data assert res_data["data"]["dataset"]["domain"]["domain"]["urn"] == domain_urn - assert res_data["data"]["dataset"]["domain"]["domain"]["properties"]["name"] == "Engineering" + assert ( + res_data["data"]["dataset"]["domain"]["domain"]["properties"]["name"] + == "Engineering" + ) diff --git a/smoke-test/tests/managed-ingestion/managed_ingestion_test.py b/smoke-test/tests/managed-ingestion/managed_ingestion_test.py index 1238a1dd5730a..b5e408731334e 100644 --- a/smoke-test/tests/managed-ingestion/managed_ingestion_test.py +++ b/smoke-test/tests/managed-ingestion/managed_ingestion_test.py @@ -3,7 +3,8 @@ import pytest import tenacity -from tests.utils import get_frontend_url, get_sleep_info, wait_for_healthcheck_util +from tests.utils import (get_frontend_url, get_sleep_info, + wait_for_healthcheck_util) sleep_sec, sleep_times = get_sleep_info() diff --git a/smoke-test/tests/patch/common_patch_tests.py b/smoke-test/tests/patch/common_patch_tests.py index 574e4fd4e4c88..f1d6abf5da794 100644 --- a/smoke-test/tests/patch/common_patch_tests.py +++ b/smoke-test/tests/patch/common_patch_tests.py @@ -2,25 +2,17 @@ import uuid from typing import Dict, Optional, Type -from datahub.emitter.mce_builder import ( - make_tag_urn, - make_term_urn, - make_user_urn, -) +from datahub.emitter.mce_builder import (make_tag_urn, make_term_urn, + make_user_urn) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_patch_builder import MetadataPatchProposal from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig -from datahub.metadata.schema_classes import ( - AuditStampClass, - GlobalTagsClass, - GlossaryTermAssociationClass, - GlossaryTermsClass, - OwnerClass, - OwnershipClass, - OwnershipTypeClass, - TagAssociationClass, - _Aspect, -) +from datahub.metadata.schema_classes import (AuditStampClass, GlobalTagsClass, + GlossaryTermAssociationClass, + GlossaryTermsClass, OwnerClass, + OwnershipClass, + OwnershipTypeClass, + TagAssociationClass, _Aspect) def helper_test_entity_terms_patch( @@ -34,18 +26,14 @@ def get_terms(graph, entity_urn): term_urn = make_term_urn(term=f"testTerm-{uuid.uuid4()}") - term_association = GlossaryTermAssociationClass( - urn=term_urn, context="test" - ) + term_association = GlossaryTermAssociationClass(urn=term_urn, context="test") global_terms = GlossaryTermsClass( terms=[term_association], auditStamp=AuditStampClass( time=int(time.time() * 1000.0), actor=make_user_urn("tester") ), ) - mcpw = MetadataChangeProposalWrapper( - entityUrn=test_entity_urn, aspect=global_terms - ) + mcpw = MetadataChangeProposalWrapper(entityUrn=test_entity_urn, aspect=global_terms) with DataHubGraph(DataHubGraphConfig()) as graph: graph.emit_mcp(mcpw) @@ -88,9 +76,7 @@ def helper_test_dataset_tags_patch( tag_association = TagAssociationClass(tag=tag_urn, context="test") global_tags = GlobalTagsClass(tags=[tag_association]) - mcpw = MetadataChangeProposalWrapper( - entityUrn=test_entity_urn, aspect=global_tags - ) + mcpw = MetadataChangeProposalWrapper(entityUrn=test_entity_urn, aspect=global_tags) with DataHubGraph(DataHubGraphConfig()) as graph: graph.emit_mcp(mcpw) @@ -153,15 +139,11 @@ def helper_test_ownership_patch( assert owner.owners[0].owner == make_user_urn("jdoe") for patch_mcp in ( - patch_builder_class(test_entity_urn) - .add_owner(owner_to_add) - .build() + patch_builder_class(test_entity_urn).add_owner(owner_to_add).build() ): graph.emit_mcp(patch_mcp) - owner = graph.get_aspect( - entity_urn=test_entity_urn, aspect_type=OwnershipClass - ) + owner = graph.get_aspect(entity_urn=test_entity_urn, aspect_type=OwnershipClass) assert len(owner.owners) == 2 for patch_mcp in ( @@ -171,9 +153,7 @@ def helper_test_ownership_patch( ): graph.emit_mcp(patch_mcp) - owner = graph.get_aspect( - entity_urn=test_entity_urn, aspect_type=OwnershipClass - ) + owner = graph.get_aspect(entity_urn=test_entity_urn, aspect_type=OwnershipClass) assert len(owner.owners) == 1 assert owner.owners[0].owner == make_user_urn("jdoe") @@ -199,9 +179,7 @@ def get_custom_properties( orig_aspect = base_aspect assert hasattr(orig_aspect, "customProperties") orig_aspect.customProperties = base_property_map - mcpw = MetadataChangeProposalWrapper( - entityUrn=test_entity_urn, aspect=orig_aspect - ) + mcpw = MetadataChangeProposalWrapper(entityUrn=test_entity_urn, aspect=orig_aspect) with DataHubGraph(DataHubGraphConfig()) as graph: graph.emit(mcpw) diff --git a/smoke-test/tests/patch/test_datajob_patches.py b/smoke-test/tests/patch/test_datajob_patches.py index 407410ee89914..342d5d683228a 100644 --- a/smoke-test/tests/patch/test_datajob_patches.py +++ b/smoke-test/tests/patch/test_datajob_patches.py @@ -3,19 +3,14 @@ from datahub.emitter.mce_builder import make_data_job_urn, make_dataset_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig -from datahub.metadata.schema_classes import ( - DataJobInfoClass, - DataJobInputOutputClass, - EdgeClass, -) +from datahub.metadata.schema_classes import (DataJobInfoClass, + DataJobInputOutputClass, + EdgeClass) from datahub.specific.datajob import DataJobPatchBuilder from tests.patch.common_patch_tests import ( - helper_test_custom_properties_patch, - helper_test_dataset_tags_patch, - helper_test_entity_terms_patch, - helper_test_ownership_patch, -) + helper_test_custom_properties_patch, helper_test_dataset_tags_patch, + helper_test_entity_terms_patch, helper_test_ownership_patch) def _make_test_datajob_urn( @@ -37,16 +32,12 @@ def test_datajob_ownership_patch(wait_for_healthchecks): # Tags def test_datajob_tags_patch(wait_for_healthchecks): - helper_test_dataset_tags_patch( - _make_test_datajob_urn(), DataJobPatchBuilder - ) + helper_test_dataset_tags_patch(_make_test_datajob_urn(), DataJobPatchBuilder) # Terms def test_dataset_terms_patch(wait_for_healthchecks): - helper_test_entity_terms_patch( - _make_test_datajob_urn(), DataJobPatchBuilder - ) + helper_test_entity_terms_patch(_make_test_datajob_urn(), DataJobPatchBuilder) # Custom Properties diff --git a/smoke-test/tests/patch/test_dataset_patches.py b/smoke-test/tests/patch/test_dataset_patches.py index 239aab64675d8..6704d19760fb9 100644 --- a/smoke-test/tests/patch/test_dataset_patches.py +++ b/smoke-test/tests/patch/test_dataset_patches.py @@ -20,7 +20,10 @@ UpstreamClass, UpstreamLineageClass) from datahub.specific.dataset import DatasetPatchBuilder -from tests.patch.common_patch_tests import helper_test_entity_terms_patch, helper_test_dataset_tags_patch, helper_test_ownership_patch, helper_test_custom_properties_patch + +from tests.patch.common_patch_tests import ( + helper_test_custom_properties_patch, helper_test_dataset_tags_patch, + helper_test_entity_terms_patch, helper_test_ownership_patch) # Common Aspect Patch Tests @@ -31,6 +34,7 @@ def test_dataset_ownership_patch(wait_for_healthchecks): ) helper_test_ownership_patch(dataset_urn, DatasetPatchBuilder) + # Tags def test_dataset_tags_patch(wait_for_healthchecks): dataset_urn = make_dataset_urn( @@ -38,6 +42,7 @@ def test_dataset_tags_patch(wait_for_healthchecks): ) helper_test_dataset_tags_patch(dataset_urn, DatasetPatchBuilder) + # Terms def test_dataset_terms_patch(wait_for_healthchecks): dataset_urn = make_dataset_urn( @@ -284,8 +289,15 @@ def test_custom_properties_patch(wait_for_healthchecks): dataset_urn = make_dataset_urn( platform="hive", name=f"SampleHiveDataset-{uuid.uuid4()}", env="PROD" ) - orig_dataset_properties = DatasetPropertiesClass(name="test_name", description="test_description") - helper_test_custom_properties_patch(test_entity_urn=dataset_urn, patch_builder_class=DatasetPatchBuilder, custom_properties_aspect_class=DatasetPropertiesClass, base_aspect=orig_dataset_properties) + orig_dataset_properties = DatasetPropertiesClass( + name="test_name", description="test_description" + ) + helper_test_custom_properties_patch( + test_entity_urn=dataset_urn, + patch_builder_class=DatasetPatchBuilder, + custom_properties_aspect_class=DatasetPropertiesClass, + base_aspect=orig_dataset_properties, + ) with DataHubGraph(DataHubGraphConfig()) as graph: # Patch custom properties along with name diff --git a/smoke-test/tests/policies/test_policies.py b/smoke-test/tests/policies/test_policies.py index b7091541894dd..67142181d2b96 100644 --- a/smoke-test/tests/policies/test_policies.py +++ b/smoke-test/tests/policies/test_policies.py @@ -1,12 +1,8 @@ import pytest import tenacity -from tests.utils import ( - get_frontend_url, - wait_for_healthcheck_util, - get_frontend_session, - get_sleep_info, - get_root_urn, -) + +from tests.utils import (get_frontend_session, get_frontend_url, get_root_urn, + get_sleep_info, wait_for_healthcheck_util) TEST_POLICY_NAME = "Updated Platform Policy" diff --git a/smoke-test/tests/setup/lineage/helper_classes.py b/smoke-test/tests/setup/lineage/helper_classes.py index 53f77b08d15ed..d550f3093be85 100644 --- a/smoke-test/tests/setup/lineage/helper_classes.py +++ b/smoke-test/tests/setup/lineage/helper_classes.py @@ -1,10 +1,7 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional -from datahub.metadata.schema_classes import ( - EdgeClass, - SchemaFieldDataTypeClass, -) +from datahub.metadata.schema_classes import EdgeClass, SchemaFieldDataTypeClass @dataclass diff --git a/smoke-test/tests/setup/lineage/ingest_data_job_change.py b/smoke-test/tests/setup/lineage/ingest_data_job_change.py index 8e3e9c5352922..588a1625419bc 100644 --- a/smoke-test/tests/setup/lineage/ingest_data_job_change.py +++ b/smoke-test/tests/setup/lineage/ingest_data_job_change.py @@ -1,36 +1,20 @@ from typing import List -from datahub.emitter.mce_builder import ( - make_dataset_urn, - make_data_flow_urn, - make_data_job_urn_with_flow, -) +from datahub.emitter.mce_builder import (make_data_flow_urn, + make_data_job_urn_with_flow, + make_dataset_urn) from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.metadata.schema_classes import ( - DateTypeClass, - NumberTypeClass, - SchemaFieldDataTypeClass, - StringTypeClass, -) +from datahub.metadata.schema_classes import (DateTypeClass, NumberTypeClass, + SchemaFieldDataTypeClass, + StringTypeClass) -from tests.setup.lineage.constants import ( - AIRFLOW_DATA_PLATFORM, - SNOWFLAKE_DATA_PLATFORM, - TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, - TIMESTAMP_MILLIS_ONE_DAY_AGO, -) -from tests.setup.lineage.helper_classes import ( - Field, - Dataset, - Task, - Pipeline, -) -from tests.setup.lineage.utils import ( - create_edge, - create_node, - create_nodes_and_edges, - emit_mcps, -) +from tests.setup.lineage.constants import (AIRFLOW_DATA_PLATFORM, + SNOWFLAKE_DATA_PLATFORM, + TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, + TIMESTAMP_MILLIS_ONE_DAY_AGO) +from tests.setup.lineage.helper_classes import Dataset, Field, Pipeline, Task +from tests.setup.lineage.utils import (create_edge, create_node, + create_nodes_and_edges, emit_mcps) # Constants for Case 2 DAILY_TEMPERATURE_DATASET_ID = "climate.daily_temperature" diff --git a/smoke-test/tests/setup/lineage/ingest_dataset_join_change.py b/smoke-test/tests/setup/lineage/ingest_dataset_join_change.py index 35a8e6d5cf02e..bb9f51b6b5e9b 100644 --- a/smoke-test/tests/setup/lineage/ingest_dataset_join_change.py +++ b/smoke-test/tests/setup/lineage/ingest_dataset_join_change.py @@ -1,32 +1,18 @@ from typing import List -from datahub.emitter.mce_builder import ( - make_dataset_urn, -) +from datahub.emitter.mce_builder import make_dataset_urn from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.metadata.schema_classes import ( - NumberTypeClass, - SchemaFieldDataTypeClass, - StringTypeClass, - UpstreamClass, -) +from datahub.metadata.schema_classes import (NumberTypeClass, + SchemaFieldDataTypeClass, + StringTypeClass, UpstreamClass) -from tests.setup.lineage.constants import ( - DATASET_ENTITY_TYPE, - SNOWFLAKE_DATA_PLATFORM, - TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, - TIMESTAMP_MILLIS_ONE_DAY_AGO, -) -from tests.setup.lineage.helper_classes import ( - Field, - Dataset, -) -from tests.setup.lineage.utils import ( - create_node, - create_upstream_edge, - create_upstream_mcp, - emit_mcps, -) +from tests.setup.lineage.constants import (DATASET_ENTITY_TYPE, + SNOWFLAKE_DATA_PLATFORM, + TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, + TIMESTAMP_MILLIS_ONE_DAY_AGO) +from tests.setup.lineage.helper_classes import Dataset, Field +from tests.setup.lineage.utils import (create_node, create_upstream_edge, + create_upstream_mcp, emit_mcps) # Constants for Case 3 GDP_DATASET_ID = "economic_data.gdp" diff --git a/smoke-test/tests/setup/lineage/ingest_input_datasets_change.py b/smoke-test/tests/setup/lineage/ingest_input_datasets_change.py index f4fb795147478..6079d7a3d2b63 100644 --- a/smoke-test/tests/setup/lineage/ingest_input_datasets_change.py +++ b/smoke-test/tests/setup/lineage/ingest_input_datasets_change.py @@ -1,36 +1,20 @@ from typing import List -from datahub.emitter.mce_builder import ( - make_dataset_urn, - make_data_flow_urn, - make_data_job_urn_with_flow, -) +from datahub.emitter.mce_builder import (make_data_flow_urn, + make_data_job_urn_with_flow, + make_dataset_urn) from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.metadata.schema_classes import ( - NumberTypeClass, - SchemaFieldDataTypeClass, - StringTypeClass, -) - -from tests.setup.lineage.constants import ( - AIRFLOW_DATA_PLATFORM, - BQ_DATA_PLATFORM, - TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, - TIMESTAMP_MILLIS_ONE_DAY_AGO, -) -from tests.setup.lineage.helper_classes import ( - Field, - Dataset, - Task, - Pipeline, -) -from tests.setup.lineage.utils import ( - create_edge, - create_node, - create_nodes_and_edges, - emit_mcps, -) +from datahub.metadata.schema_classes import (NumberTypeClass, + SchemaFieldDataTypeClass, + StringTypeClass) +from tests.setup.lineage.constants import (AIRFLOW_DATA_PLATFORM, + BQ_DATA_PLATFORM, + TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, + TIMESTAMP_MILLIS_ONE_DAY_AGO) +from tests.setup.lineage.helper_classes import Dataset, Field, Pipeline, Task +from tests.setup.lineage.utils import (create_edge, create_node, + create_nodes_and_edges, emit_mcps) # Constants for Case 1 TRANSACTIONS_DATASET_ID = "transactions.transactions" diff --git a/smoke-test/tests/setup/lineage/ingest_time_lineage.py b/smoke-test/tests/setup/lineage/ingest_time_lineage.py index cae8e0124d501..3aec979707290 100644 --- a/smoke-test/tests/setup/lineage/ingest_time_lineage.py +++ b/smoke-test/tests/setup/lineage/ingest_time_lineage.py @@ -1,12 +1,14 @@ +import os from typing import List from datahub.emitter.rest_emitter import DatahubRestEmitter -from tests.setup.lineage.ingest_input_datasets_change import ingest_input_datasets_change, get_input_datasets_change_urns -from tests.setup.lineage.ingest_data_job_change import ingest_data_job_change, get_data_job_change_urns -from tests.setup.lineage.ingest_dataset_join_change import ingest_dataset_join_change, get_dataset_join_change_urns - -import os +from tests.setup.lineage.ingest_data_job_change import ( + get_data_job_change_urns, ingest_data_job_change) +from tests.setup.lineage.ingest_dataset_join_change import ( + get_dataset_join_change_urns, ingest_dataset_join_change) +from tests.setup.lineage.ingest_input_datasets_change import ( + get_input_datasets_change_urns, ingest_input_datasets_change) SERVER = os.getenv("DATAHUB_SERVER") or "http://localhost:8080" TOKEN = os.getenv("DATAHUB_TOKEN") or "" @@ -20,4 +22,8 @@ def ingest_time_lineage() -> None: def get_time_lineage_urns() -> List[str]: - return get_input_datasets_change_urns() + get_data_job_change_urns() + get_dataset_join_change_urns() + return ( + get_input_datasets_change_urns() + + get_data_job_change_urns() + + get_dataset_join_change_urns() + ) diff --git a/smoke-test/tests/setup/lineage/utils.py b/smoke-test/tests/setup/lineage/utils.py index 672f7a945a6af..c72f6ccb89b7a 100644 --- a/smoke-test/tests/setup/lineage/utils.py +++ b/smoke-test/tests/setup/lineage/utils.py @@ -1,41 +1,30 @@ import datetime -from datahub.emitter.mce_builder import ( - make_data_platform_urn, - make_dataset_urn, - make_data_job_urn_with_flow, - make_data_flow_urn, -) +from typing import List + +from datahub.emitter.mce_builder import (make_data_flow_urn, + make_data_job_urn_with_flow, + make_data_platform_urn, + make_dataset_urn) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage -from datahub.metadata.schema_classes import ( - AuditStampClass, - ChangeTypeClass, - DatasetLineageTypeClass, - DatasetPropertiesClass, - DataFlowInfoClass, - DataJobInputOutputClass, - DataJobInfoClass, - EdgeClass, - MySqlDDLClass, - SchemaFieldClass, - SchemaMetadataClass, - UpstreamClass, -) -from typing import List - -from tests.setup.lineage.constants import ( - DATASET_ENTITY_TYPE, - DATA_JOB_ENTITY_TYPE, - DATA_FLOW_ENTITY_TYPE, - DATA_FLOW_INFO_ASPECT_NAME, - DATA_JOB_INFO_ASPECT_NAME, - DATA_JOB_INPUT_OUTPUT_ASPECT_NAME, -) -from tests.setup.lineage.helper_classes import ( - Dataset, - Pipeline, -) +from datahub.metadata.schema_classes import (AuditStampClass, ChangeTypeClass, + DataFlowInfoClass, + DataJobInfoClass, + DataJobInputOutputClass, + DatasetLineageTypeClass, + DatasetPropertiesClass, EdgeClass, + MySqlDDLClass, SchemaFieldClass, + SchemaMetadataClass, + UpstreamClass) + +from tests.setup.lineage.constants import (DATA_FLOW_ENTITY_TYPE, + DATA_FLOW_INFO_ASPECT_NAME, + DATA_JOB_ENTITY_TYPE, + DATA_JOB_INFO_ASPECT_NAME, + DATA_JOB_INPUT_OUTPUT_ASPECT_NAME, + DATASET_ENTITY_TYPE) +from tests.setup.lineage.helper_classes import Dataset, Pipeline def create_node(dataset: Dataset) -> List[MetadataChangeProposalWrapper]: @@ -85,10 +74,10 @@ def create_node(dataset: Dataset) -> List[MetadataChangeProposalWrapper]: def create_edge( - source_urn: str, - destination_urn: str, - created_timestamp_millis: int, - updated_timestamp_millis: int, + source_urn: str, + destination_urn: str, + created_timestamp_millis: int, + updated_timestamp_millis: int, ) -> EdgeClass: created_audit_stamp: AuditStampClass = AuditStampClass( time=created_timestamp_millis, actor="urn:li:corpuser:unknown" @@ -105,7 +94,7 @@ def create_edge( def create_nodes_and_edges( - airflow_dag: Pipeline, + airflow_dag: Pipeline, ) -> List[MetadataChangeProposalWrapper]: mcps = [] data_flow_urn = make_data_flow_urn( @@ -160,9 +149,9 @@ def create_nodes_and_edges( def create_upstream_edge( - upstream_entity_urn: str, - created_timestamp_millis: int, - updated_timestamp_millis: int, + upstream_entity_urn: str, + created_timestamp_millis: int, + updated_timestamp_millis: int, ): created_audit_stamp: AuditStampClass = AuditStampClass( time=created_timestamp_millis, actor="urn:li:corpuser:unknown" @@ -180,11 +169,11 @@ def create_upstream_edge( def create_upstream_mcp( - entity_type: str, - entity_urn: str, - upstreams: List[UpstreamClass], - timestamp_millis: int, - run_id: str = "", + entity_type: str, + entity_urn: str, + upstreams: List[UpstreamClass], + timestamp_millis: int, + run_id: str = "", ) -> MetadataChangeProposalWrapper: print(f"Creating upstreamLineage aspect for {entity_urn}") timestamp_millis: int = int(datetime.datetime.now().timestamp() * 1000) @@ -203,7 +192,7 @@ def create_upstream_mcp( def emit_mcps( - emitter: DatahubRestEmitter, mcps: List[MetadataChangeProposalWrapper] + emitter: DatahubRestEmitter, mcps: List[MetadataChangeProposalWrapper] ) -> None: for mcp in mcps: emitter.emit_mcp(mcp) diff --git a/smoke-test/tests/tags-and-terms/tags_and_terms_test.py b/smoke-test/tests/tags-and-terms/tags_and_terms_test.py index b0ca29b544cfe..6ac75765286f0 100644 --- a/smoke-test/tests/tags-and-terms/tags_and_terms_test.py +++ b/smoke-test/tests/tags-and-terms/tags_and_terms_test.py @@ -1,5 +1,7 @@ import pytest -from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest, wait_for_healthcheck_util + +from tests.utils import (delete_urns_from_file, get_frontend_url, + ingest_file_via_rest, wait_for_healthcheck_util) @pytest.fixture(scope="module", autouse=True) diff --git a/smoke-test/tests/telemetry/telemetry_test.py b/smoke-test/tests/telemetry/telemetry_test.py index 3672abcda948d..3127061c9f506 100644 --- a/smoke-test/tests/telemetry/telemetry_test.py +++ b/smoke-test/tests/telemetry/telemetry_test.py @@ -7,5 +7,7 @@ def test_no_clientID(): client_id_urn = "urn:li:telemetry:clientId" aspect = ["telemetryClientId"] - res_data = json.dumps(get_aspects_for_entity(entity_urn=client_id_urn, aspects=aspect, typed=False)) + res_data = json.dumps( + get_aspects_for_entity(entity_urn=client_id_urn, aspects=aspect, typed=False) + ) assert res_data == "{}" diff --git a/smoke-test/tests/test_result_msg.py b/smoke-test/tests/test_result_msg.py index e3b336db9d66c..b9775e8ee4acd 100644 --- a/smoke-test/tests/test_result_msg.py +++ b/smoke-test/tests/test_result_msg.py @@ -1,6 +1,6 @@ -from slack_sdk import WebClient import os +from slack_sdk import WebClient datahub_stats = {} @@ -10,10 +10,10 @@ def add_datahub_stats(stat_name, stat_val): def send_to_slack(passed: str): - slack_api_token = os.getenv('SLACK_API_TOKEN') - slack_channel = os.getenv('SLACK_CHANNEL') - slack_thread_ts = os.getenv('SLACK_THREAD_TS') - test_identifier = os.getenv('TEST_IDENTIFIER', 'LOCAL_TEST') + slack_api_token = os.getenv("SLACK_API_TOKEN") + slack_channel = os.getenv("SLACK_CHANNEL") + slack_thread_ts = os.getenv("SLACK_THREAD_TS") + test_identifier = os.getenv("TEST_IDENTIFIER", "LOCAL_TEST") if slack_api_token is None or slack_channel is None: return client = WebClient(token=slack_api_token) @@ -26,14 +26,21 @@ def send_to_slack(passed: str): message += f"Num {entity_type} is {val}\n" if slack_thread_ts is None: - client.chat_postMessage(channel=slack_channel, text=f'{test_identifier} Status - {passed}\n{message}') + client.chat_postMessage( + channel=slack_channel, + text=f"{test_identifier} Status - {passed}\n{message}", + ) else: - client.chat_postMessage(channel=slack_channel, text=f'{test_identifier} Status - {passed}\n{message}', thread_ts=slack_thread_ts) + client.chat_postMessage( + channel=slack_channel, + text=f"{test_identifier} Status - {passed}\n{message}", + thread_ts=slack_thread_ts, + ) def send_message(exitstatus): try: - send_to_slack('PASSED' if exitstatus == 0 else 'FAILED') + send_to_slack("PASSED" if exitstatus == 0 else "FAILED") except Exception as e: # We don't want to fail pytest at all print(f"Exception happened for sending msg to slack {e}") diff --git a/smoke-test/tests/test_stateful_ingestion.py b/smoke-test/tests/test_stateful_ingestion.py index a10cf13a08029..c6adb402e5d51 100644 --- a/smoke-test/tests/test_stateful_ingestion.py +++ b/smoke-test/tests/test_stateful_ingestion.py @@ -4,17 +4,15 @@ from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState -from datahub.ingestion.source.state.stale_entity_removal_handler import StaleEntityRemovalHandler +from datahub.ingestion.source.state.entity_removal_state import \ + GenericCheckpointState +from datahub.ingestion.source.state.stale_entity_removal_handler import \ + StaleEntityRemovalHandler from sqlalchemy import create_engine from sqlalchemy.sql import text -from tests.utils import ( - get_gms_url, - get_mysql_password, - get_mysql_url, - get_mysql_username, -) +from tests.utils import (get_gms_url, get_mysql_password, get_mysql_url, + get_mysql_username) def test_stateful_ingestion(wait_for_healthchecks): diff --git a/smoke-test/tests/tests/tests_test.py b/smoke-test/tests/tests/tests_test.py index 0b87f90a92c58..213a2ea087b7a 100644 --- a/smoke-test/tests/tests/tests_test.py +++ b/smoke-test/tests/tests/tests_test.py @@ -1,9 +1,13 @@ import pytest import tenacity -from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest, wait_for_healthcheck_util, get_sleep_info + +from tests.utils import (delete_urns_from_file, get_frontend_url, + get_sleep_info, ingest_file_via_rest, + wait_for_healthcheck_util) sleep_sec, sleep_times = get_sleep_info() + @pytest.fixture(scope="module", autouse=True) def ingest_cleanup_data(request): print("ingesting test data") @@ -18,6 +22,7 @@ def wait_for_healthchecks(): wait_for_healthcheck_util() yield + @pytest.mark.dependency() def test_healthchecks(wait_for_healthchecks): # Call to wait_for_healthchecks fixture will do the actual functionality. diff --git a/smoke-test/tests/timeline/timeline_test.py b/smoke-test/tests/timeline/timeline_test.py index a73d585c6c72d..4705343c1a2ba 100644 --- a/smoke-test/tests/timeline/timeline_test.py +++ b/smoke-test/tests/timeline/timeline_test.py @@ -3,14 +3,14 @@ from datahub.cli import timeline_cli from datahub.cli.cli_utils import guess_entity_type, post_entity -from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync, get_datahub_graph + +from tests.utils import (get_datahub_graph, ingest_file_via_rest, + wait_for_writes_to_sync) def test_all(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -18,8 +18,13 @@ def test_all(): ingest_file_via_rest("tests/timeline/timeline_test_datav2.json") ingest_file_via_rest("tests/timeline/timeline_test_datav3.json") - res_data = timeline_cli.get_timeline(dataset_urn, ["TAG", "DOCUMENTATION", "TECHNICAL_SCHEMA", "GLOSSARY_TERM", - "OWNER"], None, None, False) + res_data = timeline_cli.get_timeline( + dataset_urn, + ["TAG", "DOCUMENTATION", "TECHNICAL_SCHEMA", "GLOSSARY_TERM", "OWNER"], + None, + None, + False, + ) get_datahub_graph().hard_delete_entity(urn=dataset_urn) assert res_data @@ -35,9 +40,7 @@ def test_all(): def test_schema(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -45,7 +48,9 @@ def test_schema(): put(dataset_urn, "schemaMetadata", "test_resources/timeline/newschemav2.json") put(dataset_urn, "schemaMetadata", "test_resources/timeline/newschemav3.json") - res_data = timeline_cli.get_timeline(dataset_urn, ["TECHNICAL_SCHEMA"], None, None, False) + res_data = timeline_cli.get_timeline( + dataset_urn, ["TECHNICAL_SCHEMA"], None, None, False + ) get_datahub_graph().hard_delete_entity(urn=dataset_urn) assert res_data @@ -61,9 +66,7 @@ def test_schema(): def test_glossary(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -71,7 +74,9 @@ def test_glossary(): put(dataset_urn, "glossaryTerms", "test_resources/timeline/newglossaryv2.json") put(dataset_urn, "glossaryTerms", "test_resources/timeline/newglossaryv3.json") - res_data = timeline_cli.get_timeline(dataset_urn, ["GLOSSARY_TERM"], None, None, False) + res_data = timeline_cli.get_timeline( + dataset_urn, ["GLOSSARY_TERM"], None, None, False + ) get_datahub_graph().hard_delete_entity(urn=dataset_urn) assert res_data @@ -87,17 +92,29 @@ def test_glossary(): def test_documentation(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" - put(dataset_urn, "institutionalMemory", "test_resources/timeline/newdocumentation.json") - put(dataset_urn, "institutionalMemory", "test_resources/timeline/newdocumentationv2.json") - put(dataset_urn, "institutionalMemory", "test_resources/timeline/newdocumentationv3.json") + put( + dataset_urn, + "institutionalMemory", + "test_resources/timeline/newdocumentation.json", + ) + put( + dataset_urn, + "institutionalMemory", + "test_resources/timeline/newdocumentationv2.json", + ) + put( + dataset_urn, + "institutionalMemory", + "test_resources/timeline/newdocumentationv3.json", + ) - res_data = timeline_cli.get_timeline(dataset_urn, ["DOCUMENTATION"], None, None, False) + res_data = timeline_cli.get_timeline( + dataset_urn, ["DOCUMENTATION"], None, None, False + ) get_datahub_graph().hard_delete_entity(urn=dataset_urn) assert res_data @@ -113,9 +130,7 @@ def test_documentation(): def test_tags(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -139,9 +154,7 @@ def test_tags(): def test_ownership(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" diff --git a/smoke-test/tests/tokens/revokable_access_token_test.py b/smoke-test/tests/tokens/revokable_access_token_test.py index b10ad3aa3fc2a..55f3de594af4e 100644 --- a/smoke-test/tests/tokens/revokable_access_token_test.py +++ b/smoke-test/tests/tokens/revokable_access_token_test.py @@ -1,15 +1,11 @@ import os -import pytest -import requests from time import sleep -from tests.utils import ( - get_frontend_url, - wait_for_healthcheck_util, - get_admin_credentials, - wait_for_writes_to_sync, -) +import pytest +import requests +from tests.utils import (get_admin_credentials, get_frontend_url, + wait_for_healthcheck_util, wait_for_writes_to_sync) # Disable telemetry os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" diff --git a/smoke-test/tests/utils.py b/smoke-test/tests/utils.py index af03efd4f71f8..bd75b13d1910f 100644 --- a/smoke-test/tests/utils.py +++ b/smoke-test/tests/utils.py @@ -1,19 +1,20 @@ import functools import json +import logging import os -from datetime import datetime, timedelta, timezone import subprocess import time -from typing import Any, Dict, List, Tuple +from datetime import datetime, timedelta, timezone from time import sleep -from joblib import Parallel, delayed +from typing import Any, Dict, List, Tuple -import requests_wrapper as requests -import logging from datahub.cli import cli_utils from datahub.cli.cli_utils import get_system_auth -from datahub.ingestion.graph.client import DataHubGraph, DatahubClientConfig +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.ingestion.run.pipeline import Pipeline +from joblib import Parallel, delayed + +import requests_wrapper as requests from tests.consistency_utils import wait_for_writes_to_sync TIME: int = 1581407189000 @@ -174,6 +175,7 @@ def delete(entry): wait_for_writes_to_sync() + # Fixed now value NOW: datetime = datetime.now() @@ -232,6 +234,3 @@ def create_datahub_step_state_aspects( ] with open(onboarding_filename, "w") as f: json.dump(aspects_dict, f, indent=2) - - - diff --git a/smoke-test/tests/views/views_test.py b/smoke-test/tests/views/views_test.py index 4da69750a167b..685c3bd80b04d 100644 --- a/smoke-test/tests/views/views_test.py +++ b/smoke-test/tests/views/views_test.py @@ -1,16 +1,14 @@ -import pytest import time + +import pytest import tenacity -from tests.utils import ( - delete_urns_from_file, - get_frontend_url, - get_gms_url, - ingest_file_via_rest, - get_sleep_info, -) + +from tests.utils import (delete_urns_from_file, get_frontend_url, get_gms_url, + get_sleep_info, ingest_file_via_rest) sleep_sec, sleep_times = get_sleep_info() + @pytest.mark.dependency() def test_healthchecks(wait_for_healthchecks): # Call to wait_for_healthchecks fixture will do the actual functionality. @@ -40,6 +38,7 @@ def _ensure_more_views(frontend_session, list_views_json, query_name, before_cou assert after_count == before_count + 1 return after_count + @tenacity.retry( stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) ) @@ -111,18 +110,18 @@ def test_create_list_delete_global_view(frontend_session): new_view_name = "Test View" new_view_description = "Test Description" new_view_definition = { - "entityTypes": ["DATASET", "DASHBOARD"], - "filter": { - "operator": "AND", - "filters": [ - { - "field": "tags", - "values": ["urn:li:tag:test"], - "negated": False, - "condition": "EQUAL" - } - ] - } + "entityTypes": ["DATASET", "DASHBOARD"], + "filter": { + "operator": "AND", + "filters": [ + { + "field": "tags", + "values": ["urn:li:tag:test"], + "negated": False, + "condition": "EQUAL", + } + ], + }, } # Create new View @@ -137,7 +136,7 @@ def test_create_list_delete_global_view(frontend_session): "viewType": "GLOBAL", "name": new_view_name, "description": new_view_description, - "definition": new_view_definition + "definition": new_view_definition, } }, } @@ -169,9 +168,7 @@ def test_create_list_delete_global_view(frontend_session): "query": """mutation deleteView($urn: String!) {\n deleteView(urn: $urn) }""", - "variables": { - "urn": view_urn - }, + "variables": {"urn": view_urn}, } response = frontend_session.post( @@ -189,7 +186,9 @@ def test_create_list_delete_global_view(frontend_session): ) -@pytest.mark.dependency(depends=["test_healthchecks", "test_create_list_delete_global_view"]) +@pytest.mark.dependency( + depends=["test_healthchecks", "test_create_list_delete_global_view"] +) def test_create_list_delete_personal_view(frontend_session): # Get count of existing views @@ -237,18 +236,18 @@ def test_create_list_delete_personal_view(frontend_session): new_view_name = "Test View" new_view_description = "Test Description" new_view_definition = { - "entityTypes": ["DATASET", "DASHBOARD"], - "filter": { - "operator": "AND", - "filters": [ - { - "field": "tags", - "values": ["urn:li:tag:test"], - "negated": False, - "condition": "EQUAL" - } - ] - } + "entityTypes": ["DATASET", "DASHBOARD"], + "filter": { + "operator": "AND", + "filters": [ + { + "field": "tags", + "values": ["urn:li:tag:test"], + "negated": False, + "condition": "EQUAL", + } + ], + }, } # Create new View @@ -263,7 +262,7 @@ def test_create_list_delete_personal_view(frontend_session): "viewType": "PERSONAL", "name": new_view_name, "description": new_view_description, - "definition": new_view_definition + "definition": new_view_definition, } }, } @@ -293,9 +292,7 @@ def test_create_list_delete_personal_view(frontend_session): "query": """mutation deleteView($urn: String!) {\n deleteView(urn: $urn) }""", - "variables": { - "urn": view_urn - }, + "variables": {"urn": view_urn}, } response = frontend_session.post( @@ -312,25 +309,28 @@ def test_create_list_delete_personal_view(frontend_session): before_count=new_count, ) -@pytest.mark.dependency(depends=["test_healthchecks", "test_create_list_delete_personal_view"]) + +@pytest.mark.dependency( + depends=["test_healthchecks", "test_create_list_delete_personal_view"] +) def test_update_global_view(frontend_session): # First create a view new_view_name = "Test View" new_view_description = "Test Description" new_view_definition = { - "entityTypes": ["DATASET", "DASHBOARD"], - "filter": { - "operator": "AND", - "filters": [ - { - "field": "tags", - "values": ["urn:li:tag:test"], - "negated": False, - "condition": "EQUAL" - } - ] - } + "entityTypes": ["DATASET", "DASHBOARD"], + "filter": { + "operator": "AND", + "filters": [ + { + "field": "tags", + "values": ["urn:li:tag:test"], + "negated": False, + "condition": "EQUAL", + } + ], + }, } # Create new View @@ -345,7 +345,7 @@ def test_update_global_view(frontend_session): "viewType": "PERSONAL", "name": new_view_name, "description": new_view_description, - "definition": new_view_definition + "definition": new_view_definition, } }, } @@ -366,18 +366,18 @@ def test_update_global_view(frontend_session): new_view_name = "New Test View" new_view_description = "New Test Description" new_view_definition = { - "entityTypes": ["DATASET", "DASHBOARD", "CHART", "DATA_FLOW"], - "filter": { - "operator": "OR", - "filters": [ - { - "field": "glossaryTerms", - "values": ["urn:li:glossaryTerm:test"], - "negated": True, - "condition": "CONTAIN" - } - ] - } + "entityTypes": ["DATASET", "DASHBOARD", "CHART", "DATA_FLOW"], + "filter": { + "operator": "OR", + "filters": [ + { + "field": "glossaryTerms", + "values": ["urn:li:glossaryTerm:test"], + "negated": True, + "condition": "CONTAIN", + } + ], + }, } update_view_json = { @@ -391,8 +391,8 @@ def test_update_global_view(frontend_session): "input": { "name": new_view_name, "description": new_view_description, - "definition": new_view_definition - } + "definition": new_view_definition, + }, }, } @@ -411,9 +411,7 @@ def test_update_global_view(frontend_session): "query": """mutation deleteView($urn: String!) {\n deleteView(urn: $urn) }""", - "variables": { - "urn": view_urn - }, + "variables": {"urn": view_urn}, } response = frontend_session.post(