diff --git a/build.gradle b/build.gradle index 025c588da2b52..cf55a59cfe694 100644 --- a/build.gradle +++ b/build.gradle @@ -27,7 +27,7 @@ buildscript { dependencies { classpath 'com.linkedin.pegasus:gradle-plugins:' + pegasusVersion classpath 'com.github.node-gradle:gradle-node-plugin:2.2.4' - classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.8.1' + classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.2.0' classpath 'org.springframework.boot:spring-boot-gradle-plugin:' + springBootVersion classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.30.0" classpath "com.palantir.gradle.gitversion:gradle-git-version:3.0.0" @@ -67,8 +67,8 @@ project.ext.externalDependency = [ 'antlr4Runtime': 'org.antlr:antlr4-runtime:4.7.2', 'antlr4': 'org.antlr:antlr4:4.7.2', 'assertJ': 'org.assertj:assertj-core:3.11.1', - 'avro_1_7': 'org.apache.avro:avro:1.7.7', - 'avroCompiler_1_7': 'org.apache.avro:avro-compiler:1.7.7', + 'avro': 'org.apache.avro:avro:1.11.3', + 'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3', 'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.10', 'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:1.1.1', 'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.8', @@ -127,7 +127,6 @@ project.ext.externalDependency = [ 'jgrapht': 'org.jgrapht:jgrapht-core:1.5.1', 'jna': 'net.java.dev.jna:jna:5.12.1', 'jsonPatch': 'com.github.java-json-tools:json-patch:1.13', - 'jsonSchemaAvro': 'com.github.fge:json-schema-avro:0.1.4', 'jsonSimple': 'com.googlecode.json-simple:json-simple:1.1.1', 'jsonSmart': 'net.minidev:json-smart:2.4.9', 'json': 'org.json:json:20230227', diff --git a/buildSrc/build.gradle b/buildSrc/build.gradle index 65b3780431db9..1f9d30d520171 100644 --- a/buildSrc/build.gradle +++ b/buildSrc/build.gradle @@ -5,7 +5,14 @@ buildscript { } dependencies { - implementation('io.acryl:json-schema-avro:0.1.5') { + /** + * Forked version of abandoned repository: https://github.com/fge/json-schema-avro + * Maintainer last active 2014, we maintain an active fork of this repository to utilize mapping Avro schemas to Json Schemas, + * repository is as close to official library for this as you can get. Original maintainer is one of the authors of Json Schema spec. + * Other companies are also separately maintaining forks (like: https://github.com/java-json-tools/json-schema-avro). + * We have built several customizations on top of it for various bug fixes, especially around union scheams + */ + implementation('io.acryl:json-schema-avro:0.2.2') { exclude group: 'com.fasterxml.jackson.core', module: 'jackson-databind' exclude group: 'com.google.guava', module: 'guava' } diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index 98f3b82285eda..fe04c3629fe58 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -56,7 +56,7 @@ public class AuthModule extends AbstractModule { * Pac4j Stores Session State in a browser-side cookie in encrypted fashion. This configuration * value provides a stable encryption base from which to derive the encryption key. * - * We hash this value (SHA1), then take the first 16 bytes as the AES key. + * We hash this value (SHA256), then take the first 16 bytes as the AES key. */ private static final String PAC4J_AES_KEY_BASE_CONF = "play.http.secret.key"; private static final String PAC4J_SESSIONSTORE_PROVIDER_CONF = "pac4j.sessionStore.provider"; @@ -93,7 +93,7 @@ protected void configure() { // it to hex and slice the first 16 bytes, because AES key length must strictly // have a specific length. final String aesKeyBase = _configs.getString(PAC4J_AES_KEY_BASE_CONF); - final String aesKeyHash = DigestUtils.sha1Hex(aesKeyBase.getBytes(StandardCharsets.UTF_8)); + final String aesKeyHash = DigestUtils.sha256Hex(aesKeyBase.getBytes(StandardCharsets.UTF_8)); final String aesEncryptionKey = aesKeyHash.substring(0, 16); playCacheCookieStore = new PlayCookieSessionStore( new ShiroAesDataEncrypter(aesEncryptionKey.getBytes())); diff --git a/datahub-frontend/app/auth/AuthUtils.java b/datahub-frontend/app/auth/AuthUtils.java index 80bd631d0db70..386eee725c83d 100644 --- a/datahub-frontend/app/auth/AuthUtils.java +++ b/datahub-frontend/app/auth/AuthUtils.java @@ -41,6 +41,11 @@ public class AuthUtils { */ public static final String SYSTEM_CLIENT_SECRET_CONFIG_PATH = "systemClientSecret"; + /** + * Cookie name for redirect url that is manually separated from the session to reduce size + */ + public static final String REDIRECT_URL_COOKIE_NAME = "REDIRECT_URL"; + public static final CorpuserUrn DEFAULT_ACTOR_URN = new CorpuserUrn("datahub"); public static final String LOGIN_ROUTE = "/login"; @@ -77,7 +82,9 @@ public static boolean isEligibleForForwarding(Http.Request req) { * as well as their agreement to determine authentication status. */ public static boolean hasValidSessionCookie(final Http.Request req) { - return req.session().data().containsKey(ACTOR) + Map sessionCookie = req.session().data(); + return sessionCookie.containsKey(ACCESS_TOKEN) + && sessionCookie.containsKey(ACTOR) && req.getCookie(ACTOR).isPresent() && req.session().data().get(ACTOR).equals(req.getCookie(ACTOR).get().value()); } diff --git a/datahub-frontend/app/auth/cookie/CustomCookiesModule.java b/datahub-frontend/app/auth/cookie/CustomCookiesModule.java new file mode 100644 index 0000000000000..a6dbd69a93889 --- /dev/null +++ b/datahub-frontend/app/auth/cookie/CustomCookiesModule.java @@ -0,0 +1,22 @@ +package auth.cookie; + +import com.google.inject.AbstractModule; +import play.api.libs.crypto.CookieSigner; +import play.api.libs.crypto.CookieSignerProvider; +import play.api.mvc.DefaultFlashCookieBaker; +import play.api.mvc.FlashCookieBaker; +import play.api.mvc.SessionCookieBaker; + + +public class CustomCookiesModule extends AbstractModule { + + @Override + public void configure() { + bind(CookieSigner.class).toProvider(CookieSignerProvider.class); + // We override the session cookie baker to not use a fallback, this prevents using an old URL Encoded cookie + bind(SessionCookieBaker.class).to(CustomSessionCookieBaker.class); + // We don't care about flash cookies, we don't use them + bind(FlashCookieBaker.class).to(DefaultFlashCookieBaker.class); + } + +} diff --git a/datahub-frontend/app/auth/cookie/CustomSessionCookieBaker.scala b/datahub-frontend/app/auth/cookie/CustomSessionCookieBaker.scala new file mode 100644 index 0000000000000..6f0a6604fa64b --- /dev/null +++ b/datahub-frontend/app/auth/cookie/CustomSessionCookieBaker.scala @@ -0,0 +1,25 @@ +package auth.cookie + +import com.google.inject.Inject +import play.api.http.{SecretConfiguration, SessionConfiguration} +import play.api.libs.crypto.CookieSigner +import play.api.mvc.DefaultSessionCookieBaker + +import scala.collection.immutable.Map + +/** + * Overrides default fallback to URL Encoding behavior, prevents usage of old URL encoded session cookies + * @param config + * @param secretConfiguration + * @param cookieSigner + */ +class CustomSessionCookieBaker @Inject() ( + override val config: SessionConfiguration, + override val secretConfiguration: SecretConfiguration, + cookieSigner: CookieSigner +) extends DefaultSessionCookieBaker(config, secretConfiguration, cookieSigner) { + // Has to be a Scala class because it extends a trait with concrete implementations, Scala does compilation tricks + + // Forces use of jwt encoding and disallows fallback to legacy url encoding + override def decode(encodedData: String): Map[String, String] = jwtCodec.decode(encodedData) +} diff --git a/datahub-frontend/app/auth/sso/oidc/OidcAuthorizationGenerator.java b/datahub-frontend/app/auth/sso/oidc/OidcAuthorizationGenerator.java index 3f864ed5abddf..baca144610ec4 100644 --- a/datahub-frontend/app/auth/sso/oidc/OidcAuthorizationGenerator.java +++ b/datahub-frontend/app/auth/sso/oidc/OidcAuthorizationGenerator.java @@ -1,19 +1,9 @@ package auth.sso.oidc; -import java.text.ParseException; import java.util.Map.Entry; import java.util.Optional; -import com.nimbusds.jose.Algorithm; -import com.nimbusds.jose.Header; -import com.nimbusds.jose.JWEAlgorithm; -import com.nimbusds.jose.JWSAlgorithm; -import com.nimbusds.jose.util.Base64URL; -import com.nimbusds.jose.util.JSONObjectUtils; -import com.nimbusds.jwt.EncryptedJWT; import com.nimbusds.jwt.JWTParser; -import com.nimbusds.jwt.SignedJWT; -import net.minidev.json.JSONObject; import org.pac4j.core.authorization.generator.AuthorizationGenerator; import org.pac4j.core.context.WebContext; import org.pac4j.core.profile.AttributeLocation; @@ -63,32 +53,5 @@ public Optional generate(WebContext context, UserProfile profile) { return Optional.ofNullable(profile); } - - private static JWT parse(final String s) throws ParseException { - final int firstDotPos = s.indexOf("."); - - if (firstDotPos == -1) { - throw new ParseException("Invalid JWT serialization: Missing dot delimiter(s)", 0); - } - - Base64URL header = new Base64URL(s.substring(0, firstDotPos)); - JSONObject jsonObject; - - try { - jsonObject = JSONObjectUtils.parse(header.decodeToString()); - } catch (ParseException e) { - throw new ParseException("Invalid unsecured/JWS/JWE header: " + e.getMessage(), 0); - } - - Algorithm alg = Header.parseAlgorithm(jsonObject); - - if (alg instanceof JWSAlgorithm) { - return SignedJWT.parse(s); - } else if (alg instanceof JWEAlgorithm) { - return EncryptedJWT.parse(s); - } else { - throw new AssertionError("Unexpected algorithm type: " + alg); - } - } } diff --git a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java index 4bde0872fc082..7164710f4e0de 100644 --- a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java +++ b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java @@ -38,6 +38,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.Base64; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -49,19 +50,21 @@ import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.pac4j.core.config.Config; +import org.pac4j.core.context.Cookie; import org.pac4j.core.engine.DefaultCallbackLogic; import org.pac4j.core.http.adapter.HttpActionAdapter; import org.pac4j.core.profile.CommonProfile; import org.pac4j.core.profile.ProfileManager; import org.pac4j.core.profile.UserProfile; +import org.pac4j.core.util.Pac4jConstants; import org.pac4j.play.PlayWebContext; import play.mvc.Result; import auth.sso.SsoManager; -import static auth.AuthUtils.createActorCookie; -import static auth.AuthUtils.createSessionMap; +import static auth.AuthUtils.*; import static com.linkedin.metadata.Constants.CORP_USER_ENTITY_NAME; import static com.linkedin.metadata.Constants.GROUP_MEMBERSHIP_ASPECT_NAME; +import static org.pac4j.play.store.PlayCookieSessionStore.*; import static play.mvc.Results.internalServerError; @@ -97,6 +100,9 @@ public OidcCallbackLogic(final SsoManager ssoManager, final Authentication syste public Result perform(PlayWebContext context, Config config, HttpActionAdapter httpActionAdapter, String defaultUrl, Boolean saveInSession, Boolean multiProfile, Boolean renewSession, String defaultClient) { + + setContextRedirectUrl(context); + final Result result = super.perform(context, config, httpActionAdapter, defaultUrl, saveInSession, multiProfile, renewSession, defaultClient); @@ -111,6 +117,15 @@ public Result perform(PlayWebContext context, Config config, return handleOidcCallback(oidcConfigs, result, context, getProfileManager(context)); } + @SuppressWarnings("unchecked") + private void setContextRedirectUrl(PlayWebContext context) { + Optional redirectUrl = context.getRequestCookies().stream() + .filter(cookie -> REDIRECT_URL_COOKIE_NAME.equals(cookie.getName())).findFirst(); + redirectUrl.ifPresent( + cookie -> context.getSessionStore().set(context, Pac4jConstants.REQUESTED_URL, + JAVA_SER_HELPER.deserializeFromBytes(uncompressBytes(Base64.getDecoder().decode(cookie.getValue()))))); + } + private Result handleOidcCallback(final OidcConfigs oidcConfigs, final Result result, final PlayWebContext context, final ProfileManager profileManager) { diff --git a/datahub-frontend/app/controllers/AuthenticationController.java b/datahub-frontend/app/controllers/AuthenticationController.java index e9ddfb2611ceb..4f89f4f67e149 100644 --- a/datahub-frontend/app/controllers/AuthenticationController.java +++ b/datahub-frontend/app/controllers/AuthenticationController.java @@ -13,14 +13,15 @@ import com.typesafe.config.Config; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.util.Base64; import java.util.Optional; import javax.annotation.Nonnull; import javax.inject.Inject; import org.apache.commons.lang3.StringUtils; import org.pac4j.core.client.Client; +import org.pac4j.core.context.Cookie; import org.pac4j.core.exception.http.FoundAction; import org.pac4j.core.exception.http.RedirectionAction; -import org.pac4j.core.util.Pac4jConstants; import org.pac4j.play.PlayWebContext; import org.pac4j.play.http.PlayHttpActionAdapter; import org.pac4j.play.store.PlaySessionStore; @@ -33,18 +34,9 @@ import play.mvc.Results; import security.AuthenticationManager; -import static auth.AuthUtils.DEFAULT_ACTOR_URN; -import static auth.AuthUtils.EMAIL; -import static auth.AuthUtils.FULL_NAME; -import static auth.AuthUtils.INVITE_TOKEN; -import static auth.AuthUtils.LOGIN_ROUTE; -import static auth.AuthUtils.PASSWORD; -import static auth.AuthUtils.RESET_TOKEN; -import static auth.AuthUtils.TITLE; -import static auth.AuthUtils.USER_NAME; -import static auth.AuthUtils.createActorCookie; -import static auth.AuthUtils.createSessionMap; +import static auth.AuthUtils.*; import static org.pac4j.core.client.IndirectClient.ATTEMPTED_AUTHENTICATION_SUFFIX; +import static org.pac4j.play.store.PlayCookieSessionStore.*; // TODO add logging. @@ -297,8 +289,12 @@ private Optional redirectToIdentityProvider(Http.RequestHeader request, } private void configurePac4jSessionStore(PlayWebContext context, Client client, String redirectPath) { - // Set the originally requested path for post-auth redirection. - _playSessionStore.set(context, Pac4jConstants.REQUESTED_URL, new FoundAction(redirectPath)); + // Set the originally requested path for post-auth redirection. We split off into a separate cookie from the session + // to reduce size of the session cookie + FoundAction foundAction = new FoundAction(redirectPath); + byte[] javaSerBytes = JAVA_SER_HELPER.serializeToBytes(foundAction); + String serialized = Base64.getEncoder().encodeToString(compressBytes(javaSerBytes)); + context.addResponseCookie(new Cookie(REDIRECT_URL_COOKIE_NAME, serialized)); // This is to prevent previous login attempts from being cached. // We replicate the logic here, which is buried in the Pac4j client. if (_playSessionStore.get(context, client.getName() + ATTEMPTED_AUTHENTICATION_SUFFIX) != null) { diff --git a/datahub-frontend/conf/application.conf b/datahub-frontend/conf/application.conf index 18d901d5ee7dd..1a62c8547e721 100644 --- a/datahub-frontend/conf/application.conf +++ b/datahub-frontend/conf/application.conf @@ -22,11 +22,16 @@ play.application.loader = play.inject.guice.GuiceApplicationLoader play.http.parser.maxMemoryBuffer = 10MB play.http.parser.maxMemoryBuffer = ${?DATAHUB_PLAY_MEM_BUFFER_SIZE} -# TODO: Disable legacy URL encoding eventually +play.modules.disabled += "play.api.mvc.LegacyCookiesModule" play.modules.disabled += "play.api.mvc.CookiesModule" -play.modules.enabled += "play.api.mvc.LegacyCookiesModule" +play.modules.enabled += "auth.cookie.CustomCookiesModule" play.modules.enabled += "auth.AuthModule" +jwt { + # 'alg' https://tools.ietf.org/html/rfc7515#section-4.1.1 + signatureAlgorithm = "HS256" +} + # We override the Akka server provider to allow setting the max header count to a higher value # This is useful while using proxies like Envoy that result in the frontend server rejecting GMS # responses as there's more than the max of 64 allowed headers @@ -199,10 +204,14 @@ auth.native.enabled = ${?AUTH_NATIVE_ENABLED} # auth.native.enabled = false # auth.oidc.enabled = false # (or simply omit oidc configurations) -# Login session expiration time +# Login session expiration time, controls when the actor cookie is expired on the browser side auth.session.ttlInHours = 24 auth.session.ttlInHours = ${?AUTH_SESSION_TTL_HOURS} +# Control the length of time a session token is valid +play.http.session.maxAge = 24h +play.http.session.maxAge = ${?MAX_SESSION_TOKEN_AGE} + analytics.enabled = true analytics.enabled = ${?DATAHUB_ANALYTICS_ENABLED} diff --git a/datahub-frontend/test/app/ApplicationTest.java b/datahub-frontend/test/app/ApplicationTest.java index 417fd79e76bbd..f27fefdb79669 100644 --- a/datahub-frontend/test/app/ApplicationTest.java +++ b/datahub-frontend/test/app/ApplicationTest.java @@ -1,6 +1,11 @@ package app; +import com.nimbusds.jwt.JWT; +import com.nimbusds.jwt.JWTClaimsSet; +import com.nimbusds.jwt.JWTParser; import controllers.routes; +import java.text.ParseException; +import java.util.Date; import no.nav.security.mock.oauth2.MockOAuth2Server; import no.nav.security.mock.oauth2.token.DefaultOAuth2TokenCallback; import okhttp3.mockwebserver.MockResponse; @@ -27,8 +32,6 @@ import java.io.IOException; import java.net.InetAddress; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; @@ -149,7 +152,7 @@ public void testOpenIdConfig() { } @Test - public void testHappyPathOidc() throws InterruptedException { + public void testHappyPathOidc() throws ParseException { browser.goTo("/authenticate"); assertEquals("", browser.url()); @@ -157,8 +160,23 @@ public void testHappyPathOidc() throws InterruptedException { assertEquals(TEST_USER, actorCookie.getValue()); Cookie sessionCookie = browser.getCookie("PLAY_SESSION"); - assertTrue(sessionCookie.getValue().contains("token=" + TEST_TOKEN)); - assertTrue(sessionCookie.getValue().contains("actor=" + URLEncoder.encode(TEST_USER, StandardCharsets.UTF_8))); + String jwtStr = sessionCookie.getValue(); + JWT jwt = JWTParser.parse(jwtStr); + JWTClaimsSet claims = jwt.getJWTClaimsSet(); + Map data = (Map) claims.getClaim("data"); + assertEquals(TEST_TOKEN, data.get("token")); + assertEquals(TEST_USER, data.get("actor")); + // Default expiration is 24h, so should always be less than current time + 1 day since it stamps the time before this executes + assertTrue(claims.getExpirationTime().compareTo(new Date(System.currentTimeMillis() + (24 * 60 * 60 * 1000))) < 0); + } + + @Test + public void testAPI() throws ParseException { + testHappyPathOidc(); + int requestCount = _gmsServer.getRequestCount(); + + browser.goTo("/api/v2/graphql/"); + assertEquals(++requestCount, _gmsServer.getRequestCount()); } @Test diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/IngestionResolverUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/IngestionResolverUtils.java index 7db0b6f826a04..1140c031f1d35 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/IngestionResolverUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/IngestionResolverUtils.java @@ -5,6 +5,7 @@ import com.linkedin.datahub.graphql.generated.IngestionConfig; import com.linkedin.datahub.graphql.generated.IngestionSchedule; import com.linkedin.datahub.graphql.generated.IngestionSource; +import com.linkedin.datahub.graphql.generated.StringMapEntry; import com.linkedin.datahub.graphql.generated.StructuredReport; import com.linkedin.datahub.graphql.types.common.mappers.StringMapMapper; import com.linkedin.entity.EntityResponse; @@ -21,6 +22,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; @@ -143,6 +145,14 @@ public static IngestionConfig mapIngestionSourceConfig(final DataHubIngestionSou result.setVersion(config.getVersion()); result.setExecutorId(config.getExecutorId()); result.setDebugMode(config.isDebugMode()); + if (config.getExtraArgs() != null) { + List extraArgs = config.getExtraArgs() + .keySet() + .stream() + .map(key -> new StringMapEntry(key, config.getExtraArgs().get(key))) + .collect(Collectors.toList()); + result.setExtraArgs(extraArgs); + } return result; } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/execution/CreateIngestionExecutionRequestResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/execution/CreateIngestionExecutionRequestResolver.java index e5064e6620526..ea20b837e0a1f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/execution/CreateIngestionExecutionRequestResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/execution/CreateIngestionExecutionRequestResolver.java @@ -117,6 +117,9 @@ public CompletableFuture get(final DataFetchingEnvironment environment) if (ingestionSourceInfo.getConfig().hasDebugMode()) { debugMode = ingestionSourceInfo.getConfig().isDebugMode() ? "true" : "false"; } + if (ingestionSourceInfo.getConfig().hasExtraArgs()) { + arguments.putAll(ingestionSourceInfo.getConfig().getExtraArgs()); + } arguments.put(DEBUG_MODE_ARG_NAME, debugMode); execInput.setArgs(new StringMap(arguments)); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolver.java index 2ce394ad5ba84..68e334bd976f8 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolver.java @@ -1,10 +1,12 @@ package com.linkedin.datahub.graphql.resolvers.ingest.source; import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.StringMap; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode; import com.linkedin.datahub.graphql.exception.DataHubGraphQLException; +import com.linkedin.datahub.graphql.generated.StringMapEntryInput; import com.linkedin.datahub.graphql.generated.UpdateIngestionSourceConfigInput; import com.linkedin.datahub.graphql.generated.UpdateIngestionSourceInput; import com.linkedin.datahub.graphql.generated.UpdateIngestionSourceScheduleInput; @@ -17,6 +19,8 @@ import com.linkedin.mxe.MetadataChangeProposal; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; +import java.util.Map; +import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import java.net.URISyntaxException; @@ -108,6 +112,12 @@ private DataHubIngestionSourceConfig mapConfig(final UpdateIngestionSourceConfig if (input.getDebugMode() != null) { result.setDebugMode(input.getDebugMode()); } + if (input.getExtraArgs() != null) { + Map extraArgs = input.getExtraArgs() + .stream() + .collect(Collectors.toMap(StringMapEntryInput::getKey, StringMapEntryInput::getValue)); + result.setExtraArgs(new StringMap(extraArgs)); + } return result; } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnerResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnerResolver.java index 5ca7007d98e43..3f2dab0a5ba71 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnerResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnerResolver.java @@ -2,14 +2,11 @@ import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.CorpuserUrn; - import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.generated.AddOwnerInput; -import com.linkedin.datahub.graphql.generated.OwnerEntityType; import com.linkedin.datahub.graphql.generated.OwnerInput; -import com.linkedin.datahub.graphql.generated.OwnershipType; import com.linkedin.datahub.graphql.generated.ResourceRefInput; import com.linkedin.datahub.graphql.resolvers.mutate.util.OwnerUtils; import com.linkedin.metadata.entity.EntityService; @@ -20,7 +17,6 @@ import lombok.extern.slf4j.Slf4j; import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.*; -import static com.linkedin.datahub.graphql.resolvers.mutate.util.OwnerUtils.*; @Slf4j @@ -32,30 +28,33 @@ public class AddOwnerResolver implements DataFetcher> @Override public CompletableFuture get(DataFetchingEnvironment environment) throws Exception { final AddOwnerInput input = bindArgument(environment.getArgument("input"), AddOwnerInput.class); - Urn ownerUrn = Urn.createFromString(input.getOwnerUrn()); - OwnerEntityType ownerEntityType = input.getOwnerEntityType(); - OwnershipType type = input.getType() == null ? OwnershipType.NONE : input.getType(); - String ownershipUrn = input.getOwnershipTypeUrn() == null ? mapOwnershipTypeToEntity(type.name()) : input.getOwnershipTypeUrn(); Urn targetUrn = Urn.createFromString(input.getResourceUrn()); + OwnerInput.Builder ownerInputBuilder = OwnerInput.builder(); + ownerInputBuilder.setOwnerUrn(input.getOwnerUrn()); + ownerInputBuilder.setOwnerEntityType(input.getOwnerEntityType()); + if (input.getType() != null) { + ownerInputBuilder.setType(input.getType()); + } + if (input.getOwnershipTypeUrn() != null) { + ownerInputBuilder.setOwnershipTypeUrn(input.getOwnershipTypeUrn()); + } + OwnerInput ownerInput = ownerInputBuilder.build(); if (!OwnerUtils.isAuthorizedToUpdateOwners(environment.getContext(), targetUrn)) { throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator."); } return CompletableFuture.supplyAsync(() -> { - OwnerUtils.validateAddInput( - ownerUrn, input.getOwnershipTypeUrn(), ownerEntityType, - targetUrn, - _entityService - ); + OwnerUtils.validateAddOwnerInput(ownerInput, ownerUrn, _entityService); + try { log.debug("Adding Owner. input: {}", input); Urn actor = CorpuserUrn.createFromString(((QueryContext) environment.getContext()).getActorUrn()); OwnerUtils.addOwnersToResources( - ImmutableList.of(new OwnerInput(input.getOwnerUrn(), ownerEntityType, type, ownershipUrn)), + ImmutableList.of(ownerInput), ImmutableList.of(new ResourceRefInput(input.getResourceUrn(), null, null)), actor, _entityService diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnersResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnersResolver.java index 06424efa83819..4e5b5bdb2a651 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnersResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnersResolver.java @@ -39,7 +39,7 @@ public CompletableFuture get(DataFetchingEnvironment environment) throw throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator."); } - OwnerUtils.validateAddInput( + OwnerUtils.validateAddOwnerInput( owners, targetUrn, _entityService diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/BatchAddOwnersResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/BatchAddOwnersResolver.java index 019c044d81ab3..5beaeecae673f 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/BatchAddOwnersResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/BatchAddOwnersResolver.java @@ -53,8 +53,7 @@ public CompletableFuture get(DataFetchingEnvironment environment) throw private void validateOwners(List owners) { for (OwnerInput ownerInput : owners) { - OwnerUtils.validateOwner(UrnUtils.getUrn(ownerInput.getOwnerUrn()), ownerInput.getOwnerEntityType(), - UrnUtils.getUrn(ownerInput.getOwnershipTypeUrn()), _entityService); + OwnerUtils.validateOwner(ownerInput, _entityService); } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java index d2f7f896e5953..7233995804423 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java @@ -50,7 +50,7 @@ public static void addOwnersToResources( ) { final List changes = new ArrayList<>(); for (ResourceRefInput resource : resources) { - changes.add(buildAddOwnersProposal(owners, UrnUtils.getUrn(resource.getResourceUrn()), actor, entityService)); + changes.add(buildAddOwnersProposal(owners, UrnUtils.getUrn(resource.getResourceUrn()), entityService)); } EntityUtils.ingestChangeProposals(changes, entityService, actor, false); } @@ -69,7 +69,7 @@ public static void removeOwnersFromResources( } - private static MetadataChangeProposal buildAddOwnersProposal(List owners, Urn resourceUrn, Urn actor, EntityService entityService) { + static MetadataChangeProposal buildAddOwnersProposal(List owners, Urn resourceUrn, EntityService entityService) { Ownership ownershipAspect = (Ownership) EntityUtils.getAspectFromEntity( resourceUrn.toString(), Constants.OWNERSHIP_ASPECT_NAME, entityService, @@ -181,18 +181,13 @@ public static boolean isAuthorizedToUpdateOwners(@Nonnull QueryContext context, orPrivilegeGroups); } - public static Boolean validateAddInput( + public static Boolean validateAddOwnerInput( List owners, Urn resourceUrn, EntityService entityService ) { for (OwnerInput owner : owners) { - boolean result = validateAddInput( - UrnUtils.getUrn(owner.getOwnerUrn()), - owner.getOwnershipTypeUrn(), - owner.getOwnerEntityType(), - resourceUrn, - entityService); + boolean result = validateAddOwnerInput(owner, resourceUrn, entityService); if (!result) { return false; } @@ -200,44 +195,29 @@ public static Boolean validateAddInput( return true; } - public static Boolean validateAddInput( - Urn ownerUrn, - String ownershipEntityUrn, - OwnerEntityType ownerEntityType, + public static Boolean validateAddOwnerInput( + OwnerInput owner, Urn resourceUrn, EntityService entityService ) { - if (OwnerEntityType.CORP_GROUP.equals(ownerEntityType) && !Constants.CORP_GROUP_ENTITY_NAME.equals(ownerUrn.getEntityType())) { - throw new IllegalArgumentException(String.format("Failed to change ownership for resource %s. Expected a corp group urn.", resourceUrn)); - } - - if (OwnerEntityType.CORP_USER.equals(ownerEntityType) && !Constants.CORP_USER_ENTITY_NAME.equals(ownerUrn.getEntityType())) { - throw new IllegalArgumentException(String.format("Failed to change ownership for resource %s. Expected a corp user urn.", resourceUrn)); - } - if (!entityService.exists(resourceUrn)) { throw new IllegalArgumentException(String.format("Failed to change ownership for resource %s. Resource does not exist.", resourceUrn)); } - if (!entityService.exists(ownerUrn)) { - throw new IllegalArgumentException(String.format("Failed to change ownership for resource %s. Owner %s does not exist.", resourceUrn, ownerUrn)); - } - - if (ownershipEntityUrn != null && !entityService.exists(UrnUtils.getUrn(ownershipEntityUrn))) { - throw new IllegalArgumentException(String.format("Failed to change ownership type for resource %s. Ownership Type " - + "%s does not exist.", resourceUrn, ownershipEntityUrn)); - } + validateOwner(owner, entityService); return true; } public static void validateOwner( - Urn ownerUrn, - OwnerEntityType ownerEntityType, - Urn ownershipEntityUrn, + OwnerInput owner, EntityService entityService ) { + + OwnerEntityType ownerEntityType = owner.getOwnerEntityType(); + Urn ownerUrn = UrnUtils.getUrn(owner.getOwnerUrn()); + if (OwnerEntityType.CORP_GROUP.equals(ownerEntityType) && !Constants.CORP_GROUP_ENTITY_NAME.equals(ownerUrn.getEntityType())) { throw new IllegalArgumentException( String.format("Failed to change ownership for resource(s). Expected a corp group urn, found %s", ownerUrn)); @@ -252,9 +232,14 @@ public static void validateOwner( throw new IllegalArgumentException(String.format("Failed to change ownership for resource(s). Owner with urn %s does not exist.", ownerUrn)); } - if (!entityService.exists(ownershipEntityUrn)) { - throw new IllegalArgumentException(String.format("Failed to change ownership for resource(s). Ownership type with " - + "urn %s does not exist.", ownershipEntityUrn)); + if (owner.getOwnershipTypeUrn() != null && !entityService.exists(UrnUtils.getUrn(owner.getOwnershipTypeUrn()))) { + throw new IllegalArgumentException(String.format("Failed to change ownership for resource(s). Custom Ownership type with " + + "urn %s does not exist.", owner.getOwnershipTypeUrn())); + } + + if (owner.getType() == null && owner.getOwnershipTypeUrn() == null) { + throw new IllegalArgumentException("Failed to change ownership for resource(s). Expected either " + + "type or ownershipTypeUrn to be specified."); } } @@ -269,11 +254,11 @@ public static Boolean validateRemoveInput( } public static void addCreatorAsOwner( - QueryContext context, - String urn, - OwnerEntityType ownerEntityType, - OwnershipType ownershipType, - EntityService entityService) { + QueryContext context, + String urn, + OwnerEntityType ownerEntityType, + OwnershipType ownershipType, + EntityService entityService) { try { Urn actorUrn = CorpuserUrn.createFromString(context.getActorUrn()); String ownershipTypeUrn = mapOwnershipTypeToEntity(ownershipType.name()); diff --git a/datahub-graphql-core/src/main/resources/ingestion.graphql b/datahub-graphql-core/src/main/resources/ingestion.graphql index 69c8aff124583..21f9fb2633119 100644 --- a/datahub-graphql-core/src/main/resources/ingestion.graphql +++ b/datahub-graphql-core/src/main/resources/ingestion.graphql @@ -332,6 +332,11 @@ type IngestionConfig { Advanced: Whether or not to run ingestion in debug mode """ debugMode: Boolean + + """ + Advanced: Extra arguments for the ingestion run. + """ + extraArgs: [StringMapEntry!] } """ @@ -483,6 +488,11 @@ input UpdateIngestionSourceConfigInput { Whether or not to run ingestion in debug mode """ debugMode: Boolean + + """ + Extra arguments for the ingestion run. + """ + extraArgs: [StringMapEntryInput!] } """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index 4cabdb04afe77..e0cde5a2db9f9 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -458,6 +458,26 @@ enum FilterOperator { Represents the relation: The field exists. If the field is an array, the field is either not present or empty. """ EXISTS + + """ + Represent the relation greater than, e.g. ownerCount > 5 + """ + GREATER_THAN + + """ + Represent the relation greater than or equal to, e.g. ownerCount >= 5 + """ + GREATER_THAN_OR_EQUAL_TO + + """ + Represent the relation less than, e.g. ownerCount < 3 + """ + LESS_THAN + + """ + Represent the relation less than or equal to, e.g. ownerCount <= 3 + """ + LESS_THAN_OR_EQUAL_TO } """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolverTest.java index 2538accc694fb..16d8da9169a8f 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolverTest.java @@ -26,7 +26,7 @@ public class UpsertIngestionSourceResolverTest { "Test source", "mysql", "Test source description", new UpdateIngestionSourceScheduleInput("* * * * *", "UTC"), - new UpdateIngestionSourceConfigInput("my test recipe", "0.8.18", "executor id", false) + new UpdateIngestionSourceConfigInput("my test recipe", "0.8.18", "executor id", false, null) ); @Test diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/owner/AddOwnersResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/owner/AddOwnersResolverTest.java index efc0c5dfcf36d..329d71ec125db 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/owner/AddOwnersResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/owner/AddOwnersResolverTest.java @@ -2,6 +2,11 @@ import com.google.common.collect.ImmutableList; import com.linkedin.common.AuditStamp; +import com.linkedin.common.Owner; +import com.linkedin.common.OwnerArray; +import com.linkedin.common.Ownership; +import com.linkedin.common.OwnershipSource; +import com.linkedin.common.OwnershipSourceType; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; @@ -28,6 +33,7 @@ public class AddOwnersResolverTest { private static final String TEST_ENTITY_URN = "urn:li:dataset:(urn:li:dataPlatform:mysql,my-test,PROD)"; private static final String TEST_OWNER_1_URN = "urn:li:corpuser:test-id-1"; private static final String TEST_OWNER_2_URN = "urn:li:corpuser:test-id-2"; + private static final String TEST_OWNER_3_URN = "urn:li:corpGroup:test-id-3"; @Test public void testGetSuccessNoExistingOwners() throws Exception { @@ -75,33 +81,41 @@ public void testGetSuccessNoExistingOwners() throws Exception { } @Test - public void testGetSuccessExistingOwners() throws Exception { + public void testGetSuccessExistingOwnerNewType() throws Exception { EntityService mockService = getMockEntityService(); + com.linkedin.common.Ownership oldOwnership = new Ownership().setOwners(new OwnerArray( + ImmutableList.of(new Owner() + .setOwner(UrnUtils.getUrn(TEST_OWNER_1_URN)) + .setType(com.linkedin.common.OwnershipType.NONE) + .setSource(new OwnershipSource().setType(OwnershipSourceType.MANUAL)) + ))); + Mockito.when(mockService.getAspect( - Mockito.eq(UrnUtils.getUrn(TEST_ENTITY_URN)), - Mockito.eq(Constants.OWNERSHIP_ASPECT_NAME), - Mockito.eq(0L))) - .thenReturn(null); + Mockito.eq(UrnUtils.getUrn(TEST_ENTITY_URN)), + Mockito.eq(Constants.OWNERSHIP_ASPECT_NAME), + Mockito.eq(0L))) + .thenReturn(oldOwnership); Mockito.when(mockService.exists(Urn.createFromString(TEST_ENTITY_URN))).thenReturn(true); Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_1_URN))).thenReturn(true); - Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_2_URN))).thenReturn(true); Mockito.when(mockService.exists(Urn.createFromString( - OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.TECHNICAL_OWNER.name())))) - .thenReturn(true); + OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.TECHNICAL_OWNER.name())))) + .thenReturn(true); AddOwnersResolver resolver = new AddOwnersResolver(mockService); // Execute resolver QueryContext mockContext = getMockAllowContext(); DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + AddOwnersInput input = new AddOwnersInput(ImmutableList.of( - new OwnerInput(TEST_OWNER_1_URN, OwnerEntityType.CORP_USER, OwnershipType.TECHNICAL_OWNER, - OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())), - new OwnerInput(TEST_OWNER_2_URN, OwnerEntityType.CORP_USER, OwnershipType.TECHNICAL_OWNER, - OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_1_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_USER) + .build() ), TEST_ENTITY_URN); Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); Mockito.when(mockEnv.getContext()).thenReturn(mockContext); @@ -111,11 +125,126 @@ public void testGetSuccessExistingOwners() throws Exception { verifyIngestProposal(mockService, 1); Mockito.verify(mockService, Mockito.times(1)).exists( - Mockito.eq(Urn.createFromString(TEST_OWNER_1_URN)) + Mockito.eq(Urn.createFromString(TEST_OWNER_1_URN)) ); + } + + @Test + public void testGetSuccessDeprecatedTypeToOwnershipType() throws Exception { + EntityService mockService = getMockEntityService(); + + com.linkedin.common.Ownership oldOwnership = new Ownership().setOwners(new OwnerArray( + ImmutableList.of(new Owner() + .setOwner(UrnUtils.getUrn(TEST_OWNER_1_URN)) + .setType(com.linkedin.common.OwnershipType.TECHNICAL_OWNER) + .setSource(new OwnershipSource().setType(OwnershipSourceType.MANUAL)) + ))); + + Mockito.when(mockService.getAspect( + Mockito.eq(UrnUtils.getUrn(TEST_ENTITY_URN)), + Mockito.eq(Constants.OWNERSHIP_ASPECT_NAME), + Mockito.eq(0L))) + .thenReturn(oldOwnership); + + Mockito.when(mockService.exists(Urn.createFromString(TEST_ENTITY_URN))).thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_1_URN))).thenReturn(true); + + Mockito.when(mockService.exists(Urn.createFromString( + OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.TECHNICAL_OWNER.name())))) + .thenReturn(true); + + AddOwnersResolver resolver = new AddOwnersResolver(mockService); + + // Execute resolver + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + + AddOwnersInput input = new AddOwnersInput(ImmutableList.of(OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_1_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_USER) + .build() + ), TEST_ENTITY_URN); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + assertTrue(resolver.get(mockEnv).get()); + + // Unable to easily validate exact payload due to the injected timestamp + verifyIngestProposal(mockService, 1); Mockito.verify(mockService, Mockito.times(1)).exists( - Mockito.eq(Urn.createFromString(TEST_OWNER_2_URN)) + Mockito.eq(Urn.createFromString(TEST_OWNER_1_URN)) + ); + } + + @Test + public void testGetSuccessMultipleOwnerTypes() throws Exception { + EntityService mockService = getMockEntityService(); + + com.linkedin.common.Ownership oldOwnership = new Ownership().setOwners(new OwnerArray( + ImmutableList.of(new Owner() + .setOwner(UrnUtils.getUrn(TEST_OWNER_1_URN)) + .setType(com.linkedin.common.OwnershipType.NONE) + .setSource(new OwnershipSource().setType(OwnershipSourceType.MANUAL)) + ))); + + Mockito.when(mockService.getAspect( + Mockito.eq(UrnUtils.getUrn(TEST_ENTITY_URN)), + Mockito.eq(Constants.OWNERSHIP_ASPECT_NAME), + Mockito.eq(0L))) + .thenReturn(oldOwnership); + + Mockito.when(mockService.exists(Urn.createFromString(TEST_ENTITY_URN))).thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_1_URN))).thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_2_URN))).thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_3_URN))).thenReturn(true); + + Mockito.when(mockService.exists(Urn.createFromString( + OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.TECHNICAL_OWNER.name())))) + .thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString( + OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.BUSINESS_OWNER.name())))) + .thenReturn(true); + + AddOwnersResolver resolver = new AddOwnersResolver(mockService); + + // Execute resolver + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + + AddOwnersInput input = new AddOwnersInput(ImmutableList.of(OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_1_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_USER) + .build(), + OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_2_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.BUSINESS_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_USER) + .build(), + OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_3_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_GROUP) + .build() + ), TEST_ENTITY_URN); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + assertTrue(resolver.get(mockEnv).get()); + + // Unable to easily validate exact payload due to the injected timestamp + verifyIngestProposal(mockService, 1); + + Mockito.verify(mockService, Mockito.times(1)).exists( + Mockito.eq(Urn.createFromString(TEST_OWNER_1_URN)) + ); + + Mockito.verify(mockService, Mockito.times(1)).exists( + Mockito.eq(Urn.createFromString(TEST_OWNER_2_URN)) + ); + + Mockito.verify(mockService, Mockito.times(1)).exists( + Mockito.eq(Urn.createFromString(TEST_OWNER_3_URN)) ); } diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx index d48ead2f5863e..9788d36af2c65 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx @@ -112,7 +112,11 @@ function CreateGlossaryEntityModal(props: Props) { - @@ -130,6 +134,7 @@ function CreateGlossaryEntityModal(props: Props) { > Name}> setIsMoveModalVisible(true)} > - +  Move diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveGlossaryEntityModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveGlossaryEntityModal.tsx index 5352825708776..37a625f58100b 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveGlossaryEntityModal.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveGlossaryEntityModal.tsx @@ -64,6 +64,7 @@ function MoveGlossaryEntityModal(props: Props) { return ( Cancel - + } > diff --git a/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx b/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx index 579b8c9905da0..cb37c44a36caa 100644 --- a/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx +++ b/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx @@ -41,7 +41,11 @@ export default function UpdateDescriptionModal({ title, description, original, o footer={ <> - diff --git a/datahub-web-react/src/app/entity/shared/components/styled/AddLinkModal.tsx b/datahub-web-react/src/app/entity/shared/components/styled/AddLinkModal.tsx index 34d4f0cb3fe91..68a8cf4094362 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/AddLinkModal.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/AddLinkModal.tsx @@ -57,7 +57,7 @@ export const AddLinkModal = ({ buttonProps, refetch }: AddLinkProps) => { return ( <> - { , - , ]} >
{ )} diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/DocumentationTab.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/DocumentationTab.tsx index de065d23e56e7..344c2aef87175 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/DocumentationTab.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/DocumentationTab.tsx @@ -60,6 +60,7 @@ export const DocumentationTab = ({ properties }: { properties?: Props }) => {
- diff --git a/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx b/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx index 11f54cb5078e6..a5262265fd23d 100644 --- a/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx +++ b/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx @@ -92,11 +92,12 @@ function BusinessGlossaryPage() { {(termsError || nodesError) && ( )} - + Business Glossary
diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index 6c91a0f6f3f8f..13af19b0b6ac2 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -15,7 +15,7 @@ import { Message } from '../../shared/Message'; import TabToolbar from '../../entity/shared/components/styled/TabToolbar'; import { IngestionSourceBuilderModal } from './builder/IngestionSourceBuilderModal'; import { addToListIngestionSourcesCache, CLI_EXECUTOR_ID, removeFromListIngestionSourcesCache } from './utils'; -import { DEFAULT_EXECUTOR_ID, SourceBuilderState } from './builder/types'; +import { DEFAULT_EXECUTOR_ID, SourceBuilderState, StringMapEntryInput } from './builder/types'; import { IngestionSource, UpdateIngestionSourceInput } from '../../../types.generated'; import { SearchBar } from '../../search/SearchBar'; import { useEntityRegistry } from '../../useEntityRegistry'; @@ -173,6 +173,11 @@ export const IngestionSourceList = () => { setFocusSourceUrn(undefined); }; + const formatExtraArgs = (extraArgs): StringMapEntryInput[] => { + if (extraArgs === null || extraArgs === undefined) return []; + return extraArgs.map((entry) => ({ key: entry.key, value: entry.value })); + }; + const createOrUpdateIngestionSource = ( input: UpdateIngestionSourceInput, resetState: () => void, @@ -294,6 +299,7 @@ export const IngestionSourceList = () => { (recipeBuilderState.config?.executorId as string)) || DEFAULT_EXECUTOR_ID, debugMode: recipeBuilderState.config?.debugMode || false, + extraArgs: formatExtraArgs(recipeBuilderState.config?.extraArgs || []), }, schedule: recipeBuilderState.schedule && { interval: recipeBuilderState.schedule?.interval as string, @@ -358,7 +364,12 @@ export const IngestionSourceList = () => {
- )} diff --git a/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx b/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx index dba9b25e14e99..7a14b6a794189 100644 --- a/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx @@ -167,7 +167,11 @@ export const CreateScheduleStep = ({ state, updateState, goTo, prev }: StepProps
-
diff --git a/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx b/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx index 913f8253ece5a..f4c048bcaf0d2 100644 --- a/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx @@ -1,7 +1,7 @@ import { Button, Checkbox, Collapse, Form, Input, Typography } from 'antd'; import React from 'react'; import styled from 'styled-components'; -import { SourceBuilderState, StepProps } from './types'; +import { SourceBuilderState, StepProps, StringMapEntryInput } from './types'; const ControlsContainer = styled.div` display: flex; @@ -13,6 +13,10 @@ const SaveButton = styled(Button)` margin-right: 15px; `; +const ExtraEnvKey = 'extra_env_vars'; +const ExtraReqKey = 'extra_pip_requirements'; +const ExtraPluginKey = 'extra_pip_plugins'; + export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) => { const setName = (stagedName: string) => { const newState: SourceBuilderState = { @@ -55,6 +59,90 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) updateState(newState); }; + const retrieveExtraEnvs = () => { + const extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const index: number = extraArgs.findIndex((entry) => entry.key === ExtraEnvKey) as number; + if (index > -1) { + return extraArgs[index].value; + } + return ''; + }; + + const setExtraEnvs = (envs: string) => { + let extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const indxOfEnvVars: number = extraArgs.findIndex((entry) => entry.key === ExtraEnvKey) as number; + const value = { key: ExtraEnvKey, value: envs }; + if (indxOfEnvVars > -1) { + extraArgs[indxOfEnvVars] = value; + } else { + extraArgs = [...extraArgs, value]; + } + const newState: SourceBuilderState = { + ...state, + config: { + ...state.config, + extraArgs, + }, + }; + updateState(newState); + }; + + const retrieveExtraDataHubPlugins = () => { + const extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const index: number = extraArgs.findIndex((entry) => entry.key === ExtraPluginKey) as number; + if (index > -1) { + return extraArgs[index].value; + } + return ''; + }; + + const setExtraDataHubPlugins = (plugins: string) => { + let extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const indxOfPlugins: number = extraArgs.findIndex((entry) => entry.key === ExtraPluginKey) as number; + const value = { key: ExtraPluginKey, value: plugins }; + if (indxOfPlugins > -1) { + extraArgs[indxOfPlugins] = value; + } else { + extraArgs = [...extraArgs, value]; + } + const newState: SourceBuilderState = { + ...state, + config: { + ...state.config, + extraArgs, + }, + }; + updateState(newState); + }; + + const retrieveExtraReqs = () => { + const extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const index: number = extraArgs.findIndex((entry) => entry.key === ExtraReqKey) as number; + if (index > -1) { + return extraArgs[index].value; + } + return ''; + }; + + const setExtraReqs = (reqs: string) => { + let extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const indxOfReqs: number = extraArgs.findIndex((entry) => entry.key === ExtraReqKey) as number; + const value = { key: ExtraReqKey, value: reqs }; + if (indxOfReqs > -1) { + extraArgs[indxOfReqs] = value; + } else { + extraArgs = [...extraArgs, value]; + } + const newState: SourceBuilderState = { + ...state, + config: { + ...state.config, + extraArgs, + }, + }; + updateState(newState); + }; + const onClickCreate = (shouldRun?: boolean) => { if (state.name !== undefined && state.name.length > 0) { submit(shouldRun); @@ -116,6 +204,39 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) onChange={(event) => setDebugMode(event.target.checked)} /> + Extra Enviroment Variables}> + + Advanced: Set extra environment variables to an ingestion execution + + setExtraEnvs(event.target.value)} + /> + + Extra DataHub plugins}> + + Advanced: Set extra DataHub plugins for an ingestion execution + + setExtraDataHubPlugins(event.target.value)} + /> + + Extra Pip Libraries}> + + Advanced: Add extra pip libraries for an ingestion execution + + setExtraReqs(event.target.value)} + /> + @@ -123,6 +244,7 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps)
0)} onClick={() => onClickCreate(false)} > diff --git a/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx b/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx index 4ddeb7b492595..bee9b04cee100 100644 --- a/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx @@ -86,10 +86,20 @@ function RecipeBuilder(props: Props) { {sourceConfigs?.displayName} Recipe - switchViews(true)}> + switchViews(true)} + data-testid="recipe-builder-form-button" + > Form - switchViews(false)}> + switchViews(false)} + data-testid="recipe-builder-yaml-button" + > YAML @@ -114,7 +124,9 @@ function RecipeBuilder(props: Props) { - + )} diff --git a/datahub-web-react/src/app/ingest/source/builder/types.ts b/datahub-web-react/src/app/ingest/source/builder/types.ts index cfe0f27ae7dbe..2df467b7beba1 100644 --- a/datahub-web-react/src/app/ingest/source/builder/types.ts +++ b/datahub-web-react/src/app/ingest/source/builder/types.ts @@ -34,6 +34,18 @@ export type StepProps = { ingestionSources: SourceConfig[]; }; +export type StringMapEntryInput = { + /** + * The key of the map entry + */ + key: string; + + /** + * The value fo the map entry + */ + value: string; +}; + /** * The object represents the state of the Ingestion Source Builder form. */ @@ -91,5 +103,10 @@ export interface SourceBuilderState { * Advanced: Whether or not to run this ingestion source in debug mode */ debugMode?: boolean | null; + + /** + * Advanced: Extra arguments for the ingestion run. + */ + extraArgs?: StringMapEntryInput[] | null; }; } diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx index 849efabdcde97..00fdc89964f88 100644 --- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx @@ -2,6 +2,7 @@ import { DownloadOutlined } from '@ant-design/icons'; import { Button, message, Modal, Typography } from 'antd'; import React, { useEffect, useState } from 'react'; import styled from 'styled-components'; +import YAML from 'yamljs'; import { useGetIngestionExecutionRequestQuery } from '../../../../graphql/ingestion.generated'; import { ANTD_GRAY } from '../../../entity/shared/constants'; import { downloadFile } from '../../../search/utils/csvUtils'; @@ -65,6 +66,13 @@ const IngestedAssetsSection = styled.div` padding-right: 30px; `; +const RecipeSection = styled.div` + border-top: 1px solid ${ANTD_GRAY[4]}; + padding-top: 16px; + padding-left: 30px; + padding-right: 30px; +`; + const LogsSection = styled.div` padding-top: 16px; padding-left: 30px; @@ -91,6 +99,8 @@ type Props = { export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { const [showExpandedLogs, setShowExpandedLogs] = useState(false); + const [showExpandedRecipe, setShowExpandedRecipe] = useState(false); + const { data, loading, error, refetch } = useGetIngestionExecutionRequestQuery({ variables: { urn } }); const output = data?.executionRequest?.result?.report || 'No output found.'; @@ -120,7 +130,18 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { const resultSummaryText = (result && {getExecutionRequestSummaryText(result)}) || undefined; - const isOutputExpandable = output.length > 100; + + const recipeJson = data?.executionRequest?.input.arguments?.find((arg) => arg.key === 'recipe')?.value; + let recipeYaml: string; + try { + recipeYaml = recipeJson && YAML.stringify(JSON.parse(recipeJson), 8, 2).trim(); + } catch (e) { + recipeYaml = ''; + } + const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 1).join('\n'); + + const areLogsExpandable = output.length > 100; + const isRecipeExpandable = recipeYaml?.includes('\n'); return ( { -
{`${logs}${!showExpandedLogs && isOutputExpandable ? '...' : ''}`}
- {isOutputExpandable && ( +
{`${logs}${!showExpandedLogs && areLogsExpandable ? '...' : ''}`}
+ {areLogsExpandable && ( setShowExpandedLogs(!showExpandedLogs)}> {showExpandedLogs ? 'Hide' : 'Show More'} )}
+ {recipe && ( + + Recipe + + + The recipe used for this ingestion run. + + + +
{`${recipe}${!showExpandedRecipe && isRecipeExpandable ? '\n...' : ''}`}
+
+ {isRecipeExpandable && ( + setShowExpandedRecipe((v) => !v)}> + {showExpandedRecipe ? 'Hide' : 'Show More'} + + )} +
+ )}
); diff --git a/datahub-web-react/src/app/ingest/source/utils.ts b/datahub-web-react/src/app/ingest/source/utils.ts index c372388e958b7..f789ed8434721 100644 --- a/datahub-web-react/src/app/ingest/source/utils.ts +++ b/datahub-web-react/src/app/ingest/source/utils.ts @@ -1,17 +1,19 @@ -import YAML from 'yamljs'; import { CheckCircleOutlined, ClockCircleOutlined, CloseCircleOutlined, + ExclamationCircleOutlined, LoadingOutlined, + StopOutlined, WarningOutlined, } from '@ant-design/icons'; -import { ANTD_GRAY, REDESIGN_COLORS } from '../../entity/shared/constants'; +import YAML from 'yamljs'; +import { ListIngestionSourcesDocument, ListIngestionSourcesQuery } from '../../../graphql/ingestion.generated'; import { EntityType, FacetMetadata } from '../../../types.generated'; -import { capitalizeFirstLetterOnly, pluralize } from '../../shared/textUtil'; import EntityRegistry from '../../entity/EntityRegistry'; +import { ANTD_GRAY, REDESIGN_COLORS } from '../../entity/shared/constants'; +import { capitalizeFirstLetterOnly, pluralize } from '../../shared/textUtil'; import { SourceConfig } from './builder/types'; -import { ListIngestionSourcesDocument, ListIngestionSourcesQuery } from '../../../graphql/ingestion.generated'; export const getSourceConfigs = (ingestionSources: SourceConfig[], sourceType: string) => { const sourceConfigs = ingestionSources.find((source) => source.name === sourceType); @@ -40,7 +42,9 @@ export function getPlaceholderRecipe(ingestionSources: SourceConfig[], type?: st export const RUNNING = 'RUNNING'; export const SUCCESS = 'SUCCESS'; +export const WARNING = 'WARNING'; export const FAILURE = 'FAILURE'; +export const CONNECTION_FAILURE = 'CONNECTION_FAILURE'; export const CANCELLED = 'CANCELLED'; export const UP_FOR_RETRY = 'UP_FOR_RETRY'; export const ROLLING_BACK = 'ROLLING_BACK'; @@ -56,8 +60,10 @@ export const getExecutionRequestStatusIcon = (status: string) => { return ( (status === RUNNING && LoadingOutlined) || (status === SUCCESS && CheckCircleOutlined) || + (status === WARNING && ExclamationCircleOutlined) || (status === FAILURE && CloseCircleOutlined) || - (status === CANCELLED && CloseCircleOutlined) || + (status === CONNECTION_FAILURE && CloseCircleOutlined) || + (status === CANCELLED && StopOutlined) || (status === UP_FOR_RETRY && ClockCircleOutlined) || (status === ROLLED_BACK && WarningOutlined) || (status === ROLLING_BACK && LoadingOutlined) || @@ -70,7 +76,9 @@ export const getExecutionRequestStatusDisplayText = (status: string) => { return ( (status === RUNNING && 'Running') || (status === SUCCESS && 'Succeeded') || + (status === WARNING && 'Completed') || (status === FAILURE && 'Failed') || + (status === CONNECTION_FAILURE && 'Connection Failed') || (status === CANCELLED && 'Cancelled') || (status === UP_FOR_RETRY && 'Up for Retry') || (status === ROLLED_BACK && 'Rolled Back') || @@ -83,21 +91,25 @@ export const getExecutionRequestStatusDisplayText = (status: string) => { export const getExecutionRequestSummaryText = (status: string) => { switch (status) { case RUNNING: - return 'Ingestion is running'; + return 'Ingestion is running...'; case SUCCESS: - return 'Ingestion successfully completed'; + return 'Ingestion succeeded with no errors or suspected missing data.'; + case WARNING: + return 'Ingestion completed with minor or intermittent errors.'; case FAILURE: - return 'Ingestion completed with errors'; + return 'Ingestion failed to complete, or completed with serious errors.'; + case CONNECTION_FAILURE: + return 'Ingestion failed due to network, authentication, or permission issues.'; case CANCELLED: - return 'Ingestion was cancelled'; + return 'Ingestion was cancelled.'; case ROLLED_BACK: - return 'Ingestion was rolled back'; + return 'Ingestion was rolled back.'; case ROLLING_BACK: - return 'Ingestion is in the process of rolling back'; + return 'Ingestion is in the process of rolling back.'; case ROLLBACK_FAILED: - return 'Ingestion rollback failed'; + return 'Ingestion rollback failed.'; default: - return 'Ingestion status not recognized'; + return 'Ingestion status not recognized.'; } }; @@ -105,7 +117,9 @@ export const getExecutionRequestStatusDisplayColor = (status: string) => { return ( (status === RUNNING && REDESIGN_COLORS.BLUE) || (status === SUCCESS && 'green') || + (status === WARNING && 'orangered') || (status === FAILURE && 'red') || + (status === CONNECTION_FAILURE && 'crimson') || (status === UP_FOR_RETRY && 'orange') || (status === CANCELLED && ANTD_GRAY[9]) || (status === ROLLED_BACK && 'orange') || diff --git a/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx b/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx index 68e91983babdb..d3e01df3a66e8 100644 --- a/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx +++ b/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx @@ -67,8 +67,8 @@ export default function PolicyDetailsModal({ policy, visible, onClose, privilege const isMetadataPolicy = policy?.type === PolicyType.Metadata; const resources = convertLegacyResourceFilter(policy?.resources); - const resourceTypes = getFieldValues(resources?.filter, 'RESOURCE_TYPE') || []; - const resourceEntities = getFieldValues(resources?.filter, 'RESOURCE_URN') || []; + const resourceTypes = getFieldValues(resources?.filter, 'TYPE') || []; + const resourceEntities = getFieldValues(resources?.filter, 'URN') || []; const domains = getFieldValues(resources?.filter, 'DOMAIN') || []; const { diff --git a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx index 1520388a5033a..b8e1505fceaec 100644 --- a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx +++ b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx @@ -67,8 +67,8 @@ export default function PolicyPrivilegeForm({ } = useAppConfig(); const resources: ResourceFilter = convertLegacyResourceFilter(maybeResources) || EMPTY_POLICY.resources; - const resourceTypes = getFieldValues(resources.filter, 'RESOURCE_TYPE') || []; - const resourceEntities = getFieldValues(resources.filter, 'RESOURCE_URN') || []; + const resourceTypes = getFieldValues(resources.filter, 'TYPE') || []; + const resourceEntities = getFieldValues(resources.filter, 'URN') || []; const getDisplayName = (entity) => { if (!entity) { @@ -145,10 +145,7 @@ export default function PolicyPrivilegeForm({ }; setResources({ ...resources, - filter: setFieldValues(filter, 'RESOURCE_TYPE', [ - ...resourceTypes, - createCriterionValue(selectedResourceType), - ]), + filter: setFieldValues(filter, 'TYPE', [...resourceTypes, createCriterionValue(selectedResourceType)]), }); }; @@ -160,7 +157,7 @@ export default function PolicyPrivilegeForm({ ...resources, filter: setFieldValues( filter, - 'RESOURCE_TYPE', + 'TYPE', resourceTypes?.filter((criterionValue) => criterionValue.value !== deselectedResourceType), ), }); @@ -173,7 +170,7 @@ export default function PolicyPrivilegeForm({ }; setResources({ ...resources, - filter: setFieldValues(filter, 'RESOURCE_URN', [ + filter: setFieldValues(filter, 'URN', [ ...resourceEntities, createCriterionValueWithEntity( resource, @@ -192,7 +189,7 @@ export default function PolicyPrivilegeForm({ ...resources, filter: setFieldValues( filter, - 'RESOURCE_URN', + 'URN', resourceEntities?.filter((criterionValue) => criterionValue.value !== resource), ), }); diff --git a/datahub-web-react/src/app/permissions/policy/policyUtils.ts b/datahub-web-react/src/app/permissions/policy/policyUtils.ts index c7af7342f6efa..2f178fcdeb5c3 100644 --- a/datahub-web-react/src/app/permissions/policy/policyUtils.ts +++ b/datahub-web-react/src/app/permissions/policy/policyUtils.ts @@ -99,10 +99,10 @@ export const convertLegacyResourceFilter = (resourceFilter: Maybe(); if (resourceFilter.type) { - criteria.push(createCriterion('RESOURCE_TYPE', [createCriterionValue(resourceFilter.type)])); + criteria.push(createCriterion('TYPE', [createCriterionValue(resourceFilter.type)])); } if (resourceFilter.resources && resourceFilter.resources.length > 0) { - criteria.push(createCriterion('RESOURCE_URN', resourceFilter.resources.map(createCriterionValue))); + criteria.push(createCriterion('URN', resourceFilter.resources.map(createCriterionValue))); } return { filter: { diff --git a/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx b/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx index d5722429aaf6b..2bb76714d6119 100644 --- a/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx +++ b/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx @@ -39,6 +39,7 @@ export default function EntityPathsModal({ paths, resultEntityUrn, hideModal }: return ( Column path{paths.length > 1 && 's'} from{' '} diff --git a/datahub-web-react/src/graphql/ingestion.graphql b/datahub-web-react/src/graphql/ingestion.graphql index 80f66642fe11f..1767fe34bfef0 100644 --- a/datahub-web-react/src/graphql/ingestion.graphql +++ b/datahub-web-react/src/graphql/ingestion.graphql @@ -12,6 +12,10 @@ query listIngestionSources($input: ListIngestionSourcesInput!) { version executorId debugMode + extraArgs { + key + value + } } schedule { interval @@ -51,6 +55,10 @@ query getIngestionSource($urn: String!, $runStart: Int, $runCount: Int) { version executorId debugMode + extraArgs { + key + value + } } schedule { interval @@ -90,6 +98,10 @@ query getIngestionExecutionRequest($urn: String!) { source { type } + arguments { + key + value + } } result { status diff --git a/datahub-web-react/src/graphql/scroll.graphql b/datahub-web-react/src/graphql/scroll.graphql index 18274c50c2166..1031fed7b9e13 100644 --- a/datahub-web-react/src/graphql/scroll.graphql +++ b/datahub-web-react/src/graphql/scroll.graphql @@ -408,6 +408,7 @@ fragment downloadScrollAcrossLineageResult on ScrollAcrossLineageResults { count total searchResults { + degree entity { ...downloadSearchResults } diff --git a/datahub-web-react/src/images/verticalogo.png b/datahub-web-react/src/images/verticalogo.png index a81047fd43edb..5da38f4e67c7d 100644 Binary files a/datahub-web-react/src/images/verticalogo.png and b/datahub-web-react/src/images/verticalogo.png differ diff --git a/docker/build.gradle b/docker/build.gradle index 0faea626e982d..56634a5fe0c67 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -35,7 +35,7 @@ task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') { environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - // environment "ACTIONS_VERSION", 'alpine3.17-slim' + // environment "ACTIONS_VERSION", 'alpine3.18-slim' // environment "DATAHUB_ACTIONS_IMAGE", 'nginx' // Elastic @@ -97,10 +97,20 @@ task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') { dependsOn(debug_modules.collect { it + ':dockerTagDebug' }) shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' - environment "DATAHUB_PRECREATE_TOPICS", "true" environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" + // Elastic + // environment "DATAHUB_SEARCH_IMAGE", 'elasticsearch' + // environment "DATAHUB_SEARCH_TAG", '7.10.1' + + // OpenSearch + environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch' + environment "DATAHUB_SEARCH_TAG", '2.9.0' + environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true' + environment "USE_AWS_ELASTICSEARCH", 'true' + + def cmd = [ 'source ../metadata-ingestion/venv/bin/activate && ', 'datahub docker quickstart', diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index 2d74a288b8c99..f5428f7480403 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 564cc19cc9a5f..25afe9b8b3dce 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -1,7 +1,7 @@ ARG APP_ENV=full ARG BASE_IMAGE=base -FROM golang:1-alpine3.17 AS dockerize-binary +FROM golang:1-alpine3.18 AS dockerize-binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 82d9a93a9a2c3..eb082d50b3020 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -2,62 +2,58 @@ # pyspark==3.0.3 # pydeequ==1.0.1 -acryl-datahub-classify==0.0.6 -acryl-iceberg-legacy==0.0.4 -acryl-PyHive==0.6.13 -aenum==3.1.12 -aiohttp==3.8.4 +acryl-datahub-classify==0.0.8 +acryl-PyHive==0.6.14 +acryl-sqlglot==18.5.2.dev45 +aenum==3.1.15 +aiohttp==3.8.6 aiosignal==1.3.1 -alembic==1.11.1 +alembic==1.12.0 altair==4.2.0 -anyio==3.7.0 -apache-airflow==2.6.1 -apache-airflow-providers-common-sql==1.5.1 -apache-airflow-providers-ftp==3.4.1 -apache-airflow-providers-http==4.4.1 -apache-airflow-providers-imap==3.2.1 -apache-airflow-providers-sqlite==3.4.1 -apispec==5.2.2 +anyio==3.7.1 +apache-airflow==2.7.2 +apache-airflow-providers-common-sql==1.7.2 +apache-airflow-providers-ftp==3.5.2 +apache-airflow-providers-http==4.5.2 +apache-airflow-providers-imap==3.3.2 +apache-airflow-providers-sqlite==3.4.3 +apispec==6.3.0 appdirs==1.4.4 appnope==0.1.3 -argcomplete==3.0.8 -argon2-cffi==21.3.0 +argcomplete==3.1.2 +argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 asgiref==3.7.2 asn1crypto==1.5.1 -asttokens==2.2.1 -async-timeout==4.0.2 +asttokens==2.4.0 +async-timeout==4.0.3 asynch==0.2.2 attrs==23.1.0 avro==1.10.2 -avro-gen3==0.7.10 -azure-core==1.26.4 -azure-identity==1.10.0 -azure-storage-blob==12.16.0 -azure-storage-file-datalake==12.11.0 -Babel==2.12.1 +avro-gen3==0.7.11 +Babel==2.13.0 backcall==0.2.0 backoff==2.2.1 beautifulsoup4==4.12.2 -bleach==6.0.0 -blinker==1.6.2 -blis==0.7.9 -boto3==1.26.142 -botocore==1.29.142 +bleach==6.1.0 +blinker==1.6.3 +blis==0.7.11 +boto3==1.28.62 +botocore==1.31.62 bowler==0.9.0 -bracex==2.3.post1 +bracex==2.4 cached-property==1.5.2 cachelib==0.9.0 cachetools==5.3.1 -catalogue==2.0.8 -cattrs==22.2.0 -certifi==2023.5.7 -cffi==1.15.1 -chardet==5.1.0 -charset-normalizer==2.1.1 +catalogue==2.0.10 +cattrs==23.1.2 +certifi==2023.7.22 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.0 ciso8601==2.3.0 -click==8.1.3 -click-default-group==1.2.2 +click==8.1.7 +click-default-group==1.2.4 click-spinner==0.1.10 clickclick==20.10.2 clickhouse-cityhash==1.0.2.4 @@ -66,205 +62,217 @@ clickhouse-sqlalchemy==0.2.4 cloudpickle==2.2.1 colorama==0.4.6 colorlog==4.8.0 -confection==0.0.4 +comm==0.1.4 +confection==0.1.3 ConfigUpdater==3.1.1 confluent-kafka==1.8.2 connexion==2.14.2 cron-descriptor==1.4.0 -croniter==1.3.15 -cryptography==37.0.4 +croniter==2.0.1 +cryptography==41.0.4 cx-Oracle==8.3.0 -cymem==2.0.7 -dask==2023.5.1 -databricks-cli==0.17.7 +cymem==2.0.8 +dask==2023.9.3 +databricks-cli==0.18.0 databricks-dbapi==0.6.0 -databricks-sdk==0.1.8 -debugpy==1.6.7 +databricks-sdk==0.10.0 +debugpy==1.8.0 decorator==5.1.1 defusedxml==0.7.1 -deltalake==0.9.0 +deltalake==0.11.0 Deprecated==1.2.14 -dill==0.3.6 -dnspython==2.3.0 -docker==6.1.2 +dill==0.3.7 +dnspython==2.4.2 +docker==6.1.3 docutils==0.20.1 ecdsa==0.18.0 elasticsearch==7.13.4 email-validator==1.3.1 entrypoints==0.4 et-xmlfile==1.1.0 -exceptiongroup==1.1.1 -executing==1.2.0 -expandvars==0.9.0 -fastapi==0.95.2 -fastavro==1.7.4 -fastjsonschema==2.17.1 -feast==0.29.0 -filelock==3.12.0 +exceptiongroup==1.1.3 +executing==2.0.0 +expandvars==0.11.0 +fastapi==0.103.2 +fastavro==1.8.4 +fastjsonschema==2.18.1 +feast==0.31.1 +filelock==3.12.4 fissix==21.11.13 Flask==2.2.5 flatdict==4.0.1 -frozenlist==1.3.3 -fsspec==2023.5.0 +frozenlist==1.4.0 +fsspec==2023.9.2 future==0.18.3 -GeoAlchemy2==0.13.3 +GeoAlchemy2==0.14.1 gitdb==4.0.10 -GitPython==3.1.31 -google-api-core==2.11.0 -google-auth==2.19.0 -google-cloud-appengine-logging==1.3.0 +GitPython==3.1.37 +google-api-core==2.12.0 +google-auth==2.23.3 +google-cloud-appengine-logging==1.3.2 google-cloud-audit-log==0.2.5 -google-cloud-bigquery==3.10.0 -google-cloud-bigquery-storage==2.19.1 -google-cloud-core==2.3.2 +google-cloud-bigquery==3.12.0 +google-cloud-core==2.3.3 google-cloud-datacatalog-lineage==0.2.2 google-cloud-logging==3.5.0 google-crc32c==1.5.0 -google-resumable-media==2.5.0 -googleapis-common-protos==1.59.0 +google-re2==1.1 +google-resumable-media==2.6.0 +googleapis-common-protos==1.60.0 gql==3.4.1 graphql-core==3.2.3 graphviz==0.20.1 great-expectations==0.15.50 -greenlet==2.0.2 +greenlet==3.0.0 grpc-google-iam-v1==0.12.6 -grpcio==1.54.2 -grpcio-reflection==1.54.2 -grpcio-status==1.54.2 -grpcio-tools==1.54.2 -gssapi==1.8.2 -gunicorn==20.1.0 +grpcio==1.59.0 +grpcio-reflection==1.59.0 +grpcio-status==1.59.0 +grpcio-tools==1.59.0 +gssapi==1.8.3 +gunicorn==21.2.0 h11==0.14.0 -hmsclient==0.1.1 -httpcore==0.17.2 -httptools==0.5.0 -httpx==0.24.1 +httpcore==0.18.0 +httptools==0.6.0 +httpx==0.25.0 humanfriendly==10.0 idna==3.4 -ijson==3.2.0.post0 -importlib-metadata==6.6.0 -importlib-resources==5.12.0 +ijson==3.2.3 +importlib-metadata==6.8.0 +importlib-resources==6.1.0 inflection==0.5.1 ipaddress==1.0.23 ipykernel==6.17.1 -ipython==8.13.2 +ipython==8.16.1 ipython-genutils==0.2.0 -ipywidgets==8.0.6 +ipywidgets==8.1.1 iso3166==2.1.1 isodate==0.6.1 itsdangerous==2.1.2 -jedi==0.18.2 +jedi==0.19.1 Jinja2==3.1.2 jmespath==1.0.1 JPype1==1.4.1 -jsonlines==3.1.0 -jsonpatch==1.32 -jsonpointer==2.3 +jsonlines==4.0.0 +jsonpatch==1.33 +jsonpointer==2.4 jsonref==1.1.0 -jsonschema==4.17.3 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 jupyter-server==1.24.0 jupyter_client==7.4.9 jupyter_core==4.12.0 jupyterlab-pygments==0.2.2 -jupyterlab-widgets==3.0.7 +jupyterlab-widgets==3.0.9 langcodes==3.3.0 lark==1.1.4 lazy-object-proxy==1.9.0 leb128==1.0.5 -limits==3.5.0 +limits==3.6.0 linear-tsv==1.1.0 linkify-it-py==2.0.2 lkml==1.3.1 locket==1.0.0 lockfile==0.12.2 looker-sdk==23.0.0 -lxml==4.9.2 +lxml==4.9.3 lz4==4.3.2 makefun==1.15.1 Mako==1.2.4 -Markdown==3.4.3 -markdown-it-py==2.2.0 -MarkupSafe==2.1.2 -marshmallow==3.19.0 -marshmallow-enum==1.5.1 +Markdown==3.5 +markdown-it-py==3.0.0 +MarkupSafe==2.1.3 +marshmallow==3.20.1 marshmallow-oneofschema==3.0.1 marshmallow-sqlalchemy==0.26.1 matplotlib-inline==0.1.6 -mdit-py-plugins==0.3.5 +mdit-py-plugins==0.4.0 mdurl==0.1.2 -mistune==2.0.5 +mistune==3.0.2 mixpanel==4.10.0 -mmh3==4.0.0 -more-itertools==9.1.0 +mlflow-skinny==2.7.1 +mmh3==4.0.1 +mmhash3==3.0.1 +more-itertools==10.1.0 moreorless==0.4.0 -moto==4.1.10 -msal==1.16.0 -msal-extensions==1.0.0 +moto==4.2.5 +msal==1.22.0 multidict==6.0.4 -murmurhash==1.0.9 -mypy==1.3.0 +murmurhash==1.0.10 +mypy==1.6.0 mypy-extensions==1.0.0 nbclassic==1.0.0 nbclient==0.6.3 -nbconvert==7.4.0 -nbformat==5.8.0 -nest-asyncio==1.5.6 +nbconvert==7.9.2 +nbformat==5.9.1 +nest-asyncio==1.5.8 networkx==3.1 -notebook==6.5.4 +notebook==6.5.6 notebook_shim==0.2.3 -numpy==1.24.3 +numpy==1.26.0 oauthlib==3.2.2 okta==1.7.0 +openlineage-airflow==1.2.0 +openlineage-integration-common==1.2.0 +openlineage-python==1.2.0 +openlineage_sql==1.2.0 openpyxl==3.1.2 +opentelemetry-api==1.20.0 +opentelemetry-exporter-otlp==1.20.0 +opentelemetry-exporter-otlp-proto-common==1.20.0 +opentelemetry-exporter-otlp-proto-grpc==1.20.0 +opentelemetry-exporter-otlp-proto-http==1.20.0 +opentelemetry-proto==1.20.0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 ordered-set==4.1.0 oscrypto==1.3.0 -packaging==23.1 +packaging==23.2 pandas==1.5.3 pandavro==1.5.2 pandocfilters==1.5.0 -parse==1.19.0 +parse==1.19.1 parso==0.8.3 -partd==1.4.0 -pathspec==0.9.0 -pathy==0.10.1 +partd==1.4.1 +pathspec==0.11.2 +pathy==0.10.2 pendulum==2.1.2 pexpect==4.8.0 phonenumbers==8.13.0 pickleshare==0.7.5 -platformdirs==3.5.1 -pluggy==1.0.0 -portalocker==2.7.0 -preshed==3.0.8 +platformdirs==3.11.0 +pluggy==1.3.0 +preshed==3.0.9 prison==0.2.1 progressbar2==4.2.0 -prometheus-client==0.17.0 -prompt-toolkit==3.0.38 -proto-plus==1.22.2 -protobuf==4.23.2 +prometheus-client==0.17.1 +prompt-toolkit==3.0.39 +proto-plus==1.22.3 +protobuf==4.24.4 psutil==5.9.5 -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 pure-sasl==0.6.2 -py-partiql-parser==0.3.0 -pyarrow==8.0.0 +py-partiql-parser==0.3.7 +pyarrow==11.0.0 pyasn1==0.5.0 pyasn1-modules==0.3.0 pyathena==2.4.1 pycountry==22.3.5 pycparser==2.21 -pycryptodome==3.18.0 -pycryptodomex==3.18.0 -pydantic==1.10.8 -pydash==7.0.3 +pycryptodome==3.19.0 +pycryptodomex==3.19.0 +pydantic==1.10.13 +pydash==7.0.6 pydruid==0.6.5 -Pygments==2.15.1 -pymongo==4.3.3 -PyMySQL==1.0.3 -pyOpenSSL==22.0.0 +Pygments==2.16.1 +pyiceberg==0.4.0 +pymongo==4.5.0 +PyMySQL==1.1.0 +pyOpenSSL==23.2.0 pyparsing==3.0.9 -pyrsistent==0.19.3 -pyspnego==0.9.0 +pyspnego==0.10.2 python-daemon==3.0.1 python-dateutil==2.8.2 python-dotenv==1.0.0 @@ -272,111 +280,115 @@ python-jose==3.3.0 python-ldap==3.4.3 python-nvd3==0.15.0 python-slugify==8.0.1 -python-stdnum==1.18 -python-tds==1.12.0 -python-utils==3.6.0 +python-stdnum==1.19 +python-tds==1.13.0 +python-utils==3.8.1 python3-openid==3.2.0 -pytz==2023.3 +pytz==2023.3.post1 pytzdata==2020.1 -PyYAML==6.0 -pyzmq==25.1.0 +PyYAML==6.0.1 +pyzmq==24.0.1 ratelimiter==1.2.0.post0 redash-toolbelt==0.1.9 -redshift-connector==2.0.910 -regex==2023.5.5 -requests==2.28.2 +redshift-connector==2.0.914 +referencing==0.30.2 +regex==2023.10.3 +requests==2.31.0 requests-file==1.5.1 requests-gssapi==1.2.3 requests-ntlm==1.2.0 requests-toolbelt==0.10.1 -responses==0.23.1 -retrying==1.3.4 +responses==0.23.3 rfc3339-validator==0.1.4 rfc3986==2.0.0 -rich==13.3.5 -rich_argparse==1.1.0 +rich==13.6.0 +rich-argparse==1.3.0 +rpds-py==0.10.6 rsa==4.9 ruamel.yaml==0.17.17 -s3transfer==0.6.1 -sasl3==0.2.11 -schwifty==2023.3.0 -scipy==1.10.1 +ruamel.yaml.clib==0.2.8 +s3transfer==0.7.0 +schwifty==2023.9.0 +scipy==1.11.3 scramp==1.4.4 Send2Trash==1.8.2 -setproctitle==1.3.2 -simple-salesforce==1.12.4 +sentry-sdk==1.32.0 +setproctitle==1.3.3 +simple-salesforce==1.12.5 six==1.16.0 -smart-open==6.3.0 -smmap==5.0.0 +smart-open==6.4.0 +smmap==5.0.1 sniffio==1.3.0 -snowflake-connector-python==2.9.0 -snowflake-sqlalchemy==1.4.7 -soupsieve==2.4.1 +snowflake-connector-python==3.2.1 +snowflake-sqlalchemy==1.5.0 +sortedcontainers==2.4.0 +soupsieve==2.5 spacy==3.4.3 spacy-legacy==3.0.12 -spacy-loggers==1.0.4 +spacy-loggers==1.0.5 sql-metadata==2.2.2 -SQLAlchemy==1.4.41 -sqlalchemy-bigquery==1.6.1 +SQLAlchemy==1.4.44 +sqlalchemy-bigquery==1.8.0 SQLAlchemy-JSONField==1.0.1.post0 sqlalchemy-pytds==0.3.5 sqlalchemy-redshift==0.8.14 SQLAlchemy-Utils==0.41.1 -sqlalchemy2-stubs==0.0.2a34 -sqllineage==1.3.6 -sqlparse==0.4.3 -srsly==2.4.6 -stack-data==0.6.2 +sqlalchemy2-stubs==0.0.2a35 +sqllineage==1.3.8 +sqlparse==0.4.4 +srsly==2.4.8 +stack-data==0.6.3 starlette==0.27.0 +strictyaml==1.7.3 tableauserverclient==0.25 tableschema==1.20.2 tabulate==0.9.0 tabulator==1.53.5 -tenacity==8.2.2 +tenacity==8.2.3 termcolor==2.3.0 terminado==0.17.1 text-unidecode==1.3 -thinc==8.1.10 -thrift==0.16.0 +thinc==8.1.12 +thrift==0.13.0 thrift-sasl==0.4.3 tinycss2==1.2.1 toml==0.10.2 tomli==2.0.1 +tomlkit==0.12.1 toolz==0.12.0 -tornado==6.3.2 -tqdm==4.65.0 +tornado==6.3.3 +tqdm==4.66.1 traitlets==5.2.1.post0 -trino==0.324.0 +trino==0.327.0 typeguard==2.13.3 typer==0.7.0 -types-PyYAML==6.0.12.10 +types-PyYAML==6.0.12.12 typing-inspect==0.9.0 -typing_extensions==4.5.0 -tzlocal==5.0.1 +typing_extensions==4.8.0 +tzlocal==5.1 uc-micro-py==1.0.2 -ujson==5.7.0 +ujson==5.8.0 unicodecsv==0.14.1 -urllib3==1.26.16 -uvicorn==0.22.0 +urllib3==1.26.17 +uvicorn==0.23.2 uvloop==0.17.0 -vertica-python==1.3.2 -vertica-sqlalchemy-dialect==0.0.1 +vertica-python==1.3.5 +vertica-sqlalchemy-dialect==0.0.8 vininfo==1.7.0 volatile==2.1.0 wasabi==0.10.1 -watchfiles==0.19.0 -wcmatch==8.4.1 -wcwidth==0.2.6 +watchfiles==0.20.0 +wcmatch==8.5 +wcwidth==0.2.8 webencodings==0.5.1 -websocket-client==1.5.2 +websocket-client==1.6.4 websockets==11.0.3 Werkzeug==2.2.3 -widgetsnbextension==4.0.7 +widgetsnbextension==4.0.9 wrapt==1.15.0 -WTForms==3.0.1 +WTForms==3.1.0 xlrd==2.0.1 xmltodict==0.13.0 yarl==1.9.2 zeep==4.2.1 -zipp==3.15.0 -zstd==1.5.5.1 +zstd==1.5.5.1 \ No newline at end of file diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index 734f8ba452f3e..4b321b1639c1b 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index ee5d927fb1ddb..4d38ee6daa235 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile index 4e1521cc0561e..945be54678a24 100644 --- a/docker/datahub-upgrade/Dockerfile +++ b/docker/datahub-upgrade/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/elasticsearch-setup/Dockerfile b/docker/elasticsearch-setup/Dockerfile index af3c8c9df762a..c8fb2eba911b8 100644 --- a/docker/elasticsearch-setup/Dockerfile +++ b/docker/elasticsearch-setup/Dockerfile @@ -3,7 +3,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/mysql-setup/Dockerfile b/docker/mysql-setup/Dockerfile index 732b860a58f07..56bab61180489 100644 --- a/docker/mysql-setup/Dockerfile +++ b/docker/mysql-setup/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/postgres-setup/Dockerfile b/docker/postgres-setup/Dockerfile index 313615ac3465b..7f4d53ae044d4 100644 --- a/docker/postgres-setup/Dockerfile +++ b/docker/postgres-setup/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 21b3a1d3fe4d3..b2b3df4dfb33c 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -158,6 +158,7 @@ module.exports = { // The purpose of this section is to provide the minimum steps required to deploy DataHub to the vendor of your choosing "docs/deploy/aws", "docs/deploy/gcp", + "docs/deploy/azure", "docker/README", "docs/deploy/kubernetes", "docs/deploy/environment-vars", @@ -608,6 +609,7 @@ module.exports = { }, { "Managed DataHub Release History": [ + "docs/managed-datahub/release-notes/v_0_2_12", "docs/managed-datahub/release-notes/v_0_2_11", "docs/managed-datahub/release-notes/v_0_2_10", "docs/managed-datahub/release-notes/v_0_2_9", diff --git a/docs/authentication/README.md b/docs/authentication/README.md index f6eda88784486..ff4a3d83cfde3 100644 --- a/docs/authentication/README.md +++ b/docs/authentication/README.md @@ -31,8 +31,9 @@ When a user makes a request for Data within DataHub, the request is authenticate and programmatic calls to DataHub APIs. There are two types of tokens that are important: 1. **Session Tokens**: Generated for users of the DataHub web application. By default, having a duration of 24 hours. -These tokens are encoded and stored inside browser-side session cookies. The duration a session token is valid for is configurable via the `AUTH_SESSION_TTL_HOURS` environment variable -on the datahub-frontend deployment. +These tokens are encoded and stored inside browser-side session cookies. The duration a session token is valid for is configurable via the `MAX_SESSION_TOKEN_AGE` environment variable +on the datahub-frontend deployment. Additionally, the `AUTH_SESSION_TTL_HOURS` configures the expiration time of the actor cookie on the user's browser which will also prompt a user login. The difference between these is that the actor cookie expiration only affects the browser session and can still be used programmatically, +but when the session expires it can no longer be used programmatically either as it is created as a JWT with an expiration claim. 2. **Personal Access Tokens**: These are tokens generated via the DataHub settings panel useful for interacting with DataHub APIs. They can be used to automate processes like enriching documentation, ownership, tags, and more on DataHub. Learn more about Personal Access Tokens [here](personal-access-tokens.md). diff --git a/docs/authentication/guides/sso/configure-oidc-react.md b/docs/authentication/guides/sso/configure-oidc-react.md index 512d6adbf916f..1671673c09318 100644 --- a/docs/authentication/guides/sso/configure-oidc-react.md +++ b/docs/authentication/guides/sso/configure-oidc-react.md @@ -72,7 +72,8 @@ AUTH_OIDC_BASE_URL=your-datahub-url - `AUTH_OIDC_CLIENT_SECRET`: Unique client secret received from identity provider - `AUTH_OIDC_DISCOVERY_URI`: Location of the identity provider OIDC discovery API. Suffixed with `.well-known/openid-configuration` - `AUTH_OIDC_BASE_URL`: The base URL of your DataHub deployment, e.g. https://yourorgdatahub.com (prod) or http://localhost:9002 (testing) -- `AUTH_SESSION_TTL_HOURS`: The length of time in hours before a user will be prompted to login again. Session tokens are stateless so this determines at what time a session token may no longer be used and a valid session token can be used until this time has passed. +- `AUTH_SESSION_TTL_HOURS`: The length of time in hours before a user will be prompted to login again. Controls the actor cookie expiration time in the browser. Numeric value converted to hours, default 24. +- `MAX_SESSION_TOKEN_AGE`: Determines the expiration time of a session token. Session tokens are stateless so this determines at what time a session token may no longer be used and a valid session token can be used until this time has passed. Accepts a valid relative Java date style String, default 24h. Providing these configs will cause DataHub to delegate authentication to your identity provider, requesting the "oidc email profile" scopes and parsing the "preferred_username" claim from diff --git a/docs/authorization/policies.md b/docs/authorization/policies.md index e3606f2a3e48d..63aa6688d3eec 100644 --- a/docs/authorization/policies.md +++ b/docs/authorization/policies.md @@ -137,7 +137,7 @@ We currently support the following: #### Resources Resource filter defines the set of resources that the policy applies to is defined using a list of criteria. Each -criterion defines a field type (like resource_type, resource_urn, domain), a list of field values to compare, and a +criterion defines a field type (like type, urn, domain), a list of field values to compare, and a condition (like EQUALS). It essentially checks whether the field of a certain resource matches any of the input values. Note, that if there are no criteria or resource is not set, policy is applied to ALL resources. @@ -149,7 +149,7 @@ For example, the following resource filter will apply the policy to datasets, ch "filter": { "criteria": [ { - "field": "RESOURCE_TYPE", + "field": "TYPE", "condition": "EQUALS", "values": [ "dataset", @@ -175,8 +175,8 @@ Supported fields are as follows | Field Type | Description | Example | |---------------|------------------------|-------------------------| -| resource_type | Type of the resource | dataset, chart, dataJob | -| resource_urn | Urn of the resource | urn:li:dataset:... | +| type | Type of the resource | dataset, chart, dataJob | +| urn | Urn of the resource | urn:li:dataset:... | | domain | Domain of the resource | urn:li:domain:domainX | ## Managing Policies diff --git a/docs/datahub_lite.md b/docs/datahub_lite.md index de0a20eed1d01..55491e3b998cf 100644 --- a/docs/datahub_lite.md +++ b/docs/datahub_lite.md @@ -85,9 +85,10 @@ source: sink: type: datahub-lite - forward_to: - type: datahub-rest - config: + config: + forward_to: + type: datahub-rest + config: server: "http://datahub-gms:8080" ``` diff --git a/docs/deploy/azure.md b/docs/deploy/azure.md new file mode 100644 index 0000000000000..b940b82827e94 --- /dev/null +++ b/docs/deploy/azure.md @@ -0,0 +1,234 @@ +--- +title: "Deploying to Azure" +--- + +# Azure setup guide + +The following is a set of instructions to quickstart DataHub on Azure Kubernetes Service (AKS). Note, the guide +assumes that you do not have a Kubernetes cluster set up. + +## Prerequisites + +This guide requires the following tools: + +- [kubectl](https://kubernetes.io/docs/tasks/tools/) to manage Kubernetes resources +- [helm](https://helm.sh/docs/intro/install/) to deploy the resources based on helm charts. Note, we only support Helm + 3. +- [AZ CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli) to manage Azure resources + +To use the above tools, you need to set up Azure credentials by following +this [guide](https://learn.microsoft.com/en-us/cli/azure/authenticate-azure-cli). + +## Start up a Kubernetes cluster on AKS + +You can follow this [guide](https://learn.microsoft.com/en-us/azure/aks/learn/quick-kubernetes-deploy-cli) to create a new +cluster using az cli. + +Note: you can skip the application deployment step since we are deploying DataHub instead. If you are deploying DataHub to an existing cluster, please +skip the corresponding sections. + +- Verify you have the Microsoft.OperationsManagement and Microsoft.OperationalInsights providers registered on your subscription. These Azure resource providers are required to support Container insights. Check the registration status using the following commands: + +``` +az provider show -n Microsoft.OperationsManagement -o table +az provider show -n Microsoft.OperationalInsights -o table +``` + +If they're not registered, register them using the following commands: + +``` +az provider register --namespace Microsoft.OperationsManagement +az provider register --namespace Microsoft.OperationalInsights +``` + +- Create a resource group. Change name, location to your choosing. + +``` +az group create --name myResourceGroup --location eastus +``` + +The following output indicates that the command execution was successful: + +``` +{ + "id": "/subscriptions//resourceGroups/myResourceGroup", + "location": "eastus", + "managedBy": null, + "name": "myResourceGroup", + "properties": { + "provisioningState": "Succeeded" + }, + "tags": null +} +``` +- Create an AKS Cluster. For this project, it is best to increase node count to at least 3. Change cluster name, node count, and addons to your choosing. + +``` +az aks create -g myResourceGroup -n myAKSCluster --enable-managed-identity --node-count 3 --enable-addons monitoring --generate-ssh-keys +``` + +After a few minutes, the command completes and returns JSON-formatted information about the cluster. + +- Connect to the cluster + +Configure kubectl to connect to your Kubernetes cluster using the az aks get-credentials command. + +``` +az aks get-credentials --resource-group myResourceGroup --name myAKSCluster +``` + +Verify the connection to your cluster using the `kubectl get` command. This command returns a list of the cluster nodes. + +``` +kubectl get nodes +``` + +You should get results like below. Make sure node status is Ready. + +``` +NAME STATUS ROLES AGE VERSION +aks-nodepool1-37660971-vmss000000 Ready agent 24h v1.25.6 +aks-nodepool1-37660971-vmss000001 Ready agent 24h v1.25.6 +aks-nodepool1-37660971-vmss000002 Ready agent 24h v1.25.6 +``` + +## Setup DataHub using Helm + +Once the Kubernetes cluster has been set up, you can deploy DataHub and its prerequisites using helm. Please follow the +steps in this [guide](kubernetes.md). + + +Notes: +Since we are using PostgreSQL as the storage layer, change postgresql enabled to true and mysql to false in the values.yaml file of prerequisites. +Additionally, create a postgresql secret. Make sure to include 3 passwords for the postgresql secret: postgres-password, replication-password, and password. + +## Expose endpoints using a load balancer + +Now that all the pods are up and running, you need to expose the datahub-frontend end point by setting +up [ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/). To do this, you need to first set up an +ingress controller. + + +There are many [ingress controllers](https://kubernetes.io/docs/concepts/services-networking/ingress-controllers/) to choose +from, but here, we will follow this [guide](https://learn.microsoft.com/en-us/azure/application-gateway/tutorial-ingress-controller-add-on-existing) to set up the Azure +Application Gateway Ingress Controller. + +- Deploy a New Application Gateway. + +First, you need to create a WAF policy + +``` +az network application-gateway waf-policy create -g myResourceGroup -n myWAFPolicy +``` + +- Before the application gateway can be deployed, you'll also need to create a public IP resource, a new virtual network with address space 10.0.0.0/16, and a subnet with address space 10.0.0.0/24. +Then, you can deploy your application gateway in the subnet using the publicIP. + +Caution: When you use an AKS cluster and application gateway in separate virtual networks, the address spaces of the two virtual networks must not overlap. The default address space that an AKS cluster deploys in is 10.224.0.0/12. + + +``` +az network public-ip create -n myPublicIp -g myResourceGroup --allocation-method Static --sku Standard +az network vnet create -n myVnet -g myResourceGroup --address-prefix 10.0.0.0/16 --subnet-name mySubnet --subnet-prefix 10.0.0.0/24 +az network application-gateway create -n myApplicationGateway -l eastus -g myResourceGroup --sku WAF_v2 --public-ip-address myPublicIp --vnet-name myVnet --subnet mySubnet --priority 100 --waf-policy /subscriptions/{subscription_id}/resourceGroups/myResourceGroup/providers/Microsoft.Network/ApplicationGatewayWebApplicationFirewallPolicies/myWAFPolicy +``` +Change myPublicIp, myResourceGroup, myVnet, mySubnet, and myApplicationGateway to names of your choosing. + + +- Enable the AGIC Add-On in Existing AKS Cluster Through Azure CLI + +``` +appgwId=$(az network application-gateway show -n myApplicationGateway -g myResourceGroup -o tsv --query "id") +az aks enable-addons -n myCluster -g myResourceGroup -a ingress-appgw --appgw-id $appgwId +``` + +- Peer the Two Virtual Networks Together + +Since you deployed the AKS cluster in its own virtual network and the Application gateway in another virtual network, you'll need to peer the two virtual networks together in order for traffic to flow from the Application gateway to the pods in the cluster. + +``` +nodeResourceGroup=$(az aks show -n myCluster -g myResourceGroup -o tsv --query "nodeResourceGroup") +aksVnetName=$(az network vnet list -g $nodeResourceGroup -o tsv --query "[0].name") + +aksVnetId=$(az network vnet show -n $aksVnetName -g $nodeResourceGroup -o tsv --query "id") +az network vnet peering create -n AppGWtoAKSVnetPeering -g myResourceGroup --vnet-name myVnet --remote-vnet $aksVnetId --allow-vnet-access + +appGWVnetId=$(az network vnet show -n myVnet -g myResourceGroup -o tsv --query "id") +az network vnet peering create -n AKStoAppGWVnetPeering -g $nodeResourceGroup --vnet-name $aksVnetName --remote-vnet $appGWVnetId --allow-vnet-access +``` + +- Deploy the Ingress on the Frontend Pod + +In order to use the ingress controller to expose frontend pod, we need to update the datahub-frontend section of the values.yaml file that was used to deploy DataHub. Here is a sample configuration: + +``` +datahub-frontend: + enabled: true + image: + repository: linkedin/datahub-frontend-react + # tag: "v0.10.0 # defaults to .global.datahub.version + + # Set up ingress to expose react front-end + ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: azure/application-gateway + appgw.ingress.kubernetes.io/backend-protocol: "http" + + hosts: + - paths: + - /* + defaultUserCredentials: {} +``` + +You can then apply the updates: + +``` +helm upgrade --install datahub datahub/datahub --values values.yaml +``` + +You can now verify that the ingress was created correctly + +``` +kubectl get ingress +``` + +You should see a result like this: + +![frontend-image](https://github.com/Saketh-Mahesh/azure-docs-images/blob/main/frontend-status.png?raw=true) + +## Use PostgresSQL for the storage layer +Configure a PostgreSQL database in the same virtual network as the Kubernetes cluster or implement virtual network peering to connect both networks. Once the database is provisioned, you should be able to see the following page under the Connect tab on the left side. + + +Note: PostgreSQL Database MUST be deployed in same location as AKS/resource group (eastus, centralus, etc.) +Take a note of the connection details: + +![postgres-info](https://github.com/Saketh-Mahesh/azure-docs-images/blob/main/postgres-info.png?raw=true) + + + + + +- Update the postgresql settings under global in the values.yaml as follows. + +``` +global: + sql: + datasource: + host: "${POSTGRES_HOST}.postgres.database.azure.com:5432" + hostForpostgresqlClient: "${POSTGRES_HOST}.postgres.database.azure.com" + port: "5432" + url: "jdbc:postgresql://${POSTGRES_HOST}.postgres.database.azure.com:5432/datahub?user=${POSTGRES_ADMIN_LOGIN}&password=${POSTGRES_ADMIN_PASSWORD}&sslmode=require" + driver: "org.postgresql.Driver" + username: "${POSTGRES_ADMIN_LOGIN}" + password: + value: "${POSTGRES_ADMIN_PASSWORD}" +``` +Run this command helm command to update datahub configuration + +``` +helm upgrade --install datahub datahub/datahub --values values.yaml +``` + +And there you go! You have now installed DataHub on an Azure Kubernetes Cluster with an ingress controller set up to expose the frontend. Additionally you have utilized PostgreSQL as the storage layer of DataHub. \ No newline at end of file diff --git a/docs/deploy/environment-vars.md b/docs/deploy/environment-vars.md index 0689db9b17331..779c3d3d7c432 100644 --- a/docs/deploy/environment-vars.md +++ b/docs/deploy/environment-vars.md @@ -79,9 +79,10 @@ Simply replace the dot, `.`, with an underscore, `_`, and convert to uppercase. ## Frontend -| Variable | Default | Unit/Type | Components | Description | -|------------------------------------|----------|-----------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `AUTH_VERBOSE_LOGGING` | `false` | boolean | [`Frontend`] | Enable verbose authentication logging. Enabling this will leak sensisitve information in the logs. Disable when finished debugging. | -| `AUTH_OIDC_GROUPS_CLAIM` | `groups` | string | [`Frontend`] | Claim to use as the user's group. | -| `AUTH_OIDC_EXTRACT_GROUPS_ENABLED` | `false` | boolean | [`Frontend`] | Auto-provision the group from the user's group claim. | -| `AUTH_SESSION_TTL_HOURS` | `24` | string | [`Frontend`] | The number of hours a user session is valid. [User session tokens are stateless and will become invalid after this time](https://www.playframework.com/documentation/2.8.x/SettingsSession#Session-Timeout-/-Expiration) requiring a user to login again. | \ No newline at end of file +| Variable | Default | Unit/Type | Components | Description | +|------------------------------------|----------|-----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `AUTH_VERBOSE_LOGGING` | `false` | boolean | [`Frontend`] | Enable verbose authentication logging. Enabling this will leak sensisitve information in the logs. Disable when finished debugging. | +| `AUTH_OIDC_GROUPS_CLAIM` | `groups` | string | [`Frontend`] | Claim to use as the user's group. | +| `AUTH_OIDC_EXTRACT_GROUPS_ENABLED` | `false` | boolean | [`Frontend`] | Auto-provision the group from the user's group claim. | +| `AUTH_SESSION_TTL_HOURS` | `24` | string | [`Frontend`] | The number of hours a user session is valid. After this many hours the actor cookie will be expired by the browser and the user will be prompted to login again. | +| `MAX_SESSION_TOKEN_AGE` | `24h` | string | [`Frontend`] | The maximum age of the session token. [User session tokens are stateless and will become invalid after this time](https://www.playframework.com/documentation/2.8.x/SettingsSession#Session-Timeout-/-Expiration) requiring a user to login again. | \ No newline at end of file diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 9cd4ad5c6f02d..3af3b2bdda215 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -22,6 +22,8 @@ Otherwise, we recommend soft deleting all databricks data via the DataHub CLI: ### Deprecations ### Other Notable Changes +- Session token configuration has changed, all previously created session tokens will be invalid and users will be prompted to log in. Expiration time has also been shortened which may result in more login prompts with the default settings. + There should be no other interruption due to this change. ## 0.11.0 diff --git a/docs/managed-datahub/release-notes/v_0_2_11.md b/docs/managed-datahub/release-notes/v_0_2_11.md index 1f42090848712..c99d10201e097 100644 --- a/docs/managed-datahub/release-notes/v_0_2_11.md +++ b/docs/managed-datahub/release-notes/v_0_2_11.md @@ -7,7 +7,7 @@ Release Availability Date Recommended CLI/SDK --- -- `v0.11.0` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.10.5.5 +- `v0.11.0` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.11.0 - [Deprecation] In LDAP ingestor, the manager_pagination_enabled changed to general pagination_enabled If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. diff --git a/docs/managed-datahub/release-notes/v_0_2_12.md b/docs/managed-datahub/release-notes/v_0_2_12.md new file mode 100644 index 0000000000000..b13f471d9bf63 --- /dev/null +++ b/docs/managed-datahub/release-notes/v_0_2_12.md @@ -0,0 +1,30 @@ +# v0.2.12 +--- + +Release Availability Date +--- +13-Oct-2023 + +Recommended CLI/SDK +--- +- `v0.11.0.4` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.11.0.4 +- [breaking] Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now. +- [breaking] Removed `urn:li:corpuser:datahub` owner for the `Measure`, `Dimension` and `Temporal` tags emitted by Looker and LookML source connectors. +- [breaking] The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. +- [breaking] Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`. +- [breaking] The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled. +This is currently enabled by default to preserve compatibility, but will be disabled by default and then removed in the future. +If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup. +Otherwise, we recommend soft deleting all databricks data via the DataHub CLI: +`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`. + + +If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. + + +## Release Changelog +--- +- Since `v0.2.11` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/75252a3d9f6a576904be5a0790d644b9ae2df6ac...10a190470e8c932b6d34cba49de7dbcba687a088 have been pulled in. + +## Some notable features in this SaaS release +- Nested Domains available in this release diff --git a/docs/ui-ingestion.md b/docs/ui-ingestion.md index db2007e1e19a9..438ddd8823b7e 100644 --- a/docs/ui-ingestion.md +++ b/docs/ui-ingestion.md @@ -1,5 +1,12 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Ingestion + + ## Introduction Starting in version `0.8.25`, DataHub supports creating, configuring, scheduling, & executing batch metadata ingestion using the DataHub user interface. This makes @@ -173,28 +180,29 @@ Finally, give your Ingestion Source a name. Once you're happy with your configurations, click 'Done' to save your changes. -##### Advanced: Running with a specific CLI version +##### Advanced ingestion configs: -DataHub comes pre-configured to use the latest version of the DataHub CLI ([acryl-datahub](https://pypi.org/project/acryl-datahub/)) that is compatible +DataHub's Managed Ingestion UI comes pre-configured to use the latest version of the DataHub CLI ([acryl-datahub](https://pypi.org/project/acryl-datahub/)) that is compatible with the server. However, you can override the default package version using the 'Advanced' source configurations. To do so, simply click 'Advanced', then change the 'CLI Version' text box to contain the exact version of the DataHub CLI you'd like to use. -

_Pinning the CLI version to version `0.8.23.2`_ +Other advanced options include specifying **environment variables**, **DataHub plugins** or **python packages at runtime**. + Once you're happy with your changes, simply click 'Done' to save. You can upload and even update recipes using the cli as mentioned in the [cli documentation for uploading ingestion recipes](./cli.md#ingest-deploy). -An example execution would look something like: +An example execution for a given `recipe.yaml` file, would look something like: ```bash datahub ingest deploy --name "My Test Ingestion Source" --schedule "5 * * * *" --time-zone "UTC" -c recipe.yaml @@ -330,8 +338,8 @@ for the `datahub-actions` container and running `docker logs `. There are valid cases for ingesting metadata without the UI-based ingestion scheduler. For example, - You have written a custom ingestion Source -- Your data sources are not reachable on the network where DataHub is deployed -- Your ingestion source requires context from a local filesystem (e.g. input files, environment variables, etc) +- Your data sources are not reachable on the network where DataHub is deployed. Managed DataHub users can use a [remote executor](managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md) for remote UI-based ingestion. +- Your ingestion source requires context from a local filesystem (e.g. input files) - You want to distribute metadata ingestion among multiple producers / environments ### How do I attach policies to the actions pod to give it permissions to pull metadata from various sources? diff --git a/metadata-dao-impl/kafka-producer/build.gradle b/metadata-dao-impl/kafka-producer/build.gradle index 393b10b0e9d24..bc3415b2ccc8c 100644 --- a/metadata-dao-impl/kafka-producer/build.gradle +++ b/metadata-dao-impl/kafka-producer/build.gradle @@ -1,9 +1,9 @@ apply plugin: 'java' dependencies { - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation project(':entity-registry') implementation project(':metadata-io') diff --git a/metadata-events/mxe-avro-1.7/.gitignore b/metadata-events/mxe-avro/.gitignore similarity index 100% rename from metadata-events/mxe-avro-1.7/.gitignore rename to metadata-events/mxe-avro/.gitignore diff --git a/metadata-events/mxe-avro-1.7/build.gradle b/metadata-events/mxe-avro/build.gradle similarity index 81% rename from metadata-events/mxe-avro-1.7/build.gradle rename to metadata-events/mxe-avro/build.gradle index 8c0a26d22dc7d..9d11eeb160ff0 100644 --- a/metadata-events/mxe-avro-1.7/build.gradle +++ b/metadata-events/mxe-avro/build.gradle @@ -6,8 +6,8 @@ apply plugin: 'io.acryl.gradle.plugin.avro' apply plugin: 'java-library' dependencies { - api externalDependency.avro_1_7 - implementation(externalDependency.avroCompiler_1_7) { + api externalDependency.avro + implementation(externalDependency.avroCompiler) { exclude group: 'org.apache.velocity', module: 'velocity' } constraints { @@ -21,7 +21,7 @@ dependencies { def genDir = file("src/generated/java") -task avroCodeGen(type: com.commercehub.gradle.plugin.avro.GenerateAvroJavaTask, dependsOn: configurations.avsc) { +task avroCodeGen(type: com.github.davidmc24.gradle.plugin.avro.GenerateAvroJavaTask, dependsOn: configurations.avsc) { source("$rootDir/metadata-events/mxe-schemas/src/renamed/avro") outputDir = genDir dependsOn(':metadata-events:mxe-schemas:renameNamespace') diff --git a/metadata-events/mxe-registration/build.gradle b/metadata-events/mxe-registration/build.gradle index 60e0da59616d9..032870d93329f 100644 --- a/metadata-events/mxe-registration/build.gradle +++ b/metadata-events/mxe-registration/build.gradle @@ -5,7 +5,7 @@ configurations { } dependencies { - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-models') implementation spec.product.pegasus.dataAvro1_6 diff --git a/metadata-events/mxe-schemas/build.gradle b/metadata-events/mxe-schemas/build.gradle index fe46601fb68b7..8dc8b71bd1cd8 100644 --- a/metadata-events/mxe-schemas/build.gradle +++ b/metadata-events/mxe-schemas/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'pegasus' dependencies { diff --git a/metadata-events/mxe-utils-avro-1.7/.gitignore b/metadata-events/mxe-utils-avro/.gitignore similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/.gitignore rename to metadata-events/mxe-utils-avro/.gitignore diff --git a/metadata-events/mxe-utils-avro-1.7/build.gradle b/metadata-events/mxe-utils-avro/build.gradle similarity index 95% rename from metadata-events/mxe-utils-avro-1.7/build.gradle rename to metadata-events/mxe-utils-avro/build.gradle index 3b137965d6c19..a7bf287ab224d 100644 --- a/metadata-events/mxe-utils-avro-1.7/build.gradle +++ b/metadata-events/mxe-utils-avro/build.gradle @@ -1,7 +1,7 @@ apply plugin: 'java-library' dependencies { - api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-events:mxe-avro') api project(':metadata-models') api spec.product.pegasus.dataAvro1_6 diff --git a/metadata-events/mxe-utils-avro-1.7/src/main/java/com/linkedin/metadata/EventUtils.java b/metadata-events/mxe-utils-avro/src/main/java/com/linkedin/metadata/EventUtils.java similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/main/java/com/linkedin/metadata/EventUtils.java rename to metadata-events/mxe-utils-avro/src/main/java/com/linkedin/metadata/EventUtils.java diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/java/com/linkedin/metadata/EventUtilsTests.java b/metadata-events/mxe-utils-avro/src/test/java/com/linkedin/metadata/EventUtilsTests.java similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/java/com/linkedin/metadata/EventUtilsTests.java rename to metadata-events/mxe-utils-avro/src/test/java/com/linkedin/metadata/EventUtilsTests.java diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-avro2pegasus-mae.json b/metadata-events/mxe-utils-avro/src/test/resources/test-avro2pegasus-mae.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-avro2pegasus-mae.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-avro2pegasus-mae.json diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-avro2pegasus-mce.json b/metadata-events/mxe-utils-avro/src/test/resources/test-avro2pegasus-mce.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-avro2pegasus-mce.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-avro2pegasus-mce.json diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-fmce.json b/metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-fmce.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-fmce.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-fmce.json diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-mae.json b/metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-mae.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-mae.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-mae.json diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-mce.json b/metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-mce.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-mce.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-mce.json diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json index 1a32b38ce055d..81d0a71b651d9 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json @@ -74,9 +74,7 @@ "downstream_task_ids": "['populate_cost_table']", "inlets": "[]", "outlets": "[]", - "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", - "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}", - "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", "name": "create_cost_table", @@ -98,7 +96,44 @@ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" ], "inputDatajobs": [], - "fineGrainedLineages": [] + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + } + ] } } }, @@ -157,7 +192,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:56:24.632190+00:00", + "start_date": "2023-10-15 20:29:10.262813+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -172,7 +207,7 @@ "name": "sqlite_operator_create_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696056984632, + "time": 1697401750262, "actor": "urn:li:corpuser:datahub" } } @@ -221,7 +256,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056984632, + "timestampMillis": 1697401750262, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -251,9 +286,7 @@ "downstream_task_ids": "['populate_cost_table']", "inlets": "[]", "outlets": "[]", - "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", - "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}", - "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", "name": "create_cost_table", @@ -275,7 +308,80 @@ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" ], "inputDatajobs": [], - "fineGrainedLineages": [] + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + } + ] } } }, @@ -331,7 +437,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056984947, + "timestampMillis": 1697401750651, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -447,7 +553,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:56:28.605901+00:00", + "start_date": "2023-10-15 20:29:15.013834+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -462,7 +568,7 @@ "name": "sqlite_operator_populate_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696056988605, + "time": 1697401755013, "actor": "urn:li:corpuser:datahub" } } @@ -511,7 +617,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056988605, + "timestampMillis": 1697401755013, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -621,7 +727,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056989098, + "timestampMillis": 1697401755600, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -807,7 +913,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:56:32.888165+00:00", + "start_date": "2023-10-15 20:29:20.216818+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -822,7 +928,7 @@ "name": "sqlite_operator_transform_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696056992888, + "time": 1697401760216, "actor": "urn:li:corpuser:datahub" } } @@ -895,7 +1001,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056992888, + "timestampMillis": 1697401760216, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1131,7 +1237,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056993744, + "timestampMillis": 1697401761237, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1249,7 +1355,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:56:37.745717+00:00", + "start_date": "2023-10-15 20:29:26.243934+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -1264,7 +1370,7 @@ "name": "sqlite_operator_cleanup_costs_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696056997745, + "time": 1697401766243, "actor": "urn:li:corpuser:datahub" } } @@ -1313,7 +1419,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056997745, + "timestampMillis": 1697401766243, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1425,7 +1531,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056998672, + "timestampMillis": 1697401767373, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1543,7 +1649,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:56:42.645806+00:00", + "start_date": "2023-10-15 20:29:32.075613+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -1558,7 +1664,7 @@ "name": "sqlite_operator_cleanup_processed_costs_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057002645, + "time": 1697401772075, "actor": "urn:li:corpuser:datahub" } } @@ -1607,7 +1713,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057002645, + "timestampMillis": 1697401772075, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1719,7 +1825,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057003759, + "timestampMillis": 1697401773454, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json index c082be693e30c..96a0f02ccec17 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json @@ -74,9 +74,7 @@ "downstream_task_ids": "['populate_cost_table']", "inlets": "[]", "outlets": "[]", - "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", - "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}", - "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", "name": "create_cost_table", @@ -98,7 +96,44 @@ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" ], "inputDatajobs": [], - "fineGrainedLineages": [] + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + } + ] } } }, @@ -157,7 +192,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 07:00:45.832554+00:00", + "start_date": "2023-10-15 20:27:26.883178+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -172,7 +207,7 @@ "name": "sqlite_operator_create_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057245832, + "time": 1697401646883, "actor": "urn:li:corpuser:datahub" } } @@ -221,7 +256,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057245832, + "timestampMillis": 1697401646883, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -251,9 +286,7 @@ "downstream_task_ids": "['populate_cost_table']", "inlets": "[]", "outlets": "[]", - "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", - "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}", - "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", "name": "create_cost_table", @@ -275,7 +308,80 @@ "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" ], "inputDatajobs": [], - "fineGrainedLineages": [] + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + } + ] } } }, @@ -331,7 +437,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057246734, + "timestampMillis": 1697401647826, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -502,7 +608,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 07:00:49.653938+00:00", + "start_date": "2023-10-15 20:27:31.398799+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -517,7 +623,7 @@ "name": "sqlite_operator_populate_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057249653, + "time": 1697401651398, "actor": "urn:li:corpuser:datahub" } } @@ -566,7 +672,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057249653, + "timestampMillis": 1697401651398, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -676,7 +782,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057250831, + "timestampMillis": 1697401652651, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -917,7 +1023,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 07:00:53.989264+00:00", + "start_date": "2023-10-15 20:27:37.697995+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -932,7 +1038,7 @@ "name": "sqlite_operator_transform_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057253989, + "time": 1697401657697, "actor": "urn:li:corpuser:datahub" } } @@ -1005,7 +1111,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057253989, + "timestampMillis": 1697401657697, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1241,7 +1347,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057255628, + "timestampMillis": 1697401659496, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1414,7 +1520,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 07:01:00.421177+00:00", + "start_date": "2023-10-15 20:27:45.670215+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -1429,7 +1535,7 @@ "name": "sqlite_operator_cleanup_costs_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057260421, + "time": 1697401665670, "actor": "urn:li:corpuser:datahub" } } @@ -1478,7 +1584,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057260421, + "timestampMillis": 1697401665670, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1590,7 +1696,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057262258, + "timestampMillis": 1697401667670, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1763,7 +1869,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 07:01:05.540192+00:00", + "start_date": "2023-10-15 20:27:51.559194+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -1778,7 +1884,7 @@ "name": "sqlite_operator_cleanup_processed_costs_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057265540, + "time": 1697401671559, "actor": "urn:li:corpuser:datahub" } } @@ -1827,7 +1933,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057265540, + "timestampMillis": 1697401671559, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1939,7 +2045,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057267631, + "timestampMillis": 1697401673788, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion/adding-source.md b/metadata-ingestion/adding-source.md index e4fc950a7cdbd..a0930102c6827 100644 --- a/metadata-ingestion/adding-source.md +++ b/metadata-ingestion/adding-source.md @@ -62,7 +62,7 @@ Some sources use the default `SourceReport` class, but others inherit and extend ### 3. Implement the source itself -The core for the source is the `get_workunits` method, which produces a stream of metadata events (typically MCP objects) wrapped up in a MetadataWorkUnit. +The core for the source is the `get_workunits_internal` method, which produces a stream of metadata events (typically MCP objects) wrapped up in a MetadataWorkUnit. The [file source](./src/datahub/ingestion/source/file.py) is a good and simple example. The MetadataChangeEventClass is defined in the metadata models which are generated diff --git a/metadata-ingestion/docs/dev_guides/profiling_ingestions.md b/metadata-ingestion/docs/dev_guides/profiling_ingestions.md index d876d99b494f8..77cc2f456aa2d 100644 --- a/metadata-ingestion/docs/dev_guides/profiling_ingestions.md +++ b/metadata-ingestion/docs/dev_guides/profiling_ingestions.md @@ -13,6 +13,35 @@ This page documents how to perform memory profiles of ingestion runs. It is useful when trying to size the amount of resources necessary to ingest some source or when developing new features or sources. ## How to use + + + + +Create an ingestion as specified in the [Ingestion guide](../../../docs/ui-ingestion.md). + +Add a flag to your ingestion recipe to generate a memray memory dump of your ingestion: +```yaml +source: + ... + +sink: + ... + +flags: + generate_memory_profiles: "" +``` + +In the final panel, under the advanced section, add the `debug` datahub package under the **Extra DataHub Plugins** section. +As seen below: + +

+ +

+ +Finally, save and run the ingestion process. + +
+ Install the `debug` plugin for DataHub's CLI wherever the ingestion runs: ```bash @@ -33,6 +62,16 @@ flags: generate_memory_profiles: "" ``` +Finally run the ingestion recipe + +```bash +$ datahub ingest -c recipe.yaml +``` + + +
+ + Once the ingestion run starts a binary file will be created and appended to during the execution of the ingestion. These files follow the pattern `file-.bin` for a unique identification. diff --git a/metadata-ingestion/docs/sources/teradata/teradata_pre.md b/metadata-ingestion/docs/sources/teradata/teradata_pre.md index eb59caa29eb52..7263a59f5ea3d 100644 --- a/metadata-ingestion/docs/sources/teradata/teradata_pre.md +++ b/metadata-ingestion/docs/sources/teradata/teradata_pre.md @@ -18,7 +18,7 @@ If you want to run profiling, you need to grant select permission on all the tables you want to profile. -3. If linege or usage extraction is enabled, please, check if query logging is enabled and it is set to size which +3. If lineage or usage extraction is enabled, please, check if query logging is enabled and it is set to size which will fit for your queries (the default query text size Teradata captures is max 200 chars) An example how you can set it for all users: ```sql diff --git a/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml b/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml index 8cf07ba4c3a01..cc94de20110fe 100644 --- a/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml +++ b/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml @@ -3,12 +3,11 @@ source: type: teradata config: host_port: "myteradatainstance.teradata.com:1025" - #platform_instance: "myteradatainstance" username: myuser password: mypassword #database_pattern: # allow: - # - "demo_user" + # - "my_database" # ignoreCase: true include_table_lineage: true include_usage_statistics: true diff --git a/metadata-ingestion/examples/library/create_dataproduct.py b/metadata-ingestion/examples/library/create_dataproduct.py new file mode 100644 index 0000000000000..245395b602480 --- /dev/null +++ b/metadata-ingestion/examples/library/create_dataproduct.py @@ -0,0 +1,25 @@ +from datahub.api.entities.dataproduct.dataproduct import DataProduct +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph + +gms_endpoint = "http://localhost:8080" +graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) + +data_product = DataProduct( + id="pet_of_the_week", + display_name="Pet of the Week Campagin", + domain="urn:li:domain:ef39e99a-9d61-406d-b4a8-c70b16380206", + description="This campaign includes Pet of the Week data.", + assets=[ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD)", + "urn:li:dashboard:(looker,baz)", + "urn:li:dataFlow:(airflow,dag_abc,PROD)", + ], + owners=[{"id": "urn:li:corpuser:jdoe", "type": "BUSINESS_OWNER"}], + terms=["urn:li:glossaryTerm:ClientsAndAccounts.AccountBalance"], + tags=["urn:li:tag:adoption"], + properties={"lifecycle": "production", "sla": "7am every day"}, + external_url="https://en.wikipedia.org/wiki/Sloth", +) + +for mcp in data_product.generate_mcp(upsert=False): + graph.emit(mcp) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 3ea9a2ea61d74..417588a433655 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -38,7 +38,6 @@ "progressbar2", "termcolor>=1.0.0", "psutil>=5.8.0", - "ratelimiter", "Deprecated", "humanfriendly", "packaging", @@ -281,8 +280,10 @@ # Misc plugins. "sql-parser": sqlglot_lib, # Source plugins - # PyAthena is pinned with exact version because we use private method in PyAthena - "athena": sql_common | {"PyAthena[SQLAlchemy]==2.4.1"}, + # sqlalchemy-bigquery is included here since it provides an implementation of + # a SQLalchemy-conform STRUCT type definition + "athena": sql_common + | {"PyAthena[SQLAlchemy]>=2.6.0,<3.0.0", "sqlalchemy-bigquery>=1.4.1"}, "azure-ad": set(), "bigquery": sql_common | bigquery_common @@ -354,7 +355,11 @@ | {"psycopg2-binary", "pymysql>=1.0.2"}, "pulsar": {"requests"}, "redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib, - "redshift": sql_common | redshift_common | usage_common | {"redshift-connector"}, + "redshift": sql_common + | redshift_common + | usage_common + | sqlglot_lib + | {"redshift-connector"}, "redshift-legacy": sql_common | redshift_common, "redshift-usage-legacy": sql_common | usage_common | redshift_common, "s3": {*s3_base, *data_lake_profiling}, @@ -373,13 +378,16 @@ # FIXME: I don't think tableau uses sqllineage anymore so we should be able # to remove that dependency. "tableau": {"tableauserverclient>=0.17.0"} | sqllineage_lib | sqlglot_lib, - "teradata": sql_common | {"teradatasqlalchemy>=17.20.0.0"}, + "teradata": sql_common + | usage_common + | sqlglot_lib + | {"teradatasqlalchemy>=17.20.0.0"}, "trino": sql_common | trino, "starburst-trino-usage": sql_common | usage_common | trino, "nifi": {"requests", "packaging", "requests-gssapi"}, "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"} | sqlglot_lib, "powerbi-report-server": powerbi_report_server, - "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8"}, + "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.1"}, "unity-catalog": databricks | sqllineage_lib, } @@ -433,7 +441,7 @@ test_api_requirements = {pytest_dep, deepdiff_dep, "PyYAML"} debug_requirements = { - "memray" + "memray", } base_dev_requirements = { @@ -667,6 +675,7 @@ "Documentation": "https://datahubproject.io/docs/", "Source": "https://github.com/datahub-project/datahub", "Changelog": "https://github.com/datahub-project/datahub/releases", + "Releases": "https://github.com/acryldata/datahub/releases", }, license="Apache License 2.0", description="A CLI to work with DataHub metadata", diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py new file mode 100644 index 0000000000000..c45d4ddc92458 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py @@ -0,0 +1,7 @@ +from typing import Optional + +from datahub.configuration import ConfigModel + + +class BaseAssertion(ConfigModel): + description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py new file mode 100644 index 0000000000000..a41b0f7aafd9f --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py @@ -0,0 +1,162 @@ +from typing import Optional, Union + +from typing_extensions import Literal, Protocol + +from datahub.configuration import ConfigModel +from datahub.metadata.schema_classes import ( + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, +) + + +class Operator(Protocol): + """Specification for an assertion operator. + + This class exists only for documentation (not used in typing checking). + """ + + operator: str + + def id(self) -> str: + ... + + def generate_parameters(self) -> AssertionStdParametersClass: + ... + + +def _generate_assertion_std_parameter( + value: Union[str, int, float] +) -> AssertionStdParameterClass: + if isinstance(value, str): + return AssertionStdParameterClass( + value=value, type=AssertionStdParameterTypeClass.STRING + ) + elif isinstance(value, (int, float)): + return AssertionStdParameterClass( + value=str(value), type=AssertionStdParameterTypeClass.NUMBER + ) + else: + raise ValueError( + f"Unsupported assertion parameter {value} of type {type(value)}" + ) + + +Param = Union[str, int, float] + + +def _generate_assertion_std_parameters( + value: Optional[Param] = None, + min_value: Optional[Param] = None, + max_value: Optional[Param] = None, +) -> AssertionStdParametersClass: + return AssertionStdParametersClass( + value=_generate_assertion_std_parameter(value) if value else None, + minValue=_generate_assertion_std_parameter(min_value) if min_value else None, + maxValue=_generate_assertion_std_parameter(max_value) if max_value else None, + ) + + +class EqualToOperator(ConfigModel): + type: Literal["equal_to"] + value: Union[str, int, float] + + operator: str = AssertionStdOperatorClass.EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class BetweenOperator(ConfigModel): + type: Literal["between"] + min: Union[int, float] + max: Union[int, float] + + operator: str = AssertionStdOperatorClass.BETWEEN + + def id(self) -> str: + return f"{self.type}-{self.min}-{self.max}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters( + min_value=self.min, max_value=self.max + ) + + +class LessThanOperator(ConfigModel): + type: Literal["less_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOperator(ConfigModel): + type: Literal["greater_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class LessThanOrEqualToOperator(ConfigModel): + type: Literal["less_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOrEqualToOperator(ConfigModel): + type: Literal["greater_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class NotNullOperator(ConfigModel): + type: Literal["not_null"] + + operator: str = AssertionStdOperatorClass.NOT_NULL + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +Operators = Union[ + EqualToOperator, + BetweenOperator, + LessThanOperator, + LessThanOrEqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + NotNullOperator, +] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py index a665e95e93c43..6a3944ba36baf 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py @@ -4,6 +4,8 @@ from typing_extensions import Literal import datahub.emitter.mce_builder as builder +from datahub.api.entities.datacontract.assertion import BaseAssertion +from datahub.api.entities.datacontract.assertion_operator import Operators from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -14,12 +16,15 @@ AssertionStdParametersClass, AssertionStdParameterTypeClass, AssertionTypeClass, + AssertionValueChangeTypeClass, DatasetAssertionInfoClass, DatasetAssertionScopeClass, + SqlAssertionInfoClass, + SqlAssertionTypeClass, ) -class IdConfigMixin(ConfigModel): +class IdConfigMixin(BaseAssertion): id_raw: Optional[str] = pydantic.Field( default=None, alias="id", @@ -30,25 +35,32 @@ def generate_default_id(self) -> str: raise NotImplementedError -class CustomSQLAssertion(IdConfigMixin, ConfigModel): +class CustomSQLAssertion(IdConfigMixin, BaseAssertion): type: Literal["custom_sql"] - sql: str + operator: Operators = pydantic.Field(discriminator="type") - def generate_dataset_assertion_info( - self, entity_urn: str - ) -> DatasetAssertionInfoClass: - return DatasetAssertionInfoClass( - dataset=entity_urn, - scope=DatasetAssertionScopeClass.UNKNOWN, - fields=[], - operator=AssertionStdOperatorClass._NATIVE_, - aggregation=AssertionStdAggregationClass._NATIVE_, - logic=self.sql, + def generate_default_id(self) -> str: + return f"{self.type}-{self.sql}-{self.operator.id()}" + + def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: + sql_assertion_info = SqlAssertionInfoClass( + entity=entity_urn, + statement=self.sql, + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + # TODO: Support other types of assertions + type=SqlAssertionTypeClass.METRIC, + changeType=AssertionValueChangeTypeClass.ABSOLUTE, + ) + return AssertionInfoClass( + type=AssertionTypeClass.SQL, + sqlAssertion=sql_assertion_info, + description=self.description, ) -class ColumnUniqueAssertion(IdConfigMixin, ConfigModel): +class ColumnUniqueAssertion(IdConfigMixin, BaseAssertion): type: Literal["unique"] # TODO: support multiple columns? @@ -57,10 +69,8 @@ class ColumnUniqueAssertion(IdConfigMixin, ConfigModel): def generate_default_id(self) -> str: return f"{self.type}-{self.column}" - def generate_dataset_assertion_info( - self, entity_urn: str - ) -> DatasetAssertionInfoClass: - return DatasetAssertionInfoClass( + def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: + dataset_assertion_info = DatasetAssertionInfoClass( dataset=entity_urn, scope=DatasetAssertionScopeClass.DATASET_COLUMN, fields=[builder.make_schema_field_urn(entity_urn, self.column)], @@ -72,6 +82,11 @@ def generate_dataset_assertion_info( ) ), ) + return AssertionInfoClass( + type=AssertionTypeClass.DATASET, + datasetAssertion=dataset_assertion_info, + description=self.description, + ) class DataQualityAssertion(ConfigModel): @@ -92,16 +107,9 @@ def id(self) -> str: def generate_mcp( self, assertion_urn: str, entity_urn: str ) -> List[MetadataChangeProposalWrapper]: - dataset_assertion_info = self.__root__.generate_dataset_assertion_info( - entity_urn - ) - return [ MetadataChangeProposalWrapper( entityUrn=assertion_urn, - aspect=AssertionInfoClass( - type=AssertionTypeClass.DATASET, - datasetAssertion=dataset_assertion_info, - ), + aspect=self.__root__.generate_assertion_info(entity_urn), ) ] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py index 2df446623a9d6..f3c6be55e5fea 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py @@ -54,7 +54,7 @@ class DataContract(ConfigModel): freshness: Optional[FreshnessAssertion] = pydantic.Field(default=None) # TODO: Add a validator to ensure that ids are unique - data_quality: Optional[List[DataQualityAssertion]] = None + data_quality: Optional[List[DataQualityAssertion]] = pydantic.Field(default=None) _original_yaml_dict: Optional[dict] = None diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py index ee8fa1181e614..71741d76b22fc 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py @@ -6,6 +6,7 @@ import pydantic from typing_extensions import Literal +from datahub.api.entities.datacontract.assertion import BaseAssertion from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -21,7 +22,7 @@ ) -class CronFreshnessAssertion(ConfigModel): +class CronFreshnessAssertion(BaseAssertion): type: Literal["cron"] cron: str = pydantic.Field( @@ -32,12 +33,30 @@ class CronFreshnessAssertion(ConfigModel): description="The timezone to use for the cron schedule. Defaults to UTC.", ) + def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleClass: + return FreshnessAssertionScheduleClass( + type=FreshnessAssertionScheduleTypeClass.CRON, + cron=FreshnessCronScheduleClass( + cron=self.cron, + timezone=self.timezone, + ), + ) + -class FixedIntervalFreshnessAssertion(ConfigModel): +class FixedIntervalFreshnessAssertion(BaseAssertion): type: Literal["interval"] interval: timedelta + def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleClass: + return FreshnessAssertionScheduleClass( + type=FreshnessAssertionScheduleTypeClass.FIXED_INTERVAL, + fixedInterval=FixedIntervalScheduleClass( + unit=CalendarIntervalClass.SECOND, + multiple=int(self.interval.total_seconds()), + ), + ) + class FreshnessAssertion(ConfigModel): __root__: Union[ @@ -51,36 +70,13 @@ def id(self): def generate_mcp( self, assertion_urn: str, entity_urn: str ) -> List[MetadataChangeProposalWrapper]: - freshness = self.__root__ - - if isinstance(freshness, CronFreshnessAssertion): - schedule = FreshnessAssertionScheduleClass( - type=FreshnessAssertionScheduleTypeClass.CRON, - cron=FreshnessCronScheduleClass( - cron=freshness.cron, - timezone=freshness.timezone, - ), - ) - elif isinstance(freshness, FixedIntervalFreshnessAssertion): - schedule = FreshnessAssertionScheduleClass( - type=FreshnessAssertionScheduleTypeClass.FIXED_INTERVAL, - fixedInterval=FixedIntervalScheduleClass( - unit=CalendarIntervalClass.SECOND, - multiple=int(freshness.interval.total_seconds()), - ), - ) - else: - raise ValueError(f"Unknown freshness type {freshness}") - - assertionInfo = AssertionInfoClass( + aspect = AssertionInfoClass( type=AssertionTypeClass.FRESHNESS, freshnessAssertion=FreshnessAssertionInfoClass( entity=entity_urn, type=FreshnessAssertionTypeClass.DATASET_CHANGE, - schedule=schedule, + schedule=self.__root__.generate_freshness_assertion_schedule(), ), + description=self.__root__.description, ) - - return [ - MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=assertionInfo) - ] + return [MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=aspect)] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py index b5b592e01f58f..b62f94e0592fc 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py @@ -6,6 +6,7 @@ import pydantic from typing_extensions import Literal +from datahub.api.entities.datacontract.assertion import BaseAssertion from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.extractor.json_schema_util import get_schema_metadata @@ -19,7 +20,7 @@ ) -class JsonSchemaContract(ConfigModel): +class JsonSchemaContract(BaseAssertion): type: Literal["json-schema"] json_schema: dict = pydantic.Field(alias="json-schema") @@ -36,7 +37,7 @@ def _init_private_attributes(self) -> None: ) -class FieldListSchemaContract(ConfigModel, arbitrary_types_allowed=True): +class FieldListSchemaContract(BaseAssertion, arbitrary_types_allowed=True): type: Literal["field-list"] fields: List[SchemaFieldClass] @@ -67,15 +68,13 @@ def id(self): def generate_mcp( self, assertion_urn: str, entity_urn: str ) -> List[MetadataChangeProposalWrapper]: - schema_metadata = self.__root__._schema_metadata - - assertionInfo = AssertionInfoClass( + aspect = AssertionInfoClass( type=AssertionTypeClass.DATA_SCHEMA, schemaAssertion=SchemaAssertionInfoClass( - entity=entity_urn, schema=schema_metadata + entity=entity_urn, + schema=self.__root__._schema_metadata, ), + description=self.__root__.description, ) - return [ - MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=assertionInfo) - ] + return [MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=aspect)] diff --git a/metadata-ingestion/src/datahub/cli/specific/group_cli.py b/metadata-ingestion/src/datahub/cli/specific/group_cli.py index 9baa8ee68d975..e313fce33d4d5 100644 --- a/metadata-ingestion/src/datahub/cli/specific/group_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/group_cli.py @@ -43,7 +43,7 @@ def upsert(file: Path, override_editable: bool) -> None: with get_default_graph() as emitter: for group_config in group_configs: try: - datahub_group = CorpGroup.parse_obj(config_dict) + datahub_group = CorpGroup.parse_obj(group_config) for mcp in datahub_group.generate_mcp( generation_config=CorpGroupGenerationConfig( override_editable=override_editable, datahub_graph=emitter diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py index 071d590f270f8..dedcfa0385f75 100644 --- a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py +++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py @@ -179,15 +179,16 @@ def add_lineage( def gen_workunits(self) -> Iterable[MetadataWorkUnit]: if self.generate_lineage: - yield from self._gen_lineage_workunits() + for mcp in self._gen_lineage_mcps(): + yield mcp.as_workunit() if self.generate_usage_statistics: yield from self._gen_usage_statistics_workunits() - def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]: + def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: for downstream_urn in self._lineage_map: upstreams: List[UpstreamClass] = [] fine_upstreams: List[FineGrainedLineageClass] = [] - for upstream_urn, edge in self._lineage_map[downstream_urn].items(): + for edge in self._lineage_map[downstream_urn].values(): upstreams.append(edge.gen_upstream_aspect()) fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects()) @@ -201,7 +202,7 @@ def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]: ) yield MetadataChangeProposalWrapper( entityUrn=downstream_urn, aspect=upstream_lineage - ).as_workunit() + ) def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]: yield from self._usage_aggregator.generate_workunits( diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index e5dff786b71d1..aa7e5aa352a3e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -221,6 +221,7 @@ def report_table_dropped(self, table: str) -> None: SourceCapability.DELETION_DETECTION, "Enabled by default when stateful ingestion is turned on.", ) +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") class GlueSource(StatefulIngestionSourceBase): """ Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py index 501162455cc45..878b8dd1bb9a5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py @@ -34,21 +34,26 @@ def get_bucket_relative_path(s3_uri: str) -> str: return "/".join(strip_s3_prefix(s3_uri).split("/")[1:]) -def make_s3_urn(s3_uri: str, env: str) -> str: +def make_s3_urn(s3_uri: str, env: str, remove_extension: bool = True) -> str: s3_name = strip_s3_prefix(s3_uri) if s3_name.endswith("/"): s3_name = s3_name[:-1] name, extension = os.path.splitext(s3_name) - - if extension != "": + if remove_extension and extension != "": extension = extension[1:] # remove the dot return f"urn:li:dataset:(urn:li:dataPlatform:s3,{name}_{extension},{env})" return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{env})" +def make_s3_urn_for_lineage(s3_uri: str, env: str) -> str: + # Ideally this is the implementation for all S3 URNs + # Don't feel comfortable changing `make_s3_urn` for glue, sagemaker, and athena + return make_s3_urn(s3_uri, env, remove_extension=False) + + def get_bucket_name(s3_uri: str) -> str: if not is_s3_uri(s3_uri): raise ValueError( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 552612f877b9a..692d8c4f81bb6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -153,6 +153,7 @@ def cleanup(config: BigQueryV2Config) -> None: ) @capability(SourceCapability.DESCRIPTIONS, "Enabled by default") @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") +@capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration") @capability( SourceCapability.USAGE_STATS, "Enabled by default, can be disabled via configuration `include_usage_statistics`", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index 88060a9cdc91d..55366d6c57cf8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass, field from datetime import datetime -from typing import Any, ClassVar, Dict, List, Optional, Pattern, Set, Tuple, Union +from typing import Any, ClassVar, Dict, List, Optional, Pattern, Tuple, Union from dateutil import parser @@ -35,8 +35,6 @@ class BigqueryTableIdentifier: dataset: str table: str - invalid_chars: ClassVar[Set[str]] = {"$", "@"} - # Note: this regex may get overwritten by the sharded_table_pattern config. # The class-level constant, however, will not be overwritten. _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[ @@ -105,18 +103,7 @@ def get_table_display_name(self) -> str: ) table_name, _ = self.get_table_and_shard(shortened_table_name) - if not table_name: - table_name = self.dataset - - # Handle exceptions - invalid_chars_in_table_name: List[str] = [ - c for c in self.invalid_chars if c in table_name - ] - if invalid_chars_in_table_name: - raise ValueError( - f"Cannot handle {self.raw_table_name()} - poorly formatted table name, contains {invalid_chars_in_table_name}" - ) - return table_name + return table_name or self.dataset def get_table_name(self) -> str: """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py index 03b12c61ee5c6..db552c09cd0a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py @@ -4,7 +4,6 @@ from google.cloud import bigquery from google.cloud.logging_v2.client import Client as GCPLoggingClient -from ratelimiter import RateLimiter from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( AuditLogEntry, @@ -17,6 +16,7 @@ BQ_DATE_SHARD_FORMAT, BQ_DATETIME_FORMAT, ) +from datahub.utilities.ratelimiter import RateLimiter logger: logging.Logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 944814b6936a4..a6a740385cf5c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -119,8 +119,8 @@ class BigQueryV2Config( ) match_fully_qualified_names: bool = Field( - default=False, - description="Whether `dataset_pattern` is matched against fully qualified dataset name `.`.", + default=True, + description="[deprecated] Whether `dataset_pattern` is matched against fully qualified dataset name `.`.", ) include_external_url: bool = Field( @@ -327,8 +327,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict: ): logger.warning( "Please update `dataset_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." - "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. " - "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." + "The config option `match_fully_qualified_names` is deprecated and will be removed in a future release." ) return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 98c8cbaf85eec..aa462435b8105 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -548,7 +548,7 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: # handle the case where the read happens within our time range but the query # completion event is delayed and happens after the configured end time. corrected_start_time = self.start_time - self.config.max_query_duration - corrected_end_time = self.end_time + -self.config.max_query_duration + corrected_end_time = self.end_time + self.config.max_query_duration self.report.log_entry_start_time = corrected_start_time self.report.log_entry_end_time = corrected_end_time diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 201567e104a51..7fc38991e5928 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -335,8 +335,12 @@ def get_time_window(self) -> Tuple[datetime, datetime]: def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool: return ( table_ref is not None - and self.config.dataset_pattern.allowed(table_ref.table_identifier.dataset) - and self.config.table_pattern.allowed(table_ref.table_identifier.table) + and self.config.dataset_pattern.allowed( + f"{table_ref.table_identifier.project_id}.{table_ref.table_identifier.dataset}" + if self.config.match_fully_qualified_names + else table_ref.table_identifier.dataset + ) + and self.config.table_pattern.allowed(str(table_ref.table_identifier)) ) def _should_ingest_usage(self) -> bool: @@ -844,7 +848,7 @@ def _get_parsed_bigquery_log_events( # handle the case where the read happens within our time range but the query # completion event is delayed and happens after the configured end time. corrected_start_time = self.start_time - self.config.max_query_duration - corrected_end_time = self.end_time + -self.config.max_query_duration + corrected_end_time = self.end_time + self.config.max_query_duration self.report.audit_start_time = corrected_start_time self.report.audit_end_time = corrected_end_time diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index 5fae0ee5215a3..1a1e012e80633 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -1096,6 +1096,7 @@ def transform_connector_config( @config_class(KafkaConnectSourceConfig) @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") class KafkaConnectSource(StatefulIngestionSourceBase): config: KafkaConnectSourceConfig report: KafkaConnectSourceReport diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 8297a0aa8efa7..a3df977582ca4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -103,6 +103,11 @@ @capability( SourceCapability.OWNERSHIP, "Enabled by default, configured using `extract_owners`" ) +@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") +@capability( + SourceCapability.LINEAGE_FINE, + "Enabled by default, configured using `extract_column_level_lineage`", +) @capability( SourceCapability.USAGE_STATS, "Enabled by default, configured using `extract_usage_history`", @@ -1128,7 +1133,6 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def emit_independent_looks_mcp( self, dashboard_element: LookerDashboardElement ) -> Iterable[MetadataWorkUnit]: - yield from auto_workunit( stream=self._make_chart_metadata_events( dashboard_element=dashboard_element, diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index fb4512893feb1..24145d60210ff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -80,6 +80,7 @@ def remove_trailing_slash(cls, v): @config_class(MetabaseConfig) @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") class MetabaseSource(Source): """ This plugin extracts Charts, dashboards, and associated metadata. This plugin is in beta and has only been tested diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py index 1c0c809c16a60..f33c6e0edae3d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py @@ -23,11 +23,17 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, + capability, config_class, platform_name, support_status, ) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.source import ( + MetadataWorkUnitProcessor, + Source, + SourceCapability, + SourceReport, +) from datahub.ingestion.api.source_helpers import ( auto_status_aspect, auto_workunit_reporter, @@ -121,6 +127,8 @@ def version_must_be_1(cls, v): @platform_name("File Based Lineage") @config_class(LineageFileSourceConfig) @support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.LINEAGE_COARSE, "Specified in the lineage file.") +@capability(SourceCapability.LINEAGE_FINE, "Specified in the lineage file.") @dataclass class LineageFileSource(Source): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index a000c66a406c2..c46b56da422d9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -98,6 +98,7 @@ class HTTPError429(HTTPError): @config_class(ModeConfig) @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") class ModeSource(Source): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index ac1e03812db3b..bc05edbb3c623 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -26,11 +26,12 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, + capability, config_class, platform_name, support_status, ) -from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.schema_classes import ( DataFlowInfoClass, @@ -360,6 +361,7 @@ def report_dropped(self, ent_name: str) -> None: @platform_name("NiFi", id="nifi") @config_class(NifiSourceConfig) @support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations") class NifiSource(Source): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 52bcef66658c8..4611a8eed4782 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -264,7 +264,6 @@ def extract_lineage( ) if len(upstream) > 0: - upstream_lineage_class: UpstreamLineageClass = UpstreamLineageClass( upstreams=upstream, fineGrainedLineages=cll_lineage or None, @@ -1139,6 +1138,10 @@ def report_to_datahub_work_units( SourceCapability.OWNERSHIP, "Disabled by default, configured using `extract_ownership`", ) +@capability( + SourceCapability.LINEAGE_COARSE, + "Enabled by default, configured using `extract_lineage`.", +) @capability( SourceCapability.LINEAGE_FINE, "Disabled by default, configured using `extract_column_level_lineage`. ", diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index 9b5296f0b9dd5..3ef6476078f6f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -75,7 +75,10 @@ class DataLakeSourceConfig( default=100, description="Maximum number of rows to use when inferring schemas for TSV and CSV files.", ) - + add_partition_columns_to_schema: bool = Field( + default=False, + description="Whether to add partition fields to the schema.", + ) verify_ssl: Union[bool, str] = Field( default=True, description="Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index eb49fcbb268c0..94c571eabad11 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -78,6 +78,7 @@ NullTypeClass, NumberTypeClass, RecordTypeClass, + SchemaField, SchemaFieldDataType, SchemaMetadata, StringTypeClass, @@ -90,6 +91,7 @@ OperationClass, OperationTypeClass, OtherSchemaClass, + SchemaFieldDataTypeClass, _Aspect, ) from datahub.telemetry import stats, telemetry @@ -458,8 +460,39 @@ def get_fields(self, table_data: TableData, path_spec: PathSpec) -> List: logger.debug(f"Extracted fields in schema: {fields}") fields = sorted(fields, key=lambda f: f.fieldPath) + if self.source_config.add_partition_columns_to_schema: + self.add_partition_columns_to_schema( + fields=fields, path_spec=path_spec, full_path=table_data.full_path + ) + return fields + def add_partition_columns_to_schema( + self, path_spec: PathSpec, full_path: str, fields: List[SchemaField] + ) -> None: + is_fieldpath_v2 = False + for field in fields: + if field.fieldPath.startswith("[version=2.0]"): + is_fieldpath_v2 = True + break + vars = path_spec.get_named_vars(full_path) + if vars is not None and "partition_key" in vars: + for partition_key in vars["partition_key"].values(): + fields.append( + SchemaField( + fieldPath=f"{partition_key}" + if not is_fieldpath_v2 + else f"[version=2.0].[type=string].{partition_key}", + nativeDataType="string", + type=SchemaFieldDataType(StringTypeClass()) + if not is_fieldpath_v2 + else SchemaFieldDataTypeClass(type=StringTypeClass()), + isPartitioningKey=True, + nullable=True, + recursive=False, + ) + ) + def get_table_profile( self, table_data: TableData, dataset_urn: str ) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 9a993f5774032..0a15c352fc842 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -21,7 +21,7 @@ import datahub.emitter.mce_builder as builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.snowflake.constants import ( LINEAGE_PERMISSION_ERROR, SnowflakeEdition, @@ -652,7 +652,9 @@ def get_external_upstreams(self, external_lineage: Set[str]) -> List[UpstreamCla # For now, populate only for S3 if external_lineage_entry.startswith("s3://"): external_upstream_table = UpstreamClass( - dataset=make_s3_urn(external_lineage_entry, self.config.env), + dataset=make_s3_urn_for_lineage( + external_lineage_entry, self.config.env + ), type=DatasetLineageTypeClass.COPY, ) external_upstreams.append(external_upstream_table) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index 9cb613bde1e9f..06b9ad92677a2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -1,12 +1,17 @@ import json import logging +import re import typing -from typing import Any, Dict, Iterable, List, Optional, Tuple, cast +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast import pydantic from pyathena.common import BaseCursor from pyathena.model import AthenaTableMetadata +from pyathena.sqlalchemy_athena import AthenaRestDialect +from sqlalchemy import create_engine, inspect, types from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.types import TypeEngine +from sqlalchemy_bigquery import STRUCT from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey @@ -21,13 +26,164 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes -from datahub.ingestion.source.sql.sql_common import SQLAlchemySource +from datahub.ingestion.source.sql.sql_common import ( + SQLAlchemySource, + register_custom_type, +) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, gen_database_container, gen_database_key, ) +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField +from datahub.metadata.schema_classes import RecordTypeClass +from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column +from datahub.utilities.sqlalchemy_type_converter import ( + MapType, + get_schema_fields_for_sqlalchemy_column, +) + +logger = logging.getLogger(__name__) + +register_custom_type(STRUCT, RecordTypeClass) + + +class CustomAthenaRestDialect(AthenaRestDialect): + """Custom definition of the Athena dialect. + + Custom implementation that allows to extend/modify the behavior of the SQLalchemy + dialect that is used by PyAthena (which is the library that is used by DataHub + to extract metadata from Athena). + This dialect can then be used by the inspector (see get_inspectors()). + + """ + + # regex to identify complex types in DDL strings which are embedded in `<>`. + _complex_type_pattern = re.compile(r"(<.+>)") + + @typing.no_type_check + def _get_column_type( + self, type_: Union[str, Dict[str, Any]] + ) -> TypeEngine: # noqa: C901 + """Derives the data type of the Athena column. + + This method is overwritten to extend the behavior of PyAthena. + Pyathena is not capable of detecting complex data types, e.g., + arrays, maps, or, structs (as of version 2.25.2). + The custom implementation extends the functionality by the above-mentioned data types. + """ + + # Originally, this method only handles `type_` as a string + # With the workaround used below to parse DDL strings for structs, + # `type` might also be a dictionary + if isinstance(type_, str): + match = self._pattern_column_type.match(type_) + if match: + type_name = match.group(1).lower() + type_meta_information = match.group(2) + else: + type_name = type_.lower() + type_meta_information = None + elif isinstance(type_, dict): + # this occurs only when a type parsed as part of a STRUCT is passed + # in such case type_ is a dictionary whose type can be retrieved from the attribute + type_name = type_.get("type", None) + type_meta_information = None + else: + raise RuntimeError(f"Unsupported type definition: {type_}") + + args = [] + + if type_name in ["array"]: + detected_col_type = types.ARRAY + + # here we need to account again for two options how `type_` is passed to this method + # first, the simple array definition as a DDL string (something like array) + # this is always the case when the array is not part of a complex data type (mainly STRUCT) + # second, the array definition can also be passed in form of dictionary + # this is the case when the array is part of a complex data type + if isinstance(type_, str): + # retrieve the raw name of the data type as a string + array_type_raw = self._complex_type_pattern.findall(type_)[0][ + 1:-1 + ] # array type without enclosing <> + # convert the string name of the data type into a SQLalchemy type (expected return) + array_type = self._get_column_type(array_type_raw) + elif isinstance(type_, dict): + # retrieve the data type of the array items and + # transform it into a SQLalchemy type + array_type = self._get_column_type(type_["items"]) + else: + raise RuntimeError(f"Unsupported array definition: {type_}") + + args = [array_type] + + elif type_name in ["struct", "record"]: + # STRUCT is not part of the SQLalchemy types selection + # but is provided by another official SQLalchemy library and + # compatible with the other SQLalchemy types + detected_col_type = STRUCT + + if isinstance(type_, dict): + # in case a struct as part of another struct is passed + # it is provided in form of a dictionary and + # can simply be used for the further processing + struct_type = type_ + else: + # this is the case when the type definition of the struct is passed as a DDL string + # therefore, it is required to parse the DDL string + # here a method provided in another Datahub source is used so that the parsing + # doesn't need to be implemented twice + # `get_avro_schema_for_hive_column` accepts a DDL description as column type and + # returns the parsed data types in form of a dictionary + schema = get_avro_schema_for_hive_column( + hive_column_name=type_name, hive_column_type=type_ + ) + + # the actual type description needs to be extracted + struct_type = schema["fields"][0]["type"] + + # A STRUCT consist of multiple attributes which are expected to be passed as + # a list of tuples consisting of name data type pairs. e.g., `('age', Integer())` + # See the reference: + # https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_struct.py#L53 + # + # To extract all of them, we simply iterate over all detected fields and + # convert them to SQLalchemy types + struct_args = [] + for field in struct_type["fields"]: + struct_args.append( + ( + field["name"], + self._get_column_type(field["type"]["type"]) + if field["type"]["type"] not in ["record", "array"] + else self._get_column_type(field["type"]), + ) + ) + + args = struct_args + + elif type_name in ["map"]: + # Instead of SQLalchemy's TupleType the custom MapType is used here + # which is just a simple wrapper around TupleType + detected_col_type = MapType + + # the type definition for maps looks like the following: key_type:val_type (e.g., string:string) + key_type_raw, value_type_raw = type_meta_information.split(",") + + # convert both type names to actual SQLalchemy types + args = [ + self._get_column_type(key_type_raw), + self._get_column_type(value_type_raw), + ] + # by using get_avro_schema_for_hive_column() for parsing STRUCTs the data type `long` + # can also be returned, so we need to extend the handling here as well + elif type_name in ["bigint", "long"]: + detected_col_type = types.BIGINT + else: + return super()._get_column_type(type_name) + return detected_col_type(*args) class AthenaConfig(SQLCommonConfig): @@ -129,6 +285,18 @@ def create(cls, config_dict, ctx): config = AthenaConfig.parse_obj(config_dict) return cls(config, ctx) + # overwrite this method to allow to specify the usage of a custom dialect + def get_inspectors(self) -> Iterable[Inspector]: + url = self.config.get_sql_alchemy_url() + logger.debug(f"sql_alchemy_url={url}") + engine = create_engine(url, **self.config.options) + + # set custom dialect to be used by the inspector + engine.dialect = CustomAthenaRestDialect() + with engine.connect() as conn: + inspector = inspect(conn) + yield inspector + def get_table_properties( self, inspector: Inspector, schema: str, table: str ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: @@ -136,9 +304,7 @@ def get_table_properties( self.cursor = cast(BaseCursor, inspector.engine.raw_connection().cursor()) assert self.cursor - # Unfortunately properties can be only get through private methods as those are not exposed - # https://github.com/laughingman7743/PyAthena/blob/9e42752b0cc7145a87c3a743bb2634fe125adfa7/pyathena/model.py#L201 - metadata: AthenaTableMetadata = self.cursor._get_table_metadata( + metadata: AthenaTableMetadata = self.cursor.get_table_metadata( table_name=table, schema_name=schema ) description = metadata.comment @@ -241,6 +407,30 @@ def get_schema_names(self, inspector: Inspector) -> List[str]: return [schema for schema in schemas if schema == athena_config.database] return schemas + # Overwrite to modify the creation of schema fields + def get_schema_fields_for_column( + self, + dataset_name: str, + column: Dict, + pk_constraints: Optional[dict] = None, + tags: Optional[List[str]] = None, + ) -> List[SchemaField]: + fields = get_schema_fields_for_sqlalchemy_column( + column_name=column["name"], + column_type=column["type"], + description=column.get("comment", None), + nullable=column.get("nullable", True), + is_part_of_key=True + if ( + pk_constraints is not None + and isinstance(pk_constraints, dict) + and column["name"] in pk_constraints.get("constrained_columns", []) + ) + else False, + ) + + return fields + def close(self): if self.cursor: self.cursor.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 056be6c2e50ac..be03858ec3ef9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -80,6 +80,7 @@ DatasetLineageTypeClass, DatasetPropertiesClass, GlobalTagsClass, + MapTypeClass, SubTypesClass, TagAssociationClass, UpstreamClass, @@ -89,6 +90,7 @@ from datahub.utilities.lossy_collections import LossyList from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport +from datahub.utilities.sqlalchemy_type_converter import MapType if TYPE_CHECKING: from datahub.ingestion.source.ge_data_profiler import ( @@ -154,6 +156,8 @@ class SqlWorkUnit(MetadataWorkUnit): types.DATETIME: TimeTypeClass, types.TIMESTAMP: TimeTypeClass, types.JSON: RecordTypeClass, + # additional type definitions that are used by the Athena source + MapType: MapTypeClass, # type: ignore # Because the postgresql dialect is used internally by many other dialects, # we add some postgres types here. This is ok to do because the postgresql # dialect is built-in to sqlalchemy. diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 08cc74aec3977..57aae32b361cf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -1,10 +1,10 @@ import logging from abc import abstractmethod from typing import Any, Dict, Optional -from urllib.parse import quote_plus import pydantic from pydantic import Field +from sqlalchemy.engine import URL from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import ( @@ -125,7 +125,11 @@ class SQLAlchemyConnectionConfig(ConfigModel): # Duplicate of SQLCommonConfig.options options: dict = pydantic.Field( default_factory=dict, - description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + description=( + "Any options specified here will be passed to " + "[SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs." + " To set connection arguments in the URL, specify them under `connect_args`." + ), ) _database_alias_deprecation = pydantic_field_deprecated( @@ -161,21 +165,26 @@ def make_sqlalchemy_uri( db: Optional[str], uri_opts: Optional[Dict[str, Any]] = None, ) -> str: - url = f"{scheme}://" - if username is not None: - url += f"{quote_plus(username)}" - if password is not None: - url += f":{quote_plus(password)}" - url += "@" - if at is not None: - url += f"{at}" - if db is not None: - url += f"/{db}" - if uri_opts is not None: - if db is None: - url += "/" - params = "&".join( - f"{key}={quote_plus(value)}" for (key, value) in uri_opts.items() if value + host: Optional[str] = None + port: Optional[int] = None + if at: + try: + host, port_str = at.rsplit(":", 1) + port = int(port_str) + except ValueError: + host = at + port = None + if uri_opts: + uri_opts = {k: v for k, v in uri_opts.items() if v is not None} + + return str( + URL.create( + drivername=scheme, + username=username, + password=password, + host=host, + port=port, + database=db, + query=uri_opts or {}, ) - url = f"{url}?{params}" - return url + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py index 3b4a7e1dc0287..ae47623188f42 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py @@ -7,7 +7,7 @@ BytesType, DateType, EnumType, - MapType, + MapType as MapTypeAvro, NullType, NumberType, RecordType, @@ -15,6 +15,7 @@ TimeType, UnionType, ) +from datahub.utilities.sqlalchemy_type_converter import MapType # these can be obtained by running `select format_type(oid, null),* from pg_type;` # we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.) @@ -363,7 +364,7 @@ def resolve_vertica_modified_type(type_string: str) -> Any: "time": TimeType, "timestamp": TimeType, "row": RecordType, - "map": MapType, + "map": MapTypeAvro, "array": ArrayType, } diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py index dd11cd840bed9..e628e4dbd3446 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -1,6 +1,7 @@ import logging from dataclasses import dataclass -from typing import Iterable, Optional, Set, Union +from datetime import datetime +from typing import Iterable, MutableMapping, Optional, Union # This import verifies that the dependencies are available. import teradatasqlalchemy # noqa: F401 @@ -32,11 +33,14 @@ from datahub.ingestion.source.usage.usage_common import BaseUsageConfig from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.ingestion.source_report.time_window import BaseTimeWindowReport +from datahub.metadata._schema_classes import SchemaMetadataClass, ViewPropertiesClass from datahub.metadata.com.linkedin.pegasus2avro.schema import ( BytesTypeClass, TimeTypeClass, ) +from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage +from datahub.utilities.urns.dataset_urn import DatasetUrn logger: logging.Logger = logging.getLogger(__name__) @@ -64,6 +68,7 @@ @dataclass class TeradataReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): num_queries_parsed: int = 0 + num_view_ddl_parsed: int = 0 num_table_parse_failures: int = 0 @@ -82,17 +87,16 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig): "This requires to have the table lineage feature enabled.", ) + include_view_lineage = Field( + default=True, + description="Whether to include view lineage in the ingestion. " + "This requires to have the view lineage feature enabled.", + ) usage: BaseUsageConfig = Field( description="The usage config to use when generating usage statistics", default=BaseUsageConfig(), ) - use_schema_resolver: bool = Field( - default=True, - description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.", - hidden_from_docs=True, - ) - default_db: Optional[str] = Field( default=None, description="The default database to use for unqualified table names", @@ -103,6 +107,11 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig): description="Generate usage statistic.", ) + use_file_backed_cache: bool = Field( + default=True, + description="Whether to use a file backed cache for the view definitions.", + ) + @platform_name("Teradata") @config_class(TeradataConfig) @@ -133,7 +142,8 @@ class TeradataSource(TwoTierSQLAlchemySource): and "timestamp" >= TIMESTAMP '{start_time}' and "timestamp" < TIMESTAMP '{end_time}' """ - urns: Optional[Set[str]] + + _view_definition_cache: MutableMapping[str, str] def __init__(self, config: TeradataConfig, ctx: PipelineContext): super().__init__(config, ctx, "teradata") @@ -141,46 +151,50 @@ def __init__(self, config: TeradataConfig, ctx: PipelineContext): self.report: TeradataReport = TeradataReport() self.graph: Optional[DataHubGraph] = ctx.graph - if self.graph: - if self.config.use_schema_resolver: - self.schema_resolver = ( - self.graph.initialize_schema_resolver_from_datahub( - platform=self.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - ) - ) - self.urns = self.schema_resolver.get_urns() - else: - self.schema_resolver = self.graph._make_schema_resolver( - platform=self.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - ) - self.urns = None - else: - self.schema_resolver = SchemaResolver( - platform=self.platform, - platform_instance=self.config.platform_instance, - graph=None, - env=self.config.env, - ) - self.urns = None - self.builder: SqlParsingBuilder = SqlParsingBuilder( usage_config=self.config.usage if self.config.include_usage_statistics else None, - generate_lineage=self.config.include_table_lineage, + generate_lineage=True, generate_usage_statistics=self.config.include_usage_statistics, generate_operations=self.config.usage.include_operational_stats, ) + self.schema_resolver = SchemaResolver( + platform=self.platform, + platform_instance=self.config.platform_instance, + graph=None, + env=self.config.env, + ) + + if self.config.use_file_backed_cache: + self._view_definition_cache = FileBackedDict[str]() + else: + self._view_definition_cache = {} + @classmethod def create(cls, config_dict, ctx): config = TeradataConfig.parse_obj(config_dict) return cls(config, ctx) + def get_view_lineage(self) -> Iterable[MetadataWorkUnit]: + for key in self._view_definition_cache.keys(): + view_definition = self._view_definition_cache[key] + dataset_urn = DatasetUrn.create_from_string(key) + + db_name: Optional[str] = None + # We need to get the default db from the dataset urn otherwise the builder generates the wrong urns + if "." in dataset_urn.get_dataset_name(): + db_name = dataset_urn.get_dataset_name().split(".", 1)[0] + + self.report.num_view_ddl_parsed += 1 + if self.report.num_view_ddl_parsed % 1000 == 0: + logger.info(f"Parsed {self.report.num_queries_parsed} view ddl") + + yield from self.gen_lineage_from_query( + query=view_definition, default_database=db_name, is_view_ddl=True + ) + def get_audit_log_mcps(self) -> Iterable[MetadataWorkUnit]: engine = self.get_metadata_engine() for entry in engine.execute( @@ -192,27 +206,43 @@ def get_audit_log_mcps(self) -> Iterable[MetadataWorkUnit]: if self.report.num_queries_parsed % 1000 == 0: logger.info(f"Parsed {self.report.num_queries_parsed} queries") - result = sqlglot_lineage( - sql=entry.query, - schema_resolver=self.schema_resolver, - default_db=None, - default_schema=entry.default_database - if entry.default_database - else self.config.default_db, + yield from self.gen_lineage_from_query( + query=entry.query, + default_database=entry.default_database, + timestamp=entry.timestamp, + user=entry.user, + is_view_ddl=False, ) - if result.debug_info.table_error: - logger.debug( - f"Error parsing table lineage, {result.debug_info.table_error}" - ) - self.report.num_table_parse_failures += 1 - continue + def gen_lineage_from_query( + self, + query: str, + default_database: Optional[str] = None, + timestamp: Optional[datetime] = None, + user: Optional[str] = None, + is_view_ddl: bool = False, + ) -> Iterable[MetadataWorkUnit]: + result = sqlglot_lineage( + sql=query, + schema_resolver=self.schema_resolver, + default_db=None, + default_schema=default_database + if default_database + else self.config.default_db, + ) + if result.debug_info.table_error: + logger.debug( + f"Error parsing table lineage, {result.debug_info.table_error}" + ) + self.report.num_table_parse_failures += 1 + else: yield from self.builder.process_sql_parsing_result( result, - query=entry.query, - query_timestamp=entry.timestamp, - user=f"urn:li:corpuser:{entry.user}", - include_urns=self.urns, + query=query, + is_view_ddl=is_view_ddl, + query_timestamp=timestamp, + user=f"urn:li:corpuser:{user}", + include_urns=self.schema_resolver.get_urns(), ) def get_metadata_engine(self) -> Engine: @@ -221,8 +251,23 @@ def get_metadata_engine(self) -> Engine: return create_engine(url, **self.config.options) def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: - yield from super().get_workunits_internal() + # Add all schemas to the schema resolver + for wu in super().get_workunits_internal(): + urn = wu.get_urn() + schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass) + if schema_metadata: + self.schema_resolver.add_schema_metadata(urn, schema_metadata) + view_properties = wu.get_aspect_of_type(ViewPropertiesClass) + if view_properties and self.config.include_view_lineage: + self._view_definition_cache[urn] = view_properties.viewLogic + yield wu + + if self.config.include_view_lineage: + self.report.report_ingestion_stage_start("view lineage extraction") + yield from self.get_view_lineage() + if self.config.include_table_lineage or self.config.include_usage_statistics: self.report.report_ingestion_stage_start("audit log extraction") yield from self.get_audit_log_mcps() - yield from self.builder.gen_workunits() + + yield from self.builder.gen_workunits() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py index d9062cef06eae..7a49551dc1235 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py @@ -1,8 +1,10 @@ import typing +import urllib.parse from typing import Any, Dict, Iterable, Optional from pydantic.fields import Field from sqlalchemy import create_engine, inspect +from sqlalchemy.engine import URL from sqlalchemy.engine.reflection import Inspector from datahub.configuration.common import AllowDenyPattern @@ -41,14 +43,27 @@ def get_sql_alchemy_url( uri_opts: typing.Optional[typing.Dict[str, typing.Any]] = None, current_db: typing.Optional[str] = None, ) -> str: - return self.sqlalchemy_uri or make_sqlalchemy_uri( - self.scheme, - self.username, - self.password.get_secret_value() if self.password else None, - self.host_port, - current_db if current_db else self.database, - uri_opts=uri_opts, - ) + if self.sqlalchemy_uri: + parsed_url = urllib.parse.urlsplit(self.sqlalchemy_uri) + url = URL.create( + drivername=parsed_url.scheme, + username=parsed_url.username, + password=parsed_url.password, + host=parsed_url.hostname, + port=parsed_url.port, + database=current_db or parsed_url.path.lstrip("/"), + query=urllib.parse.parse_qs(parsed_url.query), + ).update_query_dict(uri_opts or {}) + return str(url) + else: + return make_sqlalchemy_uri( + self.scheme, + self.username, + self.password.get_secret_value() if self.password else None, + self.host_port, + current_db or self.database, + uri_opts=uri_opts, + ) class TwoTierSQLAlchemySource(SQLAlchemySource): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py index bce4d1ec76e6e..fcf97e461967c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -20,11 +20,17 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, + capability, config_class, platform_name, support_status, ) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.source import ( + MetadataWorkUnitProcessor, + Source, + SourceCapability, + SourceReport, +) from datahub.ingestion.api.source_helpers import auto_workunit_reporter from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.graph.client import DataHubGraph @@ -83,6 +89,8 @@ def compute_stats(self) -> None: @platform_name("SQL Queries") @config_class(SqlQueriesSourceConfig) @support_status(SupportStatus.TESTING) +@capability(SourceCapability.LINEAGE_COARSE, "Parsed from SQL queries") +@capability(SourceCapability.LINEAGE_FINE, "Parsed from SQL queries") class SqlQueriesSource(Source): # TODO: Documentation urns: Optional[Set[str]] diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index 14bc4242d2a91..e491a1e8b82fa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -142,6 +142,7 @@ def get_filter_name(filter_obj): @capability( SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion" ) +@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") class SupersetSource(StatefulIngestionSourceBase): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index bad7ae49d325e..4bc40b0aac964 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -452,6 +452,10 @@ class TableauSourceReport(StaleEntityRemovalSourceReport): @capability(SourceCapability.OWNERSHIP, "Requires recipe configuration") @capability(SourceCapability.TAGS, "Requires recipe configuration") @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +@capability( + SourceCapability.LINEAGE_FINE, + "Enabled by default, configure using `extract_column_level_lineage`", +) class TableauSource(StatefulIngestionSourceBase): platform = "tableau" @@ -533,7 +537,7 @@ def fetch_projects(): path=[], ) # Set parent project name - for project_id, project in all_project_map.items(): + for _project_id, project in all_project_map.items(): if ( project.parent_id is not None and project.parent_id in all_project_map diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index a57ee39848855..16820c37d546e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -166,6 +166,14 @@ class UnityCatalogSourceConfig( description="Option to enable/disable lineage generation.", ) + include_external_lineage: bool = pydantic.Field( + default=True, + description=( + "Option to enable/disable lineage generation for external tables." + " Only external S3 tables are supported at the moment." + ), + ) + include_notebooks: bool = pydantic.Field( default=False, description="Ingest notebooks, represented as DataHub datasets.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 9bcdb200f180e..3fb77ce512ed2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -33,6 +33,7 @@ ALLOWED_STATEMENT_TYPES, Catalog, Column, + ExternalTableReference, Metastore, Notebook, Query, @@ -248,6 +249,13 @@ def table_lineage(self, table: Table, include_entity_lineage: bool) -> None: ) if table_ref: table.upstreams[table_ref] = {} + elif "fileInfo" in item: + external_ref = ExternalTableReference.create_from_lineage( + item["fileInfo"] + ) + if external_ref: + table.external_upstreams.add(external_ref) + for notebook in item.get("notebookInfos") or []: table.upstream_notebooks.add(notebook["notebook_id"]) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py index 18ac2475b51e0..315c1c0d20186 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py @@ -10,6 +10,7 @@ CatalogType, ColumnTypeName, DataSourceFormat, + SecurableType, TableType, ) from databricks.sdk.service.sql import QueryStatementType @@ -176,6 +177,35 @@ def external_path(self) -> str: return f"{self.catalog}/{self.schema}/{self.table}" +@dataclass(frozen=True, order=True) +class ExternalTableReference: + path: str + has_permission: bool + name: Optional[str] + type: Optional[SecurableType] + storage_location: Optional[str] + + @classmethod + def create_from_lineage(cls, d: dict) -> Optional["ExternalTableReference"]: + try: + securable_type: Optional[SecurableType] + try: + securable_type = SecurableType(d.get("securable_type", "").lower()) + except ValueError: + securable_type = None + + return cls( + path=d["path"], + has_permission=d.get("has_permission") or True, + name=d.get("securable_name"), + type=securable_type, + storage_location=d.get("storage_location"), + ) + except Exception as e: + logger.warning(f"Failed to create ExternalTableReference from {d}: {e}") + return None + + @dataclass class Table(CommonProperty): schema: Schema @@ -193,6 +223,7 @@ class Table(CommonProperty): view_definition: Optional[str] properties: Dict[str, str] upstreams: Dict[TableReference, Dict[str, List[str]]] = field(default_factory=dict) + external_upstreams: Set[ExternalTableReference] = field(default_factory=set) upstream_notebooks: Set[NotebookId] = field(default_factory=set) downstream_notebooks: Set[NotebookId] = field(default_factory=set) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index fa61571fa92cb..4153d9dd88eb8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -19,6 +19,8 @@ class UnityCatalogReport(IngestionStageReport, StaleEntityRemovalSourceReport): notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook") num_column_lineage_skipped_column_count: int = 0 + num_external_upstreams_lacking_permissions: int = 0 + num_external_upstreams_unsupported: int = 0 num_queries: int = 0 num_queries_dropped_parse_failure: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 27c1f341aa84d..b63cf65d55dc8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -41,6 +41,7 @@ TestConnectionReport, ) from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -455,6 +456,28 @@ def _generate_lineage_aspect( ) ) + if self.config.include_external_lineage: + for external_ref in table.external_upstreams: + if not external_ref.has_permission or not external_ref.path: + self.report.num_external_upstreams_lacking_permissions += 1 + logger.warning( + f"Lacking permissions for external file upstream on {table.ref}" + ) + elif external_ref.path.startswith("s3://"): + upstreams.append( + UpstreamClass( + dataset=make_s3_urn_for_lineage( + external_ref.path, self.config.env + ), + type=DatasetLineageTypeClass.COPY, + ) + ) + else: + self.report.num_external_upstreams_unsupported += 1 + logger.warning( + f"Unsupported external file upstream on {table.ref}: {external_ref.path}" + ) + if upstreams: return UpstreamLineageClass( upstreams=upstreams, diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py index b3b1331db768b..2b610947e9043 100644 --- a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py +++ b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py @@ -24,6 +24,7 @@ def assert_sql_result_with_resolver( *, expected_file: pathlib.Path, schema_resolver: SchemaResolver, + allow_table_error: bool = False, **kwargs: Any, ) -> None: # HACK: Our BigQuery source overwrites this value and doesn't undo it. @@ -36,6 +37,14 @@ def assert_sql_result_with_resolver( **kwargs, ) + if res.debug_info.table_error: + if allow_table_error: + logger.info( + f"SQL parser table error: {res.debug_info.table_error}", + exc_info=res.debug_info.table_error, + ) + else: + raise res.debug_info.table_error if res.debug_info.column_error: logger.warning( f"SQL parser column error: {res.debug_info.column_error}", diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index c04d2138bc116..18493edded4b7 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -3,6 +3,7 @@ import logging import pathlib import pickle +import shutil import sqlite3 import tempfile from dataclasses import dataclass, field @@ -56,15 +57,15 @@ class ConnectionWrapper: conn: sqlite3.Connection filename: pathlib.Path - _temp_directory: Optional[tempfile.TemporaryDirectory] + _temp_directory: Optional[str] def __init__(self, filename: Optional[pathlib.Path] = None): self._temp_directory = None # Warning: If filename is provided, the file will not be automatically cleaned up. if not filename: - self._temp_directory = tempfile.TemporaryDirectory() - filename = pathlib.Path(self._temp_directory.name) / _DEFAULT_FILE_NAME + self._temp_directory = tempfile.mkdtemp() + filename = pathlib.Path(self._temp_directory) / _DEFAULT_FILE_NAME self.conn = sqlite3.connect(filename, isolation_level=None) self.conn.row_factory = sqlite3.Row @@ -101,7 +102,8 @@ def executemany( def close(self) -> None: self.conn.close() if self._temp_directory: - self._temp_directory.cleanup() + shutil.rmtree(self._temp_directory) + self._temp_directory = None def __enter__(self) -> "ConnectionWrapper": return self diff --git a/metadata-ingestion/src/datahub/utilities/ratelimiter.py b/metadata-ingestion/src/datahub/utilities/ratelimiter.py new file mode 100644 index 0000000000000..3d47d25e14c49 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/ratelimiter.py @@ -0,0 +1,56 @@ +import collections +import threading +import time +from contextlib import AbstractContextManager +from typing import Any, Deque + + +# Modified version of https://github.com/RazerM/ratelimiter/blob/master/ratelimiter/_sync.py +class RateLimiter(AbstractContextManager): + + """Provides rate limiting for an operation with a configurable number of + requests for a time period. + """ + + def __init__(self, max_calls: int, period: float = 1.0) -> None: + """Initialize a RateLimiter object which enforces as much as max_calls + operations on period (eventually floating) number of seconds. + """ + if period <= 0: + raise ValueError("Rate limiting period should be > 0") + if max_calls <= 0: + raise ValueError("Rate limiting number of calls should be > 0") + + # We're using a deque to store the last execution timestamps, not for + # its maxlen attribute, but to allow constant time front removal. + self.calls: Deque = collections.deque() + + self.period = period + self.max_calls = max_calls + self._lock = threading.Lock() + + def __enter__(self) -> "RateLimiter": + with self._lock: + # We want to ensure that no more than max_calls were run in the allowed + # period. For this, we store the last timestamps of each call and run + # the rate verification upon each __enter__ call. + if len(self.calls) >= self.max_calls: + until = time.time() + self.period - self._timespan + sleeptime = until - time.time() + if sleeptime > 0: + time.sleep(sleeptime) + return self + + def __exit__(self, exc_type: Any, exc: Any, traceback: Any) -> None: + with self._lock: + # Store the last operation timestamp. + self.calls.append(time.time()) + + # Pop the timestamp list front (ie: the older calls) until the sum goes + # back below the period. This is our 'sliding period' window. + while self._timespan >= self.period: + self.calls.popleft() + + @property + def _timespan(self) -> float: + return self.calls[-1] - self.calls[0] diff --git a/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py new file mode 100644 index 0000000000000..1d5ec5dae3519 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py @@ -0,0 +1,204 @@ +import json +import logging +import uuid +from typing import Any, Dict, List, Optional, Type, Union + +from sqlalchemy import types +from sqlalchemy_bigquery import STRUCT + +from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField +from datahub.metadata.schema_classes import NullTypeClass, SchemaFieldDataTypeClass + +logger = logging.getLogger(__name__) + + +class MapType(types.TupleType): + # Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub + pass + + +class SqlAlchemyColumnToAvroConverter: + """Helper class that collects some methods to convert SQLalchemy columns to Avro schema.""" + + # tuple of complex data types that require a special handling + _COMPLEX_TYPES = (STRUCT, types.ARRAY, MapType) + + # mapping of primitive SQLalchemy data types to AVRO schema data types + PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE: Dict[Type[types.TypeEngine], str] = { + types.String: "string", + types.BINARY: "string", + types.BOOLEAN: "boolean", + types.FLOAT: "float", + types.INTEGER: "int", + types.BIGINT: "long", + types.VARCHAR: "string", + types.CHAR: "string", + } + + @classmethod + def get_avro_type( + cls, column_type: Union[types.TypeEngine, STRUCT, MapType], nullable: bool + ) -> Dict[str, Any]: + """Determines the concrete AVRO schema type for a SQLalchemy-typed column""" + + if type(column_type) in cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE.keys(): + return { + "type": cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE[type(column_type)], + "native_data_type": str(column_type), + "_nullable": nullable, + } + if isinstance(column_type, types.DECIMAL): + return { + "type": "bytes", + "logicalType": "decimal", + "precision": int(column_type.precision), + "scale": int(column_type.scale), + "native_data_type": str(column_type), + "_nullable": nullable, + } + if isinstance(column_type, types.DATE): + return { + "type": "int", + "logicalType": "date", + "native_data_type": str(column_type), + "_nullable": nullable, + } + if isinstance(column_type, types.TIMESTAMP): + return { + "type": "long", + "logicalType": "timestamp-millis", + "native_data_type": str(column_type), + "_nullable": nullable, + } + if isinstance(column_type, types.ARRAY): + array_type = column_type.item_type + return { + "type": "array", + "items": cls.get_avro_type(column_type=array_type, nullable=nullable), + "native_data_type": f"array<{str(column_type.item_type)}>", + } + if isinstance(column_type, MapType): + key_type = column_type.types[0] + value_type = column_type.types[1] + return { + "type": "map", + "values": cls.get_avro_type(column_type=value_type, nullable=nullable), + "native_data_type": str(column_type), + "key_type": cls.get_avro_type(column_type=key_type, nullable=nullable), + "key_native_data_type": str(key_type), + } + if isinstance(column_type, STRUCT): + fields = [] + for field_def in column_type._STRUCT_fields: + field_name, field_type = field_def + fields.append( + { + "name": field_name, + "type": cls.get_avro_type( + column_type=field_type, nullable=nullable + ), + } + ) + struct_name = f"__struct_{str(uuid.uuid4()).replace('-', '')}" + + return { + "type": "record", + "name": struct_name, + "fields": fields, + "native_data_type": str(column_type), + "_nullable": nullable, + } + + return { + "type": "null", + "native_data_type": str(column_type), + "_nullable": nullable, + } + + @classmethod + def get_avro_for_sqlalchemy_column( + cls, + column_name: str, + column_type: types.TypeEngine, + nullable: bool, + ) -> Union[object, Dict[str, object]]: + """Returns the AVRO schema representation of a SQLalchemy column.""" + if isinstance(column_type, cls._COMPLEX_TYPES): + return { + "type": "record", + "name": "__struct_", + "fields": [ + { + "name": column_name, + "type": cls.get_avro_type( + column_type=column_type, nullable=nullable + ), + } + ], + } + return cls.get_avro_type(column_type=column_type, nullable=nullable) + + +def get_schema_fields_for_sqlalchemy_column( + column_name: str, + column_type: types.TypeEngine, + description: Optional[str] = None, + nullable: Optional[bool] = True, + is_part_of_key: Optional[bool] = False, +) -> List[SchemaField]: + """Creates SchemaFields from a given SQLalchemy column. + + This function is analogous to `get_schema_fields_for_hive_column` from datahub.utilities.hive_schema_to_avro. + The main purpose of implementing it this way, is to make it ready/compatible for second field path generation, + which allows to explore nested structures within the UI. + """ + + if nullable is None: + nullable = True + + try: + # as a first step, the column is converted to AVRO JSON which can then be used by an existing function + avro_schema_json = ( + SqlAlchemyColumnToAvroConverter.get_avro_for_sqlalchemy_column( + column_name=column_name, + column_type=column_type, + nullable=nullable, + ) + ) + # retrieve schema field definitions from the above generated AVRO JSON structure + schema_fields = avro_schema_to_mce_fields( + avro_schema=json.dumps(avro_schema_json), + default_nullable=nullable, + swallow_exceptions=False, + ) + except Exception as e: + logger.warning( + f"Unable to parse column {column_name} and type {column_type} the error was: {e}" + ) + + # fallback description in case any exception occurred + schema_fields = [ + SchemaField( + fieldPath=column_name, + type=SchemaFieldDataTypeClass(type=NullTypeClass()), + nativeDataType=str(column_type), + ) + ] + + # for all non-nested data types an additional modification of the `fieldPath` property is required + if type(column_type) in ( + *SqlAlchemyColumnToAvroConverter.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE.keys(), + types.TIMESTAMP, + types.DATE, + types.DECIMAL, + ): + schema_fields[0].fieldPath += f".{column_name}" + + if description: + schema_fields[0].description = description + schema_fields[0].isPartOfKey = ( + is_part_of_key if is_part_of_key is not None else False + ) + + return schema_fields diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index c830ec8c02fd4..526d90b2a1bfa 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -241,9 +241,9 @@ class SqlParsingResult(_ParserBaseModel): ) -def _parse_statement(sql: str, dialect: str) -> sqlglot.Expression: - statement = sqlglot.parse_one( - sql, read=dialect, error_level=sqlglot.ErrorLevel.RAISE +def _parse_statement(sql: sqlglot.exp.ExpOrStr, dialect: str) -> sqlglot.Expression: + statement: sqlglot.Expression = sqlglot.maybe_parse( + sql, dialect=dialect, error_level=sqlglot.ErrorLevel.RAISE ) return statement @@ -467,14 +467,20 @@ def _column_level_lineage( # noqa: C901 default_db: Optional[str], default_schema: Optional[str], ) -> List[_ColumnLineageInfo]: - if not isinstance( - statement, - _SupportedColumnLineageTypesTuple, + is_create_ddl = _is_create_table_ddl(statement) + if ( + not isinstance( + statement, + _SupportedColumnLineageTypesTuple, + ) + and not is_create_ddl ): raise UnsupportedStatementTypeError( f"Can only generate column-level lineage for select-like inner statements, not {type(statement)}" ) + column_lineage: List[_ColumnLineageInfo] = [] + use_case_insensitive_cols = dialect in { # Column identifiers are case-insensitive in BigQuery, so we need to # do a normalization step beforehand to make sure it's resolved correctly. @@ -580,6 +586,38 @@ def _schema_aware_fuzzy_column_resolve( ) from e logger.debug("Qualified sql %s", statement.sql(pretty=True, dialect=dialect)) + # Handle the create DDL case. + if is_create_ddl: + assert ( + output_table is not None + ), "output_table must be set for create DDL statements" + + create_schema: sqlglot.exp.Schema = statement.this + sqlglot_columns = create_schema.expressions + + for column_def in sqlglot_columns: + if not isinstance(column_def, sqlglot.exp.ColumnDef): + # Ignore things like constraints. + continue + + output_col = _schema_aware_fuzzy_column_resolve( + output_table, column_def.name + ) + output_col_type = column_def.args.get("kind") + + column_lineage.append( + _ColumnLineageInfo( + downstream=_DownstreamColumnRef( + table=output_table, + column=output_col, + column_type=output_col_type, + ), + upstreams=[], + ) + ) + + return column_lineage + # Try to figure out the types of the output columns. try: statement = sqlglot.optimizer.annotate_types.annotate_types( @@ -589,8 +627,6 @@ def _schema_aware_fuzzy_column_resolve( # This is not a fatal error, so we can continue. logger.debug("sqlglot failed to annotate types: %s", e) - column_lineage = [] - try: assert isinstance(statement, _SupportedColumnLineageTypesTuple) @@ -599,7 +635,6 @@ def _schema_aware_fuzzy_column_resolve( (select_col.alias_or_name, select_col) for select_col in statement.selects ] logger.debug("output columns: %s", [col[0] for col in output_columns]) - output_col: str for output_col, original_col_expression in output_columns: if output_col == "*": # If schema information is available, the * will be expanded to the actual columns. @@ -628,7 +663,7 @@ def _schema_aware_fuzzy_column_resolve( # Generate SELECT lineage. # Using a set here to deduplicate upstreams. - direct_col_upstreams: Set[_ColumnRef] = set() + direct_raw_col_upstreams: Set[_ColumnRef] = set() for node in lineage_node.walk(): if node.downstream: # We only want the leaf nodes. @@ -643,8 +678,9 @@ def _schema_aware_fuzzy_column_resolve( if node.subfield: normalized_col = f"{normalized_col}.{node.subfield}" - col = _schema_aware_fuzzy_column_resolve(table_ref, normalized_col) - direct_col_upstreams.add(_ColumnRef(table=table_ref, column=col)) + direct_raw_col_upstreams.add( + _ColumnRef(table=table_ref, column=normalized_col) + ) else: # This branch doesn't matter. For example, a count(*) column would go here, and # we don't get any column-level lineage for that. @@ -665,7 +701,16 @@ def _schema_aware_fuzzy_column_resolve( if original_col_expression.type: output_col_type = original_col_expression.type - if not direct_col_upstreams: + # Fuzzy resolve upstream columns. + direct_resolved_col_upstreams = { + _ColumnRef( + table=edge.table, + column=_schema_aware_fuzzy_column_resolve(edge.table, edge.column), + ) + for edge in direct_raw_col_upstreams + } + + if not direct_resolved_col_upstreams: logger.debug(f' "{output_col}" has no upstreams') column_lineage.append( _ColumnLineageInfo( @@ -674,12 +719,12 @@ def _schema_aware_fuzzy_column_resolve( column=output_col, column_type=output_col_type, ), - upstreams=sorted(direct_col_upstreams), + upstreams=sorted(direct_resolved_col_upstreams), # logic=column_logic.sql(pretty=True, dialect=dialect), ) ) - # TODO: Also extract referenced columns (e.g. non-SELECT lineage) + # TODO: Also extract referenced columns (aka auxillary / non-SELECT lineage) except (sqlglot.errors.OptimizeError, ValueError) as e: raise SqlUnderstandingError( f"sqlglot failed to compute some lineage: {e}" @@ -700,6 +745,53 @@ def _extract_select_from_create( return statement +_UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT: Set[str] = set( + sqlglot.exp.Update.arg_types.keys() +) - set(sqlglot.exp.Select.arg_types.keys()) + + +def _extract_select_from_update( + statement: sqlglot.exp.Update, +) -> sqlglot.exp.Select: + statement = statement.copy() + + # The "SET" expressions need to be converted. + # For the update command, it'll be a list of EQ expressions, but the select + # should contain aliased columns. + new_expressions = [] + for expr in statement.expressions: + if isinstance(expr, sqlglot.exp.EQ) and isinstance( + expr.left, sqlglot.exp.Column + ): + new_expressions.append( + sqlglot.exp.Alias( + this=expr.right, + alias=expr.left.this, + ) + ) + else: + # If we don't know how to convert it, just leave it as-is. If this causes issues, + # they'll get caught later. + new_expressions.append(expr) + + return sqlglot.exp.Select( + **{ + **{ + k: v + for k, v in statement.args.items() + if k not in _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT + }, + "expressions": new_expressions, + } + ) + + +def _is_create_table_ddl(statement: sqlglot.exp.Expression) -> bool: + return isinstance(statement, sqlglot.exp.Create) and isinstance( + statement.this, sqlglot.exp.Schema + ) + + def _try_extract_select( statement: sqlglot.exp.Expression, ) -> sqlglot.exp.Expression: @@ -716,6 +808,9 @@ def _try_extract_select( elif isinstance(statement, sqlglot.exp.Insert): # TODO Need to map column renames in the expressions part of the statement. statement = statement.expression + elif isinstance(statement, sqlglot.exp.Update): + # Assumption: the output table is already captured in the modified tables list. + statement = _extract_select_from_update(statement) elif isinstance(statement, sqlglot.exp.Create): # TODO May need to map column renames. # Assumption: the output table is already captured in the modified tables list. @@ -766,6 +861,7 @@ def _translate_sqlglot_type( def _translate_internal_column_lineage( table_name_urn_mapping: Dict[_TableName, str], raw_column_lineage: _ColumnLineageInfo, + dialect: str, ) -> ColumnLineageInfo: downstream_urn = None if raw_column_lineage.downstream.table: @@ -779,7 +875,9 @@ def _translate_internal_column_lineage( ) if raw_column_lineage.downstream.column_type else None, - native_column_type=raw_column_lineage.downstream.column_type.sql() + native_column_type=raw_column_lineage.downstream.column_type.sql( + dialect=dialect + ) if raw_column_lineage.downstream.column_type and raw_column_lineage.downstream.column_type.this != sqlglot.exp.DataType.Type.UNKNOWN @@ -800,12 +898,14 @@ def _get_dialect(platform: str) -> str: # TODO: convert datahub platform names to sqlglot dialect if platform == "presto-on-hive": return "hive" + if platform == "mssql": + return "tsql" else: return platform def _sqlglot_lineage_inner( - sql: str, + sql: sqlglot.exp.ExpOrStr, schema_resolver: SchemaResolver, default_db: Optional[str] = None, default_schema: Optional[str] = None, @@ -886,19 +986,25 @@ def _sqlglot_lineage_inner( ) # Simplify the input statement for column-level lineage generation. - select_statement = _try_extract_select(statement) + try: + select_statement = _try_extract_select(statement) + except Exception as e: + logger.debug(f"Failed to extract select from statement: {e}", exc_info=True) + debug_info.column_error = e + select_statement = None # Generate column-level lineage. column_lineage: Optional[List[_ColumnLineageInfo]] = None try: - column_lineage = _column_level_lineage( - select_statement, - dialect=dialect, - input_tables=table_name_schema_mapping, - output_table=downstream_table, - default_db=default_db, - default_schema=default_schema, - ) + if select_statement is not None: + column_lineage = _column_level_lineage( + select_statement, + dialect=dialect, + input_tables=table_name_schema_mapping, + output_table=downstream_table, + default_db=default_db, + default_schema=default_schema, + ) except UnsupportedStatementTypeError as e: # Inject details about the outer statement type too. e.args = (f"{e.args[0]} (outer statement type: {type(statement)})",) @@ -918,7 +1024,7 @@ def _sqlglot_lineage_inner( if column_lineage: column_lineage_urns = [ _translate_internal_column_lineage( - table_name_urn_mapping, internal_col_lineage + table_name_urn_mapping, internal_col_lineage, dialect=dialect ) for internal_col_lineage in column_lineage ] diff --git a/metadata-ingestion/tests/integration/delta_lake/delta_lake_minio_mces_golden.json b/metadata-ingestion/tests/integration/delta_lake/delta_lake_minio_mces_golden.json index 52e92d27549f0..ed65d74037796 100644 --- a/metadata-ingestion/tests/integration/delta_lake/delta_lake_minio_mces_golden.json +++ b/metadata-ingestion/tests/integration/delta_lake/delta_lake_minio_mces_golden.json @@ -136,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -156,7 +157,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -171,7 +173,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -186,7 +189,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -203,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -218,7 +223,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -238,7 +244,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -253,7 +260,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -268,7 +276,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -285,7 +294,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -300,7 +310,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -320,7 +331,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -335,7 +347,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -355,14 +368,16 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "version": "0" }, "lastUpdatedTimestamp": 1655664815399 } }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -386,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_allow_table.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_allow_table.json index 4dcdf71ce0095..6ec6eb2809a10 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_allow_table.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_allow_table.json @@ -94,7 +94,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -115,7 +116,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -130,7 +132,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -146,7 +149,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -163,7 +167,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -183,7 +188,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -204,7 +210,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -219,7 +226,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -235,7 +243,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -252,7 +261,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -267,7 +277,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -291,7 +302,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -312,7 +324,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -327,7 +340,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -343,7 +357,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -360,7 +375,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -375,7 +391,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -403,7 +420,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -424,7 +442,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -439,7 +458,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -455,7 +475,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -472,7 +493,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -487,7 +509,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -519,7 +542,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -540,7 +564,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -555,7 +580,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -571,7 +597,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -588,7 +615,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -603,7 +631,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -639,7 +668,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -654,7 +684,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -674,14 +705,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831476907 + "lastUpdatedTimestamp": 1655831477768 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -702,14 +736,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831477701 + "lastUpdatedTimestamp": 1655831477745 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -730,14 +766,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831477726 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -758,14 +796,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831477745 + "lastUpdatedTimestamp": 1655831477701 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -786,14 +826,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831477768 + "lastUpdatedTimestamp": 1655831476907 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -833,7 +874,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -973,7 +1015,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -988,7 +1031,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1008,14 +1052,16 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "version": "0" }, "lastUpdatedTimestamp": 1655664815399 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1055,7 +1101,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1152,7 +1199,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1167,7 +1215,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1187,14 +1236,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831649166 + "lastUpdatedTimestamp": 1655831649788 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1215,14 +1267,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831649715 + "lastUpdatedTimestamp": 1655831649754 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1243,14 +1297,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831649731 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1271,14 +1327,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831649754 + "lastUpdatedTimestamp": 1655831649715 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1299,14 +1357,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831649788 + "lastUpdatedTimestamp": 1655831649166 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1346,7 +1405,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1444,7 +1504,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1465,7 +1526,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1480,7 +1542,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1496,7 +1559,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1513,7 +1577,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1528,7 +1593,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1568,7 +1634,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1583,7 +1650,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1603,14 +1671,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831865396 + "lastUpdatedTimestamp": 1655831866541 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1631,14 +1702,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831866337 + "lastUpdatedTimestamp": 1655831866447 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1659,14 +1732,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831866398 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1687,14 +1762,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831866447 + "lastUpdatedTimestamp": 1655831866337 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1715,14 +1792,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831866541 + "lastUpdatedTimestamp": 1655831865396 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1766,7 +1844,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_inner_table.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_inner_table.json index 901e4c1262d3f..715beebfe22fb 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_inner_table.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_inner_table.json @@ -94,7 +94,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +115,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -144,7 +147,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +165,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -176,7 +181,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -196,7 +202,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -211,7 +218,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -226,7 +234,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -243,7 +252,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -258,7 +268,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -278,7 +289,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -298,7 +310,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -313,7 +326,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -328,7 +342,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -345,7 +360,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -360,7 +376,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -384,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -404,7 +422,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -419,7 +438,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -434,7 +454,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -451,7 +472,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -466,7 +488,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -494,7 +517,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -514,7 +538,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -529,7 +554,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -544,7 +570,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -561,7 +588,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -576,7 +604,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -608,7 +637,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -623,7 +653,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -643,14 +674,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831476907 + "lastUpdatedTimestamp": 1655831477768 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -671,14 +705,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831477701 + "lastUpdatedTimestamp": 1655831477745 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -699,14 +735,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831477726 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -727,14 +765,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831477745 + "lastUpdatedTimestamp": 1655831477701 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -755,14 +795,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831477768 + "lastUpdatedTimestamp": 1655831476907 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -798,7 +839,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -938,7 +980,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -953,7 +996,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -973,14 +1017,16 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "version": "0" }, "lastUpdatedTimestamp": 1655664815399 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1016,7 +1062,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1113,7 +1160,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1128,7 +1176,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1148,14 +1197,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831649166 + "lastUpdatedTimestamp": 1655831649788 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1176,14 +1228,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831649715 + "lastUpdatedTimestamp": 1655831649754 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1204,14 +1258,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831649731 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1232,14 +1288,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831649754 + "lastUpdatedTimestamp": 1655831649715 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1260,14 +1318,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831649788 + "lastUpdatedTimestamp": 1655831649166 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1303,7 +1362,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1401,7 +1461,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1421,7 +1482,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1436,7 +1498,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1451,7 +1514,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1468,7 +1532,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1483,7 +1548,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1519,7 +1585,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1534,7 +1601,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1554,14 +1622,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831865396 + "lastUpdatedTimestamp": 1655831866541 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1582,14 +1653,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831866337 + "lastUpdatedTimestamp": 1655831866447 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1610,14 +1683,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831866398 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1638,14 +1713,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831866447 + "lastUpdatedTimestamp": 1655831866337 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1666,14 +1743,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831866541 + "lastUpdatedTimestamp": 1655831865396 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1713,7 +1791,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_relative_path.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_relative_path.json index 18474e819334e..2076ec4096f68 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_relative_path.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_relative_path.json @@ -94,7 +94,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +115,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -144,7 +147,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +165,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -176,7 +181,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -191,7 +197,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -211,14 +218,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831476907 + "lastUpdatedTimestamp": 1655831477768 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -239,14 +249,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831477701 + "lastUpdatedTimestamp": 1655831477745 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -267,14 +279,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831477726 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -295,14 +309,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831477745 + "lastUpdatedTimestamp": 1655831477701 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -323,14 +339,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831477768 + "lastUpdatedTimestamp": 1655831476907 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -350,7 +367,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_single_table.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_single_table.json index bb47a077e878b..42e3b19612c2b 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_single_table.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_single_table.json @@ -93,7 +93,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -113,7 +114,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -128,7 +130,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -143,7 +146,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -160,7 +164,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -175,7 +180,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -195,7 +201,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -210,7 +217,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -225,7 +233,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -242,7 +251,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -257,7 +267,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -277,7 +288,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -297,7 +309,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -312,7 +325,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -327,7 +341,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -344,7 +359,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -359,7 +375,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -383,7 +400,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -403,7 +421,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -418,7 +437,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -433,7 +453,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -450,7 +471,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -465,7 +487,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -493,7 +516,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -513,7 +537,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -528,7 +553,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -543,7 +569,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -560,7 +587,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -575,7 +603,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -607,7 +636,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -622,7 +652,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -643,14 +674,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "readVersion": "3", + "version": "4" }, "lastUpdatedTimestamp": 1655831477768 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -686,7 +719,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/vertica/ddl.sql b/metadata-ingestion/tests/integration/vertica/ddl.sql index 59a71a1a1f7b5..ceebcd8e9ce2a 100644 --- a/metadata-ingestion/tests/integration/vertica/ddl.sql +++ b/metadata-ingestion/tests/integration/vertica/ddl.sql @@ -1,5 +1,4 @@ -\set AUTOCOMMIT on -ALTER USER dbadmin IDENTIFIED BY 'abc123'; + -- Create a Top-k projection CREATE TABLE readings (meter_id INT, reading_date TIMESTAMP, reading_value FLOAT); @@ -35,12 +34,16 @@ SELECT tokenize(phrase) OVER () FROM phrases; -- Create a temp table -CREATE TEMPORARY TABLE sampletemp (a int, b int) ON COMMIT PRESERVE ROWS; -INSERT INTO sampletemp VALUES(1,2); +-- CREATE TEMPORARY TABLE sampletemp (a int, b int) ON COMMIT PRESERVE ROWS; +-- INSERT INTO sampletemp VALUES(1,2); -- Create partition key -ALTER TABLE store.store_orders_fact PARTITION BY date_ordered::DATE GROUP BY DATE_TRUNC('month', (date_ordered)::DATE); -SELECT PARTITION_TABLE('store.store_orders_fact'); -CREATE PROJECTION ytd_orders AS SELECT * FROM store.store_orders_fact ORDER BY date_ordered - ON PARTITION RANGE BETWEEN date_trunc('year',now())::date AND NULL; +-- ALTER TABLE store.store_orders_fact PARTITION BY date_ordered::DATE GROUP BY DATE_TRUNC('month', (date_ordered)::DATE); +-- SELECT PARTITION_TABLE('store.store_orders_fact'); +-- CREATE PROJECTION ytd_orders AS SELECT * FROM store.store_orders_fact ORDER BY date_ordered +-- ON PARTITION RANGE BETWEEN date_trunc('year',now())::date AND NULL; + + + + SELECT start_refresh(); \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml index 84af5c32a60e3..1ba7990c826b2 100644 --- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml +++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml @@ -6,7 +6,7 @@ services: APP_DB_USER: "dbadmin" APP_DB_PASSWORD: "abc123" container_name: vertica-ce - image: vertica/vertica-ce:12.0.2-0 + image: vertica/vertica-ce:23.4.0-0 ports: - "5433:5433" - "5444:5444" diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py index fe306d1d0b2b8..94ad33ba21ce4 100644 --- a/metadata-ingestion/tests/integration/vertica/test_vertica.py +++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py @@ -1,6 +1,5 @@ import subprocess -import time -from typing import List, Optional +from typing import List import pytest from freezegun import freeze_time @@ -17,13 +16,12 @@ def test_resources_dir(pytestconfig): return pytestconfig.rootpath / "tests/integration/vertica" -def is_vertica_responsive( - container_name: str, port: int, hostname: Optional[str] -) -> bool: - if hostname: - cmd = f"docker logs {container_name} 2>&1 | grep 'Vertica is now running' " - ret = subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL) - +def is_vertica_responsive(container_name: str) -> bool: + cmd = f"docker logs {container_name} 2>&1 | grep 'Vertica is now running' " + ret = subprocess.run( + cmd, + shell=True, + ) return ret.returncode == 0 @@ -37,28 +35,22 @@ def vertica_runner(docker_compose_runner, test_resources_dir): "vertica-ce", 5433, timeout=120, - checker=lambda: is_vertica_responsive( - "vertica-ce", 5433, hostname="vertica-ce" - ), + checker=lambda: is_vertica_responsive("vertica-ce"), ) commands = """ docker cp tests/integration/vertica/ddl.sql vertica-ce:/home/dbadmin/ && - docker exec vertica-ce sh -c "/opt/vertica/bin/vsql -w abc123 -f /home/dbadmin/ddl.sql + docker exec vertica-ce sh -c "/opt/vertica/bin/vsql -w abc123 -f /home/dbadmin/ddl.sql" """ ret = subprocess.run(commands, shell=True, stdout=subprocess.DEVNULL) - # waiting for vertica to create default table and system table and ml models - time.sleep(60) - assert ret.returncode >= 1 + assert ret.returncode == 0 yield docker_services -# Test needs more work to be done , currently it is working fine. @freeze_time(FROZEN_TIME) -@pytest.mark.skip("Failing in CI, cmd failing with exit code 1") @pytest.mark.integration def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica" @@ -72,7 +64,7 @@ def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path): ignore_paths: List[str] = [ r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['create_time'\]", r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['table_size'\]", - r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['projection_size'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['Projection_size'\]", r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['ROS_Count'\]", r"root\[\d+\]\['aspect'\].+\['customProperties'\]\['cluster_size'\]", r"root\[\d+\]\['aspect'\].+\['customProperties'\]\['udx_language'\]", diff --git a/metadata-ingestion/tests/integration/vertica/vertica_mces_with_db_golden.json b/metadata-ingestion/tests/integration/vertica/vertica_mces_with_db_golden.json index 44a5e07d7b996..ef535158165da 100644 --- a/metadata-ingestion/tests/integration/vertica/vertica_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/vertica/vertica_mces_with_db_golden.json @@ -11,7 +11,7 @@ "env": "PROD", "database": "vmart", "cluster_type": "Enterprise", - "cluster_size": "122 GB", + "cluster_size": "101 GB", "subcluster": " ", "communal_storage_path": "" }, @@ -20,7 +20,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -35,7 +36,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -50,7 +52,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -67,7 +70,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -82,7 +86,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -97,16 +102,17 @@ "env": "PROD", "database": "vmart", "schema": "public", - "projection_count": "9", - "udx_list": "APPROXIMATE_COUNT_DISTINCT_SYNOPSIS_INFO, APPROXIMATE_MEDIAN, APPROXIMATE_PERCENTILE, AcdDataToCount, AcdDataToLongSyn, AcdDataToSyn, AcdSynToCount, AcdSynToSyn, DelimitedExport, DelimitedExportMulti, EmptyMap, Explode, FAvroParser, FCefParser, FCsvParser, FDelimitedPairParser, FDelimitedParser, FIDXParser, FJSONParser, FRegexParser, FlexTokenizer, JsonExport, JsonExportMulti, KafkaAvroParser, KafkaCheckBrokers, KafkaExport, KafkaInsertDelimiters, KafkaInsertLengths, KafkaJsonParser, KafkaListManyTopics, KafkaListTopics, KafkaOffsets, KafkaParser, KafkaSource, KafkaTopicDetails, MSE, MapAggregate, MapAggregate, MapContainsKey, MapContainsKey, MapContainsValue, MapContainsValue, MapDelimitedExtractor, MapItems, MapItems, MapJSONExtractor, MapKeys, MapKeys, MapKeysInfo, MapKeysInfo, MapLookup, MapLookup, MapLookup, MapPut, MapRegexExtractor, MapSize, MapSize, MapToString, MapToString, MapValues, MapValues, MapValuesOrField, MapVersion, MapVersion, OrcExport, OrcExportMulti, PRC, ParquetExport, ParquetExportMulti, PickBestType, PickBestType, PickBestType, ROC, STV_AsGeoJSON, STV_AsGeoJSON, STV_AsGeoJSON, STV_Create_Index, STV_Create_Index, STV_Create_Index, STV_DWithin, STV_DWithin, STV_DWithin, STV_Describe_Index, STV_Drop_Index, STV_Export2Shapefile, STV_Extent, STV_Extent, STV_ForceLHR, STV_Geography, STV_Geography, STV_GeographyPoint, STV_Geometry, STV_Geometry, STV_GeometryPoint, STV_GeometryPoint, STV_GetExportShapefileDirectory, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_IsValidReason, STV_IsValidReason, STV_IsValidReason, STV_LineStringPoint, STV_LineStringPoint, STV_LineStringPoint, STV_MemSize, STV_MemSize, STV_MemSize, STV_NN, STV_NN, STV_NN, STV_PolygonPoint, STV_PolygonPoint, STV_PolygonPoint, STV_Refresh_Index, STV_Refresh_Index, STV_Refresh_Index, STV_Rename_Index, STV_Reverse, STV_SetExportShapefileDirectory, STV_ShpCreateTable, STV_ShpParser, STV_ShpSource, ST_Area, ST_Area, ST_Area, ST_AsBinary, ST_AsBinary, ST_AsBinary, ST_AsText, ST_AsText, ST_AsText, ST_Boundary, ST_Buffer, ST_Centroid, ST_Contains, ST_Contains, ST_Contains, ST_ConvexHull, ST_Crosses, ST_Difference, ST_Disjoint, ST_Disjoint, ST_Disjoint, ST_Distance, ST_Distance, ST_Distance, ST_Envelope, ST_Equals, ST_Equals, ST_Equals, ST_GeoHash, ST_GeoHash, ST_GeoHash, ST_GeographyFromText, ST_GeographyFromWKB, ST_GeomFromGeoHash, ST_GeomFromGeoJSON, ST_GeomFromGeoJSON, ST_GeomFromText, ST_GeomFromText, ST_GeomFromWKB, ST_GeomFromWKB, ST_GeometryN, ST_GeometryN, ST_GeometryN, ST_GeometryType, ST_GeometryType, ST_GeometryType, ST_Intersection, ST_Intersects, ST_Intersects, ST_IsEmpty, ST_IsEmpty, ST_IsEmpty, ST_IsSimple, ST_IsSimple, ST_IsSimple, ST_IsValid, ST_IsValid, ST_IsValid, ST_Length, ST_Length, ST_Length, ST_NumGeometries, ST_NumGeometries, ST_NumGeometries, ST_NumPoints, ST_NumPoints, ST_NumPoints, ST_Overlaps, ST_PointFromGeoHash, ST_PointN, ST_PointN, ST_PointN, ST_Relate, ST_SRID, ST_SRID, ST_SRID, ST_Simplify, ST_SimplifyPreserveTopology, ST_SymDifference, ST_Touches, ST_Touches, ST_Touches, ST_Transform, ST_Union, ST_Union, ST_Within, ST_Within, ST_Within, ST_X, ST_X, ST_X, ST_XMax, ST_XMax, ST_XMax, ST_XMin, ST_XMin, ST_XMin, ST_Y, ST_Y, ST_Y, ST_YMax, ST_YMax, ST_YMax, ST_YMin, ST_YMin, ST_YMin, ST_intersects, SetMapKeys, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_NumCol, VoltageSecureAccess, VoltageSecureAccess, VoltageSecureConfigure, VoltageSecureConfigureGlobal, VoltageSecureProtect, VoltageSecureProtect, VoltageSecureProtectAllKeys, VoltageSecureRefreshPolicy, VoltageSecureVersion, append_centers, apply_bisecting_kmeans, apply_iforest, apply_inverse_pca, apply_inverse_svd, apply_kmeans, apply_normalize, apply_one_hot_encoder, apply_pca, apply_svd, approximate_quantiles, ar_create_blobs, ar_final_newton, ar_save_model, ar_transition_newton, avg_all_columns_local, bisecting_kmeans_init_model, bk_apply_best_kmeans_results, bk_compute_totss_local, bk_finalize_model, bk_get_rows_in_active_cluster, bk_kmeans_compute_local_centers, bk_kmeans_compute_withinss, bk_kmeans_fast_random_init, bk_kmeans_slow_random_init, bk_kmeanspp_init_cur_cluster, bk_kmeanspp_reset_blob, bk_kmeanspp_select_new_centers, bk_kmeanspp_within_chunk_sum, bk_save_final_model, bk_write_new_cluster_level, blob_to_table, bufUdx, bufUdx, calc_pseudo_centers, calculate_alpha_linear, calculate_hessian_linear1, calculate_hessian_linear2, cleanup_kmeans_files, compute_and_save_global_center, compute_and_save_new_centers, compute_local_totss, compute_local_withinss, compute_new_local_centers, confusion_matrix, coordinate_descent_covariance, corr_matrix, count_rows_in_blob, create_aggregator_blob, error_rate, evaluate_naive_bayes_model, evaluate_reg_model, evaluate_svm_model, export_model_files, finalize_blob_resource_group, get_attr_minmax, get_attr_robust_zscore, get_attr_zscore, get_model_attribute, get_model_summary, get_robust_zscore_median, iforest_create_blobs, iforest_phase0_udf1, iforest_phase0_udf2, iforest_phase1_udf1, iforest_phase1_udf2, iforest_phase1_udf3, iforest_phase1_udf4, iforest_phase2_udf1, iforest_phase2_udf2, iforest_phase2_udf3, iforest_phase2_udf4, iforest_save_model, import_model_files, isOrContains, kmeansAddMetricsToModel, kmeans_init_blobs, kmeans_to_write_final_centers, lift_table, line_search_logistic1, line_search_logistic2, load_rows_into_blocks, map_factor, math_op, matrix_global_xtx, matrix_local_xtx, mode_finder, model_converter, naive_bayes_phase1, naive_bayes_phase1_blob, naive_bayes_phase2, pca_prep1_global, pca_prep1_local, pca_prep2, pmml_parser, predict_autoregressor, predict_linear_reg, predict_logistic_reg, predict_moving_average, predict_naive_bayes, predict_naive_bayes_classes, predict_pmml, predict_rf_classifier, predict_rf_classifier_classes, predict_rf_regressor, predict_svm_classifier, predict_svm_regressor, predict_xgb_classifier, predict_xgb_classifier_classes, predict_xgb_regressor, random_init, random_init_write, read_from_dfblob, read_map_factor, read_ptree, read_tree, reg_final_bfgs, reg_final_newton, reg_transition_bfgs, reg_transition_newton, reg_write_model, remove_blob, reverse_normalize, rf_blob, rf_clean, rf_phase0_udf1, rf_phase0_udf2, rf_phase1_udf1, rf_phase1_udf2, rf_phase1_udf3, rf_phase1_udf4, rf_phase2_udf1, rf_phase2_udf2, rf_phase2_udf3, rf_phase2_udf4, rf_predictor_importance, rf_save_model, rsquared, save_cv_result, save_pca_model, save_svd_model, save_svm_model, select_new_centers, store_minmax_model, store_one_hot_encoder_model, store_robust_zscore_model, store_zscore_model, table_to_blob, table_to_dfblob, update_and_return_sum_of_squared_distances, upgrade_model_format, writeInitialKmeansModelToDfs, xgb_create_blobs, xgb_phase0_udf1, xgb_phase0_udf2, xgb_phase1_udf1, xgb_phase1_udf2, xgb_phase1_udf3, xgb_phase2_udf1, xgb_phase2_udf2, xgb_phase2_udf3, xgb_prune, xgb_save_model, yule_walker, ", - "udx_language": "ComplexTypesLib -- Functions for Complex Types | DelimitedExportLib -- Delimited data export package | JsonExportLib -- Json data export package | MachineLearningLib -- Machine learning package | OrcExportLib -- Orc export package | ParquetExportLib -- Parquet export package | ApproximateLib -- Approximate package | FlexTableLib -- Flexible Tables Data Load and Query | KafkaLib -- Kafka streaming load and export | PlaceLib -- Geospatial package | VoltageSecureLib -- Voltage SecureData Connector | " + "projection_count": "12", + "udx_list": "APPROXIMATE_COUNT_DISTINCT_SYNOPSIS_INFO, APPROXIMATE_MEDIAN, APPROXIMATE_PERCENTILE, AcdDataToCount, AcdDataToLongSyn, AcdDataToSyn, AcdSynToCount, AcdSynToSyn, DelimitedExport, DelimitedExportMulti, EmptyMap, Explode, FAvroParser, FCefParser, FCsvParser, FDelimitedPairParser, FDelimitedParser, FIDXParser, FJSONParser, FRegexParser, FlexTokenizer, JsonExport, JsonExportMulti, KafkaAvroParser, KafkaCheckBrokers, KafkaExport, KafkaInsertDelimiters, KafkaInsertLengths, KafkaJsonParser, KafkaListManyTopics, KafkaListTopics, KafkaOffsets, KafkaParser, KafkaSource, KafkaTopicDetails, MSE, MapAggregate, MapAggregate, MapContainsKey, MapContainsKey, MapContainsValue, MapContainsValue, MapDelimitedExtractor, MapItems, MapItems, MapJSONExtractor, MapKeys, MapKeys, MapKeysInfo, MapKeysInfo, MapLookup, MapLookup, MapLookup, MapPut, MapRegexExtractor, MapSize, MapSize, MapToString, MapToString, MapValues, MapValues, MapValuesOrField, MapVersion, MapVersion, OrcExport, OrcExportMulti, PRC, ParquetExport, ParquetExportMulti, PickBestType, PickBestType, PickBestType, ROC, STV_AsGeoJSON, STV_AsGeoJSON, STV_AsGeoJSON, STV_Create_Index, STV_Create_Index, STV_Create_Index, STV_DWithin, STV_DWithin, STV_DWithin, STV_Describe_Index, STV_Drop_Index, STV_Export2Shapefile, STV_Extent, STV_Extent, STV_ForceLHR, STV_Geography, STV_Geography, STV_GeographyPoint, STV_Geometry, STV_Geometry, STV_GeometryPoint, STV_GeometryPoint, STV_GetExportShapefileDirectory, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_IsValidReason, STV_IsValidReason, STV_IsValidReason, STV_LineStringPoint, STV_LineStringPoint, STV_LineStringPoint, STV_MemSize, STV_MemSize, STV_MemSize, STV_NN, STV_NN, STV_NN, STV_PolygonPoint, STV_PolygonPoint, STV_PolygonPoint, STV_Refresh_Index, STV_Refresh_Index, STV_Refresh_Index, STV_Rename_Index, STV_Reverse, STV_SetExportShapefileDirectory, STV_ShpCreateTable, STV_ShpParser, STV_ShpSource, ST_Area, ST_Area, ST_Area, ST_AsBinary, ST_AsBinary, ST_AsBinary, ST_AsText, ST_AsText, ST_AsText, ST_Boundary, ST_Buffer, ST_Centroid, ST_Contains, ST_Contains, ST_Contains, ST_ConvexHull, ST_Crosses, ST_Difference, ST_Disjoint, ST_Disjoint, ST_Disjoint, ST_Distance, ST_Distance, ST_Distance, ST_Envelope, ST_Equals, ST_Equals, ST_Equals, ST_GeoHash, ST_GeoHash, ST_GeoHash, ST_GeographyFromText, ST_GeographyFromWKB, ST_GeomFromGeoHash, ST_GeomFromGeoJSON, ST_GeomFromGeoJSON, ST_GeomFromText, ST_GeomFromText, ST_GeomFromWKB, ST_GeomFromWKB, ST_GeometryN, ST_GeometryN, ST_GeometryN, ST_GeometryType, ST_GeometryType, ST_GeometryType, ST_Intersection, ST_Intersects, ST_Intersects, ST_IsEmpty, ST_IsEmpty, ST_IsEmpty, ST_IsSimple, ST_IsSimple, ST_IsSimple, ST_IsValid, ST_IsValid, ST_IsValid, ST_Length, ST_Length, ST_Length, ST_NumGeometries, ST_NumGeometries, ST_NumGeometries, ST_NumPoints, ST_NumPoints, ST_NumPoints, ST_Overlaps, ST_PointFromGeoHash, ST_PointN, ST_PointN, ST_PointN, ST_Relate, ST_SRID, ST_SRID, ST_SRID, ST_Simplify, ST_SimplifyPreserveTopology, ST_SymDifference, ST_Touches, ST_Touches, ST_Touches, ST_Transform, ST_Union, ST_Union, ST_Within, ST_Within, ST_Within, ST_X, ST_X, ST_X, ST_XMax, ST_XMax, ST_XMax, ST_XMin, ST_XMin, ST_XMin, ST_Y, ST_Y, ST_Y, ST_YMax, ST_YMax, ST_YMax, ST_YMin, ST_YMin, ST_YMin, ST_intersects, SetMapKeys, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_NumCol, Unnest, VoltageSecureAccess, VoltageSecureAccess, VoltageSecureConfigure, VoltageSecureConfigureGlobal, VoltageSecureProtect, VoltageSecureProtect, VoltageSecureProtectAllKeys, VoltageSecureRefreshPolicy, VoltageSecureVersion, append_centers, apply_bisecting_kmeans, apply_iforest, apply_inverse_pca, apply_inverse_svd, apply_kmeans, apply_kprototypes, apply_normalize, apply_one_hot_encoder, apply_pca, apply_svd, approximate_quantiles, ar_create_blobs, ar_final_newton, ar_save_model, ar_transition_newton, arima_bfgs, arima_line_search, arima_save_model, avg_all_columns_local, bisecting_kmeans_init_model, bk_apply_best_kmeans_results, bk_compute_totss_local, bk_finalize_model, bk_get_rows_in_active_cluster, bk_kmeans_compute_local_centers, bk_kmeans_compute_withinss, bk_kmeans_fast_random_init, bk_kmeans_slow_random_init, bk_kmeanspp_init_cur_cluster, bk_kmeanspp_reset_blob, bk_kmeanspp_select_new_centers, bk_kmeanspp_within_chunk_sum, bk_save_final_model, bk_write_new_cluster_level, blob_to_table, bufUdx, bufUdx, calc_pseudo_centers, calculate_alpha_linear, calculate_hessian_linear1, calculate_hessian_linear2, chi_squared, cleanup_kmeans_files, compute_and_save_global_center, compute_and_save_new_centers, compute_local_totss, compute_local_withinss, compute_new_local_centers, confusion_matrix, coordinate_descent_covariance, corr_matrix, count_rows_in_blob, create_aggregator_blob, error_rate, evaluate_naive_bayes_model, evaluate_reg_model, evaluate_svm_model, export_model_files, finalize_blob_resource_group, get_attr_minmax, get_attr_robust_zscore, get_attr_zscore, get_model_attribute, get_model_summary, get_robust_zscore_median, iforest_create_blobs, iforest_phase0_udf1, iforest_phase0_udf2, iforest_phase1_udf1, iforest_phase1_udf2, iforest_phase1_udf3, iforest_phase1_udf4, iforest_phase2_udf1, iforest_phase2_udf2, iforest_phase2_udf3, iforest_phase2_udf4, iforest_save_model, import_model_files, isOrContains, kmeansAddMetricsToModel, kmeans_init_blobs, kmeans_to_write_final_centers, lift_table, line_search_logistic1, line_search_logistic2, load_rows_into_blocks, map_factor, math_op, matrix_global_xtx, matrix_local_xtx, mode_finder, model_converter, naive_bayes_phase1, naive_bayes_phase1_blob, naive_bayes_phase2, pca_prep1_global, pca_prep1_local, pca_prep2, pmml_parser, predict_arima, predict_autoregressor, predict_linear_reg, predict_logistic_reg, predict_moving_average, predict_naive_bayes, predict_naive_bayes_classes, predict_pmml, predict_poisson_reg, predict_rf_classifier, predict_rf_classifier_classes, predict_rf_regressor, predict_svm_classifier, predict_svm_regressor, predict_xgb_classifier, predict_xgb_classifier_classes, predict_xgb_regressor, random_init, random_init_write, read_from_dfblob, read_map_factor, read_ptree, read_tree, reg_final_bfgs, reg_final_newton, reg_transition_bfgs, reg_transition_newton, reg_write_model, remove_blob, reverse_normalize, rf_blob, rf_clean, rf_phase0_udf1, rf_phase0_udf2, rf_phase1_udf1, rf_phase1_udf2, rf_phase1_udf3, rf_phase1_udf4, rf_phase2_udf1, rf_phase2_udf2, rf_phase2_udf3, rf_phase2_udf4, rf_predictor_importance, rf_save_model, rsquared, save_cv_result, save_pca_model, save_svd_model, save_svm_model, select_new_centers, store_minmax_model, store_one_hot_encoder_model, store_robust_zscore_model, store_zscore_model, table_to_blob, table_to_dfblob, tokenize, topk, update_and_return_sum_of_squared_distances, upgrade_model_format, writeInitialKmeansModelToDfs, xgb_create_blobs, xgb_phase0_udf1, xgb_phase0_udf2, xgb_phase1_udf1, xgb_phase1_udf2, xgb_phase1_udf3, xgb_phase2_udf1, xgb_phase2_udf2, xgb_phase2_udf3, xgb_predictor_importance, xgb_prune, xgb_save_model, yule_walker, ", + "udx_language": "ComplexTypesLib -- Functions for Complex Types | DelimitedExportLib -- Delimited data export package | JsonExportLib -- Json data export package | MachineLearningLib -- Machine learning package | OrcExportLib -- Orc export package | ParquetExportLib -- Parquet export package | ApproximateLib -- Approximate package | FlexTableLib -- Flexible Tables Data Load and Query | KafkaLib -- Kafka streaming load and export | PlaceLib -- Geospatial package | VoltageSecureLib -- Voltage SecureData Connector | TransformFunctions -- User-defined Python library | " }, "name": "public" } }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -121,7 +127,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -136,7 +143,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -153,7 +161,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -168,7 +177,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -188,7 +198,184 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "create_time": "2023-10-13 11:23:05.308022+00:00", + "table_size": "0 KB" + }, + "name": "clicks", + "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.clicks", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "user_id", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "page_id", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "click_time", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "TIMESTAMP_WITH_PRECISION()", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -212,7 +399,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -227,7 +415,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -243,7 +432,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.358215+00:00" + "create_time": "2023-10-13 11:22:37.846965+00:00", + "table_size": "2119 KB" }, "name": "customer_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -551,7 +741,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -568,7 +759,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -592,7 +784,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -616,7 +809,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -631,7 +825,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -647,7 +842,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.368954+00:00" + "create_time": "2023-10-13 11:22:37.857152+00:00", + "table_size": "138 KB" }, "name": "date_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -955,7 +1151,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -972,7 +1169,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -996,7 +1194,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1020,7 +1219,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1035,7 +1235,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1051,7 +1252,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.375896+00:00" + "create_time": "2023-10-13 11:22:37.863745+00:00", + "table_size": "327 KB" }, "name": "employee_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -1320,7 +1522,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1337,7 +1540,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1361,7 +1565,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1385,7 +1590,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1400,7 +1606,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1416,7 +1623,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.385843+00:00" + "create_time": "2023-10-13 11:22:37.873181+00:00", + "table_size": "2564 KB" }, "name": "inventory_fact", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -1529,7 +1737,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1546,7 +1755,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1570,12 +1780,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -1594,12 +1805,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -1609,13 +1821,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1625,16 +1838,17 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.362016+00:00" + "create_time": "2023-10-13 11:23:05.408507+00:00", + "table_size": "0 KB" }, - "name": "product_dimension", + "name": "phrases", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.product_dimension", + "schemaName": "public.phrases", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -1653,33 +1867,7 @@ }, "fields": [ { - "fieldPath": "product_key", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": true - }, - { - "fieldPath": "product_version", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "product_description", + "fieldPath": "phrase", "nullable": true, "description": "", "type": { @@ -1690,76 +1878,252 @@ "nativeDataType": "VARCHAR(length=128)", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "sku_number", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "category_description", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "department_description", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "package_type_description", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "package_size", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "fat_content", - "nullable": true, - "description": "", + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "create_time": "2023-10-13 11:22:37.850505+00:00", + "table_size": "19 KB" + }, + "name": "product_dimension", + "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.product_dimension", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "product_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "product_version", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "product_description", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=128)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "sku_number", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "category_description", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "department_description", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "package_type_description", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "package_size", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "fat_content", + "nullable": true, + "description": "", "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -1933,7 +2297,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1950,7 +2315,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1974,7 +2340,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1998,7 +2365,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2013,7 +2381,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2029,7 +2398,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.365453+00:00" + "create_time": "2023-10-13 11:22:37.853878+00:00", + "table_size": "3 KB" }, "name": "promotion_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -2220,7 +2590,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2237,7 +2608,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2261,12 +2633,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -2285,12 +2658,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -2300,13 +2674,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -2316,16 +2691,17 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.379273+00:00" + "create_time": "2023-10-13 11:23:05.296044+00:00", + "table_size": "0 KB" }, - "name": "shipping_dimension", + "name": "readings", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.shipping_dimension", + "schemaName": "public.readings", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -2344,7 +2720,7 @@ }, "fields": [ { - "fieldPath": "shipping_key", + "fieldPath": "meter_id", "nullable": true, "description": "", "type": { @@ -2354,39 +2730,215 @@ }, "nativeDataType": "INTEGER()", "recursive": false, - "isPartOfKey": true + "isPartOfKey": false }, { - "fieldPath": "ship_type", + "fieldPath": "reading_date", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.TimeType": {} } }, - "nativeDataType": "CHAR(length=30)", + "nativeDataType": "TIMESTAMP_WITH_PRECISION()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "ship_mode", + "fieldPath": "reading_value", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=10)", + "nativeDataType": "FLOAT()", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "ship_carrier", - "nullable": true, - "description": "", - "type": { + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "create_time": "2023-10-13 11:22:37.867119+00:00", + "table_size": "1 KB" + }, + "name": "shipping_dimension", + "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.shipping_dimension", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "shipping_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "ship_type", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=30)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_mode", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=10)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_carrier", + "nullable": true, + "description": "", + "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} } @@ -2403,7 +2955,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2420,7 +2973,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2444,7 +2998,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2468,7 +3023,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2483,7 +3039,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2499,7 +3056,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.372409+00:00" + "create_time": "2023-10-13 11:22:37.860541+00:00", + "table_size": "1 KB" }, "name": "vendor_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -2638,7 +3196,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2655,7 +3214,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2679,7 +3239,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2703,7 +3264,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2718,7 +3280,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2734,7 +3297,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:38:20.045598+00:00" + "create_time": "2023-10-13 11:23:04.970568+00:00", + "table_size": "0 KB" }, "name": "vmart_load_success", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -2782,7 +3346,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2799,7 +3364,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2823,7 +3389,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2847,7 +3414,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2862,7 +3430,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2878,7 +3447,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.382549+00:00" + "create_time": "2023-10-13 11:22:37.870169+00:00", + "table_size": "2 KB" }, "name": "warehouse_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -2991,7 +3561,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3008,7 +3579,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3032,12 +3604,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -3056,12 +3629,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -3071,13 +3645,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -3087,23 +3662,19 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "ROS_Count": "1", - "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(date_dimension.date_key)", - "projection_size": "138 KB", - "Partition_Key": "Not Available", - "Partition_Size": "0", - "Projection_Cached": "False" + "create_time": "2023-10-13 11:23:05.319029+00:00", + "table_size": "0 KB", + "view_definition": "SELECT sum(customer_dimension.annual_income) AS SUM, customer_dimension.customer_state FROM public.customer_dimension WHERE (customer_dimension.customer_key IN (SELECT store_sales_fact.customer_key FROM store.store_sales_fact)) GROUP BY customer_dimension.customer_state ORDER BY customer_dimension.customer_state", + "is_view": "True" }, - "name": "date_dimension_super", - "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "name": "sampleview", + "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.date_dimension_super", + "schemaName": "public.sampleview", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -3122,7 +3693,7 @@ }, "fields": [ { - "fieldPath": "date_key", + "fieldPath": "SUM", "nullable": true, "description": "", "type": { @@ -3135,33 +3706,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} - } - }, - "nativeDataType": "DATE()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "full_date_description", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=18)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_of_week", + "fieldPath": "customer_state", "nullable": true, "description": "", "type": { @@ -3169,228 +3714,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=9)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_number_in_calendar_month", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_number_in_calendar_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_number_in_fiscal_month", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_number_in_fiscal_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "last_day_in_week_indicator", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "last_day_in_month_indicator", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_week_number_in_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_month_name", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=9)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_month_number_in_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_year_month", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=7)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_quarter", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_year_quarter", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=7)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_half_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "holiday_indicator", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=10)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "weekday_indicator", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=7)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "selling_season", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=2)", "recursive": false, "isPartOfKey": false } @@ -3402,29 +3726,49 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Projections" + "View" ] } }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT sum(customer_dimension.annual_income) AS SUM, customer_dimension.customer_state FROM public.customer_dimension WHERE (customer_dimension.customer_key IN (SELECT store_sales_fact.customer_key FROM store.store_sales_fact)) GROUP BY customer_dimension.customer_state ORDER BY customer_dimension.customer_state", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -3435,7 +3779,15 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,store.store_sales_fact,PROD)", "type": "TRANSFORMED" } ] @@ -3443,12 +3795,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -3467,12 +3820,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -3491,12 +3845,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -3506,13 +3861,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -3524,21 +3880,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(product_dimension.product_key, product_dimension.product_version)", - "projection_size": "19 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(date_dimension.date_key)", + "Projection_size": "138 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "product_dimension_super", + "name": "date_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.product_dimension_super", + "schemaName": "public.date_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -3557,7 +3913,7 @@ }, "fields": [ { - "fieldPath": "product_key", + "fieldPath": "date_key", "nullable": true, "description": "", "type": { @@ -3570,20 +3926,20 @@ "isPartOfKey": false }, { - "fieldPath": "product_version", + "fieldPath": "date", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.DateType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "DATE()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "product_description", + "fieldPath": "full_date_description", "nullable": true, "description": "", "type": { @@ -3591,12 +3947,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=128)", + "nativeDataType": "VARCHAR(length=18)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "sku_number", + "fieldPath": "day_of_week", "nullable": true, "description": "", "type": { @@ -3604,64 +3960,64 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "VARCHAR(length=9)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "category_description", + "fieldPath": "day_number_in_calendar_month", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "department_description", + "fieldPath": "day_number_in_calendar_year", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "package_type_description", + "fieldPath": "day_number_in_fiscal_month", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "package_size", + "fieldPath": "day_number_in_fiscal_year", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "fat_content", + "fieldPath": "last_day_in_week_indicator", "nullable": true, "description": "", "type": { @@ -3674,20 +4030,20 @@ "isPartOfKey": false }, { - "fieldPath": "diet_type", + "fieldPath": "last_day_in_month_indicator", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "weight", + "fieldPath": "calendar_week_number_in_year", "nullable": true, "description": "", "type": { @@ -3700,7 +4056,7 @@ "isPartOfKey": false }, { - "fieldPath": "weight_units_of_measure", + "fieldPath": "calendar_month_name", "nullable": true, "description": "", "type": { @@ -3708,12 +4064,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "VARCHAR(length=9)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "shelf_width", + "fieldPath": "calendar_month_number_in_year", "nullable": true, "description": "", "type": { @@ -3726,20 +4082,20 @@ "isPartOfKey": false }, { - "fieldPath": "shelf_height", + "fieldPath": "calendar_year_month", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "CHAR(length=7)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "shelf_depth", + "fieldPath": "calendar_quarter", "nullable": true, "description": "", "type": { @@ -3752,20 +4108,20 @@ "isPartOfKey": false }, { - "fieldPath": "product_price", + "fieldPath": "calendar_year_quarter", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "CHAR(length=7)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "product_cost", + "fieldPath": "calendar_half_year", "nullable": true, "description": "", "type": { @@ -3778,7 +4134,7 @@ "isPartOfKey": false }, { - "fieldPath": "lowest_competitor_price", + "fieldPath": "calendar_year", "nullable": true, "description": "", "type": { @@ -3791,41 +4147,41 @@ "isPartOfKey": false }, { - "fieldPath": "highest_competitor_price", + "fieldPath": "holiday_indicator", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "VARCHAR(length=10)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "average_competitor_price", + "fieldPath": "weekday_indicator", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "CHAR(length=7)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "discontinued_flag", + "fieldPath": "selling_season", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false } @@ -3837,12 +4193,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -3854,12 +4211,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -3870,7 +4228,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension,PROD)", "type": "TRANSFORMED" } ] @@ -3878,12 +4236,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -3902,12 +4261,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -3926,12 +4286,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -3941,13 +4302,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -3959,21 +4321,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(promotion_dimension.promotion_key)", - "projection_size": "3 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(product_dimension.product_key, product_dimension.product_version)", + "Projection_size": "19 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "promotion_dimension_super", + "name": "product_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.promotion_dimension_super", + "schemaName": "public.product_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -3992,7 +4354,7 @@ }, "fields": [ { - "fieldPath": "promotion_key", + "fieldPath": "product_key", "nullable": true, "description": "", "type": { @@ -4005,7 +4367,20 @@ "isPartOfKey": false }, { - "fieldPath": "promotion_name", + "fieldPath": "product_version", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "product_description", "nullable": true, "description": "", "type": { @@ -4018,7 +4393,7 @@ "isPartOfKey": false }, { - "fieldPath": "price_reduction_type", + "fieldPath": "sku_number", "nullable": true, "description": "", "type": { @@ -4026,12 +4401,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "promotion_media_type", + "fieldPath": "category_description", "nullable": true, "description": "", "type": { @@ -4039,12 +4414,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "ad_type", + "fieldPath": "department_description", "nullable": true, "description": "", "type": { @@ -4052,12 +4427,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "display_type", + "fieldPath": "package_type_description", "nullable": true, "description": "", "type": { @@ -4065,12 +4440,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "coupon_type", + "fieldPath": "package_size", "nullable": true, "description": "", "type": { @@ -4078,12 +4453,25 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "ad_media_name", + "fieldPath": "fat_content", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "diet_type", "nullable": true, "description": "", "type": { @@ -4091,12 +4479,25 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "display_provider", + "fieldPath": "weight", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "weight_units_of_measure", "nullable": true, "description": "", "type": { @@ -4104,12 +4505,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=128)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "promotion_cost", + "fieldPath": "shelf_width", "nullable": true, "description": "", "type": { @@ -4122,28 +4523,106 @@ "isPartOfKey": false }, { - "fieldPath": "promotion_begin_date", + "fieldPath": "shelf_height", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "DATE()", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "promotion_end_date", + "fieldPath": "shelf_depth", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "DATE()", + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "product_price", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "product_cost", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lowest_competitor_price", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "highest_competitor_price", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "average_competitor_price", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "discontinued_flag", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false } @@ -4155,12 +4634,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -4172,12 +4652,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -4188,7 +4669,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", "type": "TRANSFORMED" } ] @@ -4196,12 +4677,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -4220,12 +4702,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -4244,12 +4727,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -4259,13 +4743,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -4277,21 +4762,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(vendor_dimension.vendor_key)", - "projection_size": "1 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(promotion_dimension.promotion_key)", + "Projection_size": "3 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "vendor_dimension_super", + "name": "promotion_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.vendor_dimension_super", + "schemaName": "public.promotion_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -4310,7 +4795,7 @@ }, "fields": [ { - "fieldPath": "vendor_key", + "fieldPath": "promotion_key", "nullable": true, "description": "", "type": { @@ -4323,7 +4808,7 @@ "isPartOfKey": false }, { - "fieldPath": "vendor_name", + "fieldPath": "promotion_name", "nullable": true, "description": "", "type": { @@ -4331,12 +4816,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=64)", + "nativeDataType": "VARCHAR(length=128)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vendor_address", + "fieldPath": "price_reduction_type", "nullable": true, "description": "", "type": { @@ -4344,12 +4829,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=64)", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vendor_city", + "fieldPath": "promotion_media_type", "nullable": true, "description": "", "type": { @@ -4357,12 +4842,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=64)", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vendor_state", + "fieldPath": "ad_type", "nullable": true, "description": "", "type": { @@ -4370,12 +4855,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "CHAR(length=2)", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vendor_region", + "fieldPath": "display_type", "nullable": true, "description": "", "type": { @@ -4388,7 +4873,46 @@ "isPartOfKey": false }, { - "fieldPath": "deal_size", + "fieldPath": "coupon_type", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ad_media_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "display_provider", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=128)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "promotion_cost", "nullable": true, "description": "", "type": { @@ -4401,7 +4925,20 @@ "isPartOfKey": false }, { - "fieldPath": "last_deal_update", + "fieldPath": "promotion_begin_date", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "promotion_end_date", "nullable": true, "description": "", "type": { @@ -4421,12 +4958,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -4438,12 +4976,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -4454,7 +4993,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension,PROD)", "type": "TRANSFORMED" } ] @@ -4462,12 +5001,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -4486,12 +5026,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -4510,12 +5051,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -4525,13 +5067,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -4543,21 +5086,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(customer_dimension.customer_key)", - "projection_size": "2119 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(vendor_dimension.vendor_key)", + "Projection_size": "1 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "customer_dimension_super", + "name": "vendor_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.customer_dimension_super", + "schemaName": "public.vendor_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -4576,7 +5119,7 @@ }, "fields": [ { - "fieldPath": "customer_key", + "fieldPath": "vendor_key", "nullable": true, "description": "", "type": { @@ -4589,46 +5132,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_type", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=16)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "customer_name", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=256)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "customer_gender", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=8)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "title", + "fieldPath": "vendor_name", "nullable": true, "description": "", "type": { @@ -4636,25 +5140,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=8)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "household_id", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", + "nativeDataType": "VARCHAR(length=64)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "customer_address", + "fieldPath": "vendor_address", "nullable": true, "description": "", "type": { @@ -4662,12 +5153,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=256)", + "nativeDataType": "VARCHAR(length=64)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "customer_city", + "fieldPath": "vendor_city", "nullable": true, "description": "", "type": { @@ -4680,7 +5171,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_state", + "fieldPath": "vendor_state", "nullable": true, "description": "", "type": { @@ -4693,20 +5184,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_region", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=64)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "marital_status", + "fieldPath": "vendor_region", "nullable": true, "description": "", "type": { @@ -4719,72 +5197,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_age", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "number_of_children", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "annual_income", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "occupation", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=64)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "largest_bill_amount", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "store_membership_card", + "fieldPath": "deal_size", "nullable": true, "description": "", "type": { @@ -4797,46 +5210,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_since", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} - } - }, - "nativeDataType": "DATE()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "deal_stage", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "deal_size", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "last_deal_update", + "fieldPath": "last_deal_update", "nullable": true, "description": "", "type": { @@ -4856,12 +5230,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -4873,12 +5248,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -4889,7 +5265,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension,PROD)", "type": "TRANSFORMED" } ] @@ -4897,12 +5273,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -4921,12 +5298,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -4945,12 +5323,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -4960,13 +5339,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -4978,21 +5358,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(employee_dimension.employee_key)", - "projection_size": "327 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(customer_dimension.customer_key)", + "Projection_size": "2119 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "employee_dimension_super", + "name": "customer_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.employee_dimension_super", + "schemaName": "public.customer_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -5011,7 +5391,7 @@ }, "fields": [ { - "fieldPath": "employee_key", + "fieldPath": "customer_key", "nullable": true, "description": "", "type": { @@ -5024,7 +5404,33 @@ "isPartOfKey": false }, { - "fieldPath": "employee_gender", + "fieldPath": "customer_type", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=16)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=256)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_gender", "nullable": true, "description": "", "type": { @@ -5037,7 +5443,7 @@ "isPartOfKey": false }, { - "fieldPath": "courtesy_title", + "fieldPath": "title", "nullable": true, "description": "", "type": { @@ -5050,7 +5456,33 @@ "isPartOfKey": false }, { - "fieldPath": "employee_first_name", + "fieldPath": "household_id", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_address", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=256)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_city", "nullable": true, "description": "", "type": { @@ -5063,7 +5495,7 @@ "isPartOfKey": false }, { - "fieldPath": "employee_middle_initial", + "fieldPath": "customer_state", "nullable": true, "description": "", "type": { @@ -5071,12 +5503,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=8)", + "nativeDataType": "CHAR(length=2)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "employee_last_name", + "fieldPath": "customer_region", "nullable": true, "description": "", "type": { @@ -5089,7 +5521,20 @@ "isPartOfKey": false }, { - "fieldPath": "employee_age", + "fieldPath": "marital_status", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_age", "nullable": true, "description": "", "type": { @@ -5102,33 +5547,33 @@ "isPartOfKey": false }, { - "fieldPath": "hire_date", + "fieldPath": "number_of_children", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "DATE()", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "employee_street_address", + "fieldPath": "annual_income", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "VARCHAR(length=256)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "employee_city", + "fieldPath": "occupation", "nullable": true, "description": "", "type": { @@ -5141,20 +5586,46 @@ "isPartOfKey": false }, { - "fieldPath": "employee_state", + "fieldPath": "largest_bill_amount", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=2)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "employee_region", + "fieldPath": "store_membership_card", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_since", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "deal_stage", "nullable": true, "description": "", "type": { @@ -5162,25 +5633,1087 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "job_title", + "fieldPath": "deal_size", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_deal_update", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Projections" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "ROS_Count": "1", + "Projection_Type": "is_super_projection", + "Is_Segmented": "True", + "Segmentation_key": "hash(employee_dimension.employee_key)", + "Projection_size": "327 KB", + "Partition_Key": "Not Available", + "Number_Of_Partitions": "0", + "Projection_Cached": "False" + }, + "name": "employee_dimension_super", + "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.employee_dimension_super", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "employee_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_gender", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=8)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "courtesy_title", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=8)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_first_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=64)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_middle_initial", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=8)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_last_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=64)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_age", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "hire_date", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_street_address", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=256)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_city", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=64)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_state", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_region", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "job_title", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=64)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "reports_to", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "salaried_flag", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "annual_salary", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "hourly_rate", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "FLOAT()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "vacation_days", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Projections" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "ROS_Count": "1", + "Projection_Type": "is_super_projection", + "Is_Segmented": "True", + "Segmentation_key": "hash(warehouse_dimension.warehouse_key)", + "Projection_size": "2 KB", + "Partition_Key": "Not Available", + "Number_Of_Partitions": "0", + "Projection_Cached": "False" + }, + "name": "warehouse_dimension_super", + "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.warehouse_dimension_super", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "warehouse_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=20)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_address", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=256)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_city", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=60)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_state", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_region", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=32)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Projections" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "ROS_Count": "1", + "Projection_Type": "is_super_projection", + "Is_Segmented": "True", + "Segmentation_key": "hash(shipping_dimension.shipping_key)", + "Projection_size": "1 KB", + "Partition_Key": "Not Available", + "Number_Of_Partitions": "0", + "Projection_Cached": "False" + }, + "name": "shipping_dimension_super", + "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.shipping_dimension_super", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "shipping_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_type", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=30)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_mode", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=10)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_carrier", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=20)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Projections" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "ROS_Count": "1", + "Projection_Type": "is_super_projection", + "Is_Segmented": "True", + "Segmentation_key": "hash(inventory_fact.date_key, inventory_fact.product_key, inventory_fact.product_version, inventory_fact.warehouse_key, inventory_fact.qty_in_stock)", + "Projection_size": "2564 KB", + "Partition_Key": "Not Available", + "Number_Of_Partitions": "0", + "Projection_Cached": "False" + }, + "name": "inventory_fact_super", + "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.inventory_fact_super", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "date_key", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "VARCHAR(length=64)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "reports_to", + "fieldPath": "product_key", "nullable": true, "description": "", "type": { @@ -5193,7 +6726,7 @@ "isPartOfKey": false }, { - "fieldPath": "salaried_flag", + "fieldPath": "product_version", "nullable": true, "description": "", "type": { @@ -5206,7 +6739,7 @@ "isPartOfKey": false }, { - "fieldPath": "annual_salary", + "fieldPath": "warehouse_key", "nullable": true, "description": "", "type": { @@ -5219,7 +6752,7 @@ "isPartOfKey": false }, { - "fieldPath": "hourly_rate", + "fieldPath": "qty_in_stock", "nullable": true, "description": "", "type": { @@ -5227,20 +6760,20 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "FLOAT()", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vacation_days", + "fieldPath": "inventory_date", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.DateType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "DATE()", "recursive": false, "isPartOfKey": false } @@ -5252,12 +6785,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -5269,12 +6803,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -5285,7 +6820,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact,PROD)", "type": "TRANSFORMED" } ] @@ -5293,12 +6828,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -5317,12 +6853,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -5341,12 +6878,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -5356,13 +6894,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -5373,22 +6912,22 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "ROS_Count": "1", - "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(warehouse_dimension.warehouse_key)", - "projection_size": "2 KB", + "Projection_Type": "is_aggregate_projection, has_expressions", + "Is_Segmented": "True", + "Segmentation_key": "hash(readings.meter_id)", + "Projection_size": "0 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "warehouse_dimension_super", + "name": "readings_topk", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.warehouse_dimension_super", + "schemaName": "public.readings_topk", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -5407,7 +6946,7 @@ }, "fields": [ { - "fieldPath": "warehouse_key", + "fieldPath": "meter_id", "nullable": true, "description": "", "type": { @@ -5420,67 +6959,28 @@ "isPartOfKey": false }, { - "fieldPath": "warehouse_name", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=20)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "warehouse_address", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=256)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "warehouse_city", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=60)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "warehouse_state", + "fieldPath": "recent_date", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.TimeType": {} } }, - "nativeDataType": "CHAR(length=2)", + "nativeDataType": "TIMESTAMP_WITH_PRECISION()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "warehouse_region", + "fieldPath": "recent_value", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "FLOAT()", "recursive": false, "isPartOfKey": false } @@ -5492,12 +6992,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -5509,12 +7010,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -5525,7 +7027,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "type": "TRANSFORMED" } ] @@ -5533,12 +7035,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -5557,12 +7060,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -5581,12 +7085,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -5596,13 +7101,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -5613,22 +7119,22 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "ROS_Count": "1", - "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(shipping_dimension.shipping_key)", - "projection_size": "1 KB", + "Projection_Type": "is_aggregate_projection, has_expressions", + "Is_Segmented": "True", + "Segmentation_key": "hash(clicks.page_id, (clicks.click_time)::date)", + "Projection_size": "0 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "shipping_dimension_super", + "name": "clicks_agg", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.shipping_dimension_super", + "schemaName": "public.clicks_agg", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -5647,7 +7153,7 @@ }, "fields": [ { - "fieldPath": "shipping_key", + "fieldPath": "page_id", "nullable": true, "description": "", "type": { @@ -5658,45 +7164,6 @@ "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "ship_type", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=30)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "ship_mode", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=10)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "ship_carrier", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=20)", - "recursive": false, - "isPartOfKey": false } ] } @@ -5706,12 +7173,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -5723,12 +7191,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -5739,7 +7208,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "type": "TRANSFORMED" } ] @@ -5747,12 +7216,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -5771,12 +7241,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -5795,12 +7266,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -5810,13 +7282,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -5828,21 +7301,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(inventory_fact.date_key, inventory_fact.product_key, inventory_fact.product_version, inventory_fact.warehouse_key, inventory_fact.qty_in_stock)", - "projection_size": "2566 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(phrases.phrase)", + "Projection_size": "0 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "inventory_fact_super", + "name": "phrases_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.inventory_fact_super", + "schemaName": "public.phrases_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -5861,80 +7334,15 @@ }, "fields": [ { - "fieldPath": "date_key", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "product_key", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "product_version", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "warehouse_key", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "qty_in_stock", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "inventory_date", + "fieldPath": "phrase", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "DATE()", + "nativeDataType": "VARCHAR(length=128)", "recursive": false, "isPartOfKey": false } @@ -5946,12 +7354,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -5963,12 +7372,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -5979,7 +7389,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", "type": "TRANSFORMED" } ] @@ -5987,12 +7397,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -6011,7 +7422,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6035,7 +7447,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6050,7 +7463,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6065,7 +7479,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6082,7 +7497,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6097,7 +7513,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6117,7 +7534,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6141,7 +7559,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6156,7 +7575,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6172,7 +7592,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.393181+00:00" + "create_time": "2023-10-13 11:22:37.879951+00:00", + "table_size": "2 KB" }, "name": "store_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -6441,7 +7862,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6458,7 +7880,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6482,7 +7905,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6506,7 +7930,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6521,7 +7946,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6537,7 +7963,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.404717+00:00" + "create_time": "2023-10-13 11:22:37.890717+00:00", + "table_size": "8646 KB" }, "name": "store_orders_fact", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -6819,7 +8246,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6836,7 +8264,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6860,7 +8289,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6884,7 +8314,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6899,7 +8330,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6915,7 +8347,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.396731+00:00" + "create_time": "2023-10-13 11:22:37.883186+00:00", + "table_size": "225060 KB" }, "name": "store_sales_fact", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -7171,7 +8604,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7188,7 +8622,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7212,7 +8647,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7236,7 +8672,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7251,7 +8688,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7269,11 +8707,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(store_dimension.store_key)", - "projection_size": "2 KB", + "Projection_size": "2 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "store_dimension_super", @@ -7543,7 +8981,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7560,7 +8999,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7584,7 +9024,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7608,7 +9049,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7632,7 +9074,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7647,7 +9090,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7665,11 +9109,11 @@ "customProperties": { "ROS_Count": "2", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(store_sales_fact.date_key, store_sales_fact.product_key, store_sales_fact.product_version, store_sales_fact.store_key, store_sales_fact.promotion_key, store_sales_fact.customer_key, store_sales_fact.employee_key, store_sales_fact.pos_transaction_number)", - "projection_size": "225089 KB", + "Projection_size": "225060 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "store_sales_fact_super", @@ -7926,7 +9370,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7943,7 +9388,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7967,7 +9413,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7991,7 +9438,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8015,7 +9463,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8030,7 +9479,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8048,11 +9498,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(store_orders_fact.product_key, store_orders_fact.product_version, store_orders_fact.store_key, store_orders_fact.vendor_key, store_orders_fact.employee_key, store_orders_fact.order_number, store_orders_fact.date_ordered, store_orders_fact.date_shipped)", - "projection_size": "8648 KB", + "Projection_size": "8646 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "store_orders_fact_super", @@ -8335,7 +9785,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8352,7 +9803,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8376,7 +9828,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8400,7 +9853,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8424,7 +9878,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8439,7 +9894,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8454,7 +9910,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8471,7 +9928,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8486,7 +9944,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8506,7 +9965,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8530,7 +9990,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8545,7 +10006,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8561,7 +10023,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.415595+00:00" + "create_time": "2023-10-13 11:22:37.900841+00:00", + "table_size": "6 KB" }, "name": "call_center_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -8752,7 +10215,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8769,7 +10233,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8793,7 +10258,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8817,7 +10283,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8832,7 +10299,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8848,7 +10316,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.412266+00:00" + "create_time": "2023-10-13 11:22:37.897788+00:00", + "table_size": "9 KB" }, "name": "online_page_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -8961,7 +10430,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8978,7 +10448,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9002,7 +10473,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9026,7 +10498,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9041,7 +10514,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9057,7 +10531,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.419260+00:00" + "create_time": "2023-10-13 11:22:37.903963+00:00", + "table_size": "182356 KB" }, "name": "online_sales_fact", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -9352,7 +10827,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9369,7 +10845,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9393,7 +10870,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9417,7 +10895,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9432,7 +10911,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9450,11 +10930,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(online_page_dimension.online_page_key)", - "projection_size": "9 KB", + "Projection_size": "9 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "online_page_dimension_super", @@ -9568,7 +11048,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9585,7 +11066,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9609,7 +11091,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9633,7 +11116,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9657,7 +11141,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9672,7 +11157,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9690,11 +11176,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(call_center_dimension.call_center_key)", - "projection_size": "6 KB", + "Projection_size": "6 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "call_center_dimension_super", @@ -9886,7 +11372,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9903,7 +11390,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9927,7 +11415,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9951,7 +11440,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9975,7 +11465,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9990,7 +11481,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -10008,11 +11500,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(online_sales_fact.sale_date_key, online_sales_fact.ship_date_key, online_sales_fact.product_key, online_sales_fact.product_version, online_sales_fact.customer_key, online_sales_fact.call_center_key, online_sales_fact.online_page_key, online_sales_fact.shipping_key)", - "projection_size": "182385 KB", + "Projection_size": "182356 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "online_sales_fact_super", @@ -10308,7 +11800,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -10325,7 +11818,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -10349,7 +11843,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -10373,7 +11868,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/vertica/vertica_to_file.yml b/metadata-ingestion/tests/integration/vertica/vertica_to_file.yml index ebd800ee09ff5..a182e54bd53c7 100644 --- a/metadata-ingestion/tests/integration/vertica/vertica_to_file.yml +++ b/metadata-ingestion/tests/integration/vertica/vertica_to_file.yml @@ -5,6 +5,13 @@ source: database: Vmart username: dbadmin password: abc123 + include_tables: true + include_views: true + include_projections: true + include_models: true + include_view_lineage: true + include_projection_lineage: true + sink: type: file diff --git a/metadata-ingestion/tests/unit/api/entities/datacontract/__init__.py b/metadata-ingestion/tests/unit/api/entities/datacontract/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py b/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py new file mode 100644 index 0000000000000..7be8b667a500b --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py @@ -0,0 +1,55 @@ +from datahub.api.entities.datacontract.data_quality_assertion import ( + DataQualityAssertion, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, + AssertionTypeClass, + AssertionValueChangeTypeClass, + SqlAssertionInfoClass, + SqlAssertionTypeClass, +) + + +def test_parse_sql_assertion(): + assertion_urn = "urn:li:assertion:a" + entity_urn = "urn:li:dataset:d" + statement = "SELECT COUNT(*) FROM my_table WHERE value IS NOT NULL" + + d = { + "type": "custom_sql", + "sql": statement, + "operator": {"type": "between", "min": 5, "max": 10}, + } + + assert DataQualityAssertion.parse_obj(d).generate_mcp( + assertion_urn, entity_urn + ) == [ + MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=AssertionInfoClass( + type=AssertionTypeClass.SQL, + sqlAssertion=SqlAssertionInfoClass( + type=SqlAssertionTypeClass.METRIC, + changeType=AssertionValueChangeTypeClass.ABSOLUTE, + entity=entity_urn, + statement="SELECT COUNT(*) FROM my_table WHERE value IS NOT NULL", + operator=AssertionStdOperatorClass.BETWEEN, + parameters=AssertionStdParametersClass( + minValue=AssertionStdParameterClass( + value="5", + type=AssertionStdParameterTypeClass.NUMBER, + ), + maxValue=AssertionStdParameterClass( + value="10", + type=AssertionStdParameterTypeClass.NUMBER, + ), + ), + ), + ), + ) + ] diff --git a/metadata-ingestion/tests/unit/serde/test_serde.py b/metadata-ingestion/tests/unit/serde/test_serde.py index d116f1f5473fa..d2d6a0bdda5b9 100644 --- a/metadata-ingestion/tests/unit/serde/test_serde.py +++ b/metadata-ingestion/tests/unit/serde/test_serde.py @@ -238,7 +238,7 @@ def test_missing_optional_simple() -> None: "criteria": [ { "condition": "EQUALS", - "field": "RESOURCE_TYPE", + "field": "TYPE", "values": ["notebook", "dataset", "dashboard"], } ] @@ -252,7 +252,7 @@ def test_missing_optional_simple() -> None: "criteria": [ { "condition": "EQUALS", - "field": "RESOURCE_TYPE", + "field": "TYPE", "values": ["notebook", "dataset", "dashboard"], } ] @@ -267,13 +267,13 @@ def test_missing_optional_simple() -> None: def test_missing_optional_in_union() -> None: # This one doesn't contain any optional fields and should work fine. revised_json = json.loads( - '{"lastUpdatedTimestamp":1662356745807,"actors":{"groups":[],"resourceOwners":false,"allUsers":true,"allGroups":false,"users":[]},"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"displayName":"customtest","resources":{"filter":{"criteria":[{"field":"RESOURCE_TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]},"allResources":false},"description":"","state":"ACTIVE","type":"METADATA"}' + '{"lastUpdatedTimestamp":1662356745807,"actors":{"groups":[],"resourceOwners":false,"allUsers":true,"allGroups":false,"users":[]},"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"displayName":"customtest","resources":{"filter":{"criteria":[{"field":"TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]},"allResources":false},"description":"","state":"ACTIVE","type":"METADATA"}' ) revised = models.DataHubPolicyInfoClass.from_obj(revised_json) # This one is missing the optional filters.allResources field. original_json = json.loads( - '{"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"actors":{"resourceOwners":false,"groups":[],"allGroups":false,"allUsers":true,"users":[]},"lastUpdatedTimestamp":1662356745807,"displayName":"customtest","description":"","resources":{"filter":{"criteria":[{"field":"RESOURCE_TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]}},"state":"ACTIVE","type":"METADATA"}' + '{"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"actors":{"resourceOwners":false,"groups":[],"allGroups":false,"allUsers":true,"users":[]},"lastUpdatedTimestamp":1662356745807,"displayName":"customtest","description":"","resources":{"filter":{"criteria":[{"field":"TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]}},"state":"ACTIVE","type":"METADATA"}' ) original = models.DataHubPolicyInfoClass.from_obj(original_json) diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json index f0175b4dc8892..d610b0a83f229 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json @@ -18,7 +18,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -36,7 +36,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -54,7 +54,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -72,7 +72,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json index b7df5444987f2..2d3d188d28316 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json @@ -14,7 +14,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -32,7 +32,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json index 67e306bebf545..41ae0885941b0 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json @@ -14,7 +14,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -32,7 +32,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json index b7df5444987f2..2d3d188d28316 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json @@ -14,7 +14,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -32,7 +32,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json index b393b2445d6c4..26f8f8f59a3ff 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json @@ -16,7 +16,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -34,7 +34,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -52,7 +52,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json index 53fb94300e804..83365c09f69c2 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json @@ -17,7 +17,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { @@ -39,7 +39,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "native_column_type": "TEXT" + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json index 4773974545bfa..cf31b71cb50f6 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json @@ -4,5 +4,58 @@ "out_tables": [ "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)" ], - "column_lineage": null + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)", + "column": "id", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INTEGER" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)", + "column": "month", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)", + "column": "total_cost", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "REAL" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)", + "column": "area", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "REAL" + }, + "upstreams": [] + } + ] } \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json index ff452467aa5bd..8a6b60d0f1bde 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json @@ -30,7 +30,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "native_column_type": "BIGINT" + "native_column_type": "NUMBER" }, "upstreams": [] }, diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json index 5ad847e252497..2424fcda34752 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json @@ -14,7 +14,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "native_column_type": "DECIMAL" + "native_column_type": "NUMERIC" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json index 6ee3d2e61c39b..8dd2633eff612 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json @@ -14,7 +14,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "native_column_type": "DECIMAL" + "native_column_type": "NUMERIC" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json new file mode 100644 index 0000000000000..e2baa34e7fe28 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json @@ -0,0 +1,56 @@ +{ + "query_type": "UPDATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col2" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)", + "column": "col2" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json new file mode 100644 index 0000000000000..b41ed61b37cdb --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json @@ -0,0 +1,35 @@ +{ + "query_type": "UPDATE", + "in_tables": [], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "orderkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "totalprice", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json index b0351a7e07ad2..ee80285d87f60 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json @@ -12,6 +12,7 @@ "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)", "column": "PatientId", + "column_type": null, "native_column_type": "INTEGER()" }, "upstreams": [ @@ -25,6 +26,7 @@ "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)", "column": "BMI", + "column_type": null, "native_column_type": "FLOAT()" }, "upstreams": [ diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 059add8db67e4..dfc5b486abd35 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -3,6 +3,7 @@ import pytest from datahub.testing.check_sql_parser_result import assert_sql_result +from datahub.utilities.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT RESOURCE_DIR = pathlib.Path(__file__).parent / "goldens" @@ -672,3 +673,98 @@ def test_teradata_default_normalization(): }, expected_file=RESOURCE_DIR / "test_teradata_default_normalization.json", ) + + +def test_snowflake_update_hardcoded(): + assert_sql_result( + """ +UPDATE snowflake_sample_data.tpch_sf1.orders +SET orderkey = 1, totalprice = 2 +WHERE orderkey = 3 +""", + dialect="snowflake", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": { + "orderkey": "NUMBER(38,0)", + "totalprice": "NUMBER(12,2)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_update_hardcoded.json", + ) + + +def test_update_from_select(): + assert _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT == {"returning", "this"} + + +def test_snowflake_update_from_table(): + # Can create these tables with the following SQL: + """ + -- Create or replace my_table + CREATE OR REPLACE TABLE my_table ( + id INT IDENTITY PRIMARY KEY, + col1 VARCHAR(50), + col2 VARCHAR(50) + ); + + -- Create or replace table1 + CREATE OR REPLACE TABLE table1 ( + id INT IDENTITY PRIMARY KEY, + col1 VARCHAR(50), + col2 VARCHAR(50) + ); + + -- Create or replace table2 + CREATE OR REPLACE TABLE table2 ( + id INT IDENTITY PRIMARY KEY, + col2 VARCHAR(50) + ); + + -- Insert data into my_table + INSERT INTO my_table (col1, col2) + VALUES ('foo', 'bar'), + ('baz', 'qux'); + + -- Insert data into table1 + INSERT INTO table1 (col1, col2) + VALUES ('foo', 'bar'), + ('baz', 'qux'); + + -- Insert data into table2 + INSERT INTO table2 (col2) + VALUES ('bar'), + ('qux'); + """ + + assert_sql_result( + """ +UPDATE my_table +SET + col1 = t1.col1 || t1.col2, + col2 = t1.col1 || t2.col2 +FROM table1 t1 +JOIN table2 t2 ON t1.id = t2.id +WHERE my_table.id = t1.id; +""", + dialect="snowflake", + default_db="my_db", + default_schema="my_schema", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_update_from_table.json", + ) diff --git a/metadata-ingestion/tests/unit/test_athena_source.py b/metadata-ingestion/tests/unit/test_athena_source.py index 2558f6a46715e..23dd7dd5a6e45 100644 --- a/metadata-ingestion/tests/unit/test_athena_source.py +++ b/metadata-ingestion/tests/unit/test_athena_source.py @@ -3,14 +3,17 @@ import pytest from freezegun import freeze_time +from sqlalchemy import types +from sqlalchemy_bigquery import STRUCT from datahub.ingestion.api.common import PipelineContext -from src.datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.sql.athena import CustomAthenaRestDialect +from datahub.utilities.sqlalchemy_type_converter import MapType FROZEN_TIME = "2020-04-14 07:00:00" -@pytest.mark.integration def test_athena_config_query_location_old_plus_new_value_not_allowed(): from datahub.ingestion.source.sql.athena import AthenaConfig @@ -25,7 +28,6 @@ def test_athena_config_query_location_old_plus_new_value_not_allowed(): ) -@pytest.mark.integration def test_athena_config_staging_dir_is_set_as_query_result(): from datahub.ingestion.source.sql.athena import AthenaConfig @@ -48,7 +50,6 @@ def test_athena_config_staging_dir_is_set_as_query_result(): assert config.json() == expected_config.json() -@pytest.mark.integration def test_athena_uri(): from datahub.ingestion.source.sql.athena import AthenaConfig @@ -59,9 +60,12 @@ def test_athena_uri(): "work_group": "test-workgroup", } ) - assert ( - config.get_sql_alchemy_url() - == "awsathena+rest://@athena.us-west-1.amazonaws.com:443/?s3_staging_dir=s3%3A%2F%2Fquery-result-location%2F&work_group=test-workgroup&catalog_name=awsdatacatalog&duration_seconds=3600" + assert config.get_sql_alchemy_url() == ( + "awsathena+rest://@athena.us-west-1.amazonaws.com:443" + "?catalog_name=awsdatacatalog" + "&duration_seconds=3600" + "&s3_staging_dir=s3%3A%2F%2Fquery-result-location%2F" + "&work_group=test-workgroup" ) @@ -104,7 +108,7 @@ def test_athena_get_table_properties(): mock_cursor = mock.MagicMock() mock_inspector = mock.MagicMock() mock_inspector.engine.raw_connection().cursor.return_value = mock_cursor - mock_cursor._get_table_metadata.return_value = AthenaTableMetadata( + mock_cursor.get_table_metadata.return_value = AthenaTableMetadata( response=table_metadata ) @@ -126,3 +130,81 @@ def test_athena_get_table_properties(): } assert location == make_s3_urn("s3://testLocation", "PROD") + + +def test_get_column_type_simple_types(): + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="int"), types.Integer + ) + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="string"), types.String + ) + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="boolean"), types.BOOLEAN + ) + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="long"), types.BIGINT + ) + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="double"), types.FLOAT + ) + + +def test_get_column_type_array(): + result = CustomAthenaRestDialect()._get_column_type(type_="array") + + assert isinstance(result, types.ARRAY) + assert isinstance(result.item_type, types.String) + + +def test_get_column_type_map(): + result = CustomAthenaRestDialect()._get_column_type(type_="map") + + assert isinstance(result, MapType) + assert isinstance(result.types[0], types.String) + assert isinstance(result.types[1], types.Integer) + + +def test_column_type_struct(): + + result = CustomAthenaRestDialect()._get_column_type(type_="struct") + + assert isinstance(result, STRUCT) + assert isinstance(result._STRUCT_fields[0], tuple) + assert result._STRUCT_fields[0][0] == "test" + assert isinstance(result._STRUCT_fields[0][1], types.String) + + +def test_column_type_complex_combination(): + + result = CustomAthenaRestDialect()._get_column_type( + type_="struct>>" + ) + + assert isinstance(result, STRUCT) + + assert isinstance(result._STRUCT_fields[0], tuple) + assert result._STRUCT_fields[0][0] == "id" + assert isinstance(result._STRUCT_fields[0][1], types.String) + + assert isinstance(result._STRUCT_fields[1], tuple) + assert result._STRUCT_fields[1][0] == "name" + assert isinstance(result._STRUCT_fields[1][1], types.String) + + assert isinstance(result._STRUCT_fields[2], tuple) + assert result._STRUCT_fields[2][0] == "choices" + assert isinstance(result._STRUCT_fields[2][1], types.ARRAY) + + assert isinstance(result._STRUCT_fields[2][1].item_type, STRUCT) + + assert isinstance(result._STRUCT_fields[2][1].item_type._STRUCT_fields[0], tuple) + assert result._STRUCT_fields[2][1].item_type._STRUCT_fields[0][0] == "id" + assert isinstance( + result._STRUCT_fields[2][1].item_type._STRUCT_fields[0][1], types.String + ) + + assert isinstance(result._STRUCT_fields[2][1].item_type._STRUCT_fields[1], tuple) + assert result._STRUCT_fields[2][1].item_type._STRUCT_fields[1][0] == "label" + assert isinstance( + result._STRUCT_fields[2][1].item_type._STRUCT_fields[1][1], types.String + ) diff --git a/metadata-ingestion/tests/unit/test_clickhouse_source.py b/metadata-ingestion/tests/unit/test_clickhouse_source.py index de7e7d66f2129..1b2ffb70c8d19 100644 --- a/metadata-ingestion/tests/unit/test_clickhouse_source.py +++ b/metadata-ingestion/tests/unit/test_clickhouse_source.py @@ -26,9 +26,7 @@ def test_clickhouse_uri_native(): "scheme": "clickhouse+native", } ) - assert ( - config.get_sql_alchemy_url() == "clickhouse+native://user:password@host:1111/" - ) + assert config.get_sql_alchemy_url() == "clickhouse+native://user:password@host:1111" def test_clickhouse_uri_native_secure(): diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 1c26ca2487e5c..888a7c0441554 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -179,10 +179,12 @@ def test_snowflake_uri_default_authentication(): } ) - assert ( - config.get_sql_alchemy_url() - == "snowflake://user:password@acctname/?authenticator=SNOWFLAKE&warehouse=COMPUTE_WH&role" - "=sysadmin&application=acryl_datahub" + assert config.get_sql_alchemy_url() == ( + "snowflake://user:password@acctname" + "?application=acryl_datahub" + "&authenticator=SNOWFLAKE" + "&role=sysadmin" + "&warehouse=COMPUTE_WH" ) @@ -198,10 +200,12 @@ def test_snowflake_uri_external_browser_authentication(): } ) - assert ( - config.get_sql_alchemy_url() - == "snowflake://user@acctname/?authenticator=EXTERNALBROWSER&warehouse=COMPUTE_WH&role" - "=sysadmin&application=acryl_datahub" + assert config.get_sql_alchemy_url() == ( + "snowflake://user@acctname" + "?application=acryl_datahub" + "&authenticator=EXTERNALBROWSER" + "&role=sysadmin" + "&warehouse=COMPUTE_WH" ) @@ -219,10 +223,12 @@ def test_snowflake_uri_key_pair_authentication(): } ) - assert ( - config.get_sql_alchemy_url() - == "snowflake://user@acctname/?authenticator=SNOWFLAKE_JWT&warehouse=COMPUTE_WH&role" - "=sysadmin&application=acryl_datahub" + assert config.get_sql_alchemy_url() == ( + "snowflake://user@acctname" + "?application=acryl_datahub" + "&authenticator=SNOWFLAKE_JWT" + "&role=sysadmin" + "&warehouse=COMPUTE_WH" ) diff --git a/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py b/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py new file mode 100644 index 0000000000000..0384e1f918881 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py @@ -0,0 +1,20 @@ +from collections import defaultdict +from datetime import datetime +from typing import Dict + +from datahub.utilities.ratelimiter import RateLimiter + + +def test_rate_is_limited(): + MAX_CALLS_PER_SEC = 5 + TOTAL_CALLS = 18 + actual_calls: Dict[float, int] = defaultdict(lambda: 0) + + ratelimiter = RateLimiter(max_calls=MAX_CALLS_PER_SEC, period=1) + for _ in range(TOTAL_CALLS): + with ratelimiter: + actual_calls[datetime.now().replace(microsecond=0).timestamp()] += 1 + + assert len(actual_calls) == round(TOTAL_CALLS / MAX_CALLS_PER_SEC) + assert all(calls <= MAX_CALLS_PER_SEC for calls in actual_calls.values()) + assert sum(actual_calls.values()) == TOTAL_CALLS diff --git a/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py b/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py new file mode 100644 index 0000000000000..6c719d351c4c2 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py @@ -0,0 +1,93 @@ +from typing import no_type_check + +from sqlalchemy import types +from sqlalchemy_bigquery import STRUCT + +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + MapTypeClass, + NullTypeClass, + NumberTypeClass, + RecordTypeClass, +) +from datahub.utilities.sqlalchemy_type_converter import ( + MapType, + get_schema_fields_for_sqlalchemy_column, +) + + +def test_get_avro_schema_for_sqlalchemy_column(): + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=types.INTEGER() + ) + assert len(schema_fields) == 1 + assert schema_fields[0].fieldPath == "[version=2.0].[type=int].test" + assert schema_fields[0].type.type == NumberTypeClass() + assert schema_fields[0].nativeDataType == "INTEGER" + assert schema_fields[0].nullable is True + + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=types.String(), nullable=False + ) + assert len(schema_fields) == 1 + assert schema_fields[0].fieldPath == "[version=2.0].[type=string].test" + assert schema_fields[0].type.type == NumberTypeClass() + assert schema_fields[0].nativeDataType == "VARCHAR" + assert schema_fields[0].nullable is False + + +def test_get_avro_schema_for_sqlalchemy_array_column(): + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=types.ARRAY(types.FLOAT()) + ) + assert len(schema_fields) == 1 + assert ( + schema_fields[0].fieldPath + == "[version=2.0].[type=struct].[type=array].[type=float].test" + ) + assert schema_fields[0].type.type == ArrayTypeClass(nestedType=["float"]) + assert schema_fields[0].nativeDataType == "array" + + +def test_get_avro_schema_for_sqlalchemy_map_column(): + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=MapType(types.String(), types.BOOLEAN()) + ) + assert len(schema_fields) == 1 + assert ( + schema_fields[0].fieldPath + == "[version=2.0].[type=struct].[type=map].[type=boolean].test" + ) + assert schema_fields[0].type.type == MapTypeClass( + keyType="string", valueType="boolean" + ) + assert schema_fields[0].nativeDataType == "MapType(String(), BOOLEAN())" + + +def test_get_avro_schema_for_sqlalchemy_struct_column() -> None: + + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=STRUCT(("test", types.INTEGER())) + ) + assert len(schema_fields) == 2 + assert ( + schema_fields[0].fieldPath == "[version=2.0].[type=struct].[type=struct].test" + ) + assert schema_fields[0].type.type == RecordTypeClass() + assert schema_fields[0].nativeDataType == "STRUCT" + + assert ( + schema_fields[1].fieldPath + == "[version=2.0].[type=struct].[type=struct].test.[type=int].test" + ) + assert schema_fields[1].type.type == NumberTypeClass() + assert schema_fields[1].nativeDataType == "INTEGER" + + +@no_type_check +def test_get_avro_schema_for_sqlalchemy_unknown_column(): + schema_fields = get_schema_fields_for_sqlalchemy_column("invalid", "test") + assert len(schema_fields) == 1 + assert schema_fields[0].type.type == NullTypeClass() + assert schema_fields[0].fieldPath == "[version=2.0].[type=null]" + assert schema_fields[0].nativeDataType == "test" diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 95de3cdb3c526..e6210f1f073f6 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -30,7 +30,7 @@ dependencies { implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } - implementation externalDependency.avro_1_7 + implementation externalDependency.avro constraints { implementation('commons-collections:commons-collections:3.2.2') { because 'Vulnerability Issue' diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/kafka/AvroSerializer.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/kafka/AvroSerializer.java index ee0d459aaa7d3..6212e57470be4 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/kafka/AvroSerializer.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/kafka/AvroSerializer.java @@ -16,12 +16,14 @@ class AvroSerializer { private final Schema _recordSchema; private final Schema _genericAspectSchema; + private final Schema _changeTypeEnumSchema; private final EventFormatter _eventFormatter; public AvroSerializer() throws IOException { _recordSchema = new Schema.Parser() .parse(this.getClass().getClassLoader().getResourceAsStream("MetadataChangeProposal.avsc")); _genericAspectSchema = this._recordSchema.getField("aspect").schema().getTypes().get(1); + _changeTypeEnumSchema = this._recordSchema.getField("changeType").schema(); _eventFormatter = new EventFormatter(EventFormatter.Format.PEGASUS_JSON); } @@ -43,7 +45,7 @@ public GenericRecord serialize(MetadataChangeProposal mcp) throws IOException { genericRecord.put("aspect", genericAspect); genericRecord.put("aspectName", mcp.getAspectName()); genericRecord.put("entityType", mcp.getEntityType()); - genericRecord.put("changeType", mcp.getChangeType()); + genericRecord.put("changeType", new GenericData.EnumSymbol(_changeTypeEnumSchema, mcp.getChangeType())); return genericRecord; } } \ No newline at end of file diff --git a/metadata-integration/java/datahub-protobuf/README.md b/metadata-integration/java/datahub-protobuf/README.md index daea8d438679c..29b82aa3e68f5 100644 --- a/metadata-integration/java/datahub-protobuf/README.md +++ b/metadata-integration/java/datahub-protobuf/README.md @@ -1,6 +1,6 @@ # Protobuf Schemas -The `datahub-protobuf` module is designed to be used with the Java Emitter, the input is a compiled protobuf binary `*.protoc` files and optionally the corresponding `*.proto` source code. In addition, you can supply the root message in cases where a single protobuf source file includes multiple non-nested messages. +The `datahub-protobuf` module is designed to be used with the Java Emitter, the input is a compiled protobuf binary `*.protoc` files and optionally the corresponding `*.proto` source code. You can supply a file with multiple nested messages to be processed. If you have a file with multiple non-nested messages, you will need to separate them out into different files or supply the root message, as otherwise we will only process the first one. ## Supported Features diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index ad54cf6524398..740fed61f13d5 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -8,9 +8,9 @@ configurations { dependencies { implementation project(':entity-registry') api project(':metadata-utils') - api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-events:mxe-avro') api project(':metadata-events:mxe-registration') - api project(':metadata-events:mxe-utils-avro-1.7') + api project(':metadata-events:mxe-utils-avro') api project(':metadata-models') api project(':metadata-service:restli-client') api project(':metadata-service:configuration') diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java index 13a7d16b723a7..ceaf37a1289d9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java @@ -256,13 +256,13 @@ public ScrollResult getCachedScrollResults( cacheAccess.stop(); if (result == null) { Timer.Context cacheMiss = MetricUtils.timer(this.getClass(), "scroll_cache_miss").time(); - result = getRawScrollResults(entities, query, filters, sortCriterion, scrollId, keepAlive, size, isFullText); + result = getRawScrollResults(entities, query, filters, sortCriterion, scrollId, keepAlive, size, isFullText, flags); cache.put(cacheKey, toJsonString(result)); cacheMiss.stop(); MetricUtils.counter(this.getClass(), "scroll_cache_miss_count").inc(); } } else { - result = getRawScrollResults(entities, query, filters, sortCriterion, scrollId, keepAlive, size, isFullText); + result = getRawScrollResults(entities, query, filters, sortCriterion, scrollId, keepAlive, size, isFullText, flags); } return result; } @@ -328,7 +328,8 @@ private ScrollResult getRawScrollResults( @Nullable final String scrollId, @Nullable final String keepAlive, final int count, - final boolean fulltext) { + final boolean fulltext, + @Nullable final SearchFlags searchFlags) { if (fulltext) { return entitySearchService.fullTextScroll( entities, @@ -337,7 +338,8 @@ private ScrollResult getRawScrollResults( sortCriterion, scrollId, keepAlive, - count); + count, + searchFlags); } else { return entitySearchService.structuredScroll(entities, input, @@ -345,7 +347,8 @@ private ScrollResult getRawScrollResults( sortCriterion, scrollId, keepAlive, - count); + count, + searchFlags); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index ef5a555e95ba8..024cf2b0abec2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -175,23 +175,26 @@ public List getBrowsePaths(@Nonnull String entityName, @Nonnull Urn urn) @Nonnull @Override public ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size) { + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size, @Nullable SearchFlags searchFlags) { log.debug(String.format( "Scrolling Structured Search documents entities: %s, input: %s, postFilters: %s, sortCriterion: %s, scrollId: %s, size: %s", entities, input, postFilters, sortCriterion, scrollId, size)); + SearchFlags flags = Optional.ofNullable(searchFlags).orElse(new SearchFlags()); + flags.setFulltext(true); return esSearchDAO.scroll(entities, input, postFilters, sortCriterion, scrollId, keepAlive, size, - new SearchFlags().setFulltext(true)); + flags); } @Nonnull @Override public ScrollResult structuredScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size) { + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size, @Nullable SearchFlags searchFlags) { log.debug(String.format( "Scrolling FullText Search documents entities: %s, input: %s, postFilters: %s, sortCriterion: %s, scrollId: %s, size: %s", entities, input, postFilters, sortCriterion, scrollId, size)); - return esSearchDAO.scroll(entities, input, postFilters, sortCriterion, scrollId, keepAlive, size, - new SearchFlags().setFulltext(false)); + SearchFlags flags = Optional.ofNullable(searchFlags).orElse(new SearchFlags()); + flags.setFulltext(false); + return esSearchDAO.scroll(entities, input, postFilters, sortCriterion, scrollId, keepAlive, size, flags); } public Optional raw(@Nonnull String indexName, @Nullable String jsonQuery) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index 004b2e0a2adc4..35cef71edd953 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -5,6 +5,7 @@ import com.linkedin.metadata.models.SearchScoreFieldSpec; import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType; +import com.linkedin.metadata.search.utils.ESUtils; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -31,15 +32,6 @@ public static Map getPartialNgramConfigWithOverrides(Map KEYWORD_TYPE_MAP = ImmutableMap.of(TYPE, KEYWORD); - // Field Types - public static final String BOOLEAN = "boolean"; - public static final String DATE = "date"; - public static final String DOUBLE = "double"; - public static final String LONG = "long"; - public static final String OBJECT = "object"; - public static final String TEXT = "text"; - public static final String TOKEN_COUNT = "token_count"; - // Subfields public static final String DELIMITED = "delimited"; public static final String LENGTH = "length"; @@ -74,7 +66,7 @@ public static Map getMappings(@Nonnull final EntitySpec entitySp private static Map getMappingsForUrn() { Map subFields = new HashMap<>(); subFields.put(DELIMITED, ImmutableMap.of( - TYPE, TEXT, + TYPE, ESUtils.TEXT_FIELD_TYPE, ANALYZER, URN_ANALYZER, SEARCH_ANALYZER, URN_SEARCH_ANALYZER, SEARCH_QUOTE_ANALYZER, CUSTOM_QUOTE_ANALYZER) @@ -85,13 +77,13 @@ private static Map getMappingsForUrn() { ) )); return ImmutableMap.builder() - .put(TYPE, KEYWORD) + .put(TYPE, ESUtils.KEYWORD_FIELD_TYPE) .put(FIELDS, subFields) .build(); } private static Map getMappingsForRunId() { - return ImmutableMap.builder().put(TYPE, KEYWORD).build(); + return ImmutableMap.builder().put(TYPE, ESUtils.KEYWORD_FIELD_TYPE).build(); } private static Map getMappingsForField(@Nonnull final SearchableFieldSpec searchableFieldSpec) { @@ -104,23 +96,23 @@ private static Map getMappingsForField(@Nonnull final Searchable } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { mappingForField.putAll(getMappingsForSearchText(fieldType)); } else if (fieldType == FieldType.BROWSE_PATH) { - mappingForField.put(TYPE, TEXT); + mappingForField.put(TYPE, ESUtils.TEXT_FIELD_TYPE); mappingForField.put(FIELDS, ImmutableMap.of(LENGTH, ImmutableMap.of( - TYPE, TOKEN_COUNT, + TYPE, ESUtils.TOKEN_COUNT_FIELD_TYPE, ANALYZER, SLASH_PATTERN_ANALYZER))); mappingForField.put(ANALYZER, BROWSE_PATH_HIERARCHY_ANALYZER); mappingForField.put(FIELDDATA, true); } else if (fieldType == FieldType.BROWSE_PATH_V2) { - mappingForField.put(TYPE, TEXT); + mappingForField.put(TYPE, ESUtils.TEXT_FIELD_TYPE); mappingForField.put(FIELDS, ImmutableMap.of(LENGTH, ImmutableMap.of( - TYPE, TOKEN_COUNT, + TYPE, ESUtils.TOKEN_COUNT_FIELD_TYPE, ANALYZER, UNIT_SEPARATOR_PATTERN_ANALYZER))); mappingForField.put(ANALYZER, BROWSE_PATH_V2_HIERARCHY_ANALYZER); mappingForField.put(FIELDDATA, true); } else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) { - mappingForField.put(TYPE, TEXT); + mappingForField.put(TYPE, ESUtils.TEXT_FIELD_TYPE); mappingForField.put(ANALYZER, URN_ANALYZER); mappingForField.put(SEARCH_ANALYZER, URN_SEARCH_ANALYZER); mappingForField.put(SEARCH_QUOTE_ANALYZER, CUSTOM_QUOTE_ANALYZER); @@ -135,13 +127,13 @@ private static Map getMappingsForField(@Nonnull final Searchable subFields.put(KEYWORD, KEYWORD_TYPE_MAP); mappingForField.put(FIELDS, subFields); } else if (fieldType == FieldType.BOOLEAN) { - mappingForField.put(TYPE, BOOLEAN); + mappingForField.put(TYPE, ESUtils.BOOLEAN_FIELD_TYPE); } else if (fieldType == FieldType.COUNT) { - mappingForField.put(TYPE, LONG); + mappingForField.put(TYPE, ESUtils.LONG_FIELD_TYPE); } else if (fieldType == FieldType.DATETIME) { - mappingForField.put(TYPE, DATE); + mappingForField.put(TYPE, ESUtils.DATE_FIELD_TYPE); } else if (fieldType == FieldType.OBJECT) { - mappingForField.put(TYPE, OBJECT); + mappingForField.put(TYPE, ESUtils.OBJECT_FIELD_TYPE); } else { log.info("FieldType {} has no mappings implemented", fieldType); } @@ -149,10 +141,10 @@ private static Map getMappingsForField(@Nonnull final Searchable searchableFieldSpec.getSearchableAnnotation() .getHasValuesFieldName() - .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, BOOLEAN))); + .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, ESUtils.BOOLEAN_FIELD_TYPE))); searchableFieldSpec.getSearchableAnnotation() .getNumValuesFieldName() - .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, LONG))); + .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, ESUtils.LONG_FIELD_TYPE))); mappings.putAll(getMappingsForFieldNameAliases(searchableFieldSpec)); return mappings; @@ -160,7 +152,7 @@ private static Map getMappingsForField(@Nonnull final Searchable private static Map getMappingsForKeyword() { Map mappingForField = new HashMap<>(); - mappingForField.put(TYPE, KEYWORD); + mappingForField.put(TYPE, ESUtils.KEYWORD_FIELD_TYPE); mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); // Add keyword subfield without lowercase filter mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP)); @@ -169,7 +161,7 @@ private static Map getMappingsForKeyword() { private static Map getMappingsForSearchText(FieldType fieldType) { Map mappingForField = new HashMap<>(); - mappingForField.put(TYPE, KEYWORD); + mappingForField.put(TYPE, ESUtils.KEYWORD_FIELD_TYPE); mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); Map subFields = new HashMap<>(); if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { @@ -186,14 +178,14 @@ private static Map getMappingsForSearchText(FieldType fieldType) String fieldName = entry.getKey(); String analyzerName = entry.getValue(); subFields.put(fieldName, ImmutableMap.of( - TYPE, TEXT, + TYPE, ESUtils.TEXT_FIELD_TYPE, ANALYZER, analyzerName )); } } } subFields.put(DELIMITED, ImmutableMap.of( - TYPE, TEXT, + TYPE, ESUtils.TEXT_FIELD_TYPE, ANALYZER, TEXT_ANALYZER, SEARCH_ANALYZER, TEXT_SEARCH_ANALYZER, SEARCH_QUOTE_ANALYZER, CUSTOM_QUOTE_ANALYZER)); @@ -206,7 +198,7 @@ private static Map getMappingsForSearchText(FieldType fieldType) private static Map getMappingsForSearchScoreField( @Nonnull final SearchScoreFieldSpec searchScoreFieldSpec) { return ImmutableMap.of(searchScoreFieldSpec.getSearchScoreAnnotation().getFieldName(), - ImmutableMap.of(TYPE, DOUBLE)); + ImmutableMap.of(TYPE, ESUtils.DOUBLE_FIELD_TYPE)); } private static Map getMappingsForFieldNameAliases(@Nonnull final SearchableFieldSpec searchableFieldSpec) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index 5fcc10b7af5cf..49571a60d5f21 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -202,7 +202,7 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi if (!finalSearchFlags.isSkipHighlighting()) { searchSourceBuilder.highlighter(_highlights); } - ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion, _entitySpecs); if (finalSearchFlags.isGetSuggestions()) { ESUtils.buildNameSuggestions(searchSourceBuilder, input); @@ -242,8 +242,10 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi BoolQueryBuilder filterQuery = getFilterQuery(filter); searchSourceBuilder.query(QueryBuilders.boolQuery().must(getQuery(input, finalSearchFlags.isFulltext())).filter(filterQuery)); _aggregationQueryBuilder.getAggregations().forEach(searchSourceBuilder::aggregation); - searchSourceBuilder.highlighter(getHighlights()); - ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + if (!finalSearchFlags.isSkipHighlighting()) { + searchSourceBuilder.highlighter(_highlights); + } + ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion, _entitySpecs); searchRequest.source(searchSourceBuilder); log.debug("Search request is: " + searchRequest); searchRequest.indicesOptions(null); @@ -270,7 +272,7 @@ public SearchRequest getFilterRequest(@Nullable Filter filters, @Nullable SortCr final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); searchSourceBuilder.query(filterQuery); searchSourceBuilder.from(from).size(size); - ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion, _entitySpecs); searchRequest.source(searchSourceBuilder); return searchRequest; @@ -301,7 +303,7 @@ public SearchRequest getFilterRequest(@Nullable Filter filters, @Nullable SortCr searchSourceBuilder.size(size); ESUtils.setSearchAfter(searchSourceBuilder, sort, pitId, keepAlive); - ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion, _entitySpecs); searchRequest.source(searchSourceBuilder); return searchRequest; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index 9a7d9a1b4c420..53765acb8e29e 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -2,6 +2,9 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; +import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.models.SearchableFieldSpec; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.ConjunctiveCriterion; import com.linkedin.metadata.query.filter.Criterion; @@ -49,7 +52,28 @@ public class ESUtils { public static final int MAX_RESULT_SIZE = 10000; public static final String OPAQUE_ID_HEADER = "X-Opaque-Id"; public static final String HEADER_VALUE_DELIMITER = "|"; - public static final String KEYWORD_TYPE = "keyword"; + + // Field types + public static final String KEYWORD_FIELD_TYPE = "keyword"; + public static final String BOOLEAN_FIELD_TYPE = "boolean"; + public static final String DATE_FIELD_TYPE = "date"; + public static final String DOUBLE_FIELD_TYPE = "double"; + public static final String LONG_FIELD_TYPE = "long"; + public static final String OBJECT_FIELD_TYPE = "object"; + public static final String TEXT_FIELD_TYPE = "text"; + public static final String TOKEN_COUNT_FIELD_TYPE = "token_count"; + // End of field types + + public static final Set FIELD_TYPES_STORED_AS_KEYWORD = Set.of( + SearchableAnnotation.FieldType.KEYWORD, + SearchableAnnotation.FieldType.TEXT, + SearchableAnnotation.FieldType.TEXT_PARTIAL, + SearchableAnnotation.FieldType.WORD_GRAM); + public static final Set FIELD_TYPES_STORED_AS_TEXT = Set.of( + SearchableAnnotation.FieldType.BROWSE_PATH, + SearchableAnnotation.FieldType.BROWSE_PATH_V2, + SearchableAnnotation.FieldType.URN, + SearchableAnnotation.FieldType.URN_PARTIAL); public static final String ENTITY_NAME_FIELD = "_entityName"; public static final String NAME_SUGGESTION = "nameSuggestion"; @@ -174,6 +198,25 @@ public static QueryBuilder getQueryBuilderFromCriterion(@Nonnull final Criterion return getQueryBuilderFromCriterionForSingleField(criterion, isTimeseries); } + public static String getElasticTypeForFieldType(SearchableAnnotation.FieldType fieldType) { + if (FIELD_TYPES_STORED_AS_KEYWORD.contains(fieldType)) { + return KEYWORD_FIELD_TYPE; + } else if (FIELD_TYPES_STORED_AS_TEXT.contains(fieldType)) { + return TEXT_FIELD_TYPE; + } else if (fieldType == SearchableAnnotation.FieldType.BOOLEAN) { + return BOOLEAN_FIELD_TYPE; + } else if (fieldType == SearchableAnnotation.FieldType.COUNT) { + return LONG_FIELD_TYPE; + } else if (fieldType == SearchableAnnotation.FieldType.DATETIME) { + return DATE_FIELD_TYPE; + } else if (fieldType == SearchableAnnotation.FieldType.OBJECT) { + return OBJECT_FIELD_TYPE; + } else { + log.warn("FieldType {} has no mappings implemented", fieldType); + return null; + } + } + /** * Populates source field of search query with the sort order as per the criterion provided. * @@ -189,14 +232,39 @@ public static QueryBuilder getQueryBuilderFromCriterion(@Nonnull final Criterion * @param sortCriterion {@link SortCriterion} to be applied to the search results */ public static void buildSortOrder(@Nonnull SearchSourceBuilder searchSourceBuilder, - @Nullable SortCriterion sortCriterion) { + @Nullable SortCriterion sortCriterion, List entitySpecs) { if (sortCriterion == null) { searchSourceBuilder.sort(new ScoreSortBuilder().order(SortOrder.DESC)); } else { + Optional fieldTypeForDefault = Optional.empty(); + for (EntitySpec entitySpec : entitySpecs) { + List fieldSpecs = entitySpec.getSearchableFieldSpecs(); + for (SearchableFieldSpec fieldSpec : fieldSpecs) { + SearchableAnnotation annotation = fieldSpec.getSearchableAnnotation(); + if (annotation.getFieldName().equals(sortCriterion.getField()) + || annotation.getFieldNameAliases().contains(sortCriterion.getField())) { + fieldTypeForDefault = Optional.of(fieldSpec.getSearchableAnnotation().getFieldType()); + break; + } + } + if (fieldTypeForDefault.isPresent()) { + break; + } + } + if (fieldTypeForDefault.isEmpty()) { + log.warn("Sort criterion field " + sortCriterion.getField() + " was not found in any entity spec to be searched"); + } final SortOrder esSortOrder = (sortCriterion.getOrder() == com.linkedin.metadata.query.filter.SortOrder.ASCENDING) ? SortOrder.ASC : SortOrder.DESC; - searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder).unmappedType(KEYWORD_TYPE)); + FieldSortBuilder sortBuilder = new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder); + if (fieldTypeForDefault.isPresent()) { + String esFieldtype = getElasticTypeForFieldType(fieldTypeForDefault.get()); + if (esFieldtype != null) { + sortBuilder.unmappedType(esFieldtype); + } + } + searchSourceBuilder.sort(sortBuilder); } if (sortCriterion == null || !sortCriterion.getField().equals(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD)) { searchSourceBuilder.sort(new FieldSortBuilder(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD).order(SortOrder.ASC)); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java index a496fc427138e..3e8f83a531b59 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java @@ -169,7 +169,7 @@ public List getIndexSizes() { List res = new ArrayList<>(); try { String indicesPattern = _indexConvention.getAllTimeseriesAspectIndicesPattern(); - Response r = _searchClient.getLowLevelClient().performRequest(new Request("GET", indicesPattern + "/_stats")); + Response r = _searchClient.getLowLevelClient().performRequest(new Request("GET", "/" + indicesPattern + "/_stats")); JsonNode body = new ObjectMapper().readTree(r.getEntity().getContent()); body.get("indices").fields().forEachRemaining(entry -> { TimeseriesIndexSizeResult elemResult = new TimeseriesIndexSizeResult(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java index 461a146022446..696e3b62834bd 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java @@ -47,8 +47,10 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import org.junit.Assert; +import org.mockito.ArgumentCaptor; import org.mockito.Mockito; import org.opensearch.client.RestHighLevelClient; +import org.opensearch.action.search.SearchRequest; import org.springframework.cache.CacheManager; import org.springframework.cache.concurrent.ConcurrentMapCacheManager; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; @@ -108,6 +110,7 @@ abstract public class LineageServiceTestBase extends AbstractTestNGSpringContext private GraphService _graphService; private CacheManager _cacheManager; private LineageSearchService _lineageSearchService; + private RestHighLevelClient _searchClientSpy; private static final String ENTITY_NAME = "testEntity"; private static final Urn TEST_URN = TestEntityUtil.getTestEntityUrn(); @@ -162,10 +165,11 @@ private ElasticSearchService buildEntitySearchService() { EntityIndexBuilders indexBuilders = new EntityIndexBuilders(getIndexBuilder(), _entityRegistry, _indexConvention, _settingsBuilder); - ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, getSearchClient(), _indexConvention, false, + _searchClientSpy = spy(getSearchClient()); + ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClientSpy, _indexConvention, false, ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, getSearchConfiguration(), null); - ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, getSearchClient(), _indexConvention, getSearchConfiguration(), getCustomSearchConfiguration()); - ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, getSearchClient(), _indexConvention, getBulkProcessor(), 1); + ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClientSpy, _indexConvention, getSearchConfiguration(), getCustomSearchConfiguration()); + ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClientSpy, _indexConvention, getBulkProcessor(), 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } @@ -246,9 +250,15 @@ public void testSearchService() throws Exception { _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); syncAfterWrite(getBulkProcessor()); + Mockito.reset(_searchClientSpy); searchResult = searchAcrossLineage(null, TEST1); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn); + // Verify that highlighting was turned off in the query + ArgumentCaptor searchRequestCaptor = ArgumentCaptor.forClass(SearchRequest.class); + Mockito.verify(_searchClientSpy, times(1)).search(searchRequestCaptor.capture(), any()); + SearchRequest capturedRequest = searchRequestCaptor.getValue(); + assertNull(capturedRequest.source().highlighter()); clearCache(false); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java index 1660504810296..69dd5c80bef1d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java @@ -22,12 +22,15 @@ import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.CriterionArray; import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.query.filter.SortCriterion; +import com.linkedin.metadata.query.filter.SortOrder; import com.linkedin.metadata.search.AggregationMetadata; import com.linkedin.metadata.search.ScrollResult; import com.linkedin.metadata.search.SearchEntity; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchService; import com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig; +import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.r2.RemoteInvocationException; import org.junit.Assert; import org.opensearch.client.RequestOptions; @@ -36,6 +39,9 @@ import org.opensearch.client.indices.AnalyzeResponse; import org.opensearch.client.indices.GetMappingsRequest; import org.opensearch.client.indices.GetMappingsResponse; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.FieldSortBuilder; +import org.opensearch.search.sort.SortBuilder; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.Test; @@ -54,11 +60,7 @@ import static com.linkedin.metadata.Constants.DATA_JOB_ENTITY_NAME; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchQueryBuilder.STRUCTURED_QUERY_PREFIX; import static com.linkedin.metadata.utils.SearchUtil.AGGREGATION_SEPARATOR_CHAR; -import static io.datahubproject.test.search.SearchTestUtils.autocomplete; -import static io.datahubproject.test.search.SearchTestUtils.scroll; -import static io.datahubproject.test.search.SearchTestUtils.search; -import static io.datahubproject.test.search.SearchTestUtils.searchAcrossEntities; -import static io.datahubproject.test.search.SearchTestUtils.searchStructured; +import static io.datahubproject.test.search.SearchTestUtils.*; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; @@ -174,6 +176,48 @@ public void testSearchFieldConfig() throws IOException { } } + @Test + public void testGetSortOrder() { + String dateFieldName = "lastOperationTime"; + List entityNamesToTestSearch = List.of("dataset", "chart", "corpgroup"); + List entitySpecs = entityNamesToTestSearch.stream().map( + name -> getEntityRegistry().getEntitySpec(name)) + .collect(Collectors.toList()); + SearchSourceBuilder builder = new SearchSourceBuilder(); + SortCriterion sortCriterion = new SortCriterion().setOrder(SortOrder.DESCENDING).setField(dateFieldName); + ESUtils.buildSortOrder(builder, sortCriterion, entitySpecs); + List> sorts = builder.sorts(); + assertEquals(sorts.size(), 2); // sort by last modified and then by urn + for (SortBuilder sort : sorts) { + assertTrue(sort instanceof FieldSortBuilder); + FieldSortBuilder fieldSortBuilder = (FieldSortBuilder) sort; + if (fieldSortBuilder.getFieldName().equals(dateFieldName)) { + assertEquals(fieldSortBuilder.order(), org.opensearch.search.sort.SortOrder.DESC); + assertEquals(fieldSortBuilder.unmappedType(), "date"); + } else { + assertEquals(fieldSortBuilder.getFieldName(), "urn"); + } + } + + // Test alias field + String entityNameField = "_entityName"; + SearchSourceBuilder nameBuilder = new SearchSourceBuilder(); + SortCriterion nameCriterion = new SortCriterion().setOrder(SortOrder.ASCENDING).setField(entityNameField); + ESUtils.buildSortOrder(nameBuilder, nameCriterion, entitySpecs); + sorts = nameBuilder.sorts(); + assertEquals(sorts.size(), 2); + for (SortBuilder sort : sorts) { + assertTrue(sort instanceof FieldSortBuilder); + FieldSortBuilder fieldSortBuilder = (FieldSortBuilder) sort; + if (fieldSortBuilder.getFieldName().equals(entityNameField)) { + assertEquals(fieldSortBuilder.order(), org.opensearch.search.sort.SortOrder.ASC); + assertEquals(fieldSortBuilder.unmappedType(), "keyword"); + } else { + assertEquals(fieldSortBuilder.getFieldName(), "urn"); + } + } + } + @Test public void testDatasetHasTags() throws IOException { GetMappingsRequest req = new GetMappingsRequest() @@ -1454,6 +1498,16 @@ public void testColumnExactMatch() { "Expected table with column name exact match first"); } + @Test + public void testSortOrdering() { + String query = "unit_data"; + SortCriterion criterion = new SortCriterion().setOrder(SortOrder.ASCENDING).setField("lastOperationTime"); + SearchResult result = getSearchService().searchAcrossEntities(SEARCHABLE_ENTITIES, query, null, criterion, 0, + 100, new SearchFlags().setFulltext(true).setSkipCache(true), null); + assertTrue(result.getEntities().size() > 2, + String.format("%s - Expected search results to have at least two results", query)); + } + private Stream getTokens(AnalyzeRequest request) throws IOException { return getSearchClient().indices().analyze(request, RequestOptions.DEFAULT).getTokens().stream(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index 90c6c523c588f..0ea035a10f91d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -97,6 +97,30 @@ public void testDatasetFieldsAndHighlights() { ), "unexpected lineage fields in highlights: " + highlightFields); } + @Test + public void testSearchRequestHandlerHighlightingTurnedOff() { + SearchRequestHandler requestHandler = SearchRequestHandler.getBuilder(TestEntitySpecBuilder.getSpec(), testQueryConfig, null); + SearchRequest searchRequest = requestHandler.getSearchRequest("testQuery", null, null, 0, + 10, new SearchFlags().setFulltext(false).setSkipHighlighting(true), null); + SearchSourceBuilder sourceBuilder = searchRequest.source(); + assertEquals(sourceBuilder.from(), 0); + assertEquals(sourceBuilder.size(), 10); + // Filters + Collection aggBuilders = sourceBuilder.aggregations().getAggregatorFactories(); + // Expect 2 aggregations: textFieldOverride and _index + assertEquals(aggBuilders.size(), 2); + for (AggregationBuilder aggBuilder : aggBuilders) { + if (aggBuilder.getName().equals("textFieldOverride")) { + TermsAggregationBuilder filterPanelBuilder = (TermsAggregationBuilder) aggBuilder; + assertEquals(filterPanelBuilder.field(), "textFieldOverride.keyword"); + } else if (!aggBuilder.getName().equals("_entityType")) { + fail("Found unexepected aggregation: " + aggBuilder.getName()); + } + } + // Highlights should not be present + assertNull(sourceBuilder.highlighter()); + } + @Test public void testSearchRequestHandler() { SearchRequestHandler requestHandler = SearchRequestHandler.getBuilder(TestEntitySpecBuilder.getSpec(), testQueryConfig, null); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java index cc60ba8679e1f..1362a0f69eff2 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java @@ -45,6 +45,7 @@ import com.linkedin.timeseries.GroupingBucket; import com.linkedin.timeseries.GroupingBucketType; import com.linkedin.timeseries.TimeWindowSize; +import com.linkedin.timeseries.TimeseriesIndexSizeResult; import org.opensearch.client.RestHighLevelClient; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; @@ -884,4 +885,23 @@ public void testCountByFilterAfterDelete() throws InterruptedException { _elasticSearchTimeseriesAspectService.countByFilter(ENTITY_NAME, ASPECT_NAME, urnAndTimeFilter); assertEquals(count, 0L); } + + @Test(groups = {"getAggregatedStats"}, dependsOnGroups = {"upsert"}) + public void testGetIndexSizes() { + List result = _elasticSearchTimeseriesAspectService.getIndexSizes(); + //CHECKSTYLE:OFF + /* + Example result: + {aspectName=testentityprofile, sizeMb=52.234, + indexName=es_timeseries_aspect_service_test_testentity_testentityprofileaspect_v1, entityName=testentity} + {aspectName=testentityprofile, sizeMb=0.208, + indexName=es_timeseries_aspect_service_test_testentitywithouttests_testentityprofileaspect_v1, entityName=testentitywithouttests} + */ + // There may be other indices in there from other tests, so just make sure that index for entity + aspect is in there + //CHECKSTYLE:ON + assertTrue(result.size() > 0); + assertTrue( + result.stream().anyMatch(idxSizeResult -> idxSizeResult.getIndexName().equals( + "es_timeseries_aspect_service_test_testentity_testentityprofileaspect_v1"))); + } } diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java index 67e1ee368f513..34aa6978f742f 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java +++ b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java @@ -5,7 +5,9 @@ import java.time.Duration; public interface SearchTestContainer { - String SEARCH_JAVA_OPTS = "-Xms64m -Xmx384m -XX:MaxDirectMemorySize=368435456"; + + String SEARCH_JAVA_OPTS = "-Xms446m -Xmx446m -XX:MaxDirectMemorySize=368435456"; + Duration STARTUP_TIMEOUT = Duration.ofMinutes(5); // usually < 1min GenericContainer startContainer(); diff --git a/metadata-jobs/mae-consumer/build.gradle b/metadata-jobs/mae-consumer/build.gradle index d36fd0de40d03..fcb8b62e4ac9d 100644 --- a/metadata-jobs/mae-consumer/build.gradle +++ b/metadata-jobs/mae-consumer/build.gradle @@ -21,9 +21,9 @@ dependencies { implementation project(':ingestion-scheduler') implementation project(':metadata-utils') implementation project(":entity-registry") - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation project(':datahub-graphql-core') implementation externalDependency.elasticSearchRest diff --git a/metadata-jobs/mce-consumer/build.gradle b/metadata-jobs/mce-consumer/build.gradle index 0bca55e0e5f92..97eec9fcff051 100644 --- a/metadata-jobs/mce-consumer/build.gradle +++ b/metadata-jobs/mce-consumer/build.gradle @@ -17,9 +17,9 @@ dependencies { } implementation project(':metadata-utils') implementation project(':metadata-events:mxe-schemas') - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation project(':metadata-io') implementation project(':metadata-service:restli-client') implementation spec.product.pegasus.restliClient diff --git a/metadata-jobs/pe-consumer/build.gradle b/metadata-jobs/pe-consumer/build.gradle index 1899a4de15635..81e8b8c9971f0 100644 --- a/metadata-jobs/pe-consumer/build.gradle +++ b/metadata-jobs/pe-consumer/build.gradle @@ -10,9 +10,9 @@ configurations { dependencies { avro project(path: ':metadata-models', configuration: 'avroSchema') implementation project(':li-utils') - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation(project(':metadata-service:factories')) { exclude group: 'org.neo4j.test' } diff --git a/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl index b3e237202fc2f..f777b5d6e12e7 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl @@ -37,10 +37,10 @@ record DataHubIngestionSourceInfo { * Parameters associated with the Ingestion Source */ config: record DataHubIngestionSourceConfig { - /** - * The JSON recipe to use for ingestion - */ - recipe: string + /** + * The JSON recipe to use for ingestion + */ + recipe: string /** * The PyPI version of the datahub CLI to use when executing a recipe @@ -56,5 +56,10 @@ record DataHubIngestionSourceInfo { * Whether or not to run this ingestion source in debug mode */ debugMode: optional boolean + + /** + * Extra arguments for the ingestion run. + */ + extraArgs: optional map[string, string] } } \ No newline at end of file diff --git a/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java b/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java index f9cf1b01e1762..d3c5ba822ac04 100644 --- a/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java +++ b/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java @@ -29,4 +29,6 @@ public class AuthenticationConfiguration { * The lifespan of a UI session token. */ private long sessionTokenDurationMs; + + private TokenServiceConfiguration tokenService; } diff --git a/metadata-service/auth-config/src/main/java/com/datahub/authentication/TokenServiceConfiguration.java b/metadata-service/auth-config/src/main/java/com/datahub/authentication/TokenServiceConfiguration.java new file mode 100644 index 0000000000000..0a606f0f06d92 --- /dev/null +++ b/metadata-service/auth-config/src/main/java/com/datahub/authentication/TokenServiceConfiguration.java @@ -0,0 +1,15 @@ +package com.datahub.authentication; + +import lombok.Data; + + +@Data +/** + * Configurations for DataHub token service + */ +public class TokenServiceConfiguration { + private String signingKey; + private String salt; + private String issuer; + private String signingAlgorithm; +} diff --git a/metadata-service/auth-filter/build.gradle b/metadata-service/auth-filter/build.gradle index 2dd07ef10274c..61e9015adc942 100644 --- a/metadata-service/auth-filter/build.gradle +++ b/metadata-service/auth-filter/build.gradle @@ -14,4 +14,6 @@ dependencies { annotationProcessor externalDependency.lombok testImplementation externalDependency.mockito + testImplementation externalDependency.testng + testImplementation externalDependency.springBootTest } \ No newline at end of file diff --git a/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthTestConfiguration.java b/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthTestConfiguration.java new file mode 100644 index 0000000000000..05ca428283a6c --- /dev/null +++ b/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthTestConfiguration.java @@ -0,0 +1,79 @@ +package com.datahub.auth.authentication; + +import com.datahub.auth.authentication.filter.AuthenticationFilter; +import com.datahub.authentication.AuthenticationConfiguration; +import com.datahub.authentication.AuthenticatorConfiguration; +import com.datahub.authentication.TokenServiceConfiguration; +import com.datahub.authentication.token.StatefulTokenService; +import com.linkedin.gms.factory.config.ConfigurationProvider; +import com.linkedin.metadata.config.AuthPluginConfiguration; +import com.linkedin.metadata.config.DataHubConfiguration; +import com.linkedin.metadata.config.PluginConfiguration; +import com.linkedin.metadata.entity.EntityService; +import java.util.List; +import java.util.Map; +import javax.servlet.ServletException; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.DependsOn; + +import static org.mockito.Mockito.*; + +@Configuration +public class AuthTestConfiguration { + + + @Bean + public EntityService entityService() { + return mock(EntityService.class); + } + + @Bean("dataHubTokenService") + public StatefulTokenService statefulTokenService(ConfigurationProvider configurationProvider, EntityService entityService) { + TokenServiceConfiguration tokenServiceConfiguration = configurationProvider.getAuthentication().getTokenService(); + return new StatefulTokenService( + tokenServiceConfiguration.getSigningKey(), + tokenServiceConfiguration.getSigningAlgorithm(), + tokenServiceConfiguration.getIssuer(), + entityService, + tokenServiceConfiguration.getSalt() + ); + } + + @Bean + public ConfigurationProvider configurationProvider() { + ConfigurationProvider configurationProvider = new ConfigurationProvider(); + AuthenticationConfiguration authenticationConfiguration = new AuthenticationConfiguration(); + authenticationConfiguration.setEnabled(true); + configurationProvider.setAuthentication(authenticationConfiguration); + DataHubConfiguration dataHubConfiguration = new DataHubConfiguration(); + PluginConfiguration pluginConfiguration = new PluginConfiguration(); + AuthPluginConfiguration authPluginConfiguration = new AuthPluginConfiguration(); + authenticationConfiguration.setSystemClientId("__datahub_system"); + authenticationConfiguration.setSystemClientSecret("JohnSnowKnowsNothing"); + TokenServiceConfiguration tokenServiceConfiguration = new TokenServiceConfiguration(); + tokenServiceConfiguration.setIssuer("datahub-metadata-service"); + tokenServiceConfiguration.setSigningKey("WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94="); + tokenServiceConfiguration.setSalt("ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O="); + tokenServiceConfiguration.setSigningAlgorithm("HS256"); + authenticationConfiguration.setTokenService(tokenServiceConfiguration); + AuthenticatorConfiguration authenticator = new AuthenticatorConfiguration(); + authenticator.setType("com.datahub.authentication.authenticator.DataHubTokenAuthenticator"); + authenticator.setConfigs(Map.of("signingKey", "WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94=", + "salt", "ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O=")); + List authenticators = List.of(authenticator); + authenticationConfiguration.setAuthenticators(authenticators); + authPluginConfiguration.setPath(""); + pluginConfiguration.setAuth(authPluginConfiguration); + dataHubConfiguration.setPlugin(pluginConfiguration); + configurationProvider.setDatahub(dataHubConfiguration); + return configurationProvider; + } + + @Bean + // TODO: Constructor injection + @DependsOn({"configurationProvider", "dataHubTokenService", "entityService"}) + public AuthenticationFilter authenticationFilter() throws ServletException { + return new AuthenticationFilter(); + } +} diff --git a/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthenticationFilterTest.java b/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthenticationFilterTest.java new file mode 100644 index 0000000000000..2ac65bf09c912 --- /dev/null +++ b/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthenticationFilterTest.java @@ -0,0 +1,53 @@ +package com.datahub.auth.authentication; + +import com.datahub.auth.authentication.filter.AuthenticationFilter; +import com.datahub.authentication.Actor; +import com.datahub.authentication.ActorType; +import com.datahub.authentication.token.StatefulTokenService; +import com.datahub.authentication.token.TokenException; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import javax.servlet.FilterChain; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import static com.datahub.authentication.AuthenticationConstants.*; +import static org.mockito.Mockito.*; + + +@ContextConfiguration(classes = { AuthTestConfiguration.class }) +public class AuthenticationFilterTest extends AbstractTestNGSpringContextTests { + + @Autowired + AuthenticationFilter _authenticationFilter; + + @Autowired + StatefulTokenService _statefulTokenService; + + @Test + public void testExpiredToken() throws ServletException, IOException, TokenException { + _authenticationFilter.init(null); + HttpServletRequest servletRequest = mock(HttpServletRequest.class); + HttpServletResponse servletResponse = mock(HttpServletResponse.class); + FilterChain filterChain = mock(FilterChain.class); + Actor actor = new Actor(ActorType.USER, "datahub"); +// String token = _statefulTokenService.generateAccessToken(TokenType.SESSION, actor, 0L, System.currentTimeMillis(), "token", +// "token", actor.toUrnStr()); + // Token generated 9/11/23, invalid for all future dates + String token = "eyJhbGciOiJIUzI1NiJ9.eyJhY3RvclR5cGUiOiJVU0VSIZCI6ImRhdGFodWIiLCJ0eXBlIjoiU0VTU0lPTiIsInZlcnNpb24iOiIxIiwian" + + "RpIjoiMmI0MzZkZDAtYjEwOS00N2UwLWJmYTEtMzM2ZmU4MTU4MDE1Iiwic3ViIjoiZGF0YWh1YiIsImV4cCI6MTY5NDU0NzA2OCwiaXNzIjoiZGF" + + "0YWh1Yi1tZXRhZGF0YS1zZXJ2aWNlIn0.giqx7J5a9mxuubG6rXdAMoaGlcII-fqY-W82Wm7OlLI"; + when(servletRequest.getHeaderNames()).thenReturn(Collections.enumeration(List.of(AUTHORIZATION_HEADER_NAME))); + when(servletRequest.getHeader(AUTHORIZATION_HEADER_NAME)) + .thenReturn("Bearer " + token); + + _authenticationFilter.doFilter(servletRequest, servletResponse, filterChain); + verify(servletResponse, times(1)).sendError(eq(HttpServletResponse.SC_UNAUTHORIZED), anyString()); + } +} diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index d22f92adca8f9..5d72e24748072 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -25,6 +25,8 @@ authentication: # Key used to sign new tokens. signingKey: ${DATAHUB_TOKEN_SERVICE_SIGNING_KEY:WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94=} salt: ${DATAHUB_TOKEN_SERVICE_SALT:ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O=} + issuer: ${DATAHUB_TOKEN_SERVICE_ISSUER:datahub-metadata-service} + signingAlgorithm: ${DATAHUB_TOKEN_SERVICE_SIGNING_ALGORITHM:HS256} # The max duration of a UI session in milliseconds. Defaults to 1 day. sessionTokenDurationMs: ${SESSION_TOKEN_DURATION_MS:86400000} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java index 6b2a61882be90..d47e1a0a73401 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java @@ -23,10 +23,10 @@ public class DataHubTokenServiceFactory { @Value("${authentication.tokenService.salt:}") private String saltingKey; - @Value("${elasticsearch.tokenService.signingAlgorithm:HS256}") + @Value("${authentication.tokenService.signingAlgorithm:HS256}") private String signingAlgorithm; - @Value("${elasticsearch.tokenService.issuer:datahub-metadata-service}") + @Value("${authentication.tokenService.issuer:datahub-metadata-service}") private String issuer; /** diff --git a/metadata-service/restli-servlet-impl/build.gradle b/metadata-service/restli-servlet-impl/build.gradle index cb307863748c3..de6fb6690e693 100644 --- a/metadata-service/restli-servlet-impl/build.gradle +++ b/metadata-service/restli-servlet-impl/build.gradle @@ -48,7 +48,7 @@ dependencies { implementation externalDependency.dropwizardMetricsCore implementation externalDependency.dropwizardMetricsJmx - compileOnly externalDependency.lombok + implementation externalDependency.lombok implementation externalDependency.neo4jJavaDriver implementation externalDependency.opentelemetryAnnotations diff --git a/metadata-service/services/build.gradle b/metadata-service/services/build.gradle index 22c62af324c12..b6af3d330d185 100644 --- a/metadata-service/services/build.gradle +++ b/metadata-service/services/build.gradle @@ -9,9 +9,9 @@ dependencies { implementation externalDependency.jsonPatch implementation project(':entity-registry') implementation project(':metadata-utils') - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation project(':metadata-models') implementation project(':metadata-service:restli-client') implementation project(':metadata-service:configuration') diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java index a46b58aabfb0b..64f59780b887f 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java @@ -188,11 +188,12 @@ BrowseResult browse(@Nonnull String entityName, @Nonnull String path, @Nullable * @param sortCriterion {@link SortCriterion} to be applied to search results * @param scrollId opaque scroll identifier to pass to search service * @param size the number of search hits to return + * @param searchFlags flags controlling search options * @return a {@link ScrollResult} that contains a list of matched documents and related search result metadata */ @Nonnull ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size); + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size, @Nullable SearchFlags searchFlags); /** * Gets a list of documents that match given search request. The results are aggregated and filters are applied to the @@ -204,11 +205,12 @@ ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String inpu * @param sortCriterion {@link SortCriterion} to be applied to search results * @param scrollId opaque scroll identifier to pass to search service * @param size the number of search hits to return + * @param searchFlags flags controlling search options * @return a {@link ScrollResult} that contains a list of matched documents and related search result metadata */ @Nonnull ScrollResult structuredScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size); + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size, @Nullable SearchFlags searchFlags); /** * Max result size returned by the underlying search backend diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle index 1c1c368611488..7bc6aa2d43442 100644 --- a/metadata-utils/build.gradle +++ b/metadata-utils/build.gradle @@ -1,7 +1,7 @@ apply plugin: 'java-library' dependencies { - api externalDependency.avro_1_7 + api externalDependency.avro implementation externalDependency.commonsLang api externalDependency.dropwizardMetricsCore implementation externalDependency.dropwizardMetricsJmx @@ -16,8 +16,8 @@ dependencies { api project(':li-utils') api project(':entity-registry') - api project(':metadata-events:mxe-avro-1.7') - api project(':metadata-events:mxe-utils-avro-1.7') + api project(':metadata-events:mxe-avro') + api project(':metadata-events:mxe-utils-avro') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/settings.gradle b/settings.gradle index d6777b07b3fb3..52de461383b5e 100644 --- a/settings.gradle +++ b/settings.gradle @@ -20,10 +20,10 @@ include 'metadata-service:openapi-analytics-servlet' include 'metadata-service:plugin' include 'metadata-service:plugin:src:test:sample-test-plugins' include 'metadata-dao-impl:kafka-producer' -include 'metadata-events:mxe-avro-1.7' +include 'metadata-events:mxe-avro' include 'metadata-events:mxe-registration' include 'metadata-events:mxe-schemas' -include 'metadata-events:mxe-utils-avro-1.7' +include 'metadata-events:mxe-utils-avro' include 'metadata-ingestion' include 'metadata-jobs:mae-consumer' include 'metadata-jobs:mce-consumer' diff --git a/smoke-test/tests/containers/containers_test.py b/smoke-test/tests/containers/containers_test.py index 05a45239dabf8..227645a87d30a 100644 --- a/smoke-test/tests/containers/containers_test.py +++ b/smoke-test/tests/containers/containers_test.py @@ -227,6 +227,7 @@ def test_update_container(frontend_session, ingest_cleanup_data): "ownerUrn": new_owner, "resourceUrn": container_urn, "ownerEntityType": "CORP_USER", + "ownershipTypeUrn": "urn:li:ownershipType:__system__technical_owner" } }, } diff --git a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js index e0d2bf240d74d..aeceaf99be889 100644 --- a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js +++ b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js @@ -4,42 +4,47 @@ const glossaryParentGroup = "Cypress"; describe("glossary sidebar navigation test", () => { it("create term and term parent group, move and delete term group", () => { - //create a new term group and term, move term to the group + + // Create a new term group and term, move term to the group cy.loginWithCredentials(); cy.goToGlossaryList(); - cy.clickOptionWithText("Add Term Group"); + cy.clickOptionWithTestId("add-term-group-button"); cy.waitTextVisible("Create Term Group"); - cy.get(".ant-input-affix-wrapper > input[type='text']").first().type(glossaryTermGroup); - cy.get(".ant-modal-footer > button:last-child").click(); - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTermGroup).should("be.visible"); - cy.clickOptionWithText("Add Term"); + cy.enterTextInTestId("create-glossary-entity-modal-name", glossaryTermGroup); + cy.clickOptionWithTestId("glossary-entity-modal-create-button"); + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).should("be.visible"); + cy.clickOptionWithTestId("add-term-button"); + cy.waitTextVisible("Created Term Group!"); cy.waitTextVisible("Create Glossary Term"); - cy.get(".ant-input-affix-wrapper > input[type='text']").first().type(glossaryTerm); - cy.get(".ant-modal-footer > button:last-child").click(); - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTerm).click(); - cy.waitTextVisible("No documentation yet"); + cy.enterTextInTestId("create-glossary-entity-modal-name", glossaryTerm); + cy.clickOptionWithTestId("glossary-entity-modal-create-button").wait(3000); + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTerm).click().wait(3000); cy.openThreeDotDropdown(); - cy.clickOptionWithText("Move"); - cy.get('[role="dialog"]').contains(glossaryTermGroup).click({force: true}); - cy.get('[role="dialog"]').contains(glossaryTermGroup).should("be.visible"); - cy.get("button").contains("Move").click(); + cy.clickOptionWithTestId("entity-menu-move-button") + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryTermGroup).click({force: true}); + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryTermGroup).should("be.visible"); + cy.clickOptionWithTestId("glossary-entity-modal-move-button"); cy.waitTextVisible("Moved Glossary Term!"); - //ensure the new term is under the parent term group in the navigation sidebar - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTermGroup).click(); + + // Ensure the new term is under the parent term group in the navigation sidebar + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).click(); cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTerm).should("be.visible"); - //move a term group from the root level to be under a parent term group + + // Move a term group from the root level to be under a parent term group cy.goToGlossaryList(); cy.clickOptionWithText(glossaryTermGroup); cy.openThreeDotDropdown(); cy.clickOptionWithText("Move"); - cy.get('[role="dialog"]').contains(glossaryParentGroup).click({force: true}); - cy.get('[role="dialog"]').contains(glossaryParentGroup).should("be.visible"); - cy.get("button").contains("Move").click(); + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryParentGroup).click({force: true}); + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryParentGroup).should("be.visible"); + cy.clickOptionWithTestId("glossary-entity-modal-move-button"); cy.waitTextVisible("Moved Term Group!"); - //ensure it is no longer on the sidebar navigator at the top level but shows up under the new parent - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryParentGroup).click(); + + // Ensure it is no longer on the sidebar navigator at the top level but shows up under the new parent + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryParentGroup).click(); cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTermGroup).should("be.visible"); - //delete a term group + + // Delete a term group cy.goToGlossaryList(); cy.clickOptionWithText(glossaryParentGroup); cy.clickOptionWithText(glossaryTermGroup); @@ -50,7 +55,8 @@ describe("glossary sidebar navigation test", () => { cy.clickOptionWithText(glossaryTermGroup).wait(3000); cy.deleteFromDropdown(); cy.waitTextVisible("Deleted Term Group!"); - //ensure it is no longer in the sidebar navigator + + // Ensure it is no longer in the sidebar navigator cy.ensureTextNotPresent(glossaryTerm); cy.ensureTextNotPresent(glossaryTermGroup); }); diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/download_lineage_results.js b/smoke-test/tests/cypress/cypress/e2e/lineage/download_lineage_results.js new file mode 100644 index 0000000000000..315aa7b22b9da --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/download_lineage_results.js @@ -0,0 +1,80 @@ +const test_dataset = "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)"; +const first_degree = [ + "urn:li:chart:(looker,cypress_baz1)", + "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleCypressHdfsDataset,PROD)", + "urn:li:mlFeature:(cypress-test-2,some-cypress-feature-1)" +]; +const second_degree = [ + "urn:li:chart:(looker,cypress_baz2)", + "urn:li:dashboard:(looker,cypress_baz)", + "urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)", + "urn:li:mlPrimaryKey:(cypress-test-2,some-cypress-feature-2)" +]; +const third_degree_plus = [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,cypress_dag_abc,PROD),cypress_task_123)", + "urn:li:dataJob:(urn:li:dataFlow:(airflow,cypress_dag_abc,PROD),cypress_task_456)", + "urn:li:dataset:(urn:li:dataPlatform:hive,cypress_logging_events,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_created,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_created_no_tag,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_deleted,PROD)" +]; +const downloadCsvFile = (filename) => { + cy.get('[data-testid="three-dot-menu"]').click(); + cy.get('[data-testid="download-as-csv-menu-item"]').click(); + cy.get('[data-testid="download-as-csv-input"]').clear().type(filename); + cy.get('[data-testid="csv-modal-download-button"]').click().wait(5000); + cy.ensureTextNotPresent("Creating CSV to download"); +}; + +describe("download lineage results to .csv file", () => { + + it("download and verify lineage results for 1st, 2nd and 3+ degree of dependencies", () => { + cy.loginWithCredentials(); + cy.goToDataset(test_dataset,"SampleCypressKafkaDataset"); + cy.openEntityTab("Lineage"); + + // Verify 1st degree of dependencies + cy.contains(/1 - 3 of 3/); + downloadCsvFile("first_degree_results.csv"); + let first_degree_csv = cy.readFile('cypress/downloads/first_degree_results.csv'); + first_degree.forEach(function (urn) { + first_degree_csv.should('contain', urn) + }); + second_degree.forEach(function (urn) { + first_degree_csv.should('not.contain', urn) + }); + third_degree_plus.forEach(function (urn) { + first_degree_csv.should('not.contain', urn); + }); + + // Verify 1st and 2nd degree of dependencies + cy.get('[data-testid="facet-degree-2"]').click().wait(5000); + cy.contains(/1 - 7 of 7/); + downloadCsvFile("second_degree_results.csv"); + let second_degree_csv = cy.readFile('cypress/downloads/second_degree_results.csv'); + first_degree.forEach(function (urn) { + second_degree_csv.should('contain', urn) + }); + second_degree.forEach(function (urn) { + second_degree_csv.should('contain', urn) + }); + third_degree_plus.forEach(function (urn) { + second_degree_csv.should('not.contain', urn); + }); + + // Verify 1st 2nd and 3+ degree of dependencies(Verify multi page download) + cy.get('[data-testid="facet-degree-3+"]').click().wait(5000); + cy.contains(/1 - 10 of 13/); + downloadCsvFile("third_plus_degree_results.csv"); + let third_degree_csv = cy.readFile('cypress/downloads/third_plus_degree_results.csv'); + first_degree.forEach(function (urn) { + third_degree_csv.should('contain', urn) + }); + second_degree.forEach(function (urn) { + third_degree_csv.should('contain', urn) + }); + third_degree_plus.forEach(function (urn) { + third_degree_csv.should('contain', urn); + }); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js new file mode 100644 index 0000000000000..37ca62c8d1229 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js @@ -0,0 +1,68 @@ +import { aliasQuery } from "../utils"; +const DATASET_ENTITY_TYPE = 'dataset'; +const DATASET_URN = 'urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleCypressHdfsDataset,PROD)'; +const DOWNSTREAM_DATASET_URN = "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)"; +const upstreamColumn = '[data-testid="node-urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)-Upstream"] text'; +const downstreamColumn = '[data-testid="node-urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleCypressHdfsDataset,PROD)-Downstream"] text'; + +const verifyColumnPathModal = (from, to) => { + cy.get('[data-testid="entity-paths-modal"]').contains(from).should("be.visible"); + cy.get('[data-testid="entity-paths-modal"]').contains(to).should("be.visible"); +}; + +describe("column-Level lineage and impact analysis path test", () => { + beforeEach(() => { + cy.on('uncaught:exception', (err, runnable) => { return false; }); + cy.intercept("POST", "/api/v2/graphql", (req) => { + aliasQuery(req, "appConfig"); + }); + }); + + it("verify column-level lineage path at lineage praph and impact analysis ", () => { + // Open dataset with column-level lineage configured an navigate to lineage tab -> visualize lineage + cy.loginWithCredentials(); + cy.goToEntityLineageGraph(DATASET_ENTITY_TYPE, DATASET_URN); + + // Enable “show columns” toggle + cy.waitTextVisible("SampleCypressHdfs"); + cy.clickOptionWithTestId("column-toggle"); + cy.waitTextVisible("shipment_info"); + + // Verify functionality of column lineage + cy.get(upstreamColumn).eq(3).click(); + cy.get(upstreamColumn).eq(3).prev().should('not.have.attr', 'fill', 'white'); + cy.get(downstreamColumn).eq(2).prev().should('not.have.attr', 'stroke', 'transparent'); + cy.get(downstreamColumn).eq(2).click(); + cy.get(downstreamColumn).eq(2).prev().should('not.have.attr', 'fill', 'white'); + cy.get(upstreamColumn).eq(3).prev().should('not.have.attr', 'stroke', 'transparent'); + + // Open dataset impact analysis view, enable column lineage + cy.goToDataset(DATASET_URN, "SampleCypressHdfsDataset"); + cy.openEntityTab("Lineage"); + cy.clickOptionWithText("Column Lineage"); + cy.clickOptionWithText("Downstream"); + + // Verify upstream column lineage, test column path modal + cy.clickOptionWithText("Upstream"); + cy.waitTextVisible("SampleCypressKafkaDataset"); + cy.ensureTextNotPresent("field_bar"); + cy.contains("Select column").click({ force: true}).wait(1000); + cy.get(".rc-virtual-list").contains("shipment_info").click(); + cy.waitTextVisible("field_bar"); + cy.clickOptionWithText("field_bar"); + verifyColumnPathModal("shipment_info", "field_bar"); + cy.get('[data-testid="entity-paths-modal"] [data-icon="close"]').click(); + + // Verify downstream column lineage, test column path modal + cy.goToDataset(DOWNSTREAM_DATASET_URN, "SampleCypressKafkaDataset"); + cy.openEntityTab("Lineage"); + cy.clickOptionWithText("Column Lineage"); + cy.ensureTextNotPresent("shipment_info"); + cy.contains("Select column").click({ force: true}).wait(1000); + cy.get(".rc-virtual-list").contains("field_bar").click(); + cy.waitTextVisible("shipment_info"); + cy.clickOptionWithText("shipment_info"); + verifyColumnPathModal("shipment_info", "field_bar"); + cy.get('[data-testid="entity-paths-modal"] [data-icon="close"]').click(); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_graph.js b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_graph.js index 9e035f7f89772..85db210649c27 100644 --- a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_graph.js +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_graph.js @@ -5,8 +5,6 @@ const TASKS_ENTITY_TYPE = 'tasks'; const DATASET_URN = 'urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)'; const JAN_1_2021_TIMESTAMP = 1609553357755; const JAN_1_2022_TIMESTAMP = 1641089357755; -const TIMESTAMP_MILLIS_EIGHT_DAYS_AGO = getTimestampMillisNumDaysAgo(8); -const TIMESTAMP_MILLIS_ONE_DAY_AGO = getTimestampMillisNumDaysAgo(1); const TIMESTAMP_MILLIS_14_DAYS_AGO = getTimestampMillisNumDaysAgo(14); const TIMESTAMP_MILLIS_7_DAYS_AGO = getTimestampMillisNumDaysAgo(7); const TIMESTAMP_MILLIS_NOW = getTimestampMillisNumDaysAgo(0); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_ownership.js b/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_ownership.js index fcc0566f3f6ce..99ad9a68d35e1 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_ownership.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_ownership.js @@ -5,7 +5,7 @@ const password = "Example password"; const group_name = `Test group ${test_id}`; const addOwner = (owner, type, elementId) => { - cy.clickOptionWithText("Add Owners"); + cy.clickOptionWithTestId("add-owners-button"); cy.contains("Search for users or groups...").click({ force: true }); cy.focused().type(owner); cy.clickOptionWithText(owner); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js index 83b66e2cb2549..5f9758a35ca0e 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js @@ -10,20 +10,20 @@ describe("edit documentation and link to dataset", () => { cy.visit( "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" ); - cy.get("[role='tab']").contains("Documentation").click(); + cy.openEntityTab("Documentation"); cy.waitTextVisible("my hive dataset"); cy.waitTextVisible("Sample doc"); - cy.clickOptionWithText("Edit"); + cy.clickOptionWithTestId("edit-documentation-button"); cy.focused().clear(); cy.focused().type(documentation_edited); - cy.get("button").contains("Save").click(); + cy.clickOptionWithTestId("description-editor-save-button"); cy.waitTextVisible("Description Updated"); cy.waitTextVisible(documentation_edited); //return documentation to original state - cy.clickOptionWithText("Edit"); + cy.clickOptionWithTestId("edit-documentation-button"); cy.focused().clear().wait(1000); cy.focused().type("my hive dataset"); - cy.get("button").contains("Save").click(); + cy.clickOptionWithTestId("description-editor-save-button"); cy.waitTextVisible("Description Updated"); cy.waitTextVisible("my hive dataset"); }); @@ -33,21 +33,21 @@ describe("edit documentation and link to dataset", () => { cy.visit( "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" ); - cy.get("[role='tab']").contains("Documentation").click(); + cy.openEntityTab("Documentation"); cy.contains("Sample doc").trigger("mouseover", { force: true }); cy.get('[data-icon="delete"]').click(); cy.waitTextVisible("Link Removed"); - cy.get("button").contains("Add Link").click().wait(1000); - cy.get('[role="dialog"] #addLinkForm_url').type(wrong_url); + cy.clickOptionWithTestId("add-link-button").wait(1000); + cy.enterTextInTestId("add-link-modal-url", wrong_url); cy.waitTextVisible("This field must be a valid url."); cy.focused().clear(); cy.waitTextVisible("A URL is required."); - cy.focused().type(correct_url); + cy.enterTextInTestId("add-link-modal-url", correct_url); cy.ensureTextNotPresent("This field must be a valid url."); - cy.get("#addLinkForm_label").type("Sample doc"); - cy.get('[role="dialog"] button').contains("Add").click(); + cy.enterTextInTestId("add-link-modal-label", "Sample doc"); + cy.clickOptionWithTestId("add-link-modal-add-button"); cy.waitTextVisible("Link Added"); - cy.get("[role='tab']").contains("Documentation").click(); + cy.openEntityTab("Documentation"); cy.get(`[href='${correct_url}']`).should("be.visible"); }); @@ -55,18 +55,18 @@ describe("edit documentation and link to dataset", () => { cy.loginWithCredentials(); cy.visit("/domain/urn:li:domain:marketing/Entities"); cy.waitTextVisible("SampleCypressKafkaDataset"); - cy.get("button").contains("Add Link").click().wait(1000); - cy.get('[role="dialog"] #addLinkForm_url').type(wrong_url); + cy.clickOptionWithTestId("add-link-button").wait(1000); + cy.enterTextInTestId("add-link-modal-url", wrong_url); cy.waitTextVisible("This field must be a valid url."); cy.focused().clear(); cy.waitTextVisible("A URL is required."); - cy.focused().type(correct_url); + cy.enterTextInTestId("add-link-modal-url", correct_url); cy.ensureTextNotPresent("This field must be a valid url."); - cy.get("#addLinkForm_label").type("Sample doc"); - cy.get('[role="dialog"] button').contains("Add").click(); + cy.enterTextInTestId("add-link-modal-label", "Sample doc"); + cy.clickOptionWithTestId("add-link-modal-add-button"); cy.waitTextVisible("Link Added"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.waitTextVisible("Edit"); + cy.openEntityTab("Documentation"); + cy.get("[data-testid='edit-documentation-button']").should("be.visible"); cy.get(`[href='${correct_url}']`).should("be.visible"); cy.contains("Sample doc").trigger("mouseover", { force: true }); cy.get('[data-icon="delete"]').click(); @@ -83,14 +83,14 @@ describe("edit documentation and link to dataset", () => { cy.waitTextVisible("Foo field description has changed"); cy.focused().clear().wait(1000); cy.focused().type(documentation_edited); - cy.get("button").contains("Update").click(); + cy.clickOptionWithTestId("description-modal-update-button"); cy.waitTextVisible("Updated!"); cy.waitTextVisible(documentation_edited); cy.waitTextVisible("(edited)"); cy.get("tbody [data-icon='edit']").first().click({ force: true }); cy.focused().clear().wait(1000); cy.focused().type("Foo field description has changed"); - cy.get("button").contains("Update").click(); + cy.clickOptionWithTestId("description-modal-update-button"); cy.waitTextVisible("Updated!"); cy.waitTextVisible("Foo field description has changed"); cy.waitTextVisible("(edited)"); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js b/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js new file mode 100644 index 0000000000000..6c5dd77810644 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js @@ -0,0 +1,68 @@ + +const number = Math.floor(Math.random() * 100000); +const accound_id = `account${number}`; +const warehouse_id = `warehouse${number}`; +const username = `user${number}`; +const password = `password${number}`; +const role = `role${number}`; +const ingestion_source_name = `ingestion source ${number}`; + +describe("ingestion source creation flow", () => { + it("create a ingestion source using ui, verify ingestion source details saved correctly, remove ingestion source", () => { + // Go to ingestion page, create a snowflake source + cy.loginWithCredentials(); + cy.goToIngestionPage(); + cy.clickOptionWithTestId("create-ingestion-source-button"); + cy.clickOptionWithText("Snowflake"); + cy.waitTextVisible("Snowflake Recipe"); + cy.get("#account_id").type(accound_id); + cy.get("#warehouse").type(warehouse_id); + cy.get("#username").type(username); + cy.get("#password").type(password); + cy.focused().blur(); + cy.get("#role").type(role); + + // Verify yaml recipe is generated correctly + cy.clickOptionWithTestId("recipe-builder-yaml-button"); + cy.waitTextVisible("account_id"); + cy.waitTextVisible(accound_id); + cy.waitTextVisible(warehouse_id); + cy.waitTextVisible(username); + cy.waitTextVisible(password); + cy.waitTextVisible(role); + + // Finish creating source + cy.clickOptionWithTestId("recipe-builder-next-button"); + cy.waitTextVisible("Configure an Ingestion Schedule"); + cy.clickOptionWithTestId("ingestion-schedule-next-button"); + cy.waitTextVisible("Give this ingestion source a name."); + cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); + cy.clickOptionWithTestId("ingestion-source-save-button"); + cy.waitTextVisible("Successfully created ingestion source!").wait(5000) + cy.waitTextVisible(ingestion_source_name); + cy.get('[data-testid="ingestion-source-table-status"]').contains("Pending...").should("be.visible"); + + // Verify ingestion source details are saved correctly + cy.get('[data-testid="ingestion-source-table-edit-button"]').first().click(); + cy.waitTextVisible("Edit Ingestion Source"); + cy.get("#account_id").should("have.value", accound_id); + cy.get("#warehouse").should("have.value", warehouse_id); + cy.get("#username").should("have.value", username); + cy.get("#password").should("have.value", password); + cy.get("#role").should("have.value", role); + cy.get("button").contains("Next").click(); + cy.waitTextVisible("Configure an Ingestion Schedule"); + cy.clickOptionWithTestId("ingestion-schedule-next-button"); + cy.get('[data-testid="source-name-input"]').clear().type(ingestion_source_name + " EDITED"); + cy.clickOptionWithTestId("ingestion-source-save-button"); + cy.waitTextVisible("Successfully updated ingestion source!"); + cy.waitTextVisible(ingestion_source_name + " EDITED"); + + // Remove ingestion source + cy.get('[data-testid="delete-button"]').first().click(); + cy.waitTextVisible("Confirm Ingestion Source Removal"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Removed ingestion source."); + cy.ensureTextNotPresent(ingestion_source_name + " EDITED") + }) +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js index 466bb2ef0757e..77fd63b9cae02 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js @@ -8,23 +8,24 @@ const ingestion_source_name = `ingestion source ${number}`; describe("managing secrets for ingestion creation", () => { it("create a secret, create ingestion source using a secret, remove a secret", () => { + // Navigate to the manage ingestion page → secrets cy.loginWithCredentials(); - //navigate to the manage ingestion page → secrets cy.goToIngestionPage(); - cy.clickOptionWithText("Secrets"); - //create a new secret - cy.clickOptionWithText("Create new secret"); - cy.get('[role="dialog"]').contains("Create a new Secret").should("be.visible"); - cy.get('[role="dialog"] #name').type(`secretname${number}`); - cy.get('[role="dialog"] #value').type(`secretvalue${number}`); - cy.get('[role="dialog"] #description').type(`secretdescription${number}`); - cy.get('#createSecretButton').click(); + cy.openEntityTab("Secrets"); + + // Create a new secret + cy.clickOptionWithTestId("create-secret-button"); + cy.enterTextInTestId('secret-modal-name-input', `secretname${number}`); + cy.enterTextInTestId('secret-modal-value-input', `secretvalue${number}`); + cy.enterTextInTestId('secret-modal-description-input', `secretdescription${number}`); + cy.clickOptionWithTestId("secret-modal-create-button"); cy.waitTextVisible("Successfully created Secret!"); cy.waitTextVisible(`secretname${number}`); - cy.waitTextVisible(`secretdescription${number}`).wait(5000)//prevent issue with missing secret - //create an ingestion source using a secret + cy.waitTextVisible(`secretdescription${number}`).wait(5000) + + // Create an ingestion source using a secret cy.goToIngestionPage(); - cy.clickOptionWithText("Create new source"); + cy.get("#ingestion-create-source").click(); cy.clickOptionWithText("Snowflake"); cy.waitTextVisible("Snowflake Recipe"); cy.get("#account_id").type(accound_id); @@ -40,11 +41,12 @@ describe("managing secrets for ingestion creation", () => { cy.waitTextVisible("Give this ingestion source a name."); cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); cy.get("button").contains("Save").click(); - cy.waitTextVisible("Successfully created ingestion source!").wait(5000)//prevent issue with missing form data + cy.waitTextVisible("Successfully created ingestion source!").wait(5000) cy.waitTextVisible(ingestion_source_name); cy.get("button").contains("Pending...").should("be.visible"); - //remove a secret - cy.clickOptionWithText("Secrets"); + + // Remove a secret + cy.openEntityTab("Secrets"); cy.waitTextVisible(`secretname${number}`); cy.get('[data-icon="delete"]').first().click(); cy.waitTextVisible("Confirm Secret Removal"); @@ -52,14 +54,16 @@ describe("managing secrets for ingestion creation", () => { cy.waitTextVisible("Removed secret."); cy.ensureTextNotPresent(`secretname${number}`); cy.ensureTextNotPresent(`secretdescription${number}`); - //remove ingestion source + + // Remove ingestion source cy.goToIngestionPage(); cy.get('[data-testid="delete-button"]').first().click(); cy.waitTextVisible("Confirm Ingestion Source Removal"); cy.get("button").contains("Yes").click(); cy.waitTextVisible("Removed ingestion source."); cy.ensureTextNotPresent(ingestion_source_name) - //verify secret is not present during ingestion source creation for password dropdown + + // Verify secret is not present during ingestion source creation for password dropdown cy.clickOptionWithText("Create new source"); cy.clickOptionWithText("Snowflake"); cy.waitTextVisible("Snowflake Recipe"); @@ -68,13 +72,13 @@ describe("managing secrets for ingestion creation", () => { cy.get("#username").type(username); cy.get("#password").click().wait(1000); cy.ensureTextNotPresent(`secretname${number}`); - //verify secret can be added during ingestion source creation and used successfully + + // Verify secret can be added during ingestion source creation and used successfully cy.clickOptionWithText("Create Secret"); - cy.get('[role="dialog"]').contains("Create a new Secret").should("be.visible"); - cy.get('[role="dialog"] #name').type(`secretname${number}`); - cy.get('[role="dialog"] #value').type(`secretvalue${number}`); - cy.get('[role="dialog"] #description').type(`secretdescription${number}`); - cy.get('#createSecretButton').click(); + cy.enterTextInTestId('secret-modal-name-input', `secretname${number}`) + cy.enterTextInTestId('secret-modal-value-input', `secretvalue${number}`) + cy.enterTextInTestId('secret-modal-description-input', `secretdescription${number}`) + cy.clickOptionWithTestId("secret-modal-create-button"); cy.waitTextVisible("Created secret!"); cy.get("#role").type(role); cy.get("button").contains("Next").click(); @@ -86,6 +90,7 @@ describe("managing secrets for ingestion creation", () => { cy.waitTextVisible("Successfully created ingestion source!").wait(5000)//prevent issue with missing form data cy.waitTextVisible(ingestion_source_name); cy.get("button").contains("Pending...").should("be.visible"); + //Remove ingestion source and secret cy.goToIngestionPage(); cy.get('[data-testid="delete-button"]').first().click(); diff --git a/smoke-test/tests/cypress/cypress/support/commands.js b/smoke-test/tests/cypress/cypress/support/commands.js index 64bc1253fc383..5e3664f944edf 100644 --- a/smoke-test/tests/cypress/cypress/support/commands.js +++ b/smoke-test/tests/cypress/cypress/support/commands.js @@ -66,6 +66,7 @@ Cypress.Commands.add("logout", () => { Cypress.Commands.add("goToGlossaryList", () => { cy.visit("/glossary"); cy.waitTextVisible("Glossary"); + cy.wait(3000); }); Cypress.Commands.add("goToDomainList", () => {