diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 2c931162fb006..ab5b3eb48da7f 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -40,19 +40,19 @@ jobs: extra_pip_requirements: "apache-airflow~=2.2.4" extra_pip_extras: plugin-v1 - python-version: "3.10" - extra_pip_requirements: "apache-airflow==2.4.3" + extra_pip_requirements: "apache-airflow~=2.4.3" extra_pip_extras: plugin-v2,test-airflow24 - python-version: "3.10" - extra_pip_requirements: 'apache-airflow==2.6.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt' + extra_pip_requirements: 'apache-airflow~=2.6.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: 'apache-airflow==2.7.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt' + extra_pip_requirements: 'apache-airflow~=2.7.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: 'apache-airflow==2.8.1 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt' + extra_pip_requirements: 'apache-airflow~=2.8.1 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt' extra_pip_extras: plugin-v2 - - python-version: "3.10" - extra_pip_requirements: 'apache-airflow==2.9.0 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.0/constraints-3.10.txt' + - python-version: "3.11" + extra_pip_requirements: 'apache-airflow~=2.9.3 -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.10.txt' extra_pip_extras: plugin-v2 fail-fast: false steps: diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java index 542745e014862..3617eb4725979 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ResolverUtils.java @@ -255,4 +255,19 @@ public static Filter viewFilter( Filter result = SearchUtils.combineFilters(null, viewInfo.getDefinition().getFilter()); return result; } + + /** + * Simply resolves the end time filter for the search across lineage query. If the start time is + * provided, but end time is not provided, we will default to the current time. + */ + public static Long getLineageEndTimeMillis( + @Nullable Long startTimeMillis, @Nullable Long endTimeMillis) { + if (endTimeMillis != null) { + return endTimeMillis; + } + if (startTimeMillis != null) { + return System.currentTimeMillis(); + } + return null; + } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/EntityLineageResultResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/EntityLineageResultResolver.java index 51b00bbe7b799..d872ffad2783d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/EntityLineageResultResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/EntityLineageResultResolver.java @@ -18,6 +18,7 @@ import com.linkedin.datahub.graphql.generated.LineageInput; import com.linkedin.datahub.graphql.generated.LineageRelationship; import com.linkedin.datahub.graphql.generated.Restricted; +import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; import com.linkedin.metadata.graph.SiblingGraphService; import graphql.schema.DataFetcher; @@ -63,7 +64,10 @@ public CompletableFuture get(DataFetchingEnvironment enviro @Nullable final Integer count = input.getCount(); // Optional! @Nullable final Boolean separateSiblings = input.getSeparateSiblings(); // Optional! @Nullable final Long startTimeMillis = input.getStartTimeMillis(); // Optional! - @Nullable final Long endTimeMillis = input.getEndTimeMillis(); // Optional! + @Nullable + final Long endTimeMillis = + ResolverUtils.getLineageEndTimeMillis( + input.getStartTimeMillis(), input.getEndTimeMillis()); // Optional! com.linkedin.metadata.graph.LineageDirection resolvedDirection = com.linkedin.metadata.graph.LineageDirection.valueOf(lineageDirection.toString()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/ScrollAcrossLineageResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/ScrollAcrossLineageResolver.java index 14b2d3b8f8420..1b719b6f78620 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/ScrollAcrossLineageResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/ScrollAcrossLineageResolver.java @@ -78,7 +78,8 @@ public CompletableFuture get(DataFetchingEnvironment @Nullable Long startTimeMillis = input.getStartTimeMillis() == null ? null : input.getStartTimeMillis(); @Nullable - Long endTimeMillis = input.getEndTimeMillis() == null ? null : input.getEndTimeMillis(); + Long endTimeMillis = + ResolverUtils.getLineageEndTimeMillis(input.getStartTimeMillis(), input.getEndTimeMillis()); final LineageFlags lineageFlags = LineageFlagsInputMapper.map(context, input.getLineageFlags()); if (lineageFlags.getStartTimeMillis() == null && startTimeMillis != null) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java index f342d251acd72..dc3a1fc17e4ec 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossLineageResolver.java @@ -111,7 +111,8 @@ public CompletableFuture get(DataFetchingEnvironment @Nullable Long startTimeMillis = input.getStartTimeMillis() == null ? null : input.getStartTimeMillis(); @Nullable - Long endTimeMillis = input.getEndTimeMillis() == null ? null : input.getEndTimeMillis(); + Long endTimeMillis = + ResolverUtils.getLineageEndTimeMillis(input.getStartTimeMillis(), input.getEndTimeMillis()); final LineageFlags lineageFlags = LineageFlagsInputMapper.map(context, input.getLineageFlags()); if (lineageFlags.getStartTimeMillis() == null && startTimeMillis != null) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/LineageFlagsInputMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/LineageFlagsInputMapper.java index 43c24c9630d64..87664ef2af4c7 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/LineageFlagsInputMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/LineageFlagsInputMapper.java @@ -6,6 +6,7 @@ import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.EntityTypeToPlatforms; import com.linkedin.datahub.graphql.generated.LineageFlags; +import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.types.entitytype.EntityTypeMapper; import com.linkedin.datahub.graphql.types.mappers.ModelMapper; import java.util.Collections; @@ -42,12 +43,16 @@ public com.linkedin.metadata.query.LineageFlags apply( if (lineageFlags.getIgnoreAsHops() != null) { result.setIgnoreAsHops(mapIgnoreAsHops(lineageFlags.getIgnoreAsHops())); } - if (lineageFlags.getEndTimeMillis() != null) { - result.setEndTimeMillis(lineageFlags.getEndTimeMillis()); - } if (lineageFlags.getStartTimeMillis() != null) { result.setStartTimeMillis(lineageFlags.getStartTimeMillis()); } + // Default to "now" if no end time is provided, but start time is provided. + Long endTimeMillis = + ResolverUtils.getLineageEndTimeMillis( + lineageFlags.getStartTimeMillis(), lineageFlags.getEndTimeMillis()); + if (endTimeMillis != null) { + result.setEndTimeMillis(endTimeMillis); + } if (lineageFlags.getEntitiesExploredPerHopLimit() != null) { result.setEntitiesExploredPerHopLimit(lineageFlags.getEntitiesExploredPerHopLimit()); } diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index e6db6bfcc9a61..21a9be9dfb386 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -32,6 +32,7 @@ import { INGESTION_REFRESH_SOURCES_ID, } from '../../onboarding/config/IngestionOnboardingConfig'; import { ONE_SECOND_IN_MS } from '../../entity/shared/tabs/Dataset/Queries/utils/constants'; +import { useCommandS } from './hooks'; const PLACEHOLDER_URN = 'placeholder-urn'; @@ -51,6 +52,8 @@ const FilterWrapper = styled.div` display: flex; `; +const SYSTEM_INTERNAL_SOURCE_TYPE = 'SYSTEM'; + export enum IngestionSourceType { ALL, UI, @@ -102,6 +105,17 @@ export const IngestionSourceList = () => { // Set of removed urns used to account for eventual consistency const [removedUrns, setRemovedUrns] = useState([]); const [sourceFilter, setSourceFilter] = useState(IngestionSourceType.ALL); + const [hideSystemSources, setHideSystemSources] = useState(true); + + /** + * Show or hide system ingestion sources using a hidden command S command. + */ + useCommandS(() => setHideSystemSources(!hideSystemSources)); + + // Ingestion Source Default Filters + const filters = hideSystemSources + ? [{ field: 'sourceType', values: [SYSTEM_INTERNAL_SOURCE_TYPE], negated: true }] + : undefined; // Ingestion Source Queries const { loading, error, data, client, refetch } = useListIngestionSourcesQuery({ @@ -110,6 +124,7 @@ export const IngestionSourceList = () => { start, count: pageSize, query: (query?.length && query) || undefined, + filters, }, }, fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', diff --git a/datahub-web-react/src/app/ingest/source/hooks.ts b/datahub-web-react/src/app/ingest/source/hooks.ts new file mode 100644 index 0000000000000..7197c9daffa9c --- /dev/null +++ b/datahub-web-react/src/app/ingest/source/hooks.ts @@ -0,0 +1,16 @@ +import { useEffect } from 'react'; + +export const useCommandS = (onPress: () => void) => { + useEffect(() => { + const handleKeyDown = (event: KeyboardEvent) => { + if (event.metaKey && event.key === 's') { + event.preventDefault(); + onPress(); + } + }; + window.addEventListener('keydown', handleKeyDown); + return () => { + window.removeEventListener('keydown', handleKeyDown); + }; + }, [onPress]); +}; diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index c1cb51bfb1a80..5f823a93b4b43 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -76,6 +76,12 @@ module.exports = { label: "Docs", position: "right", }, + { + to: "/learn", + activeBasePath: "learn", + label: "Learn", + position: "right", + }, { to: "/integrations", activeBasePath: "integrations", @@ -299,7 +305,15 @@ module.exports = { showLastUpdateAuthor: false, showLastUpdateTime: false, }, - blog: false, + blog: { + blogTitle: "DataHub Learn", + blogSidebarTitle: "DataHub Learn", + blogDescription: "Learn about the hot topics in the data ecosystem and how DataHub can help you with your data journey.", + path: "src/learn", + routeBasePath: "learn", + postsPerPage: "ALL", + blogListComponent: "../src/learn/_components/LearnListPage", + }, theme: { customCss: [ isSaas ? require.resolve("./src/styles/acryl.scss") : require.resolve("./src/styles/datahub.scss"), diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index e58dbd4d99b0b..8e48062af6d4d 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -209,11 +209,6 @@ module.exports = { }, items: [ "docs/managed-datahub/welcome-acryl", - { - type: "doc", - id: "docs/managed-datahub/saas-slack-setup", - className: "saasOnly", - }, { type: "doc", id: "docs/managed-datahub/approval-workflows", @@ -247,6 +242,20 @@ module.exports = { }, ], }, + { + Slack: [ + { + type: "doc", + id: "docs/managed-datahub/slack/saas-slack-setup", + className: "saasOnly", + }, + { + type: "doc", + id: "docs/managed-datahub/slack/saas-slack-app", + className: "saasOnly", + }, + ], + }, { "Operator Guide": [ { diff --git a/docs-website/src/learn/_components/LearnItemCard/index.jsx b/docs-website/src/learn/_components/LearnItemCard/index.jsx new file mode 100644 index 0000000000000..9c6b6cfdc98d8 --- /dev/null +++ b/docs-website/src/learn/_components/LearnItemCard/index.jsx @@ -0,0 +1,30 @@ +import React from "react"; +import clsx from "clsx"; +import Link from "@docusaurus/Link"; +import { useBlogPost } from "@docusaurus/theme-common/internal"; +import styles from "./styles.module.scss"; + +export default function LearnItemCard() { + const { metadata } = useBlogPost(); + const { permalink, title, description, formattedDate, frontMatter } = metadata; + return ( +
+ + {frontMatter?.image ? ( +
+ {title} +
+ ) : ( +
+

{title}

+
+ )} +
+
{description}
+
+ +
Published on {formattedDate}
+ +
+ ); +} \ No newline at end of file diff --git a/docs-website/src/learn/_components/LearnItemCard/styles.module.scss b/docs-website/src/learn/_components/LearnItemCard/styles.module.scss new file mode 100644 index 0000000000000..2bfaabdc06d49 --- /dev/null +++ b/docs-website/src/learn/_components/LearnItemCard/styles.module.scss @@ -0,0 +1,53 @@ +.featureCol { + display: flex; +} + +.card_date { + padding: 1rem 2rem; + font-size: 0.8rem; + font-style: italic; + color: gray; + margin-top: auto; +} + +.card_feature { + font-size: 2rem; + font-weight: 700; +} + +.card { + color: var(--ifm-text-color); + text-decoration: none !important; + padding: 0rem; + margin-bottom: 2rem; + align-self: stretch; + flex-grow: 1; + &:hover { + border-color: var(--ifm-color-primary); + } + hr { + margin: 0; + } +} + +.featureHeader { + h2 { + margin-bottom: 1rem !important; + font-size: 1.25rem; + } + padding: 1rem 2rem; +} + +.featureBody { + padding: 0 2rem; +} + +.card_image { + margin: 0; + margin-bottom: 0.5rem; + + img { + width: 100%; + height: auto; + } +} \ No newline at end of file diff --git a/docs-website/src/learn/_components/LearnListPage/index.jsx b/docs-website/src/learn/_components/LearnListPage/index.jsx new file mode 100644 index 0000000000000..4df87a340f21e --- /dev/null +++ b/docs-website/src/learn/_components/LearnListPage/index.jsx @@ -0,0 +1,91 @@ +import React, { useState } from "react"; +import clsx from "clsx"; + +import useDocusaurusContext from "@docusaurus/useDocusaurusContext"; +import { PageMetadata, HtmlClassNameProvider, ThemeClassNames } from "@docusaurus/theme-common"; +import BlogListPaginator from "@theme/BlogListPaginator"; +import SearchMetadata from "@theme/SearchMetadata"; +import { BlogPostProvider } from "@docusaurus/theme-common/internal"; +import LearnItemCard from "../LearnItemCard"; +import Layout from "@theme/Layout"; +import styles from "./styles.module.scss"; + +function BlogListPageMetadata(props) { + const { metadata } = props; + const { + siteConfig: { title: siteTitle }, + } = useDocusaurusContext(); + const { blogDescription, blogTitle, permalink } = metadata; + const isBlogOnlyMode = permalink === "/"; + const title = isBlogOnlyMode ? siteTitle : blogTitle; + return ( + <> + + + + ); +} + +function BlogListPageContent(props) { + const { metadata, items } = props; + const [activeFilters, setActiveFilters] = useState([]); + // These are currently hardcoded, check the frontmatter of the blog posts to see what audiences are available + const audiences = ["Data Governance Leads", "Data Engineers", "Data Architects", "Data Platform Leads", "Data Analysts"]; + + const filteredItems = activeFilters?.length + ? (items || []).filter((post) => activeFilters.some((activeFilter) => post?.content?.frontMatter?.audience?.some((a) => a === activeFilter))) + : items; + + const handleFilterToggle = (audience) => { + if (activeFilters.includes(audience)) { + setActiveFilters(activeFilters.filter((filter) => filter !== audience)); + } else { + setActiveFilters([...new Set([...activeFilters, audience])]); + } + }; + + return ( + +
+
+
+
+

DataHub Learn

+

Learn about the hot topics in the data ecosystem and how DataHub can help you with your data journey.

+
+
+
+ For: + {audiences.map((audience) => ( + + ))} +
+
+
+
+
+ {(filteredItems || []).map(({ content: BlogPostContent }) => ( + + + + ))} +
+ +
+
+ ); +} + +export default function BlogListPage(props) { + return ( + + + + + ); +} \ No newline at end of file diff --git a/docs-website/src/learn/_components/LearnListPage/styles.module.scss b/docs-website/src/learn/_components/LearnListPage/styles.module.scss new file mode 100644 index 0000000000000..d08b48a011de0 --- /dev/null +++ b/docs-website/src/learn/_components/LearnListPage/styles.module.scss @@ -0,0 +1,7 @@ +.filterBar { + display: flex; + justify-content: center; + align-items: center; + gap: 10px; + flex-wrap: wrap; +} \ No newline at end of file diff --git a/docs-website/src/learn/business-glossary.md b/docs-website/src/learn/business-glossary.md new file mode 100644 index 0000000000000..d6b249617fc5a --- /dev/null +++ b/docs-website/src/learn/business-glossary.md @@ -0,0 +1,120 @@ +--- +title: "What is a Business Glossary and How to Standardize It" +description: Understand how a standardized business glossary aids in achieving consistency, compliance, and efficient data use. +tags: ["Business Glossary", "Use Case", "For Data Governance Leads"] +image: /img/learn/use-case-business-glossary.png +hide_table_of_contents: false +audience: ["Data Governance Leads"] +date: 2024-06-03T05:00 +--- + +# What is a Business Glossary and How to Standardize It + +Understand how a standardized business glossary aids in achieving consistency, compliance, and efficient data use. + + + +## Introduction + +Have you ever faced confusion due to inconsistent business terminology within your organization? This lack of standardization can lead to misunderstandings, compliance issues, and inefficient data use. In this post, we’ll explore the importance of having a standardized business glossary, its benefits, and how you can implement one effectively in your organization. + +## What is a Business Glossary? + +A Business Glossary is like a dictionary for your company. It contains definitions of key business terms that everyone in the organization uses, ensuring everyone speaks the same language, especially when it comes to important concepts related to the data your company collects, processes, and uses. + +For example, below are some sales-related glossary terms that can be used in an IT company. + +| Term | Definition | Usage | +| --- | --- | --- | +| CRM (Customer Relationship Management) | Software that manages a company's interactions with current and potential customers. | CRMs help streamline processes and improve customer relationships. | +| Lead | A potential customer who has shown interest in a company's product or service. | Leads are nurtured by the sales team to convert into customers. | +| Pipeline | The stages through which a sales prospect moves from initial contact to final sale. | Sales pipelines track progress and forecast future sales. | +| Quota | A sales target set for a salesperson or team for a specific period. | Quotas motivate sales teams and measure performance. | +| Conversion Rate | The percentage of leads that turn into actual sales. | High conversion rates indicate effective sales strategies. | +| Upselling | Encouraging customers to purchase a more expensive or upgraded version of a product. | Upselling increases revenue by enhancing the customer purchase. | +| Churn Rate | The percentage of customers who stop using a product or service over a given period. | Reducing churn rate is crucial for maintaining steady growth. | +| MQL (Marketing Qualified Lead) | A lead that has been deemed more likely to become a customer based on marketing efforts. | MQLs are passed from the marketing team to the sales team for further nurturing. | +| ARR (Annual Recurring Revenue) | The amount of revenue that a company expects to receive from its customers on an annual basis for subscriptions. | ARR helps in financial forecasting and performance measurement. | + +## What is Business Glossary Standardization? + +Business glossary standardization means creating and maintaining a consistent set of business terms and definitions used across the organization. This practice is essential for maintaining clarity and consistency in how data is interpreted and used across different departments. + +## Why Should You Care? + +### The Challenge + +Without a consistent understanding and use of business terminology, your company lacks a unified understanding of its data. This can lead to inconsistencies, increased compliance risk, and less effective use of data. Different teams may describe the same concepts in various ways, causing confusion about customers, key metrics, products, marketing, and more. + +### The Benefits + +For a governance lead, standardizing the business glossary is crucial for several reasons: + +- **Reduces Confusion, Facilitates Discovery:** Ensures data quality, consistency, and reliability, which are critical for effective decision-making. +- **Regulatory Compliance:** Aligns data use with regulatory definitions and requirements, essential for compliance with financial regulations. +- **Supports Risk Management:** Provides consistent terminology for analyzing market trends, credit risk, and operational risks. +- **Training and Onboarding:** Helps new employees quickly understand the company’s specific language and metrics, speeding up the training process. + +### Real-World Impact + +Imagine a financial services company where different teams use varied terminologies for the same concepts, such as "customer lifetime value." (CLV) This inconsistency can lead to misinterpretations, faulty risk assessments, and regulatory non-compliance, ultimately affecting the company's reputation and financial stability. + +Here's how different teams might interpret CLV and the potential implications: + +| Team | Interpretation of CLV | Focus | Implications | +| --- | --- | --- | --- | +| Marketing | Total revenue generated from a customer over their entire relationship with the company | Campaign effectiveness, customer acquisition costs, return on marketing investment | Revenue maximization through frequent promotions, potentially ignoring the cost of service and risk associated with certain customer segments | +| Sales | Projected future sales from a customer based on past purchasing behavior | Sales targets, customer retention, cross-selling/up-selling opportunities | Aggressive sales tactics to boost short-term sales, potentially leading to customer churn if the value delivered does not meet | +| Finance | Net present value (NPV), factoring in the time value of money and associated costs over the customer relationship period | Profitability, cost management, financial forecasting | Conservative growth strategies, focusing on high-value, low-risk customers, potentially overlooking opportunities for broader market expansion | + + Different interpretations can lead to conflicting strategies and objectives across teams. For instance, Marketing’s aggressive acquisition strategy may lead to a significant increase in new customers and short-term revenue. However, if Finance’s NPV analysis reveals that these customers are not profitable long-term, the company may face financial strain due to high acquisition costs and low profitability. + + The Sales team’s push for upselling may generate short-term sales increases, aligning with their CLV projections. However, if customers feel pressured and perceive the upsells as unnecessary, this could lead to dissatisfaction and higher churn rates, ultimately reducing the actual lifetime value of these customers. + + The conflicting strategies can result in misaligned priorities, where Marketing focuses on volume, Sales on immediate revenue, and Finance on long-term profitability. This misalignment can lead to inefficient resource allocation, where Marketing spends heavily on acquisition, Sales focuses on short-term gains, and Finance restricts budgets due to profitability concerns. + +### Example Discovery Questions + +- Have you ever experienced confusion or errors due to inconsistent terminology in your organization's data reports? How do you currently manage and standardize business terms across departments? +- If your organization lacks a standardized business glossary, what challenges do you face in ensuring regulatory compliance and reliable data analysis? +- When onboarding new employees, do you find that inconsistent terminology slows down their training and understanding of company data? How could a standardized glossary improve this process? + +## How to Standardize a Business Glossary + +### General Approach + +To standardize a business glossary, start by identifying key business terms and their definitions. Engage stakeholders from various departments to ensure comprehensive coverage and agreement. Regularly update the glossary to reflect changes in business processes and regulatory requirements. + +### Alternatives and Best Practices + +Some companies use manual methods to track data terminology and manage access requests. While these methods can work, they are often inefficient and error-prone. Best practices include using automated tools that provide consistent updates and easy access to the glossary for all employees. + +### Our Solution + +Acryl DataHub offers comprehensive features designed to support the authoring of a unified business glossary for your organization: + +

+ +
+ Business Glossary Center +

+ +- **[Centralized Business Glossary](https://datahubproject.io/docs/glossary/business-glossary):** A repository for all business terms and definitions, ensuring consistency across the organization. + + +

+ +
+ Approval Flows +

+ + +- **[Approval Flows](https://datahubproject.io/docs/managed-datahub/approval-workflows):** Structured workflows for approving changes to the glossary, maintaining quality and consistency through time + +- **Automated Data Classification:** Tools to tag critical data assets - tables, columns, dashboards, and pipelines - with terms from the business glossary using automations and custom rules. + +By implementing these solutions, you can ensure that your business terminology is consistently defined and accurately used across all teams, supporting reliable decision-making and regulatory compliance. + +## Conclusion + +Standardizing your business glossary is essential for maintaining consistency, ensuring compliance, and optimizing data use. By implementing best practices and leveraging advanced tools, you can achieve a more efficient and reliable data management process. This investment will lead to better decision-making, reduced compliance risks, and a more cohesive organizational understanding of data. \ No newline at end of file diff --git a/docs-website/src/learn/business-metric.md b/docs-website/src/learn/business-metric.md new file mode 100644 index 0000000000000..39221a67d40ab --- /dev/null +++ b/docs-website/src/learn/business-metric.md @@ -0,0 +1,87 @@ +--- +title: "What is a Business Metric and How to Define and Standardize Them" +description: Learn the importance of consistent metric definitions and calculation methods to ensure organizational alignment. +tags: ["Business Metric", "Use Case", "For Data Analysts"] +image: /img/learn/use-case-business-metric.png +hide_table_of_contents: false +audience: ["Data Analysts"] +date: 2024-06-03T04:00 +--- + +# What is a Business Metric and How to Define and Standardize Them + +Learn the importance of consistent metric definitions and calculation methods to ensure organizational alignment. + + + +## Introduction + +Have you ever been part of a project where different teams had conflicting definitions for key business metrics like revenue, churn, or weekly active users? This misalignment can cause significant issues, leading to incorrect analysis and poor decision-making. In this post, we will explore the importance of defining and standardizing business metrics, why it matters, and how you can do it effectively within your organization. + +## What is Business Metrics Definition and Standardization? + +Standardizing business metrics definition involves creating consistent and universally understood definitions for key performance indicators (KPIs) across your organization. Think of it as creating a common language that everyone in your company can use when discussing critical metrics like revenue, churn, or engagement. This ensures that all teams are on the same page, which is essential for accurate analysis and strategic decision-making. + +## Why Should You Care About Business Metrics Definition and Standardization? + +### The Challenge + +In many organizations, KPIs are used to drive critical day-to-day operating decisions. They often emerge organically in response to the data needs of management. Over time, organizations can naturally develop inconsistent sources, representations, and vocabulary around such metrics. When there is a lack of consistent understanding of these metrics, it can lead to meaningful discrepancies in data interpretation and decision-making. + +### Importance + +Standardizing business metrics is crucial because these metrics are direct indicators of the performance and health of various functions within an organization. More often than not, these metrics are used for not only making day-to-day operating decisions, but also for reporting out business performance. Standardized metrics provide immediate insight into whether the business is on track to meet its objectives and serve as solid foundations upon which other second-order metrics may be derived. + +### Real-World Impact + +Consider a scenario where the finance team defines revenue differently from the product team. If these discrepancies are not reconciled, it could lead to conflicting reports and misguided strategies. For instance, a marketing campaign analyzed with inconsistent metrics might appear successful in one report and unsuccessful in another, causing confusion and potentially leading to incorrect strategic decisions. Disagreements about the source-of-truth or accuracy of a given metric are commonplace; perhaps you can recall some examples from your own experience. + +### Example Discovery Questions and Explanations + +- **Current Management and Challenges:** "How do you currently manage and standardize definitions for core business metrics across different teams, and what challenges have you encountered in this process?" This question helps to uncover the existing processes and pain points in managing metrics, providing insights into potential areas where our product can offer significant improvements. +- **Educating your Workforce:** “How do you educate new employees about the most important metrics at the organization?” This question helps to recognize and eliminate inefficient sharing of tribal knowledge within an organization when an employee joins or leaves. +- **Impact of Misalignment:** "Can you describe a recent instance where misalignment on metric definitions impacted a business decision or analysis, and how was the issue resolved?" This question aims to highlight the real-world consequences of not having standardized metrics, emphasizing the importance of our solution in preventing such issues. + +## How to Define and Standardize Business Metrics + +### General Approach + +Start by identifying key business metrics that are actively used to power decision making at the organization. Involve stakeholders from different departments to agree on a standard set of definitions, and propose a lightweight process for introducing new ones. Document these definitions and ensure they are easily accessible to everyone in the organization. Regular reviews and updates are necessary to keep the metrics relevant and aligned with business goals. + +### Alternatives and Best Practices + +Some companies try to align metric definitions through emails and meetings. While this is a good place to start, it is often impractical at scale. Instead, best practices involve using a centralized system for defining and discovering key business metrics. Implementing approval flows and lineage tracking can ensure that all changes are reviewed and that the physical origins of a metric - e.g. the actual tables and rows that power it - are immediately clear. By making metrics centrally visible, you can begin to establish accountability and audibility around your key metrics, increasing their reliability through time and improving the quality of your decisions. + +### Our Solution + +Acryl DataHub offers comprehensive features designed to tackle the challenges of defining and standardizing business metrics: + +

+ +
+ Business Glossary Center +

+ + +- **[Business Glossary](https://datahubproject.io/docs/glossary/business-glossary):** A centralized repository for all metrics definitions, ensuring consistency across the organization. + +

+ +
+ Approval Flows +

+ +- **[Approval Flows](https://datahubproject.io/docs/managed-datahub/approval-workflows):** Structured workflows for approving changes to metric definitions, maintaining accuracy and reliability. + - + +![Untitled](https://prod-files-secure.s3.us-west-2.amazonaws.com/f818df0d-1067-44ab-99e1-8cf45d930c01/33ebd070-32a1-4875-b220-c31373f5eedf/Untitled.png) + +- **[Lineage Tracking](https://datahubproject.io/docs/generated/lineage/lineage-feature-guide):** Tools to track the origin and transformations of metrics, ensuring they align with standardized definitions. + - +![Screenshot 2024-07-10 at 12.07.28 PM.png](https://prod-files-secure.s3.us-west-2.amazonaws.com/f818df0d-1067-44ab-99e1-8cf45d930c01/39503957-ad64-4d2d-a5b2-b140abfc1f6c/Screenshot_2024-07-10_at_12.07.28_PM.png) + +By implementing these solutions, you can ensure that your business metrics are consistently defined and accurately used across all teams, supporting reliable analysis and decision-making. + +### Conclusion + +Defining and standardizing business metrics is essential for ensuring consistent, accurate, and reliable data analysis and decision-making within an organization. By implementing best practices and leveraging advanced tools like our product’s business glossary, approval flows, and lineage tracking, you can achieve a more cohesive and efficient approach to managing business metrics. This investment will lead to better insights, more informed decisions, and ultimately, a more successful data-driven organization. \ No newline at end of file diff --git a/docs-website/src/learn/data-freshness.md b/docs-website/src/learn/data-freshness.md new file mode 100644 index 0000000000000..e97e9b054b256 --- /dev/null +++ b/docs-website/src/learn/data-freshness.md @@ -0,0 +1,121 @@ +--- +title: "Ensuring Data Freshness: Why It Matters and How to Achieve It" +description: Explore the significance of maintaining up-to-date data, the challenges involved, and how our solutions can ensure your data remains fresh to meet SLAs. +tags: ["Data Freshness", "Use Case", "For Data Engineers"] +image: /img/learn/use-case-data-freshness.png +hide_table_of_contents: false +audience: ["Data Engineers"] +date: 2024-06-03T01:00 +--- + +# Ensuring Data Freshness: Why It Matters and How to Achieve It + +Explore the significance of maintaining up-to-date data, the challenges involved, and how our solutions can ensure your data remains fresh to meet SLAs. + + + +## Introduction + +Have you ever experienced delays in delivering tables that or machine learning (ML) models that directly power customer experiences due to stale data? Ensuring timely data is crucial for maintaining the effectiveness and reliability of these mission-critical products. In this post, we'll explore the importance of data freshness, the challenges associated with it, and how DataHub can help you meet your data freshness SLAs consistently. + +## What is Data Freshness? + +Data freshness refers to the timeliness and completeness of data used to build tables and ML models. Specifically, freshness can be measured by the difference in time between when some event *actually occurs* vs when that record of that event is reflected in a dataset or used to train an AI model. + +To make things concrete, let’s imagine you run an e-commerce business selling t-shirts. When a user clicks the final “purchase” button to finalize a purchase, this interaction is recorded, eventually winding up in a consolidated “click_events” table on your data warehouse. Data freshness in this case could be measured by comparing when the actual click was performed against when the record of the click landed in the data warehouse. In reality, freshness can be measured against any reference point - e.g. event time, ingestion time, or something else - in relation to when a target table, model, or other data product is updated with new data. + +

+ +
+ Data Freshness +

+ +Oftentimes, data pipelines are designed in order meet some well-defined availability latency, or data freshness SLA, with the specifics of this type of agreement dictating how and when the data pipeline is triggered to run. + +In the modern data landscape, ensuring that data is up-to-date is vital for building high-quality data products, from reporting dashboards used to drive day-to-day company decisions to personalized and dynamic data- or AI-powered product experiences. + +## Why Data Freshness Matters + +For many organizations, fresh data is more than a ‘nice to have’. + +Mission-critical ML models, like those used for price prediction or fraud detection, depend heavily on fresh data to make accurate predictions. Delays in updating these models can lead to lost revenue and damage to your company's reputation. + +Customer-facing data products, for example recommendation features, also need timely updates to ensure that customers receive the most recent and relevant information personalized to them. Delays in data freshness can result in customer frustration, user churn, and loss of trust. + +### Key Considerations for Your Organization + +**Critical Data and ML Models:** + +Can you recall examples when your organization faced challenges in maintaining the timeliness of mission-critical datasets and ML models? If your organization relies on data to deliver concrete product experiences, compliance auditing, or for making high-quality day-to-day decision, then stale data can significantly impact revenue and customer satisfaction. Consider identifying which datasets and models are most critical to your operations and quantifying the business impact of delays. + +**Impact Identification and Response:** + +Because data is highly interconnected, delays in data freshness can lead to cascading problems, particularly of your organization lacks a robust system for identifying and resolving such problems. How does your organization prioritize and manage such incidents? Processes for quickly identifying and resolving root causes are essential for minimizing negative impacts on revenue and reputation. + +**Automated Freshness Monitoring:** + +If data freshness problems often go undetected for long periods of time, there may be opportunities to automate the detection of such problems for core tables and AI models so that your team is first to know when something goes wrong. + +## How to Ensure Data Freshness + +Ensuring data freshness involves several best practices and strategies. Here’s how you can achieve it: + +### Best Practices and Strategies + +**Data Lineage Tracking:** + +Utilize data lineage tracking to establish a bird’s eye view of data flowing through your systems - a picture of the supply chain of data within your organization. This helps in pinpointing hotspots where delays occur and understanding the full impact of such delays to coordinate an effective response. + +**Automation and Monitoring:** + +Implement automated freshness monitoring to detect and address issues promptly. This reduces the need for manual debugging and allows for quicker response times. It can also help you to establish peace-of-mind by targeting your most impactful assets. + +**Incident Management:** + +Establish clear protocols for incident management to prioritize and resolve data freshness issues effectively. This includes setting up notifications and alerts for timely intervention, and a broader communication strategy to involve all stakeholders (even those downstream) in the case of an issue. + +### Alternatives + +While manual investigation and communication using tools like Slack can help triage issues, they often result in time-consuming, inefficient, and informal processes for addressing data quality issues related to freshness, ultimately leading to lower quality outcomes. Automated freshness incident detection and structured incident management via dedicated data monitoring tools can help improve the situation by providing a single place for detecting, communicating, and coordinating to resolve data freshness issues. + +### How DataHub Can Help + +DataHub offers comprehensive features designed to tackle data freshness challenges: + + +**[End-To-End Data Lineage](https://datahubproject.io/docs/generated/lineage/lineage-feature-guide) and [Impact Analysis](https://datahubproject.io/docs/act-on-metadata/impact-analysis):** Easily track the flow of data through your organization to identify, debug, and resolve delays quickly. +

+ +
+ Data Lineage +

+ + +**Freshness Monitoring & Alerting:** Automatically detect and alert when data freshness issues occur, to ensure timely updates by proactively monitoring key datasets for updates. Check out [Assertions](https://datahubproject.io/docs/managed-datahub/observe/assertions) and [Freshness Assertions](https://datahubproject.io/docs/managed-datahub/observe/freshness-assertions), Available in **Acryl Managed DataHub Only.** + +

+ +
+ Freshness Assertions Results +

+ + +

+ +
+ Smart assertions checks for changes on a cadence based on the Table history, by default using the Audit Log. +

+ + +**[Incident Management](https://datahubproject.io/docs/incidents/incidents)** : Centralize data incident management and begin to effectively triage, prioritize, communicate and resolve data freshness issues to all relevant stakeholders. Check out [subscription & notification](https://datahubproject.io/docs/managed-datahub/subscription-and-notification) features as well. + +

+ +

+ + +By implementing these solutions, you can ensure that your key datasets and models are always up-to-date, maintaining their relevancy, accuracy, and reliability for critical use cases within your organization. + +## Conclusion + +Ensuring data freshness is essential for the performance and reliability of critical datasets and AI/ML models. By understanding the importance of data freshness and implementing best practices and automated solutions, you can effectively manage and mitigate delays, thereby protecting your revenue and reputation. DataHub is designed to help you achieve this, providing the tools and features necessary to keep your data fresh and your operations running smoothly. \ No newline at end of file diff --git a/docs-website/src/learn/data-mesh.md b/docs-website/src/learn/data-mesh.md new file mode 100644 index 0000000000000..f9a625d103ae7 --- /dev/null +++ b/docs-website/src/learn/data-mesh.md @@ -0,0 +1,131 @@ +--- +title: "What is a Data Mesh and How to Implement It in Your Organization" +description: Learn how a data mesh aligns data management with domain expertise, enhancing overall organizational agility. +tags: ["Data Mesh", "Use Case", "For Data Architects", "For Data Platform Leads"] +image: /img/learn/use-case-data-mesh.png +hide_table_of_contents: false +audience: ["Data Architects", "Data Platform Leads"] +date: 2024-06-03T02:00 +--- + +# What is Data Mesh and How to Implement It in Your Organization + +Learn how a data mesh aligns data management with domain expertise, enhancing overall organizational agility. + + + +## Introduction + +Have you faced challenges in managing decentralized data across various business units or domains? Implementing a [Data Mesh](https://martinfowler.com/articles/data-mesh-principles.html) can address these issues, aligning data management with domain expertise and enhancing your organization’s overall agility. In this post, we'll explore what a Data Mesh is, why it's beneficial, and how to implement it effectively within your organization. + +## What is Data Mesh? + +Data Mesh is a decentralized data architecture that shifts the responsibility of data management from a central team to individual business units, or "domains." Each domain in turn produces “data products”, or consumable data artifacts, ensuring that data management is closely aligned with domain-specific expertise. This approach promotes agility, scalability, and the ability to generate insights more effectively. + +If you’re familiar with [Service-Oriented Architectures](https://en.wikipedia.org/wiki/Service-oriented_architecture), i.e. micro-services, this might sound familiar. Data Mesh is a somewhat analogous concept, but applied to data! + +

+ +
+ 4 Principles of Data Mesh +

+ + +| Principle | Explanation | +| --- | --- | +| Domain Data Ownership | Organizing data into explicit domains based on the structure of your organization, and then assigning clear accountability to each. This enables you to more easily increase the number of sources of data, variety of use cases, and diversity of access models to the data increases. | +| Data as a product | Domain data should be highly accessible and highly reliable by default. It should be easy to discover, easy to understand, easy to access securely, and high quality. | +| Self-Service | Domain teams should be able to independently create, consume, and manage data products on top of a general-purpose platform that can hide the complexity of building, executing and maintaining secure and interoperable data products. | +| Federated Governance | Consistent standards that are enforced by process and technology around interoperability, compliance, and quality. This makes it easy for data consumers to interact with data products across domains in familiar way and ensures quality is maintained uniformly. | + +

+ +
+ Logical architecture of data mesh approach, Image Credit: Zhamak Dehghani +

+ + + +## Why Implement Data Mesh? + +For data architects and data platform leads, implementing a Data Mesh can resolve various challenges associated with managing decentralized data, particularly as you try to scale up. + +Traditional data lakes or warehouses can become central bottlenecks, impairing access, understanding, accountability, and quality of data - ultimately, its usability. These architectures can struggle to meet the diverse needs of different business units, leading to inefficiencies. + +Data Mesh addresses these issues by formally dividing data into decentralized domains, which are owned by the individual teams who are experts in those domains. This approach allows each business unit or domain to manage its own data, enabling independent creation and consumption of data and increasing the agility, reliability, scalability of an organization’s data practice. + +### Key Considerations for Your Organization + +**Decentralized Data Management:** Have you experienced difficulties or bottlenecks in managing data across various business units? Implementing a Data Mesh can alleviate these issues by allowing each domain to build and share its own data products, enhancing agility and scalability. + +**Overcoming Centralized Bottlenecks:** If your organization relies on a centralized data lake or warehouse, or data platform team, have you encountered limitations in scalability or delays in data access and analysis? Data Mesh can help overcome these bottlenecks by “pushing down” data ownership and management to domain experts. + +**Enhancing Agility and Reliability:** How important is it for your organization to respond quickly to market changes and generate insights reliably? By formally defining the responsibilities around data “products”, a data mesh architecture can provide the flexibility and speed needed to stay competitive. + +## How to Implement Data Mesh + +Implementing Data Mesh doesn’t need to be a headache. Here’s how your organization can move towards a better future: + +### Best Practices and Strategies + +**Define Domains and Data Products** + +Formally define the different business units or domains within your organization and define the data products each domain will own and manage, and then begin to organize the data on your existing warehouse or lake around these domains. This ensures clarity and responsibility for data management. + +**Establish Clear Contracts** + +Create a clear set of expectations around what it means to be a domain or data product owner within your organization. Then, build processes and systems to both reinforce and monitor these expectations. This helps maintain consistency and reliability across the organization. + +**Monitor Data Quality** + +Use metadata validation and data quality assertions to ensure that your expectations are being met. This includes setting standards for both data quality - freshness, volume, column validity - as well compliance with your less technical requirements - ownership, data documentation, and data classification. + +**Move Towards Federated Governance** + +Adopt a federated governance model to balance autonomy and control. While domains manage their data products, a central team can oversee governance standards and ensure compliance with organizational policies via a well-defined review process. + +### Alternatives + +While a centralized data lake or warehouse can simplify data governance by virtue of keeping everything in one place, it can become a bottleneck as your data organization grows. Decentralized Data Mesh can provide a more scalable and agile approach, by distributing day-to-day responsibility for accessing, producing, and validating data while enforcing a centralized set of standards and processes. + +### Our Solution + +Acryl DataHub offers a comprehensive set of features designed to support the implementation of a Data Mesh at your organization: + +- **[Data Domains](https://datahubproject.io/docs/domains)**: Clearly define and manage data products within each business unit. +- **[Data Products](https://datahubproject.io/docs/dataproducts):** Ensure each domain owns and manages its data products, promoting autonomy and agility. +- **[Data Contracts](https://datahubproject.io/docs/managed-datahub/observe/data-contract)**: Establish clear agreements between domains to ensure consistency and reliability. + + +

+ +
+ Data Contracts in Acryl DataHub UI +

+ + + +- **[Assertions](https://datahubproject.io/docs/managed-datahub/observe/assertions)** Monitor data quality using freshness, volume, column validity, schema, and custom SQL checks to get notified first when things go wrong + + +

+ +
+ Assertion Results +

+ + + +- **[Metadata Tests](https://datahubproject.io/docs/tests/metadata-tests)**: Monitor and enforce a central set of standards or policies across all of your data assets, e.g. to ensure data documentation, data ownership, and data classification. + +

+ +
+ Metadata Test Results +

+ +By implementing these solutions, you can effectively manage decentralized data, enhance agility, and generate insights more efficiently. + +## Conclusion + +Implementing a Data Mesh can significantly improve your organization's ability to manage and leverage decentralized data. By understanding the benefits of data mesh and following best practices for implementation, you can overcome the limitations of centralized data systems and enhance your agility, scalability, and ability to generate insights. Acryl DataHub was built from the ground up to help you achieve this, providing the tools and features necessary to implement a large-scale Data Mesh successfully. \ No newline at end of file diff --git a/docs-website/src/learn/data-pipeline.md b/docs-website/src/learn/data-pipeline.md new file mode 100644 index 0000000000000..f5e5bb6615f48 --- /dev/null +++ b/docs-website/src/learn/data-pipeline.md @@ -0,0 +1,90 @@ +--- +title: "What is a Data Pipeline and Why Should We Optimize It" +description: Discover the importance of optimizing data pipelines to maintain data freshness and control costs. +tags: ["Data Pipeline", "Use Case", "For Data Engineers"] +image: /img/learn/use-case-data-pipeline.png +hide_table_of_contents: false +audience: ["Data Engineers"] +date: 2024-06-03T03:00 +--- + +# What is a Data Pipeline and Why Should We Optimize It? + +Discover the importance of optimizing data pipelines to maintain data freshness and control costs. + + + +## Introduction + +Have you ever been frustrated by slow and unreliable data pipelines or unexpectedly high cloud bills? In the modern data world, maintaining efficient, reliable, and cost-effective data pipelines is crucial for delivering timely, high-quality data. This post will explore the importance of optimizing data pipelines, why it matters, and how to achieve it effectively. + +## What is a Data Pipeline? + +A data pipeline is a series of processes that move data from one system to another - a key component in the supply chain for data. Think of it like a conveyor belt in a factory, transporting raw materials to different stations where they are processed into the final product. In the context of data, pipelines extract, transform, and load data (ETL) from various sources to destinations like data warehouses, ensuring the data is ready for analysis and use in applications such as machine learning models and business intelligence dashboards. + + +

+ +
+ Data Pipeline Example +

+ +## Why Should You Care About Data Pipeline Optimization? + +### The Problem + +Over time, data pipelines can slow down or become unreliable due to new dependencies, application code bugs, and poorly optimized queries, leading to missed data freshness SLAs and increased cloud costs. For data engineers, this means more time spent on manual debugging and justifying costs to your executives. + +### Importance + +Efficient data pipelines are essential for maintaining the performance of mission-critical tables, dashboards, and ML models powering key use cases for your organization. For example, a price prediction model relies on timely data to provide accurate results, directly impacting revenue. Similarly, outdated customer data can harm a company’s reputation and customer satisfaction. + +### Real-World Impact + +Imagine you’re managing a recommendation engine for an e-commerce site. If your data pipeline is delayed, the recommendations could become outdated, leading to missed sales opportunities - financial costs - and a poor user experience - reputational costs. Alternatively, consider a fraud detection system that relies on real-time data; any delay or downtime could mean the difference between catching fraudulent activity and suffering significant financial loss. + +### Questions To Ask + +- Have you ever noticed a decline in the freshness of crucial data or an uptick in cloud costs for specific pipelines? How do you currently approach diagnosing and optimizing these pipelines? +- If your organization is facing increasing cloud bills due to data pipeline inefficiencies, what strategies or tools do you employ to monitor and optimize costs? How do you balance the trade-off between performance, cost, and meeting business stakeholders' expectations for data delivery? +- Are you taking proactive measures to prevent data pipelines from becoming slower, more fragile, or more expensive over time? Do you have a system in place for regularly reviewing and optimizing key data pipelines to prevent performance or cost degradation? + +## How to Optimize Data Pipelines + +### General Approach + +To optimize your data pipelines, start by identifying bottlenecks and inefficiencies in the pipelines that generate your most mission-critical tables, dashboards, and models. Regularly review and update queries, and monitor pipeline performance by measuring aggregate pipeline run times as well as more granular tracking at the step or query level to catch issues early. Implement automation wherever possible to reduce manual intervention and ensure consistency. + +### Alternatives and Best Practices + +Some companies resort to manual debugging or use communication tools like Slack to triage issues. While these methods can work, they are often time-consuming and prone to errors. Instead, consider leveraging tools that provide lineage tracking, last updated time, and automated monitoring to streamline the optimization process. + +### Our Solution + +Acryl DataHub offers comprehensive features designed to optimize data pipelines: + +

+ +
+ Pipeline Catalog +

+ +- **Pipeline Cataloging:** Quickly browse all of the data pipelines running inside your organization, and track critical human context like pipeline ownership / accountability, purpose / documentation, and compliance labels in one place. + +

+ +
+ Lineage Tracking +

+ +- **[Lineage Tracking](https://datahubproject.io/docs/generated/lineage/lineage-feature-guide) and [Impact Analysis](https://datahubproject.io/docs/act-on-metadata/impact-analysis):** Understand the flow of data through your pipelines to identify and resolve inefficiencies quickly. Easily see which assets are consumed and produced by which pipelines. +- **Freshness Monitoring:** Track the freshness using Freshness Assertions of your data to ensure SLAs are met consistently. +- **Cost Management Tooling:** Monitor and optimize cloud costs associated with your data pipelines to improve cost-efficiency. + +By implementing these solutions, you can ensure that your data pipelines are running efficiently, meeting delivery SLAs, and staying within budget. + + + +## Conclusion + +Optimizing data pipelines is essential for maintaining data reliability, controlling costs, and ultimately ensuring your business continues to run smoothly. By implementing best practices and leveraging advanced tools like our product’s lineage tracking and automated monitoring features, you can achieve efficient and cost-effective data pipelines. Investing time and resources into optimization will ultimately lead to better performance, lower costs, and more satisfied stakeholders. \ No newline at end of file diff --git a/docs-website/src/pages/customer-stories-survey/index.js b/docs-website/src/pages/customer-stories-survey/index.js index 63a3ecd77e968..e271ad6a309c1 100644 --- a/docs-website/src/pages/customer-stories-survey/index.js +++ b/docs-website/src/pages/customer-stories-survey/index.js @@ -17,7 +17,7 @@ function CustomerStoriesSurvey() { window.hbspt.forms.create({ region: "na1", portalId: "14552909", - formId: "087ef03d-e47e-4814-b458-b30e3e02b623", + formId: "5fbd22ff-4edd-4c43-84bb-7fdaf4e38528", target: '#hubspotForm' // Targeting the div with the specific ID }); } diff --git a/docs-website/static/img/learn/use-case-business-glossary.png b/docs-website/static/img/learn/use-case-business-glossary.png new file mode 100644 index 0000000000000..cb1ed55756469 Binary files /dev/null and b/docs-website/static/img/learn/use-case-business-glossary.png differ diff --git a/docs-website/static/img/learn/use-case-business-metric.png b/docs-website/static/img/learn/use-case-business-metric.png new file mode 100644 index 0000000000000..09893a13b7268 Binary files /dev/null and b/docs-website/static/img/learn/use-case-business-metric.png differ diff --git a/docs-website/static/img/learn/use-case-data-freshness.png b/docs-website/static/img/learn/use-case-data-freshness.png new file mode 100644 index 0000000000000..0f6828521a7d3 Binary files /dev/null and b/docs-website/static/img/learn/use-case-data-freshness.png differ diff --git a/docs-website/static/img/learn/use-case-data-mesh.png b/docs-website/static/img/learn/use-case-data-mesh.png new file mode 100644 index 0000000000000..9214383688752 Binary files /dev/null and b/docs-website/static/img/learn/use-case-data-mesh.png differ diff --git a/docs-website/static/img/learn/use-case-data-pipeline.png b/docs-website/static/img/learn/use-case-data-pipeline.png new file mode 100644 index 0000000000000..c82b42f80a832 Binary files /dev/null and b/docs-website/static/img/learn/use-case-data-pipeline.png differ diff --git a/docs/actions/actions/slack.md b/docs/actions/actions/slack.md index bdea1c479e8aa..a89439825d2da 100644 --- a/docs/actions/actions/slack.md +++ b/docs/actions/actions/slack.md @@ -138,7 +138,7 @@ In the next steps, we'll show you how to configure the Slack Action based on the #### Managed DataHub -Head over to the [Configuring Notifications](../../managed-datahub/saas-slack-setup.md#configuring-notifications) section in the Managed DataHub guide to configure Slack notifications for your Managed DataHub instance. +Head over to the [Configuring Notifications](../../managed-datahub/slack/saas-slack-setup.md#configuring-notifications) section in the Managed DataHub guide to configure Slack notifications for your Managed DataHub instance. #### Quickstart diff --git a/docs/incidents/incidents.md b/docs/incidents/incidents.md index 578571289cd2e..41b4df10b7828 100644 --- a/docs/incidents/incidents.md +++ b/docs/incidents/incidents.md @@ -427,5 +427,5 @@ These notifications are also able to tag the immediate asset's owners, along wit

-To do so, simply follow the [Slack Integration Guide](docs/managed-datahub/saas-slack-setup.md) and contact your Acryl customer success team to enable the feature! +To do so, simply follow the [Slack Integration Guide](docs/managed-datahub/slack/saas-slack-setup.md) and contact your Acryl customer success team to enable the feature! diff --git a/docs/managed-datahub/managed-datahub-overview.md b/docs/managed-datahub/managed-datahub-overview.md index 087238097dd9f..4efc96eaf17a7 100644 --- a/docs/managed-datahub/managed-datahub-overview.md +++ b/docs/managed-datahub/managed-datahub-overview.md @@ -56,7 +56,8 @@ know. | Monitor Freshness SLAs | ❌ | ✅ | | Monitor Table Schemas | ❌ | ✅ | | Monitor Table Volume | ❌ | ✅ | -| Validate Table Columns | ❌ | ✅ | +| Monitor Table Column Integrity | ❌ | ✅ | +| Monitor Table with Custom SQL | ❌ | ✅ | | Receive Notifications via Email & Slack | ❌ | ✅ | | Manage Data Incidents via Slack | ❌ | ✅ | | View Data Health Dashboard | ❌ | ✅ | @@ -115,7 +116,7 @@ Fill out ## Additional Integrations -- [Slack Integration](docs/managed-datahub/saas-slack-setup.md) +- [Slack Integration](docs/managed-datahub/slack/saas-slack-setup.md) - [Remote Ingestion Executor](docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor.md) - [AWS Privatelink](docs/managed-datahub/integrations/aws-privatelink.md) - [AWS Eventbridge](docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge.md) diff --git a/docs/managed-datahub/observe/assertions.md b/docs/managed-datahub/observe/assertions.md index b74d524dff1bd..e63d051a0096b 100644 --- a/docs/managed-datahub/observe/assertions.md +++ b/docs/managed-datahub/observe/assertions.md @@ -38,7 +38,7 @@ If you opt for a 3rd party tool, it will be your responsibility to ensure the as ## Alerts -Beyond the ability to see the results of the assertion checks (and history of the results) both on the physical asset’s page in the DataHub UI and as the result of DataHub API calls, you can also get notified via [slack messages](/docs/managed-datahub/saas-slack-setup.md) (DMs or to a team channel) based on your [subscription](https://youtu.be/VNNZpkjHG_I?t=79) to an assertion change event. In the future, we’ll also provide the ability to subscribe directly to contracts. +Beyond the ability to see the results of the assertion checks (and history of the results) both on the physical asset’s page in the DataHub UI and as the result of DataHub API calls, you can also get notified via [Slack messages](/docs/managed-datahub/slack/saas-slack-setup.md) (DMs or to a team channel) based on your [subscription](https://youtu.be/VNNZpkjHG_I?t=79) to an assertion change event. In the future, we’ll also provide the ability to subscribe directly to contracts. With Acryl Observe, you can get the Assertion Change event by getting API events via [AWS EventBridge](/docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge.md) (the availability and simplicity of setup of each solution dependent on your current Acryl setup – chat with your Acryl representative to learn more). diff --git a/docs/managed-datahub/saas-slack-setup.md b/docs/managed-datahub/saas-slack-setup.md deleted file mode 100644 index 1b98f3a30773a..0000000000000 --- a/docs/managed-datahub/saas-slack-setup.md +++ /dev/null @@ -1,113 +0,0 @@ -import FeatureAvailability from '@site/src/components/FeatureAvailability'; - -# Configure Slack For Notifications - - - -## Install the DataHub Slack App into your Slack workspace - -The following steps should be performed by a Slack Workspace Admin. -- Navigate to https://api.slack.com/apps/ -- Click Create New App -- Use “From an app manifest” option -- Select your workspace -- Paste this Manifest in YAML. Suggest changing name and `display_name` to be `DataHub App YOUR_TEAM_NAME` but not required. This name will show up in your slack workspace -```yml -display_information: - name: DataHub App - description: An app to integrate DataHub with Slack - background_color: "#000000" -features: - bot_user: - display_name: DataHub App - always_online: false -oauth_config: - scopes: - bot: - - channels:read - - chat:write - - commands - - groups:read - - im:read - - mpim:read - - team:read - - users:read - - users:read.email -settings: - org_deploy_enabled: false - socket_mode_enabled: false - token_rotation_enabled: false -``` - -Confirm you see the Basic Information Tab - -![](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/integrations/slack/slack_basic_info.png) - -- Click **Install to Workspace** -- It will show you permissions the Slack App is asking for, what they mean and a default channel in which you want to add the slack app - - Note that the Slack App will only be able to post in channels that the app has been added to. This is made clear by slack’s Authentication screen also. -- Select the channel you'd like notifications to go to and click **Allow** -- Go to DataHub App page - - You can find your workspace's list of apps at https://api.slack.com/apps/ - -## Generating a Bot Token - -- Go to **OAuth & Permissions** Tab - -![](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/integrations/slack/slack_oauth_and_permissions.png) - -Here you'll find a “Bot User OAuth Token” which DataHub will need to communicate with your slack through the bot. -In the next steps, we'll show you how to configure the Slack Integration inside of Acryl DataHub. - -## Configuring Notifications - -> In order to set up the Slack integration, the user must have the `Manage Platform Settings` privilege. - -To enable the integration with slack -- Navigate to **Settings > Integrations** -- Click **Slack** -- Enable the Integration -- Enter the **Bot Token** obtained in the previous steps -- Enter a **Default Slack Channel** - this is where all notifications will be routed unless -- Click **Update** to save your settings - - - -To enable and disable specific types of notifications, or configure custom routing for notifications, start by navigating to **Settings > Notifications**. -To enable or disable a specific notification type in Slack, simply click the check mark. By default, all notification types are enabled. -To customize the channel where notifications are send, click the button to the right of the check box. - - - -If provided, a custom channel will be used to route notifications of the given type. If not provided, the default channel will be used. -That's it! You should begin to receive notifications on Slack. Note that it may take up to 1 minute for notification settings to take effect after saving. - -## Sending Notifications - -For now we support sending notifications to -- Slack Channel ID (e.g. `C029A3M079U`) -- Slack Channel Name (e.g. `#troubleshoot`) -- Specific Users (aka Direct Messages or DMs) via user ID - -By default, the Slack app will be able to send notifications to public channels. If you want to send notifications to private channels or DMs, you will need to invite the Slack app to those channels. - -## How to find Team ID and Channel ID in Slack - -- Go to the Slack channel for which you want to get channel ID -- Check the URL e.g. for the troubleshoot channel in OSS DataHub slack - -![](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/integrations/slack/slack_channel_url.png) - -- Notice `TUMKD5EGJ/C029A3M079U` in the URL - - Team ID = `TUMKD5EGJ` from above - - Channel ID = `C029A3M079U` from above - -## How to find User ID in Slack - -- Go to user DM -- Click on their profile picture -- Click on View Full Profile -- Click on “More” -- Click on “Copy member ID” - -![](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/integrations/slack/slack_user_id.png) \ No newline at end of file diff --git a/docs/managed-datahub/slack/saas-slack-app.md b/docs/managed-datahub/slack/saas-slack-app.md new file mode 100644 index 0000000000000..5e16fed901e72 --- /dev/null +++ b/docs/managed-datahub/slack/saas-slack-app.md @@ -0,0 +1,59 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + +# Slack App Features + + + +## Overview +The DataHub Slack App brings several of DataHub's key capabilities directly into your Slack experience. These include: +1. Searching for Data Assets +2. Subscribing to notifications for Data Assets +3. Managing Data Incidents + +*Our goal with the Slack app is to make data discovery easier and more accessible for you.* + +## Slack App Commands +The command-based capabilities on the Slack App revolve around search. + +### Querying for Assets +You can trigger a search by simplying typing `/acryl my favorite table`. +

+ Example of an in-Slack Acryl search command being performed. +

+ +Right within Slack, you'll be presented with results matching your query, and a handful of quick-actions for your convenience. +

+ Example of search results being displayed within Slack. +

+ +By selecting **'More Details'** you can preview in-depth information about an asset without leaving Slack. +

+ Example of search results being displayed within Slack. +

+ +### Subscribing to be notified about an Asset +You can hit the **'Subscribe'** button on a specific search result to subscribe to it directly from within Slack. +

+ Example of search results being displayed within Slack. +

+ + +## Manage Data Incidents +Some of the most commonly used features within our Slack app are the Incidents management capabilities. +The DataHub UI offers a rich set of [Incident tracking and management](https://datahubproject.io/docs/incidents/incidents/) features. +When a Slack member or channel receives notifications about an Incident, many of these features are made accessible right within the Slack app. + +When an incident is raised, you will recieve rich context about the incident in the Slack message itself. You will also be able to `Mark as Resolved`, update the `Priorty`, set a triage `Stage` and `View Details` - directly from the Slack message. +

+ Example of search results being displayed within Slack. +

+ +If you choose to `Mark as Resolved` the message will update in-place, and you will be presented with the ability to `Reopen Incident` should you choose. +

+ Example of search results being displayed within Slack. +

+ + +## Coming Soon +We're constantly working on rolling out new features for the Slack app, stay tuned! + diff --git a/docs/managed-datahub/slack/saas-slack-setup.md b/docs/managed-datahub/slack/saas-slack-setup.md new file mode 100644 index 0000000000000..6db6a77c3a1f3 --- /dev/null +++ b/docs/managed-datahub/slack/saas-slack-setup.md @@ -0,0 +1,176 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + +# Configure Slack For Notifications + + + +## Install the DataHub Slack App into your Slack workspace + + +### Video Walkthrough +
+ +### Step-by-step guide +The following steps should be performed by a Slack Workspace Admin. +1. Navigate to [https://api.slack.com/reference/manifests#config-tokens](https://api.slack.com/reference/manifests#config-tokens) +2. Under **Managing configuration tokens**, select **'Generate Token'** +

+ +

+3. Select your workspace, then hit **'Generate'** +

+ +

+4. Now you will see two tokens available for you to copy, an *Access Token* and a *Refresh Token* +

+ +

+5. Navigate back to your DataHub [Slack Integration setup page](https://longtailcompanions.acryl.io/settings/integrations/slack), and paste the tokens into their respective boxes, and click **'Connect'**. +

+ +

+6. You will be automatically re-directed to Slack to confirm DataHub Slack App's permissions and complete the installation process: +

+ +

+7. Congrats 🎉 Slack is set up! Now try it out by going to the **Platform Notifications** page +

+ +

+8. Enter your channel in, and click **'Send a test notification'** +

+ +

+ +Now proceed to the [Subscriptions and Notifications page](https://datahubproject.io/docs/managed-datahub/subscription-and-notification) to see how you can subscribe to be notified about events on the platform, or visit the [Slack App page](saas-slack-app.md) to see how you can use DataHub's powerful capabilities directly within Slack. + + + +## Sending Notifications + +For now, we support sending notifications to +- Slack Channel Name (e.g. `#troubleshoot`) +- Slack Channel ID (e.g. `C029A3M079U`) +- Specific Users (aka Direct Messages or DMs) via user ID + +By default, the Slack app will be able to send notifications to public channels. If you want to send notifications to private channels or DMs, you will need to invite the Slack app to those channels. + +## How to find Team ID and Channel ID in Slack +:::note +We recommend just using the Slack channel name for simplicity (e.g. `#troubleshoot`). +::: + +**Via Slack App:** +1. Go to the Slack channel for which you want to get a channel ID +2. Click the channel name at the top +

+ +

+3. At the bottom of the modal that pops up, you will see the Channel ID as well as a button to copy it +

+ +

+ +**Via Web:** +1. Go to the Slack channel for which you want to get a channel ID +2. Check the URL e.g. for the troubleshoot channel in OSS DataHub Slack +![](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/integrations/slack/slack_channel_url.png) + +3. Notice `TUMKD5EGJ/C029A3M079U` in the URL + - Team ID = `TUMKD5EGJ` from above + - Channel ID = `C029A3M079U` from above + +## How to find User ID in Slack + +**Your User ID** +1. Click your profile picture, then select **'Profile'** +

+ +

+2. Now hit the **'...'** and select **'Copy member ID'** +

+ +

+ +**Someone else's User ID** +1. Click their profile picture in the Slack message +

+ +

+2. Now hit the **'...'** and select **'Copy member ID'** +

+ +

diff --git a/docs/managed-datahub/subscription-and-notification.md b/docs/managed-datahub/subscription-and-notification.md index 81648d4298ec1..0e456fe415b2c 100644 --- a/docs/managed-datahub/subscription-and-notification.md +++ b/docs/managed-datahub/subscription-and-notification.md @@ -5,7 +5,10 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; DataHub's Subscriptions and Notifications feature gives you real-time change alerts on data assets of your choice. -With this feature, you can set up subscriptions to specific changes for an Entity – and DataHub will notify you when those changes happen. Currently, DataHub supports notifications on Slack, with support for Microsoft Teams and email subscriptions forthcoming. +With this feature, you can set up subscriptions to specific changes for an Entity – and DataHub will notify you when those changes happen. Currently, DataHub supports notifications on Slack and Email, with support for Microsoft Teams forthcoming. + +Email will work out of box. For installing the DataHub Slack App, see: +👉 [Configure Slack for Notifications](slack/saas-slack-setup.md)

@@ -16,7 +19,7 @@ As a user, you can subscribe to and receive notifications about changes such as ## Prerequisites -Once you have [configured Slack within your DataHub instance](saas-slack-setup.md), you will be able to subscribe to any Entity in DataHub and begin recieving notifications via DM. +Once you have [configured Slack within your DataHub instance](slack/saas-slack-setup.md), you will be able to subscribe to any Entity in DataHub and begin recieving notifications via DM. To begin receiving personal notifications, go to Settings > "My Notifications". From here, toggle on Slack Notifications and input your Slack Member ID. If you want to create and manage group-level Subscriptions for your team, you will need [the following privileges](../../docs/authorization/roles.md#role-privileges): diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java index 77820948b00cb..fc4ac90dfabad 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java @@ -9,6 +9,7 @@ import com.linkedin.util.Pair; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -198,16 +199,12 @@ default Map> getNewUrnAspectsMap( static Map> merge( @Nonnull Map> a, @Nonnull Map> b) { - return Stream.concat(a.entrySet().stream(), b.entrySet().stream()) - .flatMap( - entry -> - entry.getValue().entrySet().stream() - .map(innerEntry -> Pair.of(entry.getKey(), innerEntry))) - .collect( - Collectors.groupingBy( - Pair::getKey, - Collectors.mapping( - Pair::getValue, Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)))); + Map> mergedMap = new HashMap<>(); + for (Map.Entry> entry : + Stream.concat(a.entrySet().stream(), b.entrySet().stream()).collect(Collectors.toList())) { + mergedMap.computeIfAbsent(entry.getKey(), k -> new HashMap<>()).putAll(entry.getValue()); + } + return mergedMap; } default String toAbbreviatedString(int maxWidth) { diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index e2e624a74e412..e874f70db02a3 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -20,11 +20,7 @@ if (extra_pip_extras != "") { def pip_install_command = "VIRTUAL_ENV=${venv_name} ${venv_name}/bin/uv pip install -e ../../metadata-ingestion" -task checkPythonVersion(type: Exec) { - commandLine python_executable, '-c', 'import sys; assert sys.version_info >= (3, 7)' -} - -task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { +task environmentSetup(type: Exec) { def sentinel_file = "${venv_name}/.venv_environment_sentinel" inputs.file file('setup.py') outputs.file(sentinel_file) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 065e9454c5d9e..2d2f6fbd2b089 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -42,7 +42,7 @@ def get_long_description(): # We remain restrictive on the versions allowed here to prevent # us from being broken by backwards-incompatible changes in the # underlying package. - "openlineage-airflow>=1.2.0,<=1.12.0", + "openlineage-airflow>=1.2.0,<=1.18.0", }, } @@ -142,6 +142,7 @@ def get_long_description(): "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py index 8aa154dc267b6..e9f93c0c1eab0 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py @@ -127,6 +127,10 @@ def _get_dependencies( ) return upstream_tasks + @staticmethod + def _extract_owners(dag: "DAG") -> List[str]: + return [owner.strip() for owner in dag.owner.split(",")] + @staticmethod def generate_dataflow( config: DatahubLineageConfig, @@ -175,7 +179,7 @@ def generate_dataflow( data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}" if config.capture_ownership_info and dag.owner: - owners = [owner.strip() for owner in dag.owner.split(",")] + owners = AirflowGenerator._extract_owners(dag) if config.capture_ownership_as_group: data_flow.group_owners.update(owners) else: @@ -282,10 +286,12 @@ def generate_datajob( datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={datajob.flow_urn.flow_id}&_flt_3_task_id={task.task_id}" if capture_owner and dag.owner: - if config and config.capture_ownership_as_group: - datajob.group_owners.add(dag.owner) - else: - datajob.owners.add(dag.owner) + if config and config.capture_ownership_info: + owners = AirflowGenerator._extract_owners(dag) + if config.capture_ownership_as_group: + datajob.group_owners.update(owners) + else: + datajob.owners.update(owners) if capture_tags and dag.tags: datajob.tags.update(dag.tags) diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini index b154f92fe553f..4d66dbc860aa9 100644 --- a/metadata-ingestion-modules/airflow-plugin/tox.ini +++ b/metadata-ingestion-modules/airflow-plugin/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py310-airflow27, py310-airflow28, py310-airflow29 +envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py310-airflow27, py310-airflow28, py311-airflow29 [testenv] use_develop = true @@ -27,7 +27,7 @@ deps = py310-airflow26: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt py310-airflow27: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt py310-airflow28: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.8.1/constraints-3.10.txt - py310-airflow29: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.1/constraints-3.10.txt + py311-airflow29: -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.3/constraints-3.11.txt # Before pinning to the constraint files, we previously left the dependencies # more open. There were a number of packages for which this caused issues. @@ -55,6 +55,6 @@ commands = [testenv:py310-airflow24] extras = dev,integration-tests,plugin-v2,test-airflow24 -[testenv:py310-airflow{26,27,28,29}] +[testenv:py310-airflow{26,27,28},py311-airflow{29}] extras = dev,integration-tests,plugin-v2 diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 52fefc3c78945..4e3f1ca91766c 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -17,7 +17,7 @@ def get_coverage_arg(test_name) { task checkPythonVersion(type: Exec) { commandLine python_executable, '-c', - 'import sys; assert (3, 11) > sys.version_info >= (3, 8), f"Python version {sys.version_info[:2]} not allowed"' + 'import sys; sys.version_info >= (3, 8), f"Python version {sys.version_info[:2]} not allowed"' } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { diff --git a/metadata-ingestion/scripts/modeldocgen.py b/metadata-ingestion/scripts/modeldocgen.py index ea7813f0ca85b..ee5f06cb801ba 100644 --- a/metadata-ingestion/scripts/modeldocgen.py +++ b/metadata-ingestion/scripts/modeldocgen.py @@ -8,12 +8,12 @@ from dataclasses import Field, dataclass, field from enum import auto from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Iterable +from typing import Any, Dict, Iterable, List, Optional, Tuple import avro.schema import click -from datahub.configuration.common import ConfigEnum, ConfigModel +from datahub.configuration.common import ConfigEnum, PermissiveConfigModel from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter @@ -22,7 +22,9 @@ from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields from datahub.ingestion.sink.file import FileSink, FileSinkConfig from datahub.metadata.schema_classes import ( + BrowsePathEntryClass, BrowsePathsClass, + BrowsePathsV2Class, DatasetPropertiesClass, DatasetSnapshotClass, ForeignKeyConstraintClass, @@ -34,8 +36,6 @@ StringTypeClass, SubTypesClass, TagAssociationClass, - BrowsePathsV2Class, - BrowsePathEntryClass, ) logger = logging.getLogger(__name__) @@ -493,30 +493,29 @@ def strip_types(field_path: str) -> str: ], ) + @dataclass class EntityAspectName: entityName: str aspectName: str -@dataclass -class AspectPluginConfig: +class AspectPluginConfig(PermissiveConfigModel): className: str enabled: bool - supportedEntityAspectNames: List[EntityAspectName] + supportedEntityAspectNames: List[EntityAspectName] = [] packageScan: Optional[List[str]] = None supportedOperations: Optional[List[str]] = None -@dataclass -class PluginConfiguration: +class PluginConfiguration(PermissiveConfigModel): aspectPayloadValidators: Optional[List[AspectPluginConfig]] = None mutationHooks: Optional[List[AspectPluginConfig]] = None mclSideEffects: Optional[List[AspectPluginConfig]] = None mcpSideEffects: Optional[List[AspectPluginConfig]] = None -class EntityRegistry(ConfigModel): +class EntityRegistry(PermissiveConfigModel): entities: List[EntityDefinition] events: Optional[List[EventDefinition]] plugins: Optional[PluginConfiguration] = None diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index ad1b312ef445c..788bec97a6488 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -1,3 +1,4 @@ +import contextlib import datetime import logging from abc import ABCMeta, abstractmethod @@ -10,6 +11,7 @@ Dict, Generic, Iterable, + Iterator, List, Optional, Sequence, @@ -97,6 +99,7 @@ def report_log( context: Optional[str] = None, exc: Optional[BaseException] = None, log: bool = False, + stacklevel: int = 1, ) -> None: """ Report a user-facing warning for the ingestion run. @@ -109,7 +112,8 @@ def report_log( exc: The exception associated with the event. We'll show the stack trace when in debug mode. """ - stacklevel = 2 + # One for this method, and one for the containing report_* call. + stacklevel = stacklevel + 2 log_key = f"{title}-{message}" entries = self._entries[level] @@ -118,6 +122,8 @@ def report_log( context = f"{context[:_MAX_CONTEXT_STRING_LENGTH]} ..." log_content = f"{message} => {context}" if context else message + if title: + log_content = f"{title}: {log_content}" if exc: log_content += f"{log_content}: {exc}" @@ -255,9 +261,10 @@ def report_failure( context: Optional[str] = None, title: Optional[LiteralString] = None, exc: Optional[BaseException] = None, + log: bool = True, ) -> None: self._structured_logs.report_log( - StructuredLogLevel.ERROR, message, title, context, exc, log=False + StructuredLogLevel.ERROR, message, title, context, exc, log=log ) def failure( @@ -266,9 +273,10 @@ def failure( context: Optional[str] = None, title: Optional[LiteralString] = None, exc: Optional[BaseException] = None, + log: bool = True, ) -> None: self._structured_logs.report_log( - StructuredLogLevel.ERROR, message, title, context, exc, log=True + StructuredLogLevel.ERROR, message, title, context, exc, log=log ) def info( @@ -277,11 +285,30 @@ def info( context: Optional[str] = None, title: Optional[LiteralString] = None, exc: Optional[BaseException] = None, + log: bool = True, ) -> None: self._structured_logs.report_log( - StructuredLogLevel.INFO, message, title, context, exc, log=True + StructuredLogLevel.INFO, message, title, context, exc, log=log ) + @contextlib.contextmanager + def report_exc( + self, + message: LiteralString, + title: Optional[LiteralString] = None, + context: Optional[str] = None, + level: StructuredLogLevel = StructuredLogLevel.ERROR, + ) -> Iterator[None]: + # Convenience method that helps avoid boilerplate try/except blocks. + # TODO: I'm not super happy with the naming here - it's not obvious that this + # suppresses the exception in addition to reporting it. + try: + yield + except Exception as exc: + self._structured_logs.report_log( + level, message=message, title=title, context=context, exc=exc + ) + def __post_init__(self) -> None: self.start_time = datetime.datetime.now() self.running_time: datetime.timedelta = datetime.timedelta(seconds=0) diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index e61ffa46b3c10..60930f03763ed 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -379,13 +379,19 @@ def _notify_reporters_on_ingestion_completion(self) -> None: for reporter in self.reporters: try: reporter.on_completion( - status="CANCELLED" - if self.final_status == PipelineStatus.CANCELLED - else "FAILURE" - if self.has_failures() - else "SUCCESS" - if self.final_status == PipelineStatus.COMPLETED - else "UNKNOWN", + status=( + "CANCELLED" + if self.final_status == PipelineStatus.CANCELLED + else ( + "FAILURE" + if self.has_failures() + else ( + "SUCCESS" + if self.final_status == PipelineStatus.COMPLETED + else "UNKNOWN" + ) + ) + ), report=self._get_structured_report(), ctx=self.ctx, ) @@ -425,7 +431,7 @@ def _time_to_print(self) -> bool: return True return False - def run(self) -> None: + def run(self) -> None: # noqa: C901 with contextlib.ExitStack() as stack: if self.config.flags.generate_memory_profiles: import memray @@ -436,6 +442,8 @@ def run(self) -> None: ) ) + stack.enter_context(self.sink) + self.final_status = PipelineStatus.UNKNOWN self._notify_reporters_on_ingestion_start() callback = None @@ -460,7 +468,17 @@ def run(self) -> None: if not self.dry_run: self.sink.handle_work_unit_start(wu) try: - record_envelopes = self.extractor.get_records(wu) + # Most of this code is meant to be fully stream-based instead of generating all records into memory. + # However, the extractor in particular will never generate a particularly large list. We want the + # exception reporting to be associated with the source, and not the transformer. As such, we + # need to materialize the generator returned by get_records(). + record_envelopes = list(self.extractor.get_records(wu)) + except Exception as e: + self.source.get_report().failure( + "Source produced bad metadata", context=wu.id, exc=e + ) + continue + try: for record_envelope in self.transform(record_envelopes): if not self.dry_run: try: @@ -482,9 +500,9 @@ def run(self) -> None: ) # TODO: Transformer errors should cause the pipeline to fail. - self.extractor.close() if not self.dry_run: self.sink.handle_work_unit_end(wu) + self.extractor.close() self.source.close() # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state for record_envelope in self.transform( @@ -518,8 +536,6 @@ def run(self) -> None: self._notify_reporters_on_ingestion_completion() - self.sink.close() - def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]: """ Transforms the given sequence of records by passing the records through the transformers diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py index 07cc694e1b162..c9833f6982599 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py @@ -52,13 +52,15 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.abs.config import DataLakeSourceConfig, PathSpec from datahub.ingestion.source.abs.report import DataLakeSourceReport -from datahub.ingestion.source.azure.abs_util import ( +from datahub.ingestion.source.azure.abs_folder_utils import ( get_abs_properties, get_abs_tags, + list_folders, +) +from datahub.ingestion.source.azure.abs_utils import ( get_container_name, get_container_relative_path, get_key_prefix, - list_folders, strip_abs_prefix, ) from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 3b77d58a8711b..f7e1e2610e7e2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -24,6 +24,7 @@ from pydantic import validator from pydantic.fields import Field +from datahub.api.entities.dataset.dataset import Dataset from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.emitter import mce_builder @@ -55,7 +56,11 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws import s3_util from datahub.ingestion.source.aws.aws_common import AwsSourceConfig -from datahub.ingestion.source.aws.s3_util import is_s3_uri, make_s3_urn +from datahub.ingestion.source.aws.s3_util import ( + is_s3_uri, + make_s3_urn, + make_s3_urn_for_lineage, +) from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -90,6 +95,9 @@ DatasetLineageTypeClass, DatasetProfileClass, DatasetPropertiesClass, + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, GlobalTagsClass, MetadataChangeEventClass, OwnerClass, @@ -97,6 +105,7 @@ OwnershipTypeClass, PartitionSpecClass, PartitionTypeClass, + SchemaMetadataClass, TagAssociationClass, UpstreamClass, UpstreamLineageClass, @@ -171,6 +180,11 @@ class GlueSourceConfig( description="If enabled, delta schemas can be alternatively fetched from table parameters.", ) + include_column_lineage: bool = Field( + default=True, + description="When enabled, column-level lineage will be extracted from the s3.", + ) + def is_profiling_enabled(self) -> bool: return self.profiling is not None and is_profiling_enabled( self.profiling.operation_config @@ -283,6 +297,7 @@ class GlueSource(StatefulIngestionSourceBase): def __init__(self, config: GlueSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) + self.ctx = ctx self.extract_owners = config.extract_owners self.source_config = config self.report = GlueSourceReport() @@ -714,18 +729,43 @@ def get_lineage_if_enabled( dataset_properties: Optional[ DatasetPropertiesClass ] = mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass) + # extract dataset schema aspect + schema_metadata: Optional[ + SchemaMetadataClass + ] = mce_builder.get_aspect_if_available(mce, SchemaMetadataClass) + if dataset_properties and "Location" in dataset_properties.customProperties: location = dataset_properties.customProperties["Location"] if is_s3_uri(location): - s3_dataset_urn = make_s3_urn(location, self.source_config.env) + s3_dataset_urn = make_s3_urn_for_lineage( + location, self.source_config.env + ) + assert self.ctx.graph + schema_metadata_for_s3: Optional[ + SchemaMetadataClass + ] = self.ctx.graph.get_schema_metadata(s3_dataset_urn) + if self.source_config.glue_s3_lineage_direction == "upstream": + fine_grained_lineages = None + if ( + self.source_config.include_column_lineage + and schema_metadata + and schema_metadata_for_s3 + ): + fine_grained_lineages = self.get_fine_grained_lineages( + mce.proposedSnapshot.urn, + s3_dataset_urn, + schema_metadata, + schema_metadata_for_s3, + ) upstream_lineage = UpstreamLineageClass( upstreams=[ UpstreamClass( dataset=s3_dataset_urn, type=DatasetLineageTypeClass.COPY, ) - ] + ], + fineGrainedLineages=fine_grained_lineages or None, ) return MetadataChangeProposalWrapper( entityUrn=mce.proposedSnapshot.urn, @@ -747,6 +787,49 @@ def get_lineage_if_enabled( ).as_workunit() return None + def get_fine_grained_lineages( + self, + dataset_urn: str, + s3_dataset_urn: str, + schema_metadata: SchemaMetadata, + schema_metadata_for_s3: SchemaMetadata, + ) -> Optional[List[FineGrainedLineageClass]]: + def simplify_field_path(field_path): + return Dataset._simplify_field_path(field_path) + + if schema_metadata and schema_metadata_for_s3: + fine_grained_lineages: List[FineGrainedLineageClass] = [] + for field in schema_metadata.fields: + field_path_v1 = simplify_field_path(field.fieldPath) + matching_s3_field = next( + ( + f + for f in schema_metadata_for_s3.fields + if simplify_field_path(f.fieldPath) == field_path_v1 + ), + None, + ) + if matching_s3_field: + fine_grained_lineages.append( + FineGrainedLineageClass( + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + mce_builder.make_schema_field_urn( + dataset_urn, field_path_v1 + ) + ], + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + mce_builder.make_schema_field_urn( + s3_dataset_urn, + simplify_field_path(matching_s3_field.fieldPath), + ) + ], + ) + ) + return fine_grained_lineages + return None + def _create_profile_mcp( self, mce: MetadataChangeEventClass, diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/abs_util.py b/metadata-ingestion/src/datahub/ingestion/source/azure/abs_folder_utils.py similarity index 79% rename from metadata-ingestion/src/datahub/ingestion/source/azure/abs_util.py rename to metadata-ingestion/src/datahub/ingestion/source/azure/abs_folder_utils.py index 34faa0f0979ef..ce166f2942dac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure/abs_util.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure/abs_folder_utils.py @@ -1,6 +1,4 @@ import logging -import os -import re from typing import Dict, Iterable, List, Optional from azure.storage.blob import BlobProperties @@ -10,67 +8,10 @@ from datahub.ingestion.source.azure.azure_common import AzureConnectionConfig from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass -ABS_PREFIXES_REGEX = re.compile( - r"(http[s]?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)" -) - logging.getLogger("py4j").setLevel(logging.ERROR) logger: logging.Logger = logging.getLogger(__name__) -def is_abs_uri(uri: str) -> bool: - return bool(ABS_PREFIXES_REGEX.match(uri)) - - -def get_abs_prefix(abs_uri: str) -> Optional[str]: - result = re.search(ABS_PREFIXES_REGEX, abs_uri) - if result and result.groups(): - return result.group(1) - return None - - -def strip_abs_prefix(abs_uri: str) -> str: - # remove abs prefix https://.blob.core.windows.net - abs_prefix = get_abs_prefix(abs_uri) - if not abs_prefix: - raise ValueError( - f"Not an Azure Blob Storage URI. Must match the following regular expression: {str(ABS_PREFIXES_REGEX)}" - ) - length_abs_prefix = len(abs_prefix) - return abs_uri[length_abs_prefix:] - - -def make_abs_urn(abs_uri: str, env: str) -> str: - abs_name = strip_abs_prefix(abs_uri) - - if abs_name.endswith("/"): - abs_name = abs_name[:-1] - - name, extension = os.path.splitext(abs_name) - - if extension != "": - extension = extension[1:] # remove the dot - return f"urn:li:dataset:(urn:li:dataPlatform:abs,{name}_{extension},{env})" - - return f"urn:li:dataset:(urn:li:dataPlatform:abs,{abs_name},{env})" - - -def get_container_name(abs_uri: str) -> str: - if not is_abs_uri(abs_uri): - raise ValueError( - f"Not an Azure Blob Storage URI. Must match the following regular expression: {str(ABS_PREFIXES_REGEX)}" - ) - return strip_abs_prefix(abs_uri).split("/")[0] - - -def get_key_prefix(abs_uri: str) -> str: - if not is_abs_uri(abs_uri): - raise ValueError( - f"Not an Azure Blob Storage URI. Must match the following regular expression: {str(ABS_PREFIXES_REGEX)}" - ) - return strip_abs_prefix(abs_uri).split("/", maxsplit=1)[1] - - def get_abs_properties( container_name: str, blob_name: Optional[str], @@ -280,7 +221,3 @@ def list_folders( this_dict[folder_name] = folder_name yield f"{folder_name}" - - -def get_container_relative_path(abs_uri: str) -> str: - return "/".join(strip_abs_prefix(abs_uri).split("/")[1:]) diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/abs_utils.py b/metadata-ingestion/src/datahub/ingestion/source/azure/abs_utils.py new file mode 100644 index 0000000000000..042e1b4ef921f --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/azure/abs_utils.py @@ -0,0 +1,66 @@ +import os +import re +from typing import Optional + +# This file should not import any abs spectific modules as we import it in path_spec.py in datat_lake_common.py + +ABS_PREFIXES_REGEX = re.compile( + r"(http[s]?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)" +) + + +def is_abs_uri(uri: str) -> bool: + return bool(ABS_PREFIXES_REGEX.match(uri)) + + +def get_abs_prefix(abs_uri: str) -> Optional[str]: + result = re.search(ABS_PREFIXES_REGEX, abs_uri) + if result and result.groups(): + return result.group(1) + return None + + +def strip_abs_prefix(abs_uri: str) -> str: + # remove abs prefix https://.blob.core.windows.net + abs_prefix = get_abs_prefix(abs_uri) + if not abs_prefix: + raise ValueError( + f"Not an Azure Blob Storage URI. Must match the following regular expression: {str(ABS_PREFIXES_REGEX)}" + ) + length_abs_prefix = len(abs_prefix) + return abs_uri[length_abs_prefix:] + + +def make_abs_urn(abs_uri: str, env: str) -> str: + abs_name = strip_abs_prefix(abs_uri) + + if abs_name.endswith("/"): + abs_name = abs_name[:-1] + + name, extension = os.path.splitext(abs_name) + + if extension != "": + extension = extension[1:] # remove the dot + return f"urn:li:dataset:(urn:li:dataPlatform:abs,{name}_{extension},{env})" + + return f"urn:li:dataset:(urn:li:dataPlatform:abs,{abs_name},{env})" + + +def get_container_name(abs_uri: str) -> str: + if not is_abs_uri(abs_uri): + raise ValueError( + f"Not an Azure Blob Storage URI. Must match the following regular expression: {str(ABS_PREFIXES_REGEX)}" + ) + return strip_abs_prefix(abs_uri).split("/")[0] + + +def get_key_prefix(abs_uri: str) -> str: + if not is_abs_uri(abs_uri): + raise ValueError( + f"Not an Azure Blob Storage URI. Must match the following regular expression: {str(ABS_PREFIXES_REGEX)}" + ) + return strip_abs_prefix(abs_uri).split("/", maxsplit=1)[1] + + +def get_container_relative_path(abs_uri: str) -> str: + return "/".join(strip_abs_prefix(abs_uri).split("/")[1:]) diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/data_lake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/data_lake_utils.py index 2ebdd2b4126bb..f594c61f4e5ed 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/data_lake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/data_lake_utils.py @@ -16,7 +16,7 @@ get_s3_prefix, is_s3_uri, ) -from datahub.ingestion.source.azure.abs_util import ( +from datahub.ingestion.source.azure.abs_utils import ( get_abs_prefix, get_container_name, get_container_relative_path, diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py index e21cdac1edf75..71765f9be5e32 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py @@ -11,7 +11,7 @@ from datahub.configuration.common import ConfigModel from datahub.ingestion.source.aws.s3_util import is_s3_uri -from datahub.ingestion.source.azure.abs_util import is_abs_uri +from datahub.ingestion.source.azure.abs_utils import is_abs_uri from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri # hide annoying debug errors from py4j diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py index 1b6619b4c4d28..bc069bd1e59ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py @@ -31,15 +31,12 @@ def __init__( reporter: LookMLSourceReport, liquid_variable: Dict[Any, Any], ) -> None: - self.viewfile_cache: Dict[str, LookerViewFile] = {} + self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {} self._root_project_name = root_project_name self._base_projects_folder = base_projects_folder self.reporter = reporter self.liquid_variable = liquid_variable - def is_view_seen(self, path: str) -> bool: - return path in self.viewfile_cache - def _load_viewfile( self, project_name: str, path: str, reporter: LookMLSourceReport ) -> Optional[LookerViewFile]: @@ -56,17 +53,15 @@ def _load_viewfile( ) return None - if self.is_view_seen(str(path)): + if path in self.viewfile_cache: return self.viewfile_cache[path] try: with open(path) as file: raw_file_content = file.read() except Exception as e: - logger.debug(f"An error occurred while reading path {path}", exc_info=True) - self.reporter.report_failure( - path, f"failed to load view file {path} from disk: {e}" - ) + self.reporter.failure("Failed to read lkml file", path, exc=e) + self.viewfile_cache[path] = None return None try: logger.debug(f"Loading viewfile {path}") @@ -91,8 +86,8 @@ def _load_viewfile( self.viewfile_cache[path] = looker_viewfile return looker_viewfile except Exception as e: - logger.debug(f"An error occurred while parsing path {path}", exc_info=True) - self.reporter.report_failure(path, f"failed to load view file {path}: {e}") + self.reporter.failure("Failed to parse lkml file", path, exc=e) + self.viewfile_cache[path] = None return None def load_viewfile( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py index 2a1d18c83e6fa..a7c008d932a71 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py @@ -11,14 +11,13 @@ ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.snowflake.snowflake_config import ( - SnowflakeIdentifierConfig, - SnowflakeV2Config, -) +from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeIdentifierMixin +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeIdentifierBuilder, +) from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( AssertionResult, AssertionResultType, @@ -40,23 +39,20 @@ class DataQualityMonitoringResult(BaseModel): VALUE: int -class SnowflakeAssertionsHandler(SnowflakeIdentifierMixin): +class SnowflakeAssertionsHandler: def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, connection: SnowflakeConnection, + identifiers: SnowflakeIdentifierBuilder, ) -> None: self.config = config self.report = report - self.logger = logger self.connection = connection + self.identifiers = identifiers self._urns_processed: List[str] = [] - @property - def identifier_config(self) -> SnowflakeIdentifierConfig: - return self.config - def get_assertion_workunits( self, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: @@ -80,10 +76,10 @@ def _gen_platform_instance_wu(self, urn: str) -> MetadataWorkUnit: return MetadataChangeProposalWrapper( entityUrn=urn, aspect=DataPlatformInstance( - platform=make_data_platform_urn(self.platform), + platform=make_data_platform_urn(self.identifiers.platform), instance=( make_dataplatform_instance_urn( - self.platform, self.config.platform_instance + self.identifiers.platform, self.config.platform_instance ) if self.config.platform_instance else None @@ -98,7 +94,7 @@ def _process_result_row( result = DataQualityMonitoringResult.parse_obj(result_row) assertion_guid = result.METRIC_NAME.split("__")[-1].lower() status = bool(result.VALUE) # 1 if PASS, 0 if FAIL - assertee = self.get_dataset_identifier( + assertee = self.identifiers.get_dataset_identifier( result.TABLE_NAME, result.TABLE_SCHEMA, result.TABLE_DATABASE ) if assertee in discovered_datasets: @@ -107,7 +103,7 @@ def _process_result_row( aspect=AssertionRunEvent( timestampMillis=datetime_to_ts_millis(result.MEASUREMENT_TIME), runId=result.MEASUREMENT_TIME.strftime("%Y-%m-%dT%H:%M:%SZ"), - asserteeUrn=self.gen_dataset_urn(assertee), + asserteeUrn=self.identifiers.gen_dataset_urn(assertee), status=AssertionRunStatus.COMPLETE, assertionUrn=make_assertion_urn(assertion_guid), result=AssertionResult( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 365e32dac3e69..ac9164cd0a000 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -131,6 +131,7 @@ class SnowflakeIdentifierConfig( # Changing default value here. convert_urns_to_lowercase: bool = Field( default=True, + description="Whether to convert dataset urns to lowercase.", ) @@ -210,8 +211,13 @@ class SnowflakeV2Config( description="Populates view->view and table->view column lineage using DataHub's sql parser.", ) - lazy_schema_resolver: bool = Field( + use_queries_v2: bool = Field( default=False, + description="If enabled, uses the new queries extractor to extract queries from snowflake.", + ) + + lazy_schema_resolver: bool = Field( + default=True, description="If enabled, uses lazy schema resolver to resolve schemas for tables and views. " "This is useful if you have a large number of schemas and want to avoid bulk fetching the schema for each table/view.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 3e65f06200418..151e9fb631620 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -2,7 +2,7 @@ import logging from dataclasses import dataclass from datetime import datetime -from typing import Any, Callable, Collection, Iterable, List, Optional, Set, Tuple, Type +from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type from pydantic import BaseModel, validator @@ -21,7 +21,11 @@ ) from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeCommonMixin, + SnowflakeFilter, + SnowflakeIdentifierBuilder, +) from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, ) @@ -119,18 +123,19 @@ def __init__( config: SnowflakeV2Config, report: SnowflakeV2Report, connection: SnowflakeConnection, - dataset_urn_builder: Callable[[str], str], + filters: SnowflakeFilter, + identifiers: SnowflakeIdentifierBuilder, redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler], sql_aggregator: SqlParsingAggregator, ) -> None: self.config = config self.report = report - self.logger = logger - self.dataset_urn_builder = dataset_urn_builder self.connection = connection + self.filters = filters + self.identifiers = identifiers + self.redundant_run_skip_handler = redundant_run_skip_handler self.sql_aggregator = sql_aggregator - self.redundant_run_skip_handler = redundant_run_skip_handler self.start_time, self.end_time = ( self.report.lineage_start_time, self.report.lineage_end_time, @@ -210,7 +215,7 @@ def populate_known_query_lineage( results: Iterable[UpstreamLineageEdge], ) -> None: for db_row in results: - dataset_name = self.get_dataset_identifier_from_qualified_name( + dataset_name = self.identifiers.get_dataset_identifier_from_qualified_name( db_row.DOWNSTREAM_TABLE_NAME ) if dataset_name not in discovered_assets or not db_row.QUERIES: @@ -233,7 +238,7 @@ def get_known_query_lineage( if not db_row.UPSTREAM_TABLES: return None - downstream_table_urn = self.dataset_urn_builder(dataset_name) + downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name) known_lineage = KnownQueryLineageInfo( query_text=query.query_text, @@ -288,7 +293,7 @@ def _populate_external_lineage_from_show_query( external_tables_query: str = SnowflakeQuery.show_external_tables() try: for db_row in self.connection.query(external_tables_query): - key = self.get_dataset_identifier( + key = self.identifiers.get_dataset_identifier( db_row["name"], db_row["schema_name"], db_row["database_name"] ) @@ -299,16 +304,16 @@ def _populate_external_lineage_from_show_query( upstream_urn=make_s3_urn_for_lineage( db_row["location"], self.config.env ), - downstream_urn=self.dataset_urn_builder(key), + downstream_urn=self.identifiers.gen_dataset_urn(key), ) self.report.num_external_table_edges_scanned += 1 self.report.num_external_table_edges_scanned += 1 except Exception as e: logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating external table lineage from Snowflake failed due to error {e}.", + self.structured_reporter.warning( + "Error populating external table lineage from Snowflake", + exc=e, ) self.report_status(EXTERNAL_LINEAGE, False) @@ -328,41 +333,47 @@ def _populate_external_lineage_from_copy_history( try: for db_row in self.connection.query(query): known_lineage_mapping = self._process_external_lineage_result_row( - db_row, discovered_tables + db_row, discovered_tables, identifiers=self.identifiers ) if known_lineage_mapping: + self.report.num_external_table_edges_scanned += 1 yield known_lineage_mapping except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. " self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) else: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating table external lineage from Snowflake failed due to error {e}.", + self.structured_reporter.warning( + "Error fetching external lineage from Snowflake", + exc=e, ) self.report_status(EXTERNAL_LINEAGE, False) + @classmethod def _process_external_lineage_result_row( - self, db_row: dict, discovered_tables: List[str] + cls, + db_row: dict, + discovered_tables: Optional[List[str]], + identifiers: SnowflakeIdentifierBuilder, ) -> Optional[KnownLineageMapping]: # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( + key: str = identifiers.get_dataset_identifier_from_qualified_name( db_row["DOWNSTREAM_TABLE_NAME"] ) - if key not in discovered_tables: + if discovered_tables is not None and key not in discovered_tables: return None if db_row["UPSTREAM_LOCATIONS"] is not None: external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"]) + loc: str for loc in external_locations: if loc.startswith("s3://"): - self.report.num_external_table_edges_scanned += 1 return KnownLineageMapping( - upstream_urn=make_s3_urn_for_lineage(loc, self.config.env), - downstream_urn=self.dataset_urn_builder(key), + upstream_urn=make_s3_urn_for_lineage( + loc, identifiers.identifier_config.env + ), + downstream_urn=identifiers.gen_dataset_urn(key), ) return None @@ -388,10 +399,9 @@ def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: error_msg = "Failed to get table/view to table lineage. Please grant imported privileges on SNOWFLAKE database. " self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) else: - logger.debug(e, exc_info=e) - self.report_warning( - "table-upstream-lineage", - f"Extracting lineage from Snowflake failed due to error {e}.", + self.structured_reporter.warning( + "Failed to extract table/view -> table lineage from Snowflake", + exc=e, ) self.report_status(TABLE_LINEAGE, False) @@ -402,9 +412,10 @@ def _process_upstream_lineage_row( return UpstreamLineageEdge.parse_obj(db_row) except Exception as e: self.report.num_upstream_lineage_edge_parsing_failed += 1 - self.report_warning( - f"Parsing lineage edge failed due to error {e}", - db_row.get("DOWNSTREAM_TABLE_NAME") or "", + self.structured_reporter.warning( + "Failed to parse lineage edge", + context=db_row.get("DOWNSTREAM_TABLE_NAME") or None, + exc=e, ) return None @@ -417,17 +428,21 @@ def map_query_result_upstreams( for upstream_table in upstream_tables: if upstream_table and upstream_table.query_id == query_id: try: - upstream_name = self.get_dataset_identifier_from_qualified_name( - upstream_table.upstream_object_name + upstream_name = ( + self.identifiers.get_dataset_identifier_from_qualified_name( + upstream_table.upstream_object_name + ) ) if upstream_name and ( not self.config.validate_upstreams_against_patterns - or self.is_dataset_pattern_allowed( + or self.filters.is_dataset_pattern_allowed( upstream_name, upstream_table.upstream_object_domain, ) ): - upstreams.append(self.dataset_urn_builder(upstream_name)) + upstreams.append( + self.identifiers.gen_dataset_urn(upstream_name) + ) except Exception as e: logger.debug(e, exc_info=e) return upstreams @@ -491,7 +506,7 @@ def build_finegrained_lineage( return None column_lineage = ColumnLineageInfo( downstream=DownstreamColumnRef( - table=dataset_urn, column=self.snowflake_identifier(col) + table=dataset_urn, column=self.identifiers.snowflake_identifier(col) ), upstreams=sorted(column_upstreams), ) @@ -508,19 +523,23 @@ def build_finegrained_lineage_upstreams( and upstream_col.column_name and ( not self.config.validate_upstreams_against_patterns - or self.is_dataset_pattern_allowed( + or self.filters.is_dataset_pattern_allowed( upstream_col.object_name, upstream_col.object_domain, ) ) ): - upstream_dataset_name = self.get_dataset_identifier_from_qualified_name( - upstream_col.object_name + upstream_dataset_name = ( + self.identifiers.get_dataset_identifier_from_qualified_name( + upstream_col.object_name + ) ) column_upstreams.append( ColumnRef( - table=self.dataset_urn_builder(upstream_dataset_name), - column=self.snowflake_identifier(upstream_col.column_name), + table=self.identifiers.gen_dataset_urn(upstream_dataset_name), + column=self.identifiers.snowflake_identifier( + upstream_col.column_name + ), ) ) return column_upstreams diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 4deeb9f96f48e..422bda5284dbc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -37,7 +37,6 @@ def __init__( super().__init__(config, report, self.platform, state_handler) self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = report - self.logger = logger self.database_default_schema: Dict[str, str] = dict() def get_workunits( @@ -86,7 +85,7 @@ def get_workunits( ) def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: - return self.get_dataset_identifier(table_name, schema_name, db_name) + return self.identifiers.get_dataset_identifier(table_name, schema_name, db_name) def get_batch_kwargs( self, table: BaseTable, schema_name: str, db_name: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index c647a624a5467..d5b8f98e40075 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -1,3 +1,4 @@ +import dataclasses import functools import json import logging @@ -11,6 +12,7 @@ import pydantic from typing_extensions import Self +from datahub.configuration.common import ConfigModel from datahub.configuration.time_window_config import ( BaseTimeWindowConfig, BucketDuration, @@ -20,6 +22,7 @@ from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain from datahub.ingestion.source.snowflake.snowflake_config import ( DEFAULT_TEMP_TABLES_PATTERNS, @@ -30,13 +33,18 @@ SnowflakeConnection, SnowflakeConnectionConfig, ) +from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( + SnowflakeLineageExtractor, +) from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeFilterMixin, - SnowflakeIdentifierMixin, + SnowflakeFilter, + SnowflakeIdentifierBuilder, + SnowflakeStructuredReportMixin, ) from datahub.ingestion.source.usage.usage_common import BaseUsageConfig from datahub.metadata.urns import CorpUserUrn +from datahub.sql_parsing.schema_resolver import SchemaResolver from datahub.sql_parsing.sql_parsing_aggregator import ( KnownLineageMapping, PreparsedQuery, @@ -50,11 +58,12 @@ DownstreamColumnRef, ) from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList +from datahub.utilities.perf_timer import PerfTimer logger = logging.getLogger(__name__) -class SnowflakeQueriesExtractorConfig(SnowflakeIdentifierConfig, SnowflakeFilterConfig): +class SnowflakeQueriesExtractorConfig(ConfigModel): # TODO: Support stateful ingestion for the time windows. window: BaseTimeWindowConfig = BaseTimeWindowConfig() @@ -76,12 +85,6 @@ class SnowflakeQueriesExtractorConfig(SnowflakeIdentifierConfig, SnowflakeFilter hidden_from_docs=True, ) - convert_urns_to_lowercase: bool = pydantic.Field( - # Override the default. - default=True, - description="Whether to convert dataset urns to lowercase.", - ) - include_lineage: bool = True include_queries: bool = True include_usage_statistics: bool = True @@ -89,40 +92,56 @@ class SnowflakeQueriesExtractorConfig(SnowflakeIdentifierConfig, SnowflakeFilter include_operations: bool = True -class SnowflakeQueriesSourceConfig(SnowflakeQueriesExtractorConfig): +class SnowflakeQueriesSourceConfig( + SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig +): connection: SnowflakeConnectionConfig @dataclass class SnowflakeQueriesExtractorReport(Report): - window: Optional[BaseTimeWindowConfig] = None + copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) + query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) + audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) sql_aggregator: Optional[SqlAggregatorReport] = None @dataclass class SnowflakeQueriesSourceReport(SourceReport): + window: Optional[BaseTimeWindowConfig] = None queries_extractor: Optional[SnowflakeQueriesExtractorReport] = None -class SnowflakeQueriesExtractor(SnowflakeFilterMixin, SnowflakeIdentifierMixin): +class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin): def __init__( self, connection: SnowflakeConnection, config: SnowflakeQueriesExtractorConfig, structured_report: SourceReport, + filters: SnowflakeFilter, + identifiers: SnowflakeIdentifierBuilder, + graph: Optional[DataHubGraph] = None, + schema_resolver: Optional[SchemaResolver] = None, + discovered_tables: Optional[List[str]] = None, ): self.connection = connection self.config = config self.report = SnowflakeQueriesExtractorReport() + self.filters = filters + self.identifiers = identifiers + self.discovered_tables = discovered_tables + self._structured_report = structured_report self.aggregator = SqlParsingAggregator( - platform=self.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - # graph=self.ctx.graph, + platform=self.identifiers.platform, + platform_instance=self.identifiers.identifier_config.platform_instance, + env=self.identifiers.identifier_config.env, + schema_resolver=schema_resolver, + graph=graph, + eager_graph_load=False, generate_lineage=self.config.include_lineage, generate_queries=self.config.include_queries, generate_usage_statistics=self.config.include_usage_statistics, @@ -144,14 +163,6 @@ def __init__( def structured_reporter(self) -> SourceReport: return self._structured_report - @property - def filter_config(self) -> SnowflakeFilterConfig: - return self.config - - @property - def identifier_config(self) -> SnowflakeIdentifierConfig: - return self.config - @functools.cached_property def local_temp_path(self) -> pathlib.Path: if self.config.local_temp_path: @@ -170,13 +181,16 @@ def is_temp_table(self, name: str) -> bool: ) def is_allowed_table(self, name: str) -> bool: - return self.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE) + if self.discovered_tables and name not in self.discovered_tables: + return False + + return self.filters.is_dataset_pattern_allowed( + name, SnowflakeObjectDomain.TABLE + ) def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: - self.report.window = self.config.window - # TODO: Add some logic to check if the cached audit log is stale or not. audit_log_file = self.local_temp_path / "audit_log.sqlite" use_cached_audit_log = audit_log_file.exists() @@ -191,74 +205,90 @@ def get_workunits_internal( shared_connection = ConnectionWrapper(audit_log_file) queries = FileBackedList(shared_connection) + entry: Union[KnownLineageMapping, PreparsedQuery] + + with self.report.copy_history_fetch_timer: + for entry in self.fetch_copy_history(): + queries.append(entry) - logger.info("Fetching audit log") - for entry in self.fetch_audit_log(): - queries.append(entry) + # TODO: Add "show external tables" lineage to the main schema extractor. + # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor. - for query in queries: - self.aggregator.add(query) + with self.report.query_log_fetch_timer: + for entry in self.fetch_query_log(): + queries.append(entry) + + with self.report.audit_log_load_timer: + for query in queries: + self.aggregator.add(query) yield from auto_workunit(self.aggregator.gen_metadata()) - def fetch_audit_log( - self, - ) -> Iterable[Union[KnownLineageMapping, PreparsedQuery]]: - """ - # TODO: we need to fetch this info from somewhere - discovered_tables = [] - - snowflake_lineage_v2 = SnowflakeLineageExtractor( - config=self.config, # type: ignore - report=self.report, # type: ignore - dataset_urn_builder=self.gen_dataset_urn, - redundant_run_skip_handler=None, - sql_aggregator=self.aggregator, # TODO this should be unused - ) + def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: + # Derived from _populate_external_lineage_from_copy_history. - for ( - known_lineage_mapping - ) in snowflake_lineage_v2._populate_external_lineage_from_copy_history( - discovered_tables=discovered_tables - ): - interim_results.append(known_lineage_mapping) + query: str = SnowflakeQuery.copy_lineage_history( + start_time_millis=int(self.config.window.start_time.timestamp() * 1000), + end_time_millis=int(self.config.window.end_time.timestamp() * 1000), + downstreams_deny_pattern=self.config.temporary_tables_pattern, + ) - for ( - known_lineage_mapping - ) in snowflake_lineage_v2._populate_external_lineage_from_show_query( - discovered_tables=discovered_tables + with self.structured_reporter.report_exc( + "Error fetching copy history from Snowflake" ): - interim_results.append(known_lineage_mapping) - """ + logger.info("Fetching copy history from Snowflake") + resp = self.connection.query(query) + + for row in resp: + try: + result = ( + SnowflakeLineageExtractor._process_external_lineage_result_row( + row, + discovered_tables=self.discovered_tables, + identifiers=self.identifiers, + ) + ) + except Exception as e: + self.structured_reporter.warning( + "Error parsing copy history row", + context=f"{row}", + exc=e, + ) + else: + if result: + yield result - audit_log_query = _build_enriched_audit_log_query( + def fetch_query_log( + self, + ) -> Iterable[PreparsedQuery]: + query_log_query = _build_enriched_query_log_query( start_time=self.config.window.start_time, end_time=self.config.window.end_time, bucket_duration=self.config.window.bucket_duration, deny_usernames=self.config.deny_usernames, ) - resp = self.connection.query(audit_log_query) - - for i, row in enumerate(resp): - if i % 1000 == 0: - logger.info(f"Processed {i} audit log rows") - - assert isinstance(row, dict) - try: - entry = self._parse_audit_log_row(row) - except Exception as e: - self.structured_reporter.warning( - "Error parsing audit log row", - context=f"{row}", - exc=e, - ) - else: - yield entry - - def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: - # Copied from SnowflakeCommonMixin. - return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) + with self.structured_reporter.report_exc( + "Error fetching query log from Snowflake" + ): + logger.info("Fetching query log from Snowflake") + resp = self.connection.query(query_log_query) + + for i, row in enumerate(resp): + if i % 1000 == 0: + logger.info(f"Processed {i} query log rows") + + assert isinstance(row, dict) + try: + entry = self._parse_audit_log_row(row) + except Exception as e: + self.structured_reporter.warning( + "Error parsing query log row", + context=f"{row}", + exc=e, + ) + else: + yield entry def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: json_fields = { @@ -280,13 +310,17 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: column_usage = {} for obj in direct_objects_accessed: - dataset = self.gen_dataset_urn( - self.get_dataset_identifier_from_qualified_name(obj["objectName"]) + dataset = self.identifiers.gen_dataset_urn( + self.identifiers.get_dataset_identifier_from_qualified_name( + obj["objectName"] + ) ) columns = set() for modified_column in obj["columns"]: - columns.add(self.snowflake_identifier(modified_column["columnName"])) + columns.add( + self.identifiers.snowflake_identifier(modified_column["columnName"]) + ) upstreams.append(dataset) column_usage[dataset] = columns @@ -301,8 +335,10 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: context=f"{row}", ) - downstream = self.gen_dataset_urn( - self.get_dataset_identifier_from_qualified_name(obj["objectName"]) + downstream = self.identifiers.gen_dataset_urn( + self.identifiers.get_dataset_identifier_from_qualified_name( + obj["objectName"] + ) ) column_lineage = [] for modified_column in obj["columns"]: @@ -310,18 +346,18 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: ColumnLineageInfo( downstream=DownstreamColumnRef( dataset=downstream, - column=self.snowflake_identifier( + column=self.identifiers.snowflake_identifier( modified_column["columnName"] ), ), upstreams=[ ColumnRef( - table=self.gen_dataset_urn( - self.get_dataset_identifier_from_qualified_name( + table=self.identifiers.gen_dataset_urn( + self.identifiers.get_dataset_identifier_from_qualified_name( upstream["objectName"] ) ), - column=self.snowflake_identifier( + column=self.identifiers.snowflake_identifier( upstream["columnName"] ), ) @@ -332,12 +368,9 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: ) ) - # TODO: Support filtering the table names. - # if objects_modified: - # breakpoint() - - # TODO implement email address mapping - user = CorpUserUrn(res["user_name"]) + # TODO: Fetch email addresses from Snowflake to map user -> email + # TODO: Support email_domain fallback for generating user urns. + user = CorpUserUrn(self.identifiers.snowflake_identifier(res["user_name"])) timestamp: datetime = res["query_start_time"] timestamp = timestamp.astimezone(timezone.utc) @@ -348,14 +381,18 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: ) entry = PreparsedQuery( - query_id=res["query_fingerprint"], + # Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better + # job at eliminating redundant / repetitive queries. As such, we don't include the fingerprint + # here so that the aggregator auto-generates one. + # query_id=res["query_fingerprint"], + query_id=None, query_text=res["query_text"], upstreams=upstreams, downstream=downstream, column_lineage=column_lineage, column_usage=column_usage, inferred_schema=None, - confidence_score=1, + confidence_score=1.0, query_count=res["query_count"], user=user, timestamp=timestamp, @@ -371,7 +408,14 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig): self.config = config self.report = SnowflakeQueriesSourceReport() - self.platform = "snowflake" + self.filters = SnowflakeFilter( + filter_config=self.config, + structured_reporter=self.report, + ) + self.identifiers = SnowflakeIdentifierBuilder( + identifier_config=self.config, + structured_reporter=self.report, + ) self.connection = self.config.connection.get_connection() @@ -379,6 +423,9 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig): connection=self.connection, config=self.config, structured_report=self.report, + filters=self.filters, + identifiers=self.identifiers, + graph=self.ctx.graph, ) self.report.queries_extractor = self.queries_extractor.report @@ -388,6 +435,8 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> Self: return cls(ctx, config) def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + self.report.window = self.config.window + # TODO: Disable auto status processor? return self.queries_extractor.get_workunits_internal() @@ -399,7 +448,7 @@ def get_report(self) -> SnowflakeQueriesSourceReport: _MAX_TABLES_PER_QUERY = 20 -def _build_enriched_audit_log_query( +def _build_enriched_query_log_query( start_time: datetime, end_time: datetime, bucket_duration: BucketDuration, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py index 4924546383aa4..80b6be36e5ffa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py @@ -15,6 +15,9 @@ from datahub.utilities.perf_timer import PerfTimer if TYPE_CHECKING: + from datahub.ingestion.source.snowflake.snowflake_queries import ( + SnowflakeQueriesExtractorReport, + ) from datahub.ingestion.source.snowflake.snowflake_schema import ( SnowflakeDataDictionary, ) @@ -113,6 +116,8 @@ class SnowflakeV2Report( data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None + queries_extractor: Optional["SnowflakeQueriesExtractorReport"] = None + # These will be non-zero if snowflake information_schema queries fail with error - # "Information schema query returned too much data. Please repeat query with more selective predicates."" # This will result in overall increase in time complexity diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index ce8f20d23aa6b..600292c2c9942 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -185,8 +185,6 @@ def get_column_tags_for_table( class SnowflakeDataDictionary(SupportsAsObj): def __init__(self, connection: SnowflakeConnection) -> None: - self.logger = logger - self.connection = connection def as_obj(self) -> Dict[str, Dict[str, int]]: @@ -514,7 +512,7 @@ def get_tags_for_database_without_propagation( ) else: # This should never happen. - self.logger.error(f"Encountered an unexpected domain: {domain}") + logger.error(f"Encountered an unexpected domain: {domain}") continue return tags diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 72f8f8ad793fd..1d4a5b377da14 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -1,6 +1,6 @@ import itertools import logging -from typing import Callable, Dict, Iterable, List, Optional, Union +from typing import Dict, Iterable, List, Optional, Union from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import ( @@ -26,8 +26,6 @@ SnowflakeObjectDomain, ) from datahub.ingestion.source.snowflake.snowflake_config import ( - SnowflakeFilterConfig, - SnowflakeIdentifierConfig, SnowflakeV2Config, TagOption, ) @@ -52,8 +50,9 @@ ) from datahub.ingestion.source.snowflake.snowflake_tag import SnowflakeTagExtractor from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeFilterMixin, - SnowflakeIdentifierMixin, + SnowflakeFilter, + SnowflakeIdentifierBuilder, + SnowflakeStructuredReportMixin, SnowsightUrlBuilder, ) from datahub.ingestion.source.sql.sql_utils import ( @@ -142,13 +141,16 @@ } -class SnowflakeSchemaGenerator(SnowflakeFilterMixin, SnowflakeIdentifierMixin): +class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin): + platform = "snowflake" + def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, connection: SnowflakeConnection, - dataset_urn_builder: Callable[[str], str], + filters: SnowflakeFilter, + identifiers: SnowflakeIdentifierBuilder, domain_registry: Optional[DomainRegistry], profiler: Optional[SnowflakeProfiler], aggregator: Optional[SqlParsingAggregator], @@ -157,7 +159,8 @@ def __init__( self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = report self.connection: SnowflakeConnection = connection - self.dataset_urn_builder = dataset_urn_builder + self.filters: SnowflakeFilter = filters + self.identifiers: SnowflakeIdentifierBuilder = identifiers self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary( connection=self.connection @@ -185,19 +188,17 @@ def get_connection(self) -> SnowflakeConnection: def structured_reporter(self) -> SourceReport: return self.report - @property - def filter_config(self) -> SnowflakeFilterConfig: - return self.config + def gen_dataset_urn(self, dataset_identifier: str) -> str: + return self.identifiers.gen_dataset_urn(dataset_identifier) - @property - def identifier_config(self) -> SnowflakeIdentifierConfig: - return self.config + def snowflake_identifier(self, identifier: str) -> str: + return self.identifiers.snowflake_identifier(identifier) def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.databases = [] for database in self.get_databases() or []: self.report.report_entity_scanned(database.name, "database") - if not self.filter_config.database_pattern.allowed(database.name): + if not self.filters.filter_config.database_pattern.allowed(database.name): self.report.report_dropped(f"{database.name}.*") else: self.databases.append(database) @@ -211,7 +212,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self._process_database(snowflake_db) except SnowflakePermissionError as e: - self.report_error(GENERIC_PERMISSION_ERROR_KEY, str(e)) + self.structured_reporter.failure( + GENERIC_PERMISSION_ERROR_KEY, + exc=e, + ) return def get_databases(self) -> Optional[List[SnowflakeDatabase]]: @@ -220,10 +224,9 @@ def get_databases(self) -> Optional[List[SnowflakeDatabase]]: # whose information_schema can be queried to start with. databases = self.data_dictionary.show_databases() except Exception as e: - logger.debug(f"Failed to list databases due to error {e}", exc_info=e) - self.report_error( - "list-databases", - f"Failed to list databases due to error {e}", + self.structured_reporter.failure( + "Failed to list databases", + exc=e, ) return None else: @@ -232,7 +235,7 @@ def get_databases(self) -> Optional[List[SnowflakeDatabase]]: ] = self.get_databases_from_ischema(databases) if len(ischema_databases) == 0: - self.report_error( + self.structured_reporter.failure( GENERIC_PERMISSION_ERROR_KEY, "No databases found. Please check permissions.", ) @@ -275,7 +278,7 @@ def _process_database( # This may happen if REFERENCE_USAGE permissions are set # We can not run show queries on database in such case. # This need not be a failure case. - self.report_warning( + self.structured_reporter.warning( "Insufficient privileges to operate on database, skipping. Please grant USAGE permissions on database to extract its metadata.", db_name, ) @@ -284,9 +287,8 @@ def _process_database( f"Failed to use database {db_name} due to error {e}", exc_info=e, ) - self.report_warning( - "Failed to get schemas for database", - db_name, + self.structured_reporter.warning( + "Failed to get schemas for database", db_name, exc=e ) return @@ -342,10 +344,10 @@ def fetch_schemas_for_database( for schema in self.data_dictionary.get_schemas_for_database(db_name): self.report.report_entity_scanned(schema.name, "schema") if not is_schema_allowed( - self.filter_config.schema_pattern, + self.filters.filter_config.schema_pattern, schema.name, db_name, - self.filter_config.match_fully_qualified_names, + self.filters.filter_config.match_fully_qualified_names, ): self.report.report_dropped(f"{db_name}.{schema.name}.*") else: @@ -356,17 +358,14 @@ def fetch_schemas_for_database( # Ideal implementation would use PEP 678 – Enriching Exceptions with Notes raise SnowflakePermissionError(error_msg) from e.__cause__ else: - logger.debug( - f"Failed to get schemas for database {db_name} due to error {e}", - exc_info=e, - ) - self.report_warning( + self.structured_reporter.warning( "Failed to get schemas for database", db_name, + exc=e, ) if not schemas: - self.report_warning( + self.structured_reporter.warning( "No schemas found in database. If schemas exist, please grant USAGE permissions on them.", db_name, ) @@ -421,12 +420,12 @@ def _process_schema( and self.config.parse_view_ddl ): for view in views: - view_identifier = self.get_dataset_identifier( + view_identifier = self.identifiers.get_dataset_identifier( view.name, schema_name, db_name ) if view.view_definition: self.aggregator.add_view_definition( - view_urn=self.dataset_urn_builder(view_identifier), + view_urn=self.identifiers.gen_dataset_urn(view_identifier), view_definition=view.view_definition, default_db=db_name, default_schema=schema_name, @@ -441,9 +440,10 @@ def _process_schema( yield from self._process_tag(tag) if not snowflake_schema.views and not snowflake_schema.tables: - self.report_warning( - "No tables/views found in schema. If tables exist, please grant REFERENCES or SELECT permissions on them.", - f"{db_name}.{schema_name}", + self.structured_reporter.warning( + title="No tables/views found in schema", + message="If tables exist, please grant REFERENCES or SELECT permissions on them.", + context=f"{db_name}.{schema_name}", ) def fetch_views_for_schema( @@ -452,11 +452,13 @@ def fetch_views_for_schema( try: views: List[SnowflakeView] = [] for view in self.get_views_for_schema(schema_name, db_name): - view_name = self.get_dataset_identifier(view.name, schema_name, db_name) + view_name = self.identifiers.get_dataset_identifier( + view.name, schema_name, db_name + ) self.report.report_entity_scanned(view_name, "view") - if not self.filter_config.view_pattern.allowed(view_name): + if not self.filters.filter_config.view_pattern.allowed(view_name): self.report.report_dropped(view_name) else: views.append(view) @@ -469,13 +471,10 @@ def fetch_views_for_schema( raise SnowflakePermissionError(error_msg) from e.__cause__ else: - logger.debug( - f"Failed to get views for schema {db_name}.{schema_name} due to error {e}", - exc_info=e, - ) - self.report_warning( + self.structured_reporter.warning( "Failed to get views for schema", f"{db_name}.{schema_name}", + exc=e, ) return [] @@ -485,11 +484,13 @@ def fetch_tables_for_schema( try: tables: List[SnowflakeTable] = [] for table in self.get_tables_for_schema(schema_name, db_name): - table_identifier = self.get_dataset_identifier( + table_identifier = self.identifiers.get_dataset_identifier( table.name, schema_name, db_name ) self.report.report_entity_scanned(table_identifier) - if not self.filter_config.table_pattern.allowed(table_identifier): + if not self.filters.filter_config.table_pattern.allowed( + table_identifier + ): self.report.report_dropped(table_identifier) else: tables.append(table) @@ -501,13 +502,10 @@ def fetch_tables_for_schema( error_msg = f"Failed to get tables for schema {db_name}.{schema_name}. Please check permissions." raise SnowflakePermissionError(error_msg) from e.__cause__ else: - logger.debug( - f"Failed to get tables for schema {db_name}.{schema_name} due to error {e}", - exc_info=e, - ) - self.report_warning( + self.structured_reporter.warning( "Failed to get tables for schema", f"{db_name}.{schema_name}", + exc=e, ) return [] @@ -526,7 +524,9 @@ def _process_table( db_name: str, ) -> Iterable[MetadataWorkUnit]: schema_name = snowflake_schema.name - table_identifier = self.get_dataset_identifier(table.name, schema_name, db_name) + table_identifier = self.identifiers.get_dataset_identifier( + table.name, schema_name, db_name + ) try: table.columns = self.get_columns_for_table( @@ -538,11 +538,9 @@ def _process_table( table.name, schema_name, db_name ) except Exception as e: - logger.debug( - f"Failed to get columns for table {table_identifier} due to error {e}", - exc_info=e, + self.structured_reporter.warning( + "Failed to get columns for table", table_identifier, exc=e ) - self.report_warning("Failed to get columns for table", table_identifier) if self.config.extract_tags != TagOption.skip: table.tags = self.tag_extractor.get_tags_on_object( @@ -575,11 +573,9 @@ def fetch_foreign_keys_for_table( table.name, schema_name, db_name ) except Exception as e: - logger.debug( - f"Failed to get foreign key for table {table_identifier} due to error {e}", - exc_info=e, + self.structured_reporter.warning( + "Failed to get foreign keys for table", table_identifier, exc=e ) - self.report_warning("Failed to get foreign key for table", table_identifier) def fetch_pk_for_table( self, @@ -593,11 +589,9 @@ def fetch_pk_for_table( table.name, schema_name, db_name ) except Exception as e: - logger.debug( - f"Failed to get primary key for table {table_identifier} due to error {e}", - exc_info=e, + self.structured_reporter.warning( + "Failed to get primary key for table", table_identifier, exc=e ) - self.report_warning("Failed to get primary key for table", table_identifier) def _process_view( self, @@ -606,7 +600,9 @@ def _process_view( db_name: str, ) -> Iterable[MetadataWorkUnit]: schema_name = snowflake_schema.name - view_name = self.get_dataset_identifier(view.name, schema_name, db_name) + view_name = self.identifiers.get_dataset_identifier( + view.name, schema_name, db_name + ) try: view.columns = self.get_columns_for_table( @@ -617,11 +613,9 @@ def _process_view( view.name, schema_name, db_name ) except Exception as e: - logger.debug( - f"Failed to get columns for view {view_name} due to error {e}", - exc_info=e, + self.structured_reporter.warning( + "Failed to get columns for view", view_name, exc=e ) - self.report_warning("Failed to get columns for view", view_name) if self.config.extract_tags != TagOption.skip: view.tags = self.tag_extractor.get_tags_on_object( @@ -657,8 +651,10 @@ def gen_dataset_workunits( for tag in table.column_tags[column_name]: yield from self._process_tag(tag) - dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - dataset_urn = self.dataset_urn_builder(dataset_name) + dataset_name = self.identifiers.get_dataset_identifier( + table.name, schema_name, db_name + ) + dataset_urn = self.identifiers.gen_dataset_urn(dataset_name) status = Status(removed=False) yield MetadataChangeProposalWrapper( @@ -799,8 +795,10 @@ def gen_schema_metadata( schema_name: str, db_name: str, ) -> SchemaMetadata: - dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - dataset_urn = self.dataset_urn_builder(dataset_name) + dataset_name = self.identifiers.get_dataset_identifier( + table.name, schema_name, db_name + ) + dataset_urn = self.identifiers.gen_dataset_urn(dataset_name) foreign_keys: Optional[List[ForeignKeyConstraint]] = None if isinstance(table, SnowflakeTable) and len(table.foreign_keys) > 0: @@ -859,7 +857,7 @@ def build_foreign_keys( for fk in table.foreign_keys: foreign_dataset = make_dataset_urn_with_platform_instance( platform=self.platform, - name=self.get_dataset_identifier( + name=self.identifiers.get_dataset_identifier( fk.referred_table, fk.referred_schema, fk.referred_database ), env=self.config.env, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py index dad0ce7b59ee1..794a6f4a59f46 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -1,5 +1,5 @@ import logging -from typing import Callable, Iterable, List +from typing import Iterable, List from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -26,12 +26,9 @@ def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, - dataset_urn_builder: Callable[[str], str], ) -> None: self.config = config self.report = report - self.logger = logger - self.dataset_urn_builder = dataset_urn_builder def get_shares_workunits( self, databases: List[SnowflakeDatabase] @@ -94,9 +91,10 @@ def report_missing_databases( missing_dbs = [db for db in inbounds + outbounds if db not in db_names] if missing_dbs and self.config.platform_instance: - self.report_warning( - "snowflake-shares", - f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.", + self.report.warning( + title="Extra Snowflake share configurations", + message="Some databases referenced by the share configs were not ingested. Siblings/lineage will not be set for these.", + context=f"{missing_dbs}", ) elif missing_dbs: logger.debug( @@ -113,15 +111,15 @@ def gen_siblings( ) -> Iterable[MetadataWorkUnit]: if not sibling_databases: return - dataset_identifier = self.get_dataset_identifier( + dataset_identifier = self.identifiers.get_dataset_identifier( table_name, schema_name, database_name ) - urn = self.dataset_urn_builder(dataset_identifier) + urn = self.identifiers.gen_dataset_urn(dataset_identifier) sibling_urns = [ make_dataset_urn_with_platform_instance( - self.platform, - self.get_dataset_identifier( + self.identifiers.platform, + self.identifiers.get_dataset_identifier( table_name, schema_name, sibling_db.database ), sibling_db.platform_instance, @@ -141,14 +139,14 @@ def get_upstream_lineage_with_primary_sibling( table_name: str, primary_sibling_db: DatabaseId, ) -> MetadataWorkUnit: - dataset_identifier = self.get_dataset_identifier( + dataset_identifier = self.identifiers.get_dataset_identifier( table_name, schema_name, database_name ) - urn = self.dataset_urn_builder(dataset_identifier) + urn = self.identifiers.gen_dataset_urn(dataset_identifier) upstream_urn = make_dataset_urn_with_platform_instance( - self.platform, - self.get_dataset_identifier( + self.identifiers.platform, + self.identifiers.get_dataset_identifier( table_name, schema_name, primary_sibling_db.database ), primary_sibling_db.platform_instance, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py index f78ae70291f8a..72952f6b76e8b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py @@ -1,5 +1,4 @@ import dataclasses -import logging from collections import defaultdict from typing import Dict, Iterable, List, Optional @@ -9,7 +8,10 @@ from datahub.ingestion.api.decorators import SupportStatus, config_class, support_status from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeFilterConfig +from datahub.ingestion.source.snowflake.snowflake_config import ( + SnowflakeFilterConfig, + SnowflakeIdentifierConfig, +) from datahub.ingestion.source.snowflake.snowflake_connection import ( SnowflakeConnectionConfig, ) @@ -17,6 +19,9 @@ from datahub.ingestion.source.snowflake.snowflake_schema_gen import ( SnowflakeSchemaGenerator, ) +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeIdentifierBuilder, +) from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.utilities.lossy_collections import LossyList @@ -59,7 +64,6 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig): super().__init__(ctx) self.config: SnowflakeSummaryConfig = config self.report: SnowflakeSummaryReport = SnowflakeSummaryReport() - self.logger = logging.getLogger(__name__) self.connection = self.config.get_connection() @@ -69,7 +73,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: config=self.config, # type: ignore report=self.report, # type: ignore connection=self.connection, - dataset_urn_builder=lambda x: "", + identifiers=SnowflakeIdentifierBuilder( + identifier_config=SnowflakeIdentifierConfig(), + structured_reporter=self.report, + ), domain_registry=None, profiler=None, aggregator=None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_tag.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_tag.py index e6b4ef1fd9607..9307eb607be26 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_tag.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_tag.py @@ -27,7 +27,6 @@ def __init__( self.config = config self.data_dictionary = data_dictionary self.report = report - self.logger = logger self.tag_cache: Dict[str, _SnowflakeTagCache] = {} @@ -69,16 +68,18 @@ def _get_tags_on_object_with_propagation( ) -> List[SnowflakeTag]: identifier = "" if domain == SnowflakeObjectDomain.DATABASE: - identifier = self.get_quoted_identifier_for_database(db_name) + identifier = self.identifiers.get_quoted_identifier_for_database(db_name) elif domain == SnowflakeObjectDomain.SCHEMA: assert schema_name is not None - identifier = self.get_quoted_identifier_for_schema(db_name, schema_name) + identifier = self.identifiers.get_quoted_identifier_for_schema( + db_name, schema_name + ) elif ( domain == SnowflakeObjectDomain.TABLE ): # Views belong to this domain as well. assert schema_name is not None assert table_name is not None - identifier = self.get_quoted_identifier_for_table( + identifier = self.identifiers.get_quoted_identifier_for_table( db_name, schema_name, table_name ) else: @@ -140,7 +141,7 @@ def get_column_tags_for_table( elif self.config.extract_tags == TagOption.with_lineage: self.report.num_get_tags_on_columns_for_table_queries += 1 temp_column_tags = self.data_dictionary.get_tags_on_columns_for_table( - quoted_table_name=self.get_quoted_identifier_for_table( + quoted_table_name=self.identifiers.get_quoted_identifier_for_table( db_name, schema_name, table_name ), db_name=db_name, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index c5e0994059f2e..aff15386c5083 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -2,7 +2,7 @@ import logging import time from datetime import datetime, timezone -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple import pydantic @@ -20,7 +20,11 @@ ) from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeCommonMixin, + SnowflakeFilter, + SnowflakeIdentifierBuilder, +) from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantUsageRunSkipHandler, ) @@ -112,13 +116,14 @@ def __init__( config: SnowflakeV2Config, report: SnowflakeV2Report, connection: SnowflakeConnection, - dataset_urn_builder: Callable[[str], str], + filter: SnowflakeFilter, + identifiers: SnowflakeIdentifierBuilder, redundant_run_skip_handler: Optional[RedundantUsageRunSkipHandler], ) -> None: self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = report - self.dataset_urn_builder = dataset_urn_builder - self.logger = logger + self.filter = filter + self.identifiers = identifiers self.connection = connection self.redundant_run_skip_handler = redundant_run_skip_handler @@ -171,7 +176,7 @@ def get_usage_workunits( bucket_duration=self.config.bucket_duration, ), dataset_urns={ - self.dataset_urn_builder(dataset_identifier) + self.identifiers.gen_dataset_urn(dataset_identifier) for dataset_identifier in discovered_datasets }, ) @@ -232,7 +237,7 @@ def _get_workunits_internal( logger.debug(f"Processing usage row number {results.rownumber}") logger.debug(self.report.usage_aggregation.as_string()) - if not self.is_dataset_pattern_allowed( + if not self.filter.is_dataset_pattern_allowed( row["OBJECT_NAME"], row["OBJECT_DOMAIN"], ): @@ -242,7 +247,7 @@ def _get_workunits_internal( continue dataset_identifier = ( - self.get_dataset_identifier_from_qualified_name( + self.identifiers.get_dataset_identifier_from_qualified_name( row["OBJECT_NAME"] ) ) @@ -279,7 +284,8 @@ def build_usage_statistics_for_dataset( fieldCounts=self._map_field_counts(row["FIELD_COUNTS"]), ) return MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(dataset_identifier), aspect=stats + entityUrn=self.identifiers.gen_dataset_urn(dataset_identifier), + aspect=stats, ).as_workunit() except Exception as e: logger.debug( @@ -356,7 +362,9 @@ def _map_field_counts(self, field_counts_str: str) -> List[DatasetFieldUsageCoun return sorted( [ DatasetFieldUsageCounts( - fieldPath=self.snowflake_identifier(field_count["col"]), + fieldPath=self.identifiers.snowflake_identifier( + field_count["col"] + ), count=field_count["total"], ) for field_count in field_counts @@ -454,8 +462,10 @@ def _get_operation_aspect_work_unit( for obj in event.objects_modified: resource = obj.objectName - dataset_identifier = self.get_dataset_identifier_from_qualified_name( - resource + dataset_identifier = ( + self.identifiers.get_dataset_identifier_from_qualified_name( + resource + ) ) if dataset_identifier not in discovered_datasets: @@ -476,7 +486,7 @@ def _get_operation_aspect_work_unit( ), ) mcp = MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(dataset_identifier), + entityUrn=self.identifiers.gen_dataset_urn(dataset_identifier), aspect=operation_aspect, ) wu = MetadataWorkUnit( @@ -561,7 +571,7 @@ def _is_unsupported_object_accessed(self, obj: Dict[str, Any]) -> bool: def _is_object_valid(self, obj: Dict[str, Any]) -> bool: if self._is_unsupported_object_accessed( obj - ) or not self.is_dataset_pattern_allowed( + ) or not self.filter.is_dataset_pattern_allowed( obj.get("objectName"), obj.get("objectDomain") ): return False diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index c33fbb3d0bfc8..a1878963d3798 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -1,8 +1,7 @@ import abc +from functools import cached_property from typing import ClassVar, Literal, Optional, Tuple -from typing_extensions import Protocol - from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance from datahub.ingestion.api.source import SourceReport @@ -25,42 +24,6 @@ class SnowflakeStructuredReportMixin(abc.ABC): def structured_reporter(self) -> SourceReport: ... - # TODO: Eventually I want to deprecate these methods and use the structured_reporter directly. - def report_warning(self, key: str, reason: str) -> None: - self.structured_reporter.warning(key, reason) - - def report_error(self, key: str, reason: str) -> None: - self.structured_reporter.failure(key, reason) - - -# Required only for mypy, since we are using mixin classes, and not inheritance. -# Reference - https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes -class SnowflakeCommonProtocol(Protocol): - platform: str = "snowflake" - - config: SnowflakeV2Config - report: SnowflakeV2Report - - def get_dataset_identifier( - self, table_name: str, schema_name: str, db_name: str - ) -> str: - ... - - def cleanup_qualified_name(self, qualified_name: str) -> str: - ... - - def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: - ... - - def snowflake_identifier(self, identifier: str) -> str: - ... - - def report_warning(self, key: str, reason: str) -> None: - ... - - def report_error(self, key: str, reason: str) -> None: - ... - class SnowsightUrlBuilder: CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX: ClassVar = [ @@ -140,17 +103,14 @@ def get_external_url_for_database(self, db_name: str) -> Optional[str]: return f"{self.snowsight_base_url}#/data/databases/{db_name}/" -class SnowflakeFilterMixin(SnowflakeStructuredReportMixin): - @property - @abc.abstractmethod - def filter_config(self) -> SnowflakeFilterConfig: - ... +class SnowflakeFilter: + def __init__( + self, filter_config: SnowflakeFilterConfig, structured_reporter: SourceReport + ) -> None: + self.filter_config = filter_config + self.structured_reporter = structured_reporter - @staticmethod - def _combine_identifier_parts( - table_name: str, schema_name: str, db_name: str - ) -> str: - return f"{db_name}.{schema_name}.{table_name}" + # TODO: Refactor remaining filtering logic into this class. def is_dataset_pattern_allowed( self, @@ -167,28 +127,35 @@ def is_dataset_pattern_allowed( SnowflakeObjectDomain.MATERIALIZED_VIEW, ): return False + if len(dataset_params) != 3: - self.report_warning( - "invalid-dataset-pattern", - f"Found {dataset_params} of type {dataset_type}", + self.structured_reporter.info( + title="Unexpected dataset pattern", + message=f"Found a {dataset_type} with an unexpected number of parts. Database and schema filtering will not work as expected, but table filtering will still work.", + context=dataset_name, ) - # NOTE: this case returned `True` earlier when extracting lineage - return False + # We fall-through here so table/view filtering still works. - if not self.filter_config.database_pattern.allowed( - dataset_params[0].strip('"') - ) or not is_schema_allowed( - self.filter_config.schema_pattern, - dataset_params[1].strip('"'), - dataset_params[0].strip('"'), - self.filter_config.match_fully_qualified_names, + if ( + len(dataset_params) >= 1 + and not self.filter_config.database_pattern.allowed( + dataset_params[0].strip('"') + ) + ) or ( + len(dataset_params) >= 2 + and not is_schema_allowed( + self.filter_config.schema_pattern, + dataset_params[1].strip('"'), + dataset_params[0].strip('"'), + self.filter_config.match_fully_qualified_names, + ) ): return False if dataset_type.lower() in { SnowflakeObjectDomain.TABLE } and not self.filter_config.table_pattern.allowed( - self.cleanup_qualified_name(dataset_name) + _cleanup_qualified_name(dataset_name, self.structured_reporter) ): return False @@ -196,41 +163,53 @@ def is_dataset_pattern_allowed( SnowflakeObjectDomain.VIEW, SnowflakeObjectDomain.MATERIALIZED_VIEW, } and not self.filter_config.view_pattern.allowed( - self.cleanup_qualified_name(dataset_name) + _cleanup_qualified_name(dataset_name, self.structured_reporter) ): return False return True - # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers, - # For example "test-database"."test-schema".test_table - # whereas we generate urns without quotes even for quoted identifiers for backward compatibility - # and also unavailability of utility function to identify whether current table/schema/database - # name should be quoted in above method get_dataset_identifier - def cleanup_qualified_name(self, qualified_name: str) -> str: - name_parts = qualified_name.split(".") - if len(name_parts) != 3: - self.structured_reporter.report_warning( - title="Unexpected dataset pattern", - message="We failed to parse a Snowflake qualified name into its constituent parts. " - "DB/schema/table filtering may not work as expected on these entities.", - context=f"{qualified_name} has {len(name_parts)} parts", - ) - return qualified_name.replace('"', "") - return SnowflakeFilterMixin._combine_identifier_parts( - table_name=name_parts[2].strip('"'), - schema_name=name_parts[1].strip('"'), - db_name=name_parts[0].strip('"'), + +def _combine_identifier_parts( + *, table_name: str, schema_name: str, db_name: str +) -> str: + return f"{db_name}.{schema_name}.{table_name}" + + +# Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers, +# For example "test-database"."test-schema".test_table +# whereas we generate urns without quotes even for quoted identifiers for backward compatibility +# and also unavailability of utility function to identify whether current table/schema/database +# name should be quoted in above method get_dataset_identifier +def _cleanup_qualified_name( + qualified_name: str, structured_reporter: SourceReport +) -> str: + name_parts = qualified_name.split(".") + if len(name_parts) != 3: + structured_reporter.info( + title="Unexpected dataset pattern", + message="We failed to parse a Snowflake qualified name into its constituent parts. " + "DB/schema/table filtering may not work as expected on these entities.", + context=f"{qualified_name} has {len(name_parts)} parts", ) + return qualified_name.replace('"', "") + return _combine_identifier_parts( + db_name=name_parts[0].strip('"'), + schema_name=name_parts[1].strip('"'), + table_name=name_parts[2].strip('"'), + ) -class SnowflakeIdentifierMixin(abc.ABC): +class SnowflakeIdentifierBuilder: platform = "snowflake" - @property - @abc.abstractmethod - def identifier_config(self) -> SnowflakeIdentifierConfig: - ... + def __init__( + self, + identifier_config: SnowflakeIdentifierConfig, + structured_reporter: SourceReport, + ) -> None: + self.identifier_config = identifier_config + self.structured_reporter = structured_reporter def snowflake_identifier(self, identifier: str) -> str: # to be in in sync with older connector, convert name to lowercase @@ -242,7 +221,7 @@ def get_dataset_identifier( self, table_name: str, schema_name: str, db_name: str ) -> str: return self.snowflake_identifier( - SnowflakeCommonMixin._combine_identifier_parts( + _combine_identifier_parts( table_name=table_name, schema_name=schema_name, db_name=db_name ) ) @@ -255,20 +234,10 @@ def gen_dataset_urn(self, dataset_identifier: str) -> str: env=self.identifier_config.env, ) - -# TODO: We're most of the way there on fully removing SnowflakeCommonProtocol. -class SnowflakeCommonMixin(SnowflakeFilterMixin, SnowflakeIdentifierMixin): - @property - def structured_reporter(self: SnowflakeCommonProtocol) -> SourceReport: - return self.report - - @property - def filter_config(self: SnowflakeCommonProtocol) -> SnowflakeFilterConfig: - return self.config - - @property - def identifier_config(self: SnowflakeCommonProtocol) -> SnowflakeIdentifierConfig: - return self.config + def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: + return self.snowflake_identifier( + _cleanup_qualified_name(qualified_name, self.structured_reporter) + ) @staticmethod def get_quoted_identifier_for_database(db_name): @@ -278,40 +247,51 @@ def get_quoted_identifier_for_database(db_name): def get_quoted_identifier_for_schema(db_name, schema_name): return f'"{db_name}"."{schema_name}"' - def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: - return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) - @staticmethod def get_quoted_identifier_for_table(db_name, schema_name, table_name): return f'"{db_name}"."{schema_name}"."{table_name}"' + +class SnowflakeCommonMixin(SnowflakeStructuredReportMixin): + platform = "snowflake" + + config: SnowflakeV2Config + report: SnowflakeV2Report + + @property + def structured_reporter(self) -> SourceReport: + return self.report + + @cached_property + def identifiers(self) -> SnowflakeIdentifierBuilder: + return SnowflakeIdentifierBuilder(self.config, self.report) + # Note - decide how to construct user urns. # Historically urns were created using part before @ from user's email. # Users without email were skipped from both user entries as well as aggregates. # However email is not mandatory field in snowflake user, user_name is always present. def get_user_identifier( - self: SnowflakeCommonProtocol, + self, user_name: str, user_email: Optional[str], email_as_user_identifier: bool, ) -> str: if user_email: - return self.snowflake_identifier( + return self.identifiers.snowflake_identifier( user_email if email_as_user_identifier is True else user_email.split("@")[0] ) - return self.snowflake_identifier(user_name) + return self.identifiers.snowflake_identifier(user_name) # TODO: Revisit this after stateful ingestion can commit checkpoint # for failures that do not affect the checkpoint - def warn_if_stateful_else_error( - self: SnowflakeCommonProtocol, key: str, reason: str - ) -> None: + # TODO: Add additional parameters to match the signature of the .warning and .failure methods + def warn_if_stateful_else_error(self, key: str, reason: str) -> None: if ( self.config.stateful_ingestion is not None and self.config.stateful_ingestion.enabled ): - self.report_warning(key, reason) + self.structured_reporter.warning(key, reason) else: - self.report_error(key, reason) + self.structured_reporter.failure(key, reason) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index d8eda98da422b..1881e1da5be68 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -25,6 +25,7 @@ TestableSource, TestConnectionReport, ) +from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.snowflake.constants import ( GENERIC_PERMISSION_ERROR_KEY, @@ -42,6 +43,10 @@ SnowflakeLineageExtractor, ) from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler +from datahub.ingestion.source.snowflake.snowflake_queries import ( + SnowflakeQueriesExtractor, + SnowflakeQueriesExtractorConfig, +) from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report from datahub.ingestion.source.snowflake.snowflake_schema import ( SnowflakeDataDictionary, @@ -56,6 +61,8 @@ ) from datahub.ingestion.source.snowflake.snowflake_utils import ( SnowflakeCommonMixin, + SnowflakeFilter, + SnowflakeIdentifierBuilder, SnowsightUrlBuilder, ) from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler @@ -72,6 +79,7 @@ from datahub.ingestion.source_report.ingestion_stage import ( LINEAGE_EXTRACTION, METADATA_EXTRACTION, + QUERIES_EXTRACTION, ) from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator from datahub.utilities.registries.domain_registry import DomainRegistry @@ -127,9 +135,13 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): super().__init__(config, ctx) self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = SnowflakeV2Report() - self.logger = logger - self.connection = self.config.get_connection() + self.filters = SnowflakeFilter( + filter_config=self.config, structured_reporter=self.report + ) + self.identifiers = SnowflakeIdentifierBuilder( + identifier_config=self.config, structured_reporter=self.report + ) self.domain_registry: Optional[DomainRegistry] = None if self.config.domain: @@ -137,28 +149,29 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): cached_domains=[k for k in self.config.domain], graph=self.ctx.graph ) + self.connection = self.config.get_connection() + # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None self.aggregator: Optional[SqlParsingAggregator] = None - if self.config.include_table_lineage: + if self.config.use_queries_v2 or self.config.include_table_lineage: self.aggregator = SqlParsingAggregator( - platform=self.platform, + platform=self.identifiers.platform, platform_instance=self.config.platform_instance, env=self.config.env, - graph=( + graph=self.ctx.graph, + eager_graph_load=( # If we're ingestion schema metadata for tables/views, then we will populate # schemas into the resolver as we go. We only need to do a bulk fetch # if we're not ingesting schema metadata as part of ingestion. - self.ctx.graph - if not ( + ( self.config.include_technical_schema and self.config.include_tables and self.config.include_views ) and not self.config.lazy_schema_resolver - else None ), generate_usage_statistics=False, generate_operations=False, @@ -166,6 +179,8 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): ) self.report.sql_aggregator = self.aggregator.report + if self.config.include_table_lineage: + assert self.aggregator is not None redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler ] = None @@ -180,7 +195,8 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): config, self.report, connection=self.connection, - dataset_urn_builder=self.gen_dataset_urn, + filters=self.filters, + identifiers=self.identifiers, redundant_run_skip_handler=redundant_lineage_run_skip_handler, sql_aggregator=self.aggregator, ) @@ -201,7 +217,8 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): config, self.report, connection=self.connection, - dataset_urn_builder=self.gen_dataset_urn, + filter=self.filters, + identifiers=self.identifiers, redundant_run_skip_handler=redundant_usage_run_skip_handler, ) @@ -277,10 +294,12 @@ class SnowflakePrivilege: capabilities: List[SourceCapability] = [c.capability for c in SnowflakeV2Source.get_capabilities() if c.capability not in (SourceCapability.PLATFORM_INSTANCE, SourceCapability.DOMAINS, SourceCapability.DELETION_DETECTION)] # type: ignore cur = conn.query("select current_role()") - current_role = [row[0] for row in cur][0] + current_role = [row["CURRENT_ROLE()"] for row in cur][0] cur = conn.query("select current_secondary_roles()") - secondary_roles_str = json.loads([row[0] for row in cur][0])["roles"] + secondary_roles_str = json.loads( + [row["CURRENT_SECONDARY_ROLES()"] for row in cur][0] + )["roles"] secondary_roles = ( [] if secondary_roles_str == "" else secondary_roles_str.split(",") ) @@ -299,7 +318,9 @@ class SnowflakePrivilege: cur = conn.query(f'show grants to role "{role}"') for row in cur: privilege = SnowflakePrivilege( - privilege=row[1], object_type=row[2], object_name=row[3] + privilege=row["privilege"], + object_type=row["granted_on"], + object_name=row["name"], ) privileges.append(privilege) @@ -362,7 +383,7 @@ class SnowflakePrivilege: roles.append(privilege.object_name) cur = conn.query("select current_warehouse()") - current_warehouse = [row[0] for row in cur][0] + current_warehouse = [row["CURRENT_WAREHOUSE()"] for row in cur][0] default_failure_messages = { SourceCapability.SCHEMA_METADATA: "Either no tables exist or current role does not have permissions to access them", @@ -445,7 +466,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: profiler=self.profiler, aggregator=self.aggregator, snowsight_url_builder=snowsight_url_builder, - dataset_urn_builder=self.gen_dataset_urn, + filters=self.filters, + identifiers=self.identifiers, ) self.report.set_ingestion_stage("*", METADATA_EXTRACTION) @@ -453,30 +475,28 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: databases = schema_extractor.databases - self.connection.close() - # TODO: The checkpoint state for stale entity detection can be committed here. if self.config.shares: yield from SnowflakeSharesHandler( - self.config, self.report, self.gen_dataset_urn + self.config, self.report ).get_shares_workunits(databases) discovered_tables: List[str] = [ - self.get_dataset_identifier(table_name, schema.name, db.name) + self.identifiers.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.tables ] discovered_views: List[str] = [ - self.get_dataset_identifier(table_name, schema.name, db.name) + self.identifiers.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.views ] if len(discovered_tables) == 0 and len(discovered_views) == 0: - self.report_error( + self.structured_reporter.failure( GENERIC_PERMISSION_ERROR_KEY, "No tables/views found. Please check permissions.", ) @@ -484,33 +504,66 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: discovered_datasets = discovered_tables + discovered_views - if self.config.include_table_lineage and self.lineage_extractor: - self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION) - yield from self.lineage_extractor.get_workunits( - discovered_tables=discovered_tables, - discovered_views=discovered_views, + if self.config.use_queries_v2: + self.report.set_ingestion_stage("*", "View Parsing") + assert self.aggregator is not None + yield from auto_workunit(self.aggregator.gen_metadata()) + + self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) + + schema_resolver = self.aggregator._schema_resolver + + queries_extractor = SnowflakeQueriesExtractor( + connection=self.connection, + config=SnowflakeQueriesExtractorConfig( + window=self.config, + temporary_tables_pattern=self.config.temporary_tables_pattern, + include_lineage=self.config.include_table_lineage, + include_usage_statistics=self.config.include_usage_stats, + include_operations=self.config.include_operational_stats, + ), + structured_report=self.report, + filters=self.filters, + identifiers=self.identifiers, + schema_resolver=schema_resolver, ) - if ( - self.config.include_usage_stats or self.config.include_operational_stats - ) and self.usage_extractor: - yield from self.usage_extractor.get_usage_workunits(discovered_datasets) + # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs + # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors, + # it should be pretty straightforward to refactor this and only initialize the aggregator once. + self.report.queries_extractor = queries_extractor.report + yield from queries_extractor.get_workunits_internal() + + else: + if self.config.include_table_lineage and self.lineage_extractor: + self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION) + yield from self.lineage_extractor.get_workunits( + discovered_tables=discovered_tables, + discovered_views=discovered_views, + ) + + if ( + self.config.include_usage_stats or self.config.include_operational_stats + ) and self.usage_extractor: + yield from self.usage_extractor.get_usage_workunits(discovered_datasets) if self.config.include_assertion_results: yield from SnowflakeAssertionsHandler( - self.config, self.report, self.connection + self.config, self.report, self.connection, self.identifiers ).get_assertion_workunits(discovered_datasets) + self.connection.close() + def report_warehouse_failure(self) -> None: if self.config.warehouse is not None: - self.report_error( + self.structured_reporter.failure( GENERIC_PERMISSION_ERROR_KEY, f"Current role does not have permissions to use warehouse {self.config.warehouse}. Please update permissions.", ) else: - self.report_error( - "no-active-warehouse", - "No default warehouse set for user. Either set default warehouse for user or configure warehouse in recipe.", + self.structured_reporter.failure( + "Could not use a Snowflake warehouse", + "No default warehouse set for user. Either set a default warehouse for the user or configure a warehouse in the recipe.", ) def get_report(self) -> SourceReport: @@ -541,19 +594,28 @@ def inspect_session_metadata(self, connection: SnowflakeConnection) -> None: for db_row in connection.query(SnowflakeQuery.current_version()): self.report.saas_version = db_row["CURRENT_VERSION()"] except Exception as e: - self.report_error("version", f"Error: {e}") + self.structured_reporter.failure( + "Could not determine the current Snowflake version", + exc=e, + ) try: logger.info("Checking current role") for db_row in connection.query(SnowflakeQuery.current_role()): self.report.role = db_row["CURRENT_ROLE()"] except Exception as e: - self.report_error("version", f"Error: {e}") + self.structured_reporter.failure( + "Could not determine the current Snowflake role", + exc=e, + ) try: logger.info("Checking current warehouse") for db_row in connection.query(SnowflakeQuery.current_warehouse()): self.report.default_warehouse = db_row["CURRENT_WAREHOUSE()"] except Exception as e: - self.report_error("current_warehouse", f"Error: {e}") + self.structured_reporter.failure( + "Could not determine the current Snowflake warehouse", + exc=e, + ) try: logger.info("Checking current edition") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index ae17cff60fedd..9ddc671e21133 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -251,11 +251,6 @@ class AthenaConfig(SQLCommonConfig): "queries executed by DataHub." ) - # overwrite default behavior of SQLAlchemyConfing - include_views: Optional[bool] = pydantic.Field( - default=True, description="Whether views should be ingested." - ) - _s3_staging_dir_population = pydantic_renamed_field( old_name="s3_staging_dir", new_name="query_result_location", diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 93c7025aeee4e..3ead59eed2d39 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -83,10 +83,10 @@ class SQLCommonConfig( description='Attach domains to databases, schemas or tables during ingestion using regex patterns. Domain key can be a guid like *urn:li:domain:ec428203-ce86-4db3-985d-5a8ee6df32ba* or a string like "Marketing".) If you provide strings, then datahub will attempt to resolve this name to a guid, and will error out if this fails. There can be multiple domain keys specified.', ) - include_views: Optional[bool] = Field( + include_views: bool = Field( default=True, description="Whether views should be ingested." ) - include_tables: Optional[bool] = Field( + include_tables: bool = Field( default=True, description="Whether tables should be ingested." ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 50fd8ed3dff59..1655724f2d402 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -2279,7 +2279,7 @@ def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: yield from self.emit_table(database_table, tableau_columns) - # Emmitting tables that were purely parsed from SQL queries + # Emitting tables that were purely parsed from SQL queries for database_table in self.database_tables.values(): # Only tables purely parsed from SQL queries don't have ID if database_table.id: @@ -2302,10 +2302,11 @@ def emit_table( tableau_columns: Optional[List[Dict[str, Any]]], ) -> Iterable[MetadataWorkUnit]: logger.debug( - f"Emiting external table {database_table} tableau_columns {tableau_columns}" + f"Emitting external table {database_table} tableau_columns {tableau_columns}" ) + dataset_urn = DatasetUrn.from_string(database_table.urn) dataset_snapshot = DatasetSnapshot( - urn=database_table.urn, + urn=str(dataset_urn), aspects=[], ) if database_table.paths: @@ -2326,6 +2327,13 @@ def emit_table( if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) + if not dataset_snapshot.aspects: + # This should only happen with ingest_tables_external enabled. + logger.warning( + f"Urn {database_table.urn} has no real aspects, adding a key aspect to ensure materialization" + ) + dataset_snapshot.aspects.append(dataset_urn.to_key_aspect()) + yield self.get_metadata_change_event(dataset_snapshot) def get_schema_metadata_for_table( diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 14dc428b65389..4308b405e46e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -14,6 +14,7 @@ USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion" USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats" USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation" +QUERIES_EXTRACTION = "Queries Extraction" PROFILING = "Profiling" diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 894f01820718b..fbf6f954f82bb 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -251,7 +251,9 @@ def __init__( platform: str, platform_instance: Optional[str] = None, env: str = builder.DEFAULT_ENV, + schema_resolver: Optional[SchemaResolver] = None, graph: Optional[DataHubGraph] = None, + eager_graph_load: bool = True, generate_lineage: bool = True, generate_queries: bool = True, generate_query_subject_fields: bool = True, @@ -274,8 +276,12 @@ def __init__( self.generate_usage_statistics = generate_usage_statistics self.generate_query_usage_statistics = generate_query_usage_statistics self.generate_operations = generate_operations - if self.generate_queries and not self.generate_lineage: - raise ValueError("Queries will only be generated if lineage is enabled") + if self.generate_queries and not ( + self.generate_lineage or self.generate_query_usage_statistics + ): + logger.warning( + "Queries will not be generated, as neither lineage nor query usage statistics are enabled" + ) self.usage_config = usage_config if ( @@ -297,17 +303,29 @@ def __init__( # Set up the schema resolver. self._schema_resolver: SchemaResolver - if graph is None: + if schema_resolver is not None: + # If explicitly provided, use it. + assert self.platform.platform_name == schema_resolver.platform + assert self.platform_instance == schema_resolver.platform_instance + assert self.env == schema_resolver.env + self._schema_resolver = schema_resolver + elif graph is not None and eager_graph_load and self._need_schemas: + # Bulk load schemas using the graph client. + self._schema_resolver = graph.initialize_schema_resolver_from_datahub( + platform=self.platform.urn(), + platform_instance=self.platform_instance, + env=self.env, + ) + else: + # Otherwise, use a lazy-loading schema resolver. self._schema_resolver = self._exit_stack.enter_context( SchemaResolver( platform=self.platform.platform_name, platform_instance=self.platform_instance, env=self.env, + graph=graph, ) ) - else: - self._schema_resolver = None # type: ignore - self._initialize_schema_resolver_from_graph(graph) # Initialize internal data structures. # This leans pretty heavily on the our query fingerprinting capabilities. @@ -373,6 +391,8 @@ def __init__( # Usage aggregator. This will only be initialized if usage statistics are enabled. # TODO: Replace with FileBackedDict. + # TODO: The BaseUsageConfig class is much too broad for our purposes, and has a number of + # configs that won't be respected here. Using it is misleading. self._usage_aggregator: Optional[UsageAggregator[UrnStr]] = None if self.generate_usage_statistics: assert self.usage_config is not None @@ -392,7 +412,13 @@ def close(self) -> None: @property def _need_schemas(self) -> bool: - return self.generate_lineage or self.generate_usage_statistics + # Unless the aggregator is totally disabled, we will need schema information. + return ( + self.generate_lineage + or self.generate_usage_statistics + or self.generate_queries + or self.generate_operations + ) def register_schema( self, urn: Union[str, DatasetUrn], schema: models.SchemaMetadataClass @@ -414,35 +440,6 @@ def register_schemas_from_stream( yield wu - def _initialize_schema_resolver_from_graph(self, graph: DataHubGraph) -> None: - # requires a graph instance - # if no schemas are currently registered in the schema resolver - # and we need the schema resolver (e.g. lineage or usage is enabled) - # then use the graph instance to fetch all schemas for the - # platform/instance/env combo - if not self._need_schemas: - return - - if ( - self._schema_resolver is not None - and self._schema_resolver.schema_count() > 0 - ): - # TODO: Have a mechanism to override this, e.g. when table ingestion is enabled but view ingestion is not. - logger.info( - "Not fetching any schemas from the graph, since " - f"there are {self._schema_resolver.schema_count()} schemas already registered." - ) - return - - # TODO: The initialize_schema_resolver_from_datahub method should take in a SchemaResolver - # that it can populate or add to, rather than creating a new one and dropping any schemas - # that were already loaded into the existing one. - self._schema_resolver = graph.initialize_schema_resolver_from_datahub( - platform=self.platform.urn(), - platform_instance=self.platform_instance, - env=self.env, - ) - def _maybe_format_query(self, query: str) -> str: if self.format_queries: with self.report.sql_formatting_timer: diff --git a/metadata-ingestion/tests/integration/tableau/tableau_cll_mces_golden.json b/metadata-ingestion/tests/integration/tableau/tableau_cll_mces_golden.json index 5de4fe5f647d9..855f872838052 100644 --- a/metadata-ingestion/tests/integration/tableau/tableau_cll_mces_golden.json +++ b/metadata-ingestion/tests/integration/tableau/tableau_cll_mces_golden.json @@ -42958,6 +42958,50 @@ "pipelineName": "test_tableau_cll_ingest" } }, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,demo-custom-323403.bigquery_demo.order_items,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.metadata.key.DatasetKey": { + "platform": "urn:li:dataPlatform:bigquery", + "name": "demo-custom-323403.bigquery_demo.order_items", + "origin": "PROD" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "test_tableau_cll_ingest" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,demo-custom-323403.bigquery_demo.sellers,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.metadata.key.DatasetKey": { + "platform": "urn:li:dataPlatform:bigquery", + "name": "demo-custom-323403.bigquery_demo.sellers", + "origin": "PROD" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "test_tableau_cll_ingest" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(tableau,130496dc-29ca-8a89-e32b-d73c4d8b65ff)", diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_column_lineage.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_column_lineage.json new file mode 100644 index 0000000000000..fd4109b0f93c9 --- /dev/null +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_column_lineage.json @@ -0,0 +1,373 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:89f32a7a37e2f61693aa4b720ace2a3c", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "flights-database-lineage", + "param1": "value1", + "param2": "value2", + "LocationUri": "s3://test-bucket/test-prefix", + "CreateTime": "June 09, 2021 at 14:14:19" + }, + "name": "flights-database-lineage", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/flights-database-lineage" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:89f32a7a37e2f61693aa4b720ace2a3c", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:89f32a7a37e2f61693aa4b720ace2a3c", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:89f32a7a37e2f61693aa4b720ace2a3c", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": "{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}", + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe', 'Parameters': {'avro.schema.literal': '{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}', 'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "avro", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/flights-database-lineage/avro", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "flights-database-lineage.avro", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, + "description": "test comment", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flightdate", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].uniquecarrier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].airlineid", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].carrier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flightnum", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].origin", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].year", + "nullable": true, + "description": "partition test comment", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:89f32a7a37e2f61693aa4b720ace2a3c" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD)", + "type": "COPY" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD),yr)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD),yr)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD),flightdate)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD),flightdate)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD),uniquecarrier)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD),uniquecarrier)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD),airlineid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD),airlineid)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD),carrier)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD),carrier)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD),flightnum)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD),flightnum)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD),origin)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-lineage.avro,PROD),origin)" + ], + "confidenceScore": 1.0 + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json new file mode 100644 index 0000000000000..873776c5777bc --- /dev/null +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json @@ -0,0 +1,1402 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "flights-database", + "param1": "value1", + "param2": "value2", + "LocationUri": "s3://test-bucket/test-prefix", + "CreateTime": "June 09, 2021 at 14:14:19" + }, + "name": "flights-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/flights-database" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "test-database", + "CreateTime": "June 01, 2021 at 14:55:02" + }, + "name": "test-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/test-database" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "empty-database", + "CreateTime": "June 01, 2021 at 14:55:13" + }, + "name": "empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": "{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}", + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe', 'Parameters': {'avro.schema.literal': '{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}', 'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "avro", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/flights-database/avro", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "flights-database.avro", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, + "description": "test comment", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flightdate", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].uniquecarrier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].airlineid", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].carrier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flightnum", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].origin", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].year", + "nullable": true, + "description": "partition test comment", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/avro,PROD)", + "type": "COPY" + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "test-jsons", + "averageRecordSize": "273", + "classification": "json", + "compressionType": "none", + "objectCount": "1", + "recordCount": "1", + "sizeKey": "273", + "typeOfData": "file", + "Location": "s3://test-glue-jsons/markers/", + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe', 'Parameters': {'paths': 'markers'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "test_jsons_markers", + "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_jsons_markers", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test-database.test_jsons_markers", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array,location:array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array,location:array>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=string].name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].position", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "double" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].location", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "double" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons/markers,PROD)", + "type": "COPY" + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "test", + "averageRecordSize": "19", + "classification": "parquet", + "compressionType": "none", + "objectCount": "60", + "recordCount": "167497743", + "sizeKey": "4463574900", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/parquet/", + "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe', 'Parameters': {'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "test_parquet", + "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_parquet", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test-database.test_parquet", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].quarter", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].month", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].dayofmonth", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].year", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:s3,crawler-public-us-west-2/flight/parquet,PROD)", + "type": "COPY" + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(glue,test-job-1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": { + "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", + "created": "2021-06-10 16:51:25.690000", + "modified": "2021-06-10 16:55:35.307000", + "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-1.py" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1", + "description": "The first test job" + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(glue,test-job-2,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": { + "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", + "created": "2021-06-10 16:58:32.469000", + "modified": "2021-06-10 16:58:32.469000", + "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-2.py" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2", + "description": "The second test job" + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "f": "lambda row : ()", + "transformation_ctx": "\"Transform0\"", + "transformType": "Filter", + "nodeId": "Transform0_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:Filter-Transform0_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)" + ] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform1\"", + "transformType": "ApplyMapping", + "nodeId": "Transform1_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform1_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)" + ] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform2\"", + "transformType": "ApplyMapping", + "nodeId": "Transform2_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform2_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "keys2": "[\"(right) flightdate\"]", + "transformation_ctx": "\"Transform3\"", + "keys1": "[\"yr\"]", + "transformType": "Join", + "nodeId": "Transform3_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:Join-Transform3_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)" + ] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform4\"", + "transformType": "ApplyMapping", + "nodeId": "Transform4_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform4_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"(right) yr\", \"int\"), (\"flightdate\", \"string\", \"(right) flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"(right) uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"(right) airlineid\", \"int\"), (\"carrier\", \"string\", \"(right) carrier\", \"string\"), (\"flightnum\", \"string\", \"(right) flightnum\", \"string\"), (\"origin\", \"string\", \"(right) origin\", \"string\"), (\"dest\", \"string\", \"(right) dest\", \"string\"), (\"depdelay\", \"int\", \"(right) depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"(right) carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"(right) weatherdelay\", \"int\"), (\"year\", \"string\", \"(right) year\", \"string\")]", + "transformation_ctx": "\"Transform5\"", + "transformType": "ApplyMapping", + "nodeId": "Transform5_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform5_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "connection_type": "s3", + "format": "json", + "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", + "transformation_ctx": "DataSink1" + }, + "tags": [] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "paths": "[\"yr\", \"quarter\", \"month\", \"dayofmonth\", \"dayofweek\", \"flightdate\", \"uniquecarrier\"]", + "name2": "\"Transform0Output1\"", + "name1": "\"Transform0Output0\"", + "transformation_ctx": "\"Transform0\"", + "transformType": "SplitFields", + "nodeId": "Transform0_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:SplitFields-Transform0_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" + ] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"quarter\", \"int\", \"quarter\", \"int\"), (\"month\", \"int\", \"month\", \"int\"), (\"dayofmonth\", \"int\", \"dayofmonth\", \"int\"), (\"dayofweek\", \"int\", \"dayofweek\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\")]", + "transformation_ctx": "\"Transform1\"", + "transformType": "ApplyMapping", + "nodeId": "Transform1_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:ApplyMapping-Transform1_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "missing_values_column": "\"dayofmonth\"", + "transformation_ctx": "\"Transform2\"", + "transformType": "FillMissingValues", + "nodeId": "Transform2_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:FillMissingValues-Transform2_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" + ] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "paths": "[]", + "transformation_ctx": "\"Transform3\"", + "transformType": "SelectFields", + "nodeId": "Transform3_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:SelectFields-Transform3_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)" + ] + } + } + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "connection_type": "s3", + "format": "json", + "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", + "transformation_ctx": "DataSink0" + }, + "tags": [] + } + } + ] + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(glue,test-job-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(glue,test-job-2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:baz:bob", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "baz:bob" + } + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:foo:bar", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "foo:bar" + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index c8b7e021cf5a0..b43db47ae0071 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, Dict, Optional, Tuple, Type, cast +from typing import Any, Callable, Dict, Optional, Tuple, Type, cast from unittest.mock import patch import pydantic @@ -8,8 +8,10 @@ from botocore.stub import Stubber from freezegun import freeze_time +import datahub.metadata.schema_classes as models from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.ingestion.sink.file import write_metadata_file from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig from datahub.ingestion.source.state.sql_common_state import ( @@ -35,6 +37,7 @@ get_bucket_tagging, get_databases_delta_response, get_databases_response, + get_databases_response_for_lineage, get_databases_response_with_resource_link, get_dataflow_graph_response_1, get_dataflow_graph_response_2, @@ -47,6 +50,7 @@ get_object_response_1, get_object_response_2, get_object_tagging, + get_tables_lineage_response_1, get_tables_response_1, get_tables_response_2, get_tables_response_for_target_database, @@ -63,19 +67,28 @@ def glue_source( platform_instance: Optional[str] = None, + mock_datahub_graph: Optional[Callable[[DatahubClientConfig], DataHubGraph]] = None, use_s3_bucket_tags: bool = True, use_s3_object_tags: bool = True, extract_delta_schema_from_parameters: bool = False, + emit_s3_lineage: bool = False, + include_column_lineage: bool = False, + extract_transforms: bool = True, ) -> GlueSource: + pipeline_context = PipelineContext(run_id="glue-source-tes") + if mock_datahub_graph: + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) return GlueSource( - ctx=PipelineContext(run_id="glue-source-test"), + ctx=pipeline_context, config=GlueSourceConfig( aws_region="us-west-2", - extract_transforms=True, + extract_transforms=extract_transforms, platform_instance=platform_instance, use_s3_bucket_tags=use_s3_bucket_tags, use_s3_object_tags=use_s3_object_tags, extract_delta_schema_from_parameters=extract_delta_schema_from_parameters, + emit_s3_lineage=emit_s3_lineage, + include_column_lineage=include_column_lineage, ), ) @@ -425,3 +438,206 @@ def test_glue_with_malformed_delta_schema_ingest( output_path=tmp_path / "glue_malformed_delta_mces.json", golden_path=test_resources_dir / "glue_malformed_delta_mces_golden.json", ) + + +@pytest.mark.parametrize( + "platform_instance, mce_file, mce_golden_file", + [ + (None, "glue_mces.json", "glue_mces_golden_table_lineage.json"), + ], +) +@freeze_time(FROZEN_TIME) +def test_glue_ingest_include_table_lineage( + tmp_path: Path, + pytestconfig: PytestConfig, + mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], + platform_instance: str, + mce_file: str, + mce_golden_file: str, +) -> None: + glue_source_instance = glue_source( + platform_instance=platform_instance, + mock_datahub_graph=mock_datahub_graph, + emit_s3_lineage=True, + ) + + with Stubber(glue_source_instance.glue_client) as glue_stubber: + glue_stubber.add_response("get_databases", get_databases_response, {}) + glue_stubber.add_response( + "get_tables", + get_tables_response_1, + {"DatabaseName": "flights-database"}, + ) + glue_stubber.add_response( + "get_tables", + get_tables_response_2, + {"DatabaseName": "test-database"}, + ) + glue_stubber.add_response( + "get_tables", + {"TableList": []}, + {"DatabaseName": "empty-database"}, + ) + glue_stubber.add_response("get_jobs", get_jobs_response, {}) + glue_stubber.add_response( + "get_dataflow_graph", + get_dataflow_graph_response_1, + {"PythonScript": get_object_body_1}, + ) + glue_stubber.add_response( + "get_dataflow_graph", + get_dataflow_graph_response_2, + {"PythonScript": get_object_body_2}, + ) + + with Stubber(glue_source_instance.s3_client) as s3_stubber: + for _ in range( + len(get_tables_response_1["TableList"]) + + len(get_tables_response_2["TableList"]) + ): + s3_stubber.add_response( + "get_bucket_tagging", + get_bucket_tagging(), + ) + s3_stubber.add_response( + "get_object_tagging", + get_object_tagging(), + ) + + s3_stubber.add_response( + "get_object", + get_object_response_1(), + { + "Bucket": "aws-glue-assets-123412341234-us-west-2", + "Key": "scripts/job-1.py", + }, + ) + s3_stubber.add_response( + "get_object", + get_object_response_2(), + { + "Bucket": "aws-glue-assets-123412341234-us-west-2", + "Key": "scripts/job-2.py", + }, + ) + + mce_objects = [wu.metadata for wu in glue_source_instance.get_workunits()] + glue_stubber.assert_no_pending_responses() + s3_stubber.assert_no_pending_responses() + + write_metadata_file(tmp_path / mce_file, mce_objects) + + # Verify the output. + test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / mce_file, + golden_path=test_resources_dir / mce_golden_file, + ) + + +@pytest.mark.parametrize( + "platform_instance, mce_file, mce_golden_file", + [ + (None, "glue_mces.json", "glue_mces_golden_table_column_lineage.json"), + ], +) +@freeze_time(FROZEN_TIME) +def test_glue_ingest_include_column_lineage( + tmp_path: Path, + pytestconfig: PytestConfig, + mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], + platform_instance: str, + mce_file: str, + mce_golden_file: str, +) -> None: + glue_source_instance = glue_source( + platform_instance=platform_instance, + mock_datahub_graph=mock_datahub_graph, + emit_s3_lineage=True, + include_column_lineage=True, + use_s3_bucket_tags=False, + use_s3_object_tags=False, + extract_transforms=False, + ) + + # fake the server response + def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: + return models.SchemaMetadataClass( + schemaName="crawler-public-us-west-2/flight/avro", + platform="urn:li:dataPlatform:s3", # important <- platform must be an urn + version=0, + hash="", + platformSchema=models.OtherSchemaClass( + rawSchema="__insert raw schema here__" + ), + fields=[ + models.SchemaFieldClass( + fieldPath="yr", + type=models.SchemaFieldDataTypeClass(type=models.NumberTypeClass()), + nativeDataType="int", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="flightdate", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="uniquecarrier", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="airlineid", + type=models.SchemaFieldDataTypeClass(type=models.NumberTypeClass()), + nativeDataType="int", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="carrier", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="flightnum", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="origin", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + ], + ) + + glue_source_instance.ctx.graph.get_schema_metadata = fake_schema_metadata # type: ignore + + with Stubber(glue_source_instance.glue_client) as glue_stubber: + glue_stubber.add_response( + "get_databases", get_databases_response_for_lineage, {} + ) + glue_stubber.add_response( + "get_tables", + get_tables_lineage_response_1, + {"DatabaseName": "flights-database-lineage"}, + ) + + mce_objects = [wu.metadata for wu in glue_source_instance.get_workunits()] + glue_stubber.assert_no_pending_responses() + + write_metadata_file(tmp_path / mce_file, mce_objects) + + # Verify the output. + test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / mce_file, + golden_path=test_resources_dir / mce_golden_file, + ) diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/test_glue_source_stubs.py index fc4c9e91410e0..f44a384a02c4a 100644 --- a/metadata-ingestion/tests/unit/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/test_glue_source_stubs.py @@ -880,6 +880,98 @@ ] get_delta_tables_response_2 = {"TableList": delta_tables_2} +get_databases_response_for_lineage = { + "DatabaseList": [ + { + "Name": "flights-database-lineage", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 14, 19), + "CreateTableDefaultPermissions": [ + { + "Principal": { + "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS" + }, + "Permissions": ["ALL"], + } + ], + "CatalogId": "123412341234", + "LocationUri": "s3://test-bucket/test-prefix", + "Parameters": {"param1": "value1", "param2": "value2"}, + }, + ] +} + +tables_lineage_1 = [ + { + "Name": "avro", + "DatabaseName": "flights-database-lineage", + "Owner": "owner", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "UpdateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "LastAccessTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "Retention": 0, + "StorageDescriptor": { + "Columns": [ + {"Name": "yr", "Type": "int", "Comment": "test comment"}, + {"Name": "flightdate", "Type": "string"}, + {"Name": "uniquecarrier", "Type": "string"}, + {"Name": "airlineid", "Type": "int"}, + {"Name": "carrier", "Type": "string"}, + {"Name": "flightnum", "Type": "string"}, + {"Name": "origin", "Type": "string"}, + ], + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": False, + "NumberOfBuckets": -1, + "SerdeInfo": { + "SerializationLibrary": "org.apache.hadoop.hive.serde2.avro.AvroSerDe", + "Parameters": { + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "serialization.format": "1", + }, + }, + "BucketColumns": [], + "SortColumns": [], + "Parameters": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + }, + "StoredAsSubDirectories": False, + }, + "PartitionKeys": [ + {"Name": "year", "Type": "string", "Comment": "partition test comment"} + ], + "TableType": "EXTERNAL_TABLE", + "Parameters": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + }, + "CreatedBy": "arn:aws:sts::123412341234:assumed-role/AWSGlueServiceRole-flights-crawler/AWS-Crawler", + "IsRegisteredWithLakeFormation": False, + "CatalogId": "123412341234", + } +] +get_tables_lineage_response_1 = {"TableList": tables_lineage_1} + def mock_get_object_response(raw_body: str) -> Dict[str, Any]: """ diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/test_snowflake_shares.py index fc753f99b7e8f..2e78f0bb3ae65 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_shares.py +++ b/metadata-ingestion/tests/unit/test_snowflake_shares.py @@ -102,9 +102,7 @@ def test_snowflake_shares_workunit_no_shares( config = SnowflakeV2Config(account_id="abc12345", platform_instance="instance1") report = SnowflakeV2Report() - shares_handler = SnowflakeSharesHandler( - config, report, lambda x: make_snowflake_urn(x) - ) + shares_handler = SnowflakeSharesHandler(config, report) wus = list(shares_handler.get_shares_workunits(snowflake_databases)) @@ -204,9 +202,7 @@ def test_snowflake_shares_workunit_inbound_share( ) report = SnowflakeV2Report() - shares_handler = SnowflakeSharesHandler( - config, report, lambda x: make_snowflake_urn(x, "instance1") - ) + shares_handler = SnowflakeSharesHandler(config, report) wus = list(shares_handler.get_shares_workunits(snowflake_databases)) @@ -262,9 +258,7 @@ def test_snowflake_shares_workunit_outbound_share( ) report = SnowflakeV2Report() - shares_handler = SnowflakeSharesHandler( - config, report, lambda x: make_snowflake_urn(x, "instance1") - ) + shares_handler = SnowflakeSharesHandler(config, report) wus = list(shares_handler.get_shares_workunits(snowflake_databases)) @@ -313,9 +307,7 @@ def test_snowflake_shares_workunit_inbound_and_outbound_share( ) report = SnowflakeV2Report() - shares_handler = SnowflakeSharesHandler( - config, report, lambda x: make_snowflake_urn(x, "instance1") - ) + shares_handler = SnowflakeSharesHandler(config, report) wus = list(shares_handler.get_shares_workunits(snowflake_databases)) @@ -376,9 +368,7 @@ def test_snowflake_shares_workunit_inbound_and_outbound_share_no_platform_instan ) report = SnowflakeV2Report() - shares_handler = SnowflakeSharesHandler( - config, report, lambda x: make_snowflake_urn(x) - ) + shares_handler = SnowflakeSharesHandler(config, report) assert sorted(config.outbounds().keys()) == ["db1", "db2_main"] assert sorted(config.inbounds().keys()) == [ diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 3353e74449c95..72b59a3a4e493 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -274,21 +274,31 @@ def test_test_connection_basic_success(mock_connect): test_connection_helpers.assert_basic_connectivity_success(report) -def setup_mock_connect(mock_connect, query_results=None): - def default_query_results(query): +class MissingQueryMock(Exception): + pass + + +def setup_mock_connect(mock_connect, extra_query_results=None): + def query_results(query): + if extra_query_results is not None: + try: + return extra_query_results(query) + except MissingQueryMock: + pass + if query == "select current_role()": - return [("TEST_ROLE",)] + return [{"CURRENT_ROLE()": "TEST_ROLE"}] elif query == "select current_secondary_roles()": - return [('{"roles":"","value":""}',)] + return [{"CURRENT_SECONDARY_ROLES()": '{"roles":"","value":""}'}] elif query == "select current_warehouse()": - return [("TEST_WAREHOUSE")] - raise ValueError(f"Unexpected query: {query}") + return [{"CURRENT_WAREHOUSE()": "TEST_WAREHOUSE"}] + elif query == 'show grants to role "PUBLIC"': + return [] + raise MissingQueryMock(f"Unexpected query: {query}") connection_mock = MagicMock() cursor_mock = MagicMock() - cursor_mock.execute.side_effect = ( - query_results if query_results is not None else default_query_results - ) + cursor_mock.execute.side_effect = query_results connection_mock.cursor.return_value = cursor_mock mock_connect.return_value = connection_mock @@ -296,21 +306,11 @@ def default_query_results(query): @patch("snowflake.connector.connect") def test_test_connection_no_warehouse(mock_connect): def query_results(query): - if query == "select current_role()": - return [("TEST_ROLE",)] - elif query == "select current_secondary_roles()": - return [('{"roles":"","value":""}',)] - elif query == "select current_warehouse()": - return [(None,)] + if query == "select current_warehouse()": + return [{"CURRENT_WAREHOUSE()": None}] elif query == 'show grants to role "TEST_ROLE"': - return [ - ("", "USAGE", "DATABASE", "DB1"), - ("", "USAGE", "SCHEMA", "DB1.SCHEMA1"), - ("", "REFERENCES", "TABLE", "DB1.SCHEMA1.TABLE1"), - ] - elif query == 'show grants to role "PUBLIC"': - return [] - raise ValueError(f"Unexpected query: {query}") + return [{"privilege": "USAGE", "granted_on": "DATABASE", "name": "DB1"}] + raise MissingQueryMock(f"Unexpected query: {query}") setup_mock_connect(mock_connect, query_results) report = test_connection_helpers.run_test_connection( @@ -330,17 +330,9 @@ def query_results(query): @patch("snowflake.connector.connect") def test_test_connection_capability_schema_failure(mock_connect): def query_results(query): - if query == "select current_role()": - return [("TEST_ROLE",)] - elif query == "select current_secondary_roles()": - return [('{"roles":"","value":""}',)] - elif query == "select current_warehouse()": - return [("TEST_WAREHOUSE",)] - elif query == 'show grants to role "TEST_ROLE"': - return [("", "USAGE", "DATABASE", "DB1")] - elif query == 'show grants to role "PUBLIC"': - return [] - raise ValueError(f"Unexpected query: {query}") + if query == 'show grants to role "TEST_ROLE"': + return [{"privilege": "USAGE", "granted_on": "DATABASE", "name": "DB1"}] + raise MissingQueryMock(f"Unexpected query: {query}") setup_mock_connect(mock_connect, query_results) @@ -361,21 +353,17 @@ def query_results(query): @patch("snowflake.connector.connect") def test_test_connection_capability_schema_success(mock_connect): def query_results(query): - if query == "select current_role()": - return [("TEST_ROLE",)] - elif query == "select current_secondary_roles()": - return [('{"roles":"","value":""}',)] - elif query == "select current_warehouse()": - return [("TEST_WAREHOUSE")] - elif query == 'show grants to role "TEST_ROLE"': + if query == 'show grants to role "TEST_ROLE"': return [ - ["", "USAGE", "DATABASE", "DB1"], - ["", "USAGE", "SCHEMA", "DB1.SCHEMA1"], - ["", "REFERENCES", "TABLE", "DB1.SCHEMA1.TABLE1"], + {"privilege": "USAGE", "granted_on": "DATABASE", "name": "DB1"}, + {"privilege": "USAGE", "granted_on": "SCHEMA", "name": "DB1.SCHEMA1"}, + { + "privilege": "REFERENCES", + "granted_on": "TABLE", + "name": "DB1.SCHEMA1.TABLE1", + }, ] - elif query == 'show grants to role "PUBLIC"': - return [] - raise ValueError(f"Unexpected query: {query}") + raise MissingQueryMock(f"Unexpected query: {query}") setup_mock_connect(mock_connect, query_results) @@ -397,30 +385,38 @@ def query_results(query): @patch("snowflake.connector.connect") def test_test_connection_capability_all_success(mock_connect): def query_results(query): - if query == "select current_role()": - return [("TEST_ROLE",)] - elif query == "select current_secondary_roles()": - return [('{"roles":"","value":""}',)] - elif query == "select current_warehouse()": - return [("TEST_WAREHOUSE")] - elif query == 'show grants to role "TEST_ROLE"': + if query == 'show grants to role "TEST_ROLE"': return [ - ("", "USAGE", "DATABASE", "DB1"), - ("", "USAGE", "SCHEMA", "DB1.SCHEMA1"), - ("", "SELECT", "TABLE", "DB1.SCHEMA1.TABLE1"), - ("", "USAGE", "ROLE", "TEST_USAGE_ROLE"), + {"privilege": "USAGE", "granted_on": "DATABASE", "name": "DB1"}, + {"privilege": "USAGE", "granted_on": "SCHEMA", "name": "DB1.SCHEMA1"}, + { + "privilege": "SELECT", + "granted_on": "TABLE", + "name": "DB1.SCHEMA1.TABLE1", + }, + {"privilege": "USAGE", "granted_on": "ROLE", "name": "TEST_USAGE_ROLE"}, ] - elif query == 'show grants to role "PUBLIC"': - return [] elif query == 'show grants to role "TEST_USAGE_ROLE"': return [ - ["", "USAGE", "DATABASE", "SNOWFLAKE"], - ["", "USAGE", "SCHEMA", "ACCOUNT_USAGE"], - ["", "USAGE", "VIEW", "SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY"], - ["", "USAGE", "VIEW", "SNOWFLAKE.ACCOUNT_USAGE.ACCESS_HISTORY"], - ["", "USAGE", "VIEW", "SNOWFLAKE.ACCOUNT_USAGE.OBJECT_DEPENDENCIES"], + {"privilege": "USAGE", "granted_on": "DATABASE", "name": "SNOWFLAKE"}, + {"privilege": "USAGE", "granted_on": "SCHEMA", "name": "ACCOUNT_USAGE"}, + { + "privilege": "USAGE", + "granted_on": "VIEW", + "name": "SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY", + }, + { + "privilege": "USAGE", + "granted_on": "VIEW", + "name": "SNOWFLAKE.ACCOUNT_USAGE.ACCESS_HISTORY", + }, + { + "privilege": "USAGE", + "granted_on": "VIEW", + "name": "SNOWFLAKE.ACCOUNT_USAGE.OBJECT_DEPENDENCIES", + }, ] - raise ValueError(f"Unexpected query: {query}") + raise MissingQueryMock(f"Unexpected query: {query}") setup_mock_connect(mock_connect, query_results) diff --git a/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl index f777b5d6e12e7..37e85b6e542bd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl @@ -62,4 +62,24 @@ record DataHubIngestionSourceInfo { */ extraArgs: optional map[string, string] } + + /** + * The source or origin of the Ingestion Source + * + * Currently CLI and UI do not provide an explicit source. + */ + source: optional record DataHubIngestionSourceSource { + /** + * The source type of the ingestion source + */ + @Searchable = { + "fieldName": "sourceType" + } + type: enum DataHubIngestionSourceSourceType { + /** + * A system internal source, e.g. for running search indexing operations, feature computation, etc. + */ + SYSTEM + } + } } \ No newline at end of file diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java index 88963e60d415d..09a6cc7c1e4b7 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java @@ -13,10 +13,15 @@ import io.swagger.v3.oas.annotations.OpenAPIDefinition; import io.swagger.v3.oas.annotations.info.Info; import io.swagger.v3.oas.annotations.servers.Server; +import io.swagger.v3.oas.models.Components; import io.swagger.v3.oas.models.OpenAPI; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.springdoc.core.models.GroupedOpenApi; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -38,8 +43,6 @@ public class SpringWebConfig implements WebMvcConfigurer { private static final Set V1_PACKAGES = Set.of("io.datahubproject.openapi.v1"); private static final Set V2_PACKAGES = Set.of("io.datahubproject.openapi.v2"); private static final Set V3_PACKAGES = Set.of("io.datahubproject.openapi.v3"); - private static final Set SCHEMA_REGISTRY_PACKAGES = - Set.of("io.datahubproject.openapi.schema.registry"); private static final Set OPENLINEAGE_PACKAGES = Set.of("io.datahubproject.openapi.openlineage"); @@ -74,14 +77,31 @@ public void addFormatters(FormatterRegistry registry) { public GroupedOpenApi v3OpenApiGroup(final EntityRegistry entityRegistry) { return GroupedOpenApi.builder() .group("10-openapi-v3") - .displayName("DataHub Entities v3 (OpenAPI)") + .displayName("DataHub v3 (OpenAPI)") .addOpenApiCustomizer( openApi -> { OpenAPI v3OpenApi = OpenAPIV3Generator.generateOpenApiSpec(entityRegistry); openApi.setInfo(v3OpenApi.getInfo()); openApi.setTags(Collections.emptyList()); - openApi.setPaths(v3OpenApi.getPaths()); - openApi.setComponents(v3OpenApi.getComponents()); + openApi.getPaths().putAll(v3OpenApi.getPaths()); + // Merge components. Swagger does not provide append method to add components. + final Components components = new Components(); + final Components oComponents = openApi.getComponents(); + final Components v3Components = v3OpenApi.getComponents(); + components + .callbacks(concat(oComponents::getCallbacks, v3Components::getCallbacks)) + .examples(concat(oComponents::getExamples, v3Components::getExamples)) + .extensions(concat(oComponents::getExtensions, v3Components::getExtensions)) + .headers(concat(oComponents::getHeaders, v3Components::getHeaders)) + .links(concat(oComponents::getLinks, v3Components::getLinks)) + .parameters(concat(oComponents::getParameters, v3Components::getParameters)) + .requestBodies( + concat(oComponents::getRequestBodies, v3Components::getRequestBodies)) + .responses(concat(oComponents::getResponses, v3Components::getResponses)) + .schemas(concat(oComponents::getSchemas, v3Components::getSchemas)) + .securitySchemes( + concat(oComponents::getSecuritySchemes, v3Components::getSecuritySchemes)); + openApi.setComponents(components); }) .packagesToScan(V3_PACKAGES.toArray(String[]::new)) .build(); @@ -122,4 +142,14 @@ public GroupedOpenApi openlineageOpenApiGroup() { .packagesToScan(OPENLINEAGE_PACKAGES.toArray(String[]::new)) .build(); } + + /** Concatenates two maps. */ + private Map concat(Supplier> a, Supplier> b) { + return a.get() == null + ? b.get() + : b.get() == null + ? a.get() + : Stream.concat(a.get().entrySet().stream(), b.get().entrySet().stream()) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericRelationshipController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericRelationshipController.java new file mode 100644 index 0000000000000..efc3d9375e09e --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericRelationshipController.java @@ -0,0 +1,221 @@ +package io.datahubproject.openapi.controller; + +import static com.linkedin.metadata.authorization.ApiGroup.RELATIONSHIP; +import static com.linkedin.metadata.authorization.ApiOperation.READ; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthUtil; +import com.datahub.authorization.AuthorizerChain; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.metadata.aspect.models.graph.Edge; +import com.linkedin.metadata.aspect.models.graph.RelatedEntities; +import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; +import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.query.filter.RelationshipDirection; +import com.linkedin.metadata.query.filter.RelationshipFilter; +import com.linkedin.metadata.search.utils.QueryUtils; +import io.datahubproject.openapi.exception.UnauthorizedException; +import io.datahubproject.openapi.models.GenericScrollResult; +import io.datahubproject.openapi.v2.models.GenericRelationship; +import io.swagger.v3.oas.annotations.Operation; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; + +public abstract class GenericRelationshipController { + + @Autowired private EntityRegistry entityRegistry; + @Autowired private ElasticSearchGraphService graphService; + @Autowired private AuthorizerChain authorizationChain; + + /** + * Returns relationship edges by type + * + * @param relationshipType the relationship type + * @param count number of results + * @param scrollId scrolling id + * @return list of relation edges + */ + @GetMapping(value = "/{relationshipType}", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "Scroll relationships of the given type.") + public ResponseEntity> getRelationshipsByType( + @PathVariable("relationshipType") String relationshipType, + @RequestParam(value = "count", defaultValue = "10") Integer count, + @RequestParam(value = "scrollId", required = false) String scrollId) { + + Authentication authentication = AuthenticationContext.getAuthentication(); + if (!AuthUtil.isAPIAuthorized(authentication, authorizationChain, RELATIONSHIP, READ)) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " " + + RELATIONSHIP); + } + + RelatedEntitiesScrollResult result = + graphService.scrollRelatedEntities( + null, + null, + null, + null, + List.of(relationshipType), + new RelationshipFilter().setDirection(RelationshipDirection.UNDIRECTED), + Edge.EDGE_SORT_CRITERION, + scrollId, + count, + null, + null); + + if (!AuthUtil.isAPIAuthorizedUrns( + authentication, + authorizationChain, + RELATIONSHIP, + READ, + result.getEntities().stream() + .flatMap( + edge -> + Stream.of( + UrnUtils.getUrn(edge.getSourceUrn()), + UrnUtils.getUrn(edge.getDestinationUrn()))) + .collect(Collectors.toSet()))) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " " + + RELATIONSHIP); + } + + return ResponseEntity.ok( + GenericScrollResult.builder() + .results(toGenericRelationships(result.getEntities())) + .scrollId(result.getScrollId()) + .build()); + } + + /** + * Returns edges for a given urn + * + * @param relationshipTypes types of edges + * @param direction direction of the edges + * @param count number of results + * @param scrollId scroll id + * @return urn edges + */ + @GetMapping(value = "/{entityName}/{entityUrn}", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "Scroll relationships from a given entity.") + public ResponseEntity> getRelationshipsByEntity( + @PathVariable("entityName") String entityName, + @PathVariable("entityUrn") String entityUrn, + @RequestParam(value = "relationshipType[]", required = false, defaultValue = "*") + String[] relationshipTypes, + @RequestParam(value = "direction", defaultValue = "OUTGOING") String direction, + @RequestParam(value = "count", defaultValue = "10") Integer count, + @RequestParam(value = "scrollId", required = false) String scrollId) { + + final RelatedEntitiesScrollResult result; + + Authentication authentication = AuthenticationContext.getAuthentication(); + if (!AuthUtil.isAPIAuthorizedUrns( + authentication, + authorizationChain, + RELATIONSHIP, + READ, + List.of(UrnUtils.getUrn(entityUrn)))) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " " + + RELATIONSHIP); + } + + switch (RelationshipDirection.valueOf(direction.toUpperCase())) { + case INCOMING -> result = + graphService.scrollRelatedEntities( + null, + null, + null, + null, + relationshipTypes.length > 0 && !relationshipTypes[0].equals("*") + ? Arrays.stream(relationshipTypes).toList() + : List.of(), + new RelationshipFilter() + .setDirection(RelationshipDirection.UNDIRECTED) + .setOr(QueryUtils.newFilter("destination.urn", entityUrn).getOr()), + Edge.EDGE_SORT_CRITERION, + scrollId, + count, + null, + null); + case OUTGOING -> result = + graphService.scrollRelatedEntities( + null, + null, + null, + null, + relationshipTypes.length > 0 && !relationshipTypes[0].equals("*") + ? Arrays.stream(relationshipTypes).toList() + : List.of(), + new RelationshipFilter() + .setDirection(RelationshipDirection.UNDIRECTED) + .setOr(QueryUtils.newFilter("source.urn", entityUrn).getOr()), + Edge.EDGE_SORT_CRITERION, + scrollId, + count, + null, + null); + default -> throw new IllegalArgumentException("Direction must be INCOMING or OUTGOING"); + } + + if (!AuthUtil.isAPIAuthorizedUrns( + authentication, + authorizationChain, + RELATIONSHIP, + READ, + result.getEntities().stream() + .flatMap( + edge -> + Stream.of( + UrnUtils.getUrn(edge.getSourceUrn()), + UrnUtils.getUrn(edge.getDestinationUrn()))) + .collect(Collectors.toSet()))) { + throw new UnauthorizedException( + authentication.getActor().toUrnStr() + + " is unauthorized to " + + READ + + " " + + RELATIONSHIP); + } + + return ResponseEntity.ok( + GenericScrollResult.builder() + .results(toGenericRelationships(result.getEntities())) + .scrollId(result.getScrollId()) + .build()); + } + + private List toGenericRelationships(List relatedEntities) { + return relatedEntities.stream() + .map( + result -> { + Urn source = UrnUtils.getUrn(result.getSourceUrn()); + Urn dest = UrnUtils.getUrn(result.getDestinationUrn()); + return GenericRelationship.builder() + .relationshipType(result.getRelationshipType()) + .source(GenericRelationship.GenericNode.fromUrn(source)) + .destination(GenericRelationship.GenericNode.fromUrn(dest)) + .build(); + }) + .collect(Collectors.toList()); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java index 3e46e10857fbd..a0412676b5cbc 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/RelationshipController.java @@ -1,40 +1,10 @@ package io.datahubproject.openapi.v2.controller; -import static com.linkedin.metadata.authorization.ApiGroup.RELATIONSHIP; -import static com.linkedin.metadata.authorization.ApiOperation.READ; - -import com.datahub.authentication.Authentication; -import com.datahub.authentication.AuthenticationContext; -import com.datahub.authorization.AuthUtil; -import com.datahub.authorization.AuthorizerChain; -import com.linkedin.common.urn.Urn; -import com.linkedin.common.urn.UrnUtils; -import com.linkedin.metadata.aspect.models.graph.Edge; -import com.linkedin.metadata.aspect.models.graph.RelatedEntities; -import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; -import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; -import com.linkedin.metadata.models.registry.EntityRegistry; -import com.linkedin.metadata.query.filter.RelationshipDirection; -import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.search.utils.QueryUtils; -import io.datahubproject.openapi.exception.UnauthorizedException; -import io.datahubproject.openapi.models.GenericScrollResult; -import io.datahubproject.openapi.v2.models.GenericRelationship; -import io.swagger.v3.oas.annotations.Operation; +import io.datahubproject.openapi.controller.GenericRelationshipController; import io.swagger.v3.oas.annotations.tags.Tag; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.http.MediaType; -import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; @RestController @@ -44,192 +14,6 @@ @Tag( name = "Generic Relationships", description = "APIs for ingesting and accessing entity relationships.") -public class RelationshipController { - - @Autowired private EntityRegistry entityRegistry; - @Autowired private ElasticSearchGraphService graphService; - @Autowired private AuthorizerChain authorizationChain; - - /** - * Returns relationship edges by type - * - * @param relationshipType the relationship type - * @param count number of results - * @param scrollId scrolling id - * @return list of relation edges - */ - @GetMapping(value = "/{relationshipType}", produces = MediaType.APPLICATION_JSON_VALUE) - @Operation(summary = "Scroll relationships of the given type.") - public ResponseEntity> getRelationshipsByType( - @PathVariable("relationshipType") String relationshipType, - @RequestParam(value = "count", defaultValue = "10") Integer count, - @RequestParam(value = "scrollId", required = false) String scrollId) { - - Authentication authentication = AuthenticationContext.getAuthentication(); - if (!AuthUtil.isAPIAuthorized(authentication, authorizationChain, RELATIONSHIP, READ)) { - throw new UnauthorizedException( - authentication.getActor().toUrnStr() - + " is unauthorized to " - + READ - + " " - + RELATIONSHIP); - } - - RelatedEntitiesScrollResult result = - graphService.scrollRelatedEntities( - null, - null, - null, - null, - List.of(relationshipType), - new RelationshipFilter().setDirection(RelationshipDirection.UNDIRECTED), - Edge.EDGE_SORT_CRITERION, - scrollId, - count, - null, - null); - - if (!AuthUtil.isAPIAuthorizedUrns( - authentication, - authorizationChain, - RELATIONSHIP, - READ, - result.getEntities().stream() - .flatMap( - edge -> - Stream.of( - UrnUtils.getUrn(edge.getSourceUrn()), - UrnUtils.getUrn(edge.getDestinationUrn()))) - .collect(Collectors.toSet()))) { - throw new UnauthorizedException( - authentication.getActor().toUrnStr() - + " is unauthorized to " - + READ - + " " - + RELATIONSHIP); - } - - return ResponseEntity.ok( - GenericScrollResult.builder() - .results(toGenericRelationships(result.getEntities())) - .scrollId(result.getScrollId()) - .build()); - } - - /** - * Returns edges for a given urn - * - * @param relationshipTypes types of edges - * @param direction direction of the edges - * @param count number of results - * @param scrollId scroll id - * @return urn edges - */ - @GetMapping(value = "/{entityName}/{entityUrn}", produces = MediaType.APPLICATION_JSON_VALUE) - @Operation(summary = "Scroll relationships from a given entity.") - public ResponseEntity> getRelationshipsByEntity( - @PathVariable("entityName") String entityName, - @PathVariable("entityUrn") String entityUrn, - @RequestParam(value = "relationshipType[]", required = false, defaultValue = "*") - String[] relationshipTypes, - @RequestParam(value = "direction", defaultValue = "OUTGOING") String direction, - @RequestParam(value = "count", defaultValue = "10") Integer count, - @RequestParam(value = "scrollId", required = false) String scrollId) { - - final RelatedEntitiesScrollResult result; - - Authentication authentication = AuthenticationContext.getAuthentication(); - if (!AuthUtil.isAPIAuthorizedUrns( - authentication, - authorizationChain, - RELATIONSHIP, - READ, - List.of(UrnUtils.getUrn(entityUrn)))) { - throw new UnauthorizedException( - authentication.getActor().toUrnStr() - + " is unauthorized to " - + READ - + " " - + RELATIONSHIP); - } - - switch (RelationshipDirection.valueOf(direction.toUpperCase())) { - case INCOMING -> result = - graphService.scrollRelatedEntities( - null, - null, - null, - null, - relationshipTypes.length > 0 && !relationshipTypes[0].equals("*") - ? Arrays.stream(relationshipTypes).toList() - : List.of(), - new RelationshipFilter() - .setDirection(RelationshipDirection.UNDIRECTED) - .setOr(QueryUtils.newFilter("destination.urn", entityUrn).getOr()), - Edge.EDGE_SORT_CRITERION, - scrollId, - count, - null, - null); - case OUTGOING -> result = - graphService.scrollRelatedEntities( - null, - null, - null, - null, - relationshipTypes.length > 0 && !relationshipTypes[0].equals("*") - ? Arrays.stream(relationshipTypes).toList() - : List.of(), - new RelationshipFilter() - .setDirection(RelationshipDirection.UNDIRECTED) - .setOr(QueryUtils.newFilter("source.urn", entityUrn).getOr()), - Edge.EDGE_SORT_CRITERION, - scrollId, - count, - null, - null); - default -> throw new IllegalArgumentException("Direction must be INCOMING or OUTGOING"); - } - - if (!AuthUtil.isAPIAuthorizedUrns( - authentication, - authorizationChain, - RELATIONSHIP, - READ, - result.getEntities().stream() - .flatMap( - edge -> - Stream.of( - UrnUtils.getUrn(edge.getSourceUrn()), - UrnUtils.getUrn(edge.getDestinationUrn()))) - .collect(Collectors.toSet()))) { - throw new UnauthorizedException( - authentication.getActor().toUrnStr() - + " is unauthorized to " - + READ - + " " - + RELATIONSHIP); - } - - return ResponseEntity.ok( - GenericScrollResult.builder() - .results(toGenericRelationships(result.getEntities())) - .scrollId(result.getScrollId()) - .build()); - } - - private List toGenericRelationships(List relatedEntities) { - return relatedEntities.stream() - .map( - result -> { - Urn source = UrnUtils.getUrn(result.getSourceUrn()); - Urn dest = UrnUtils.getUrn(result.getDestinationUrn()); - return GenericRelationship.builder() - .relationshipType(result.getRelationshipType()) - .source(GenericRelationship.GenericNode.fromUrn(source)) - .destination(GenericRelationship.GenericNode.fromUrn(dest)) - .build(); - }) - .collect(Collectors.toList()); - } +public class RelationshipController extends GenericRelationshipController { + // Supports same methods as GenericRelationshipController. } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/RelationshipController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/RelationshipController.java new file mode 100644 index 0000000000000..8f317e8622723 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/RelationshipController.java @@ -0,0 +1,19 @@ +package io.datahubproject.openapi.v3.controller; + +import io.datahubproject.openapi.controller.GenericRelationshipController; +import io.swagger.v3.oas.annotations.tags.Tag; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +@RestController("RelationshipControllerV3") +@RequiredArgsConstructor +@RequestMapping("/v3/relationship") +@Slf4j +@Tag( + name = "Generic Relationships", + description = "APIs for ingesting and accessing entity relationships.") +public class RelationshipController extends GenericRelationshipController { + // Supports same methods as GenericRelationshipController. +} diff --git a/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java b/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java index 0855ad6c2e4ff..3c7e93621f5cc 100644 --- a/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java +++ b/metadata-service/openapi-servlet/src/test/java/io/datahubproject/openapi/v3/controller/EntityControllerTest.java @@ -25,6 +25,7 @@ import com.linkedin.entity.EnvelopedAspect; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortOrder; @@ -189,6 +190,12 @@ public EntityRegistry entityRegistry( return testOperationContext.getEntityRegistry(); } + @Bean("graphService") + @Primary + public ElasticSearchGraphService graphService() { + return mock(ElasticSearchGraphService.class); + } + @Bean public AuthorizerChain authorizerChain() { AuthorizerChain authorizerChain = mock(AuthorizerChain.class);