diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 63bab821cc398..54042d104d906 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -10,7 +10,7 @@ on: - "metadata-models/**" pull_request: branches: - - master + - "**" paths: - ".github/**" - "metadata-ingestion-modules/airflow-plugin/**" @@ -32,16 +32,21 @@ jobs: strategy: matrix: include: - - python-version: "3.7" - extraPythonRequirement: "apache-airflow~=2.1.0" - - python-version: "3.7" - extraPythonRequirement: "apache-airflow~=2.2.0" + - python-version: "3.8" + extra_pip_requirements: "apache-airflow~=2.1.4" + extra_pip_extras: plugin-v1 + - python-version: "3.8" + extra_pip_requirements: "apache-airflow~=2.2.4" + extra_pip_extras: plugin-v1 - python-version: "3.10" - extraPythonRequirement: "apache-airflow~=2.4.0" + extra_pip_requirements: "apache-airflow~=2.4.0" + extra_pip_extras: plugin-v2 - python-version: "3.10" - extraPythonRequirement: "apache-airflow~=2.6.0" + extra_pip_requirements: "apache-airflow~=2.6.0" + extra_pip_extras: plugin-v2 - python-version: "3.10" - extraPythonRequirement: "apache-airflow>2.6.0" + extra_pip_requirements: "apache-airflow>=2.7.0" + extra_pip_extras: plugin-v2 fail-fast: false steps: - uses: actions/checkout@v3 @@ -51,13 +56,13 @@ jobs: cache: "pip" - name: Install dependencies run: ./metadata-ingestion/scripts/install_deps.sh - - name: Install airflow package and test (extras ${{ matrix.extraPythonRequirement }}) - run: ./gradlew -Pextra_pip_requirements='${{ matrix.extraPythonRequirement }}' :metadata-ingestion-modules:airflow-plugin:lint :metadata-ingestion-modules:airflow-plugin:testQuick + - name: Install airflow package and test (extras ${{ matrix.extra_pip_requirements }}) + run: ./gradlew -Pextra_pip_requirements='${{ matrix.extra_pip_requirements }}' -Pextra_pip_extras='${{ matrix.extra_pip_extras }}' :metadata-ingestion-modules:airflow-plugin:lint :metadata-ingestion-modules:airflow-plugin:testQuick - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && pip freeze - uses: actions/upload-artifact@v3 - if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'apache-airflow>2.6.0' }} + if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }} with: name: Test Results (Airflow Plugin ${{ matrix.python-version}}) path: | diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index f6320e1bd5c9f..25f3957e8f086 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -8,7 +8,7 @@ on: - "**.md" pull_request: branches: - - master + - "**" paths-ignore: - "docs/**" - "**.md" @@ -24,17 +24,12 @@ jobs: strategy: fail-fast: false matrix: - command: - [ - "./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x :metadata-integration:java:spark-lineage:test -x :metadata-io:test -x :metadata-ingestion-modules:airflow-plugin:build -x :datahub-frontend:build -x :datahub-web-react:build --parallel", - "./gradlew :datahub-frontend:build :datahub-web-react:build --parallel", - "./gradlew :metadata-ingestion-modules:airflow-plugin:build --parallel" - ] - timezone: - [ - "UTC", - "America/New_York", + command: [ + # metadata-ingestion and airflow-plugin each have dedicated build jobs + "except_metadata_ingestion", + "frontend" ] + timezone: ["UTC", "America/New_York"] runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -51,10 +46,17 @@ jobs: java-version: 11 - uses: actions/setup-python@v4 with: - python-version: "3.7" - - name: Gradle build (and test) + python-version: "3.10" + cache: pip + - name: Gradle build (and test) for metadata ingestion + # we only need the timezone runs for frontend tests + if: ${{ matrix.command == 'except_metadata_ingestion' && matrix.timezone == 'America/New_York' }} + run: | + ./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x :metadata-integration:java:spark-lineage:test -x :metadata-io:test -x :metadata-ingestion-modules:airflow-plugin:build -x :metadata-ingestion-modules:airflow-plugin:check -x :datahub-frontend:build -x :datahub-web-react:build --parallel + - name: Gradle build (and test) for frontend + if: ${{ matrix.command == 'frontend' }} run: | - ${{ matrix.command }} + ./gradlew :datahub-frontend:build :datahub-web-react:build --parallel env: NODE_OPTIONS: "--max-old-space-size=3072" - uses: actions/upload-artifact@v3 @@ -81,7 +83,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.7" + python-version: "3.10" - name: Download YQ uses: chrisdickinson/setup-yq@v1.0.1 with: diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml index 841a9ed5f9bc7..9a17a70e7f8d4 100644 --- a/.github/workflows/check-datahub-jars.yml +++ b/.github/workflows/check-datahub-jars.yml @@ -10,7 +10,7 @@ on: - "**.md" pull_request: branches: - - master + - "**" paths-ignore: - "docker/**" - "docs/**" @@ -28,12 +28,7 @@ jobs: max-parallel: 1 fail-fast: false matrix: - command: - [ - "datahub-client", - "datahub-protobuf", - "spark-lineage" - ] + command: ["datahub-client", "datahub-protobuf", "spark-lineage"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/close-stale-issues.yml b/.github/workflows/close-stale-issues.yml index a7809087702ac..98e3041f28804 100644 --- a/.github/workflows/close-stale-issues.yml +++ b/.github/workflows/close-stale-issues.yml @@ -18,7 +18,9 @@ jobs: days-before-issue-stale: 30 days-before-issue-close: 30 stale-issue-label: "stale" - stale-issue-message: "This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io. For feature requests please use https://feature-requests.datahubproject.io" + stale-issue-message: + "This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io.\ + \ For feature requests please use https://feature-requests.datahubproject.io" close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale." days-before-pr-stale: -1 days-before-pr-close: -1 diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 6ce19a5b4616e..e12971b8a6208 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -10,7 +10,7 @@ on: - ".github/workflows/code-checks.yml" pull_request: branches: - - master + - "**" paths: - "metadata-io/**" - "datahub-web-react/**" @@ -21,17 +21,12 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true - jobs: code_check: strategy: fail-fast: false matrix: - command: - [ - "check_event_type.py", - "check_policies.py" - ] + command: ["check_event_type.py", "check_policies.py"] name: run code checks runs-on: ubuntu-latest steps: @@ -43,5 +38,5 @@ jobs: with: python-version: "3.10" - name: run check ${{ matrix.command }} - run: | - python .github/scripts/${{ matrix.command }} \ No newline at end of file + run: |- + python .github/scripts/${{ matrix.command }} diff --git a/.github/workflows/docker-postgres-setup.yml b/.github/workflows/docker-postgres-setup.yml index a5d421d4b7ff5..fda4349f90bf7 100644 --- a/.github/workflows/docker-postgres-setup.yml +++ b/.github/workflows/docker-postgres-setup.yml @@ -8,7 +8,7 @@ on: - ".github/workflows/docker-postgres-setup.yml" pull_request: branches: - - master + - "**" paths: - "docker/postgres-setup/**" - ".github/workflows/docker-postgres-setup.yml" @@ -61,4 +61,3 @@ jobs: context: . file: ./docker/postgres-setup/Dockerfile platforms: linux/amd64,linux/arm64 - diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 2aae6bf51529d..8666a5e2e2171 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -8,7 +8,7 @@ on: - "**.md" pull_request: branches: - - master + - "**" paths-ignore: - "docs/**" - "**.md" @@ -545,7 +545,6 @@ jobs: id: tag run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT - datahub_ingestion_slim_build: name: Build and Push DataHub Ingestion Docker Images runs-on: ubuntu-latest @@ -809,8 +808,8 @@ jobs: DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }} DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }} ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag }} - ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5' - ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' + ACTIONS_EXTRA_PACKAGES: "acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5" + ACTIONS_CONFIG: "https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml" run: | ./smoke-test/run-quickstart.sh - name: sleep 60s diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 68432a4feb13d..ebe2990f3a3cd 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -3,7 +3,7 @@ name: documentation on: pull_request: branches: - - master + - "**" push: branches: - master diff --git a/.github/workflows/lint-actions.yml b/.github/workflows/lint-actions.yml index b285e46da4857..6f34bf292bf51 100644 --- a/.github/workflows/lint-actions.yml +++ b/.github/workflows/lint-actions.yml @@ -2,8 +2,10 @@ name: Lint actions on: pull_request: paths: - - '.github/workflows/**' + - ".github/workflows/**" + branches: + - "**" jobs: actionlint: runs-on: ubuntu-latest diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index fff41e481c3cb..699ca330ce0ac 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -9,7 +9,7 @@ on: - "metadata-models/**" pull_request: branches: - - master + - "**" paths: - ".github/**" - "metadata-ingestion/**" @@ -34,11 +34,10 @@ jobs: python-version: ["3.7", "3.10"] command: [ - "lint", "testQuick", - "testIntegration", + "testIntegrationBatch0", "testIntegrationBatch1", - "testSlowIntegration", + "testIntegrationBatch2", ] include: - python-version: "3.7" @@ -54,13 +53,20 @@ jobs: run: ./metadata-ingestion/scripts/install_deps.sh - name: Install package run: ./gradlew :metadata-ingestion:installPackageOnly + - name: Run lint alongwith testQuick + if: ${{ matrix.command == 'testQuick' }} + run: ./gradlew :metadata-ingestion:lint - name: Run metadata-ingestion tests run: ./gradlew :metadata-ingestion:${{ matrix.command }} - - name: pip freeze show list installed + - name: Debug info if: always() - run: source metadata-ingestion/venv/bin/activate && pip freeze + run: | + source metadata-ingestion/venv/bin/activate && pip freeze + set -x + df -hl + docker image ls + docker system df - uses: actions/upload-artifact@v3 - if: ${{ always() && matrix.command != 'lint' }} with: name: Test Results (metadata ingestion ${{ matrix.python-version }}) path: | @@ -68,7 +74,7 @@ jobs: **/build/test-results/test/** **/junit.*.xml - name: Upload coverage to Codecov - if: ${{ always() && matrix.python-version == '3.10' && matrix.command != 'lint' }} + if: ${{ always() && matrix.python-version == '3.10' }} uses: codecov/codecov-action@v3 with: token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index e37ddd0ce4e86..48f230ce14c8d 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -10,7 +10,7 @@ on: - "metadata-io/**" pull_request: branches: - - master + - "**" paths: - "**/*.gradle" - "li-utils/**" diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index b2482602e7548..541b2019b93ef 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -12,7 +12,7 @@ on: - ".github/workflows/spark-smoke-test.yml" pull_request: branches: - - master + - "**" paths: - "metadata_models/**" - "metadata-integration/java/datahub-client/**" diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/TestUtils.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/TestUtils.java index 272a93fa1989c..606123cac926d 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/TestUtils.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/TestUtils.java @@ -8,6 +8,7 @@ import com.datahub.plugins.auth.authorization.Authorizer; import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.ebean.transactions.AspectsBatchImpl; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; @@ -21,6 +22,8 @@ public class TestUtils { public static EntityService getMockEntityService() { + PathSpecBasedSchemaAnnotationVisitor.class.getClassLoader() + .setClassAssertionStatus(PathSpecBasedSchemaAnnotationVisitor.class.getName(), false); EntityRegistry registry = new ConfigEntityRegistry(TestUtils.class.getResourceAsStream("/test-entity-registry.yaml")); EntityService mockEntityService = Mockito.mock(EntityService.class); Mockito.when(mockEntityService.getEntityRegistry()).thenReturn(registry); diff --git a/datahub-graphql-core/src/test/resources/test-entity-registry.yaml b/datahub-graphql-core/src/test/resources/test-entity-registry.yaml index d694ae53ac42f..efd75a7fb07f5 100644 --- a/datahub-graphql-core/src/test/resources/test-entity-registry.yaml +++ b/datahub-graphql-core/src/test/resources/test-entity-registry.yaml @@ -181,6 +181,7 @@ entities: - assertionInfo - dataPlatformInstance - assertionRunEvent + - assertionActions - status - name: dataHubRetention category: internal @@ -292,4 +293,11 @@ entities: aspects: - ownershipTypeInfo - status +- name: dataContract + category: core + keyAspect: dataContractKey + aspects: + - dataContractProperties + - dataContractStatus + - status events: diff --git a/datahub-web-react/src/app/entity/shared/embed/EmbeddedProfile.tsx b/datahub-web-react/src/app/entity/shared/embed/EmbeddedProfile.tsx index 31a736e30bdc0..df928fc408de6 100644 --- a/datahub-web-react/src/app/entity/shared/embed/EmbeddedProfile.tsx +++ b/datahub-web-react/src/app/entity/shared/embed/EmbeddedProfile.tsx @@ -55,6 +55,8 @@ export default function EmbeddedProfile({ urn, entityType, getOverridePropert return ; } + const readOnly = false; + return ( ({ urn, entityType, getOverridePropert - + - + - + - + - + )} diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 1bd5b6f1f768b..b18384909c33f 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -130,7 +130,7 @@ "name": "dynamodb", "displayName": "DynamoDB", "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", - "recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # User could use the below option to provide a list of primary keys of a table in dynamodb format,\n # those items from given primary keys will be included when we scan the table.\n # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items.\n # We'll enforce the the primary keys list size not to exceed 100\n # The total items we'll try to retrieve in these two scenarios:\n # 1. If user don't specify include_table_item: we'll retrieve up to 100 items\n # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in\n # the table, with a total not more than 200 items\n # include_table_item:\n # table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]" + "recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # If there are items that have most representative fields of the table, users could use the\n # `include_table_item` option to provide a list of primary keys of the table in dynamodb format.\n # For each `region.table`, the list of primary keys can be at most 100.\n # We include these items in addition to the first 100 items in the table when we scan it.\n # include_table_item:\n # region.table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]" }, { "urn": "urn:li:dataPlatform:glue", @@ -223,4 +223,4 @@ "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", "recipe": "source:\n type: \n config:\n # Source-type specifics config\n " } -] \ No newline at end of file +] diff --git a/datahub-web-react/src/app/lineage/utils/__tests__/columnLineageUtils.test.tsx b/datahub-web-react/src/app/lineage/utils/__tests__/columnLineageUtils.test.tsx index cd0a5f1385858..c11d8fe90cfa9 100644 --- a/datahub-web-react/src/app/lineage/utils/__tests__/columnLineageUtils.test.tsx +++ b/datahub-web-react/src/app/lineage/utils/__tests__/columnLineageUtils.test.tsx @@ -88,7 +88,7 @@ describe('encodeSchemaField', () => { }); describe('getPopulatedColumnsByUrn', () => { - it('should update columns by urn with data job fine grained data so that the data job appears to have the upstream columns', () => { + it('should update columns by urn with data job fine grained data so that the data job appears to have the upstream and downstream columns', () => { const dataJobWithCLL = { ...dataJob1, name: '', @@ -116,12 +116,24 @@ describe('getPopulatedColumnsByUrn', () => { recursive: false, type: SchemaFieldDataType.String, }, + { + fieldPath: 'test2', + nullable: false, + recursive: false, + type: SchemaFieldDataType.String, + }, { fieldPath: 'test3', nullable: false, recursive: false, type: SchemaFieldDataType.String, }, + { + fieldPath: 'test4', + nullable: false, + recursive: false, + type: SchemaFieldDataType.String, + }, ], }); }); diff --git a/datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts b/datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts index 4dd54ea25416d..60b1698444168 100644 --- a/datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts +++ b/datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts @@ -88,9 +88,9 @@ export function getPopulatedColumnsByUrn( ), }; } else if (fetchedEntity.type === EntityType.DataJob && fetchedEntity.fineGrainedLineages) { - // Add upstream fields from fineGrainedLineage onto DataJob to mimic upstream dataset fields. - // DataJobs will virtually "have" these fields so we can draw full column paths - // from upstream dataset fields to downstream dataset fields. + // Add upstream and downstream fields from fineGrainedLineage onto DataJob to mimic upstream + // and downstream dataset fields. DataJobs will virtually "have" these fields so we can draw + // full column paths from upstream dataset fields to downstream dataset fields. const fields: SchemaField[] = []; fetchedEntity.fineGrainedLineages.forEach((fineGrainedLineage) => { fineGrainedLineage.upstreams?.forEach((upstream) => { @@ -103,6 +103,16 @@ export function getPopulatedColumnsByUrn( }); } }); + fineGrainedLineage.downstreams?.forEach((downstream) => { + if (!fields.some((field) => field.fieldPath === downstream.path)) { + fields.push({ + fieldPath: downgradeV2FieldPath(downstream.path) || '', + nullable: false, + recursive: false, + type: SchemaFieldDataType.String, + }); + } + }); }); populatedColumnsByUrn = { ...populatedColumnsByUrn, [urn]: fields }; } diff --git a/datahub-web-react/src/app/lineage/utils/extendAsyncEntities.ts b/datahub-web-react/src/app/lineage/utils/extendAsyncEntities.ts index 860b5715f34c9..30e81a37dc380 100644 --- a/datahub-web-react/src/app/lineage/utils/extendAsyncEntities.ts +++ b/datahub-web-react/src/app/lineage/utils/extendAsyncEntities.ts @@ -130,6 +130,18 @@ export function extendColumnLineage( }); }); }); + if (lineageVizConfig.type === EntityType.DataJob && !fineGrainedLineage.upstreams?.length) { + fineGrainedLineage.downstreams?.forEach((downstream) => { + const [downstreamEntityUrn, downstreamField] = breakFieldUrn(downstream); + updateFineGrainedMap( + fineGrainedMap, + lineageVizConfig.urn, + downstreamField, + downstreamEntityUrn, + downstreamField, + ); + }); + } }); } diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx index fb10e1ca0026e..b4699994bc460 100644 --- a/datahub-web-react/src/app/search/SearchBar.tsx +++ b/datahub-web-react/src/app/search/SearchBar.tsx @@ -6,7 +6,7 @@ import { useHistory } from 'react-router'; import { AutoCompleteResultForEntity, EntityType, FacetFilterInput, ScenarioType } from '../../types.generated'; import EntityRegistry from '../entity/EntityRegistry'; import filterSearchQuery from './utils/filterSearchQuery'; -import { ANTD_GRAY, ANTD_GRAY_V2 } from '../entity/shared/constants'; +import { ANTD_GRAY, ANTD_GRAY_V2, REDESIGN_COLORS } from '../entity/shared/constants'; import { getEntityPath } from '../entity/shared/containers/profile/utils'; import { EXACT_SEARCH_PREFIX } from './utils/constants'; import { useListRecommendationsQuery } from '../../graphql/recommendations.generated'; @@ -20,7 +20,6 @@ import RecommendedOption from './autoComplete/RecommendedOption'; import SectionHeader, { EntityTypeLabel } from './autoComplete/SectionHeader'; import { useUserContext } from '../context/useUserContext'; import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; -import { getQuickFilterDetails } from './autoComplete/quickFilters/utils'; import ViewAllSearchItem from './ViewAllSearchItem'; import { ViewSelect } from '../entity/view/select/ViewSelect'; import { combineSiblingsInAutoComplete } from './utils/combineSiblingsInAutoComplete'; @@ -39,13 +38,14 @@ const StyledSearchBar = styled(Input)` &&& { border-radius: 70px; height: 40px; - font-size: 20px; - color: ${ANTD_GRAY[7]}; - background-color: ${ANTD_GRAY_V2[2]}; - } - > .ant-input { font-size: 14px; + color: ${ANTD_GRAY[7]}; background-color: ${ANTD_GRAY_V2[2]}; + border: 2px solid transparent; + + &:focus-within { + border: 1.5px solid ${REDESIGN_COLORS.BLUE}; + } } > .ant-input::placeholder { color: ${ANTD_GRAY_V2[10]}; @@ -203,23 +203,16 @@ export const SearchBar = ({ const { quickFilters, selectedQuickFilter, setSelectedQuickFilter } = useQuickFiltersContext(); const autoCompleteQueryOptions = useMemo(() => { - const query = suggestions.length ? effectiveQuery : ''; - const selectedQuickFilterLabel = - showQuickFilters && selectedQuickFilter - ? getQuickFilterDetails(selectedQuickFilter, entityRegistry).label - : ''; - const text = query || selectedQuickFilterLabel; - - if (!text) return []; + if (effectiveQuery === '') return []; return [ { - value: `${EXACT_SEARCH_PREFIX}${text}`, - label: , + value: `${EXACT_SEARCH_PREFIX}${effectiveQuery}`, + label: , type: EXACT_AUTOCOMPLETE_OPTION_TYPE, }, ]; - }, [showQuickFilters, suggestions.length, effectiveQuery, selectedQuickFilter, entityRegistry]); + }, [effectiveQuery]); const autoCompleteEntityOptions = useMemo(() => { return suggestions.map((suggestion: AutoCompleteResultForEntity) => { @@ -296,6 +289,22 @@ export const SearchBar = ({ } } + const searchInputRef = useRef(null); + + useEffect(() => { + const handleKeyDown = (event) => { + // Support command-k to select the search bar. + // 75 is the keyCode for 'k' + if ((event.metaKey || event.ctrlKey) && event.keyCode === 75) { + (searchInputRef?.current as any)?.focus(); + } + }; + document.addEventListener('keydown', handleKeyDown); + return () => { + document.removeEventListener('keydown', handleKeyDown); + }; + }, []); + return ( } + ref={searchInputRef} /> diff --git a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx index ced7d8642576b..ce1ad93565ba4 100644 --- a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx +++ b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx @@ -93,20 +93,6 @@ export function HeaderLinks(props: Props) { )} - {showIngestion && ( - - - - - - )} + {showIngestion && ( + + + + + + )} {showSettings && ( diff --git a/datahub-web-react/src/conf/theme/theme_dark.config.json b/datahub-web-react/src/conf/theme/theme_dark.config.json index 9746c3ddde5f3..54ebebd3b692b 100644 --- a/datahub-web-react/src/conf/theme/theme_dark.config.json +++ b/datahub-web-react/src/conf/theme/theme_dark.config.json @@ -30,7 +30,7 @@ "homepageMessage": "Find data you can count(*) on" }, "search": { - "searchbarMessage": "Search Datasets, People, & more..." + "searchbarMessage": "Search Tables, Dashboards, People, & more..." }, "menu": { "items": [ @@ -52,4 +52,4 @@ ] } } -} +} \ No newline at end of file diff --git a/datahub-web-react/src/conf/theme/theme_light.config.json b/datahub-web-react/src/conf/theme/theme_light.config.json index 906c04e38a1ba..6b9ef3eac52b0 100644 --- a/datahub-web-react/src/conf/theme/theme_light.config.json +++ b/datahub-web-react/src/conf/theme/theme_light.config.json @@ -33,7 +33,7 @@ "homepageMessage": "Find data you can count on" }, "search": { - "searchbarMessage": "Search Datasets, People, & more..." + "searchbarMessage": "Search Tables, Dashboards, People, & more..." }, "menu": { "items": [ @@ -60,4 +60,4 @@ ] } } -} +} \ No newline at end of file diff --git a/docker/airflow/local_airflow.md b/docker/airflow/local_airflow.md index 55a64f5c122c5..fbfc1d17327c5 100644 --- a/docker/airflow/local_airflow.md +++ b/docker/airflow/local_airflow.md @@ -1,6 +1,6 @@ :::caution -This feature is currently unmaintained. As of 0.10.0 the container described is not published alongside the DataHub CLI. If you'd like to use it, please reach out to us on the [community slack.](docs/slack.md) +This guide is currently unmaintained. As of 0.10.0 the container described is not published alongside the DataHub CLI. If you'd like to use it, please reach out to us on the [community slack.](docs/slack.md) ::: diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py index 83157edc1972c..53ee9cf1e63ef 100644 --- a/docs-website/download_historical_versions.py +++ b/docs-website/download_historical_versions.py @@ -1,6 +1,7 @@ import json import os import tarfile +import time import urllib.request repo_url = "https://api.github.com/repos/datahub-project/static-assets" @@ -16,17 +17,30 @@ def download_file(url, destination): f.write(chunk) -def fetch_urls(repo_url: str, folder_path: str, file_format: str): +def fetch_urls( + repo_url: str, folder_path: str, file_format: str, max_retries=3, retry_delay=5 +): api_url = f"{repo_url}/contents/{folder_path}" - response = urllib.request.urlopen(api_url) - data = response.read().decode("utf-8") - urls = [ - file["download_url"] - for file in json.loads(data) - if file["name"].endswith(file_format) - ] - print(urls) - return urls + for attempt in range(max_retries + 1): + try: + response = urllib.request.urlopen(api_url) + if response.status == 403 or (500 <= response.status < 600): + raise Exception(f"HTTP Error {response.status}: {response.reason}") + data = response.read().decode("utf-8") + urls = [ + file["download_url"] + for file in json.loads(data) + if file["name"].endswith(file_format) + ] + print(urls) + return urls + except Exception as e: + if attempt < max_retries: + print(f"Attempt {attempt + 1}/{max_retries}: {e}") + time.sleep(retry_delay) + else: + print(f"Max retries reached. Unable to fetch data.") + raise def extract_tar_file(destination_path): diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 892d02c43fe97..a321146e10efa 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -66,7 +66,7 @@ function list_markdown_files(): string[] { .trim() .split("\n"); let all_generated_markdown_files = execSync( - "cd .. && ls docs/generated/**/**/*.md" + "cd .. && ls docs/generated/**/**/*.md && ls docs/generated/**/*.md" ) .toString() .trim() diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index d8b85da79b31b..bdf3926c17e0d 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -432,7 +432,7 @@ module.exports = { "docs/features/dataset-usage-and-query-history", "docs/posts", "docs/sync-status", - "docs/lineage/lineage-feature-guide", + "docs/generated/lineage/lineage-feature-guide", { type: "doc", id: "docs/tests/metadata-tests", @@ -446,6 +446,9 @@ module.exports = { "docs/managed-datahub/observe/custom-sql-assertions", ], }, + { + Guides: ["docs/features/feature-guides/ui-lineage"], + }, ], }, { diff --git a/docs/act-on-metadata/impact-analysis.md b/docs/act-on-metadata/impact-analysis.md index 9728a480efe32..e1143dd436d9c 100644 --- a/docs/act-on-metadata/impact-analysis.md +++ b/docs/act-on-metadata/impact-analysis.md @@ -92,4 +92,4 @@ We currently limit the list of dependencies to 10,000 records; we suggest applyi ### Related Features -* [DataHub Lineage](../lineage/lineage-feature-guide.md) +* [DataHub Lineage](../generated/lineage/lineage-feature-guide.md) diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index dc43cb178f949..4baad09099d07 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -6,7 +6,8 @@ import TabItem from '@theme/TabItem'; ## Why Would You Use Lineage? Lineage is used to capture data dependencies within an organization. It allows you to track the inputs from which a data asset is derived, along with the data assets that depend on it downstream. -For more information about lineage, refer to [About DataHub Lineage](/docs/lineage/lineage-feature-guide.md). + +For more information about lineage, refer to [About DataHub Lineage](/docs/generated/lineage/lineage-feature-guide.md). ### Goal Of This Guide diff --git a/docs/deploy/aws.md b/docs/deploy/aws.md index 228fcb51d1a28..e0f57b4a0b0cb 100644 --- a/docs/deploy/aws.md +++ b/docs/deploy/aws.md @@ -100,7 +100,7 @@ eksctl create iamserviceaccount \ Install the TargetGroupBinding custom resource definition by running the following. ``` -kubectl apply -k "github.com/aws/eks-charts/stable/aws-load-balancer-controller//crds?ref=master" +kubectl apply -k "github.com/aws/eks-charts/stable/aws-load-balancer-controller/crds?ref=master" ``` Add the helm chart repository containing the latest version of the ALB controller. diff --git a/docs/deploy/confluent-cloud.md b/docs/deploy/confluent-cloud.md index 794b55d4686bf..096fd9984f474 100644 --- a/docs/deploy/confluent-cloud.md +++ b/docs/deploy/confluent-cloud.md @@ -16,6 +16,11 @@ First, you'll need to create following new topics in the [Confluent Control Cent 6. (Deprecated) **MetadataChangeEvent_v4**: Metadata change proposal messages 7. (Deprecated) **MetadataAuditEvent_v4**: Metadata change log messages 8. (Deprecated) **FailedMetadataChangeEvent_v4**: Failed to process #1 event +9. **MetadataGraphEvent_v4**: +10. **MetadataGraphEvent_v4**: +11. **PlatformEvent_v1** +12. **DataHubUpgradeHistory_v1**: Notifies the end of DataHub Upgrade job so dependants can act accordingly (_eg_, startup). + Note this topic requires special configuration: **Infinite retention**. Also, 1 partition is enough for the occasional traffic. The first five are the most important, and are explained in more depth in [MCP/MCL](../advanced/mcp-mcl.md). The final topics are those which are deprecated but still used under certain circumstances. It is likely that in the future they will be completely diff --git a/docs/features/feature-guides/ui-lineage.md b/docs/features/feature-guides/ui-lineage.md new file mode 100644 index 0000000000000..18e4f77e793b2 --- /dev/null +++ b/docs/features/feature-guides/ui-lineage.md @@ -0,0 +1,58 @@ +# Managing Lineage via UI + +## Viewing lineage +The UI shows the latest version of the lineage. The time picker can be used to filter out edges within the latest version to exclude those that were last updated outside of the time window. Selecting time windows in the patch will not show you historical lineages. It will only filter the view of the latest version of the lineage. + +## Editing from Lineage Graph View + +The first place that you can edit lineage for entities is from the Lineage Visualization screen. Click on the "Lineage" button on the top right of an entity's profile to get to this view. + +

+ +

+ +Once you find the entity that you want to edit the lineage of, click on the three-dot menu dropdown to select whether you want to edit lineage in the upstream direction or the downstream direction. + +

+ +

+ +If you want to edit upstream lineage for entities downstream of the center node or downstream lineage for entities upstream of the center node, you can simply re-center to focus on the node you want to edit. Once focused on the desired node, you can edit lineage in either direction. + +

+ +

+ +### Adding Lineage Edges + +Once you click "Edit Upstream" or "Edit Downstream," a modal will open that allows you to manage lineage for the selected entity in the chosen direction. In order to add a lineage edge to a new entity, search for it by name in the provided search bar and select it. Once you're satisfied with everything you've added, click "Save Changes." If you change your mind, you can always cancel or exit without saving the changes you've made. + +

+ +

+ +### Removing Lineage Edges + +You can remove lineage edges from the same modal used to add lineage edges. Find the edge(s) that you want to remove, and click the "X" on the right side of it. And just like adding, you need to click "Save Changes" to save and if you exit without saving, your changes won't be applied. + +

+ +

+ +### Reviewing Changes + +Any time lineage is edited manually, we keep track of who made the change and when they made it. You can see this information in the modal where you add and remove edges. If an edge was added manually, a user avatar will be in line with the edge that was added. You can hover over this avatar in order to see who added it and when. + +

+ +

+ +## Editing from Lineage Tab + +The other place that you can edit lineage for entities is from the Lineage Tab on an entity's profile. Click on the "Lineage" tab in an entity's profile and then find the "Edit" dropdown that allows you to edit upstream or downstream lineage for the given entity. + +

+ +

+ +Using the modal from this view will work the same as described above for editing from the Lineage Visualization screen. \ No newline at end of file diff --git a/docs/how/add-custom-data-platform.md b/docs/how/add-custom-data-platform.md index a4ea32af455c1..5dcd423e77569 100644 --- a/docs/how/add-custom-data-platform.md +++ b/docs/how/add-custom-data-platform.md @@ -77,7 +77,7 @@ datahub put platform --name MyCustomDataPlatform --display_name "My Custom Data source: type: "file" config: - filename: "./my-custom-data-platform.json" + path: "./my-custom-data-platform.json" # see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation sink: diff --git a/docs/how/add-user-data.md b/docs/how/add-user-data.md index ea76c97163ddd..035821ab75879 100644 --- a/docs/how/add-user-data.md +++ b/docs/how/add-user-data.md @@ -57,7 +57,7 @@ Define an [ingestion recipe](https://datahubproject.io/docs/metadata-ingestion/# source: type: "file" config: - filename: "./my-user.json" + path: "./my-user.json" # see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation sink: diff --git a/docs/how/kafka-config.md b/docs/how/kafka-config.md index f3f81c3d07c01..2f20e8b548f83 100644 --- a/docs/how/kafka-config.md +++ b/docs/how/kafka-config.md @@ -52,16 +52,21 @@ Also see [Kafka Connect Security](https://docs.confluent.io/current/connect/secu By default, DataHub relies on the a set of Kafka topics to operate. By default, they have the following names: -- **MetadataChangeProposal_v1** -- **FailedMetadataChangeProposal_v1** -- **MetadataChangeLog_Versioned_v1** -- **MetadataChangeLog_Timeseries_v1** -- **DataHubUsageEvent_v1**: User behavior tracking event for UI +1. **MetadataChangeProposal_v1** +2. **FailedMetadataChangeProposal_v1** +3. **MetadataChangeLog_Versioned_v1** +4. **MetadataChangeLog_Timeseries_v1** +5. **DataHubUsageEvent_v1**: User behavior tracking event for UI 6. (Deprecated) **MetadataChangeEvent_v4**: Metadata change proposal messages 7. (Deprecated) **MetadataAuditEvent_v4**: Metadata change log messages 8. (Deprecated) **FailedMetadataChangeEvent_v4**: Failed to process #1 event +9. **MetadataGraphEvent_v4**: +10. **MetadataGraphEvent_v4**: +11. **PlatformEvent_v1**: +12. **DataHubUpgradeHistory_v1**: Notifies the end of DataHub Upgrade job so dependants can act accordingly (_eg_, startup). + Note this topic requires special configuration: **Infinite retention**. Also, 1 partition is enough for the occasional traffic. -These topics are discussed at more length in [Metadata Events](../what/mxe.md). +How Metadata Events relate to these topics is discussed at more length in [Metadata Events](../what/mxe.md). We've included environment variables to customize the name each of these topics, for cases where an organization has naming rules for your topics. diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 9b19291ee246a..5d0ad5eaf8f7e 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -5,7 +5,15 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ## Next ### Breaking Changes + - #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now. +- #8853 - The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. See the docs for more details. +- #8853 - Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`. +- #8943 The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled. +This is currently enabled by default to preserve compatibility, but will be disabled by default and then removed in the future. +If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup. +Otherwise, we recommend soft deleting all databricks data via the DataHub CLI: +`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`. ### Potential Downtime diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 49de5352f6d58..19ed1598d4c5a 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -1,74 +1,137 @@ # Airflow Integration -DataHub supports integration of +:::note -- Airflow Pipeline (DAG) metadata -- DAG and Task run information as well as -- Lineage information when present +If you're looking to schedule DataHub ingestion using Airflow, see the guide on [scheduling ingestion with Airflow](../../metadata-ingestion/schedule_docs/airflow.md). -You can use either the DataHub Airflow lineage plugin (recommended) or the Airflow lineage backend (deprecated). +::: -## Using Datahub's Airflow lineage plugin +The DataHub Airflow plugin supports: -:::note +- Automatic column-level lineage extraction from various operators e.g. `SqlOperator`s (including `MySqlOperator`, `PostgresOperator`, `SnowflakeOperator`, and more), `S3FileTransformOperator`, and a few others. +- Airflow DAG and tasks, including properties, ownership, and tags. +- Task run information, including task successes and failures. +- Manual lineage annotations using `inlets` and `outlets` on Airflow operators. -The Airflow lineage plugin is only supported with Airflow version >= 2.0.2 or on MWAA with an Airflow version >= 2.0.2. +There's two actively supported implementations of the plugin, with different Airflow version support. -If you're using Airflow 1.x, use the Airflow lineage plugin with acryl-datahub-airflow-plugin <= 0.9.1.0. +| Approach | Airflow Version | Notes | +| --------- | --------------- | --------------------------------------------------------------------------- | +| Plugin v2 | 2.3+ | Recommended. Requires Python 3.8+ | +| Plugin v1 | 2.1+ | No automatic lineage extraction; may not extract lineage if the task fails. | -::: +If you're using Airflow older than 2.1, it's possible to use the v1 plugin with older versions of `acryl-datahub-airflow-plugin`. See the [compatibility section](#compatibility) for more details. -This plugin registers a task success/failure callback on every task with a cluster policy and emits DataHub events from that. This allows this plugin to be able to register both task success as well as failures compared to the older Airflow Lineage Backend which could only support emitting task success. + + -### Setup +## DataHub Plugin v2 -1. You need to install the required dependency in your airflow. +### Installation + +The v2 plugin requires Airflow 2.3+ and Python 3.8+. If you don't meet these requirements, use the v1 plugin instead. ```shell -pip install acryl-datahub-airflow-plugin +pip install 'acryl-datahub-airflow-plugin[plugin-v2]' ``` -:::note +### Configuration -The [DataHub Rest](../../metadata-ingestion/sink_docs/datahub.md#datahub-rest) emitter is included in the plugin package by default. To use [DataHub Kafka](../../metadata-ingestion/sink_docs/datahub.md#datahub-kafka) install `pip install acryl-datahub-airflow-plugin[datahub-kafka]`. +Set up a DataHub connection in Airflow. -::: +```shell +airflow connections add --conn-type 'datahub-rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' +``` + +No additional configuration is required to use the plugin. However, there are some optional configuration parameters that can be set in the `airflow.cfg` file. + +```ini title="airflow.cfg" +[datahub] +# Optional - additional config here. +enabled = True # default +``` + +| Name | Default value | Description | +| -------------------------- | -------------------- | ---------------------------------------------------------------------------------------- | +| enabled | true | If the plugin should be enabled. | +| conn_id | datahub_rest_default | The name of the datahub rest connection. | +| cluster | prod | name of the airflow cluster | +| capture_ownership_info | true | Extract DAG ownership. | +| capture_tags_info | true | Extract DAG tags. | +| capture_executions | true | Extract task runs and success/failure statuses. This will show up in DataHub "Runs" tab. | +| enable_extractors | true | Enable automatic lineage extraction. | +| disable_openlineage_plugin | true | Disable the OpenLineage plugin to avoid duplicative processing. | +| log_level | _no change_ | [debug] Set the log level for the plugin. | +| debug_emitter | false | [debug] If true, the plugin will log the emitted events. | + +### Automatic lineage extraction + +To automatically extract lineage information, the v2 plugin builds on top of Airflow's built-in [OpenLineage extractors](https://openlineage.io/docs/integrations/airflow/default-extractors). -2. Disable lazy plugin loading in your airflow.cfg. - On MWAA you should add this config to your [Apache Airflow configuration options](https://docs.aws.amazon.com/mwaa/latest/userguide/configuring-env-variables.html#configuring-2.0-airflow-override). +The SQL-related extractors have been updated to use DataHub's SQL parser, which is more robust than the built-in one and uses DataHub's metadata information to generate column-level lineage. We discussed the DataHub SQL parser, including why schema-aware parsing works better and how it performs on benchmarks, during the [June 2023 community town hall](https://youtu.be/1QVcUmRQK5E?si=U27zygR7Gi_KdkzE&t=2309). + +## DataHub Plugin v1 + +### Installation + +The v1 plugin requires Airflow 2.1+ and Python 3.8+. If you're on older versions, it's still possible to use an older version of the plugin. See the [compatibility section](#compatibility) for more details. + +If you're using Airflow 2.3+, we recommend using the v2 plugin instead. If you need to use the v1 plugin with Airflow 2.3+, you must also set the environment variable `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN=true`. + +```shell +pip install 'acryl-datahub-airflow-plugin[plugin-v1]' + +# The DataHub rest connection type is included by default. +# To use the DataHub Kafka connection type, install the plugin with the kafka extras. +pip install 'acryl-datahub-airflow-plugin[plugin-v1,datahub-kafka]' +``` + + + +### Configuration + +#### Disable lazy plugin loading ```ini title="airflow.cfg" [core] lazy_load_plugins = False ``` -3. You must configure an Airflow hook for Datahub. We support both a Datahub REST hook and a Kafka-based hook, but you only need one. +On MWAA you should add this config to your [Apache Airflow configuration options](https://docs.aws.amazon.com/mwaa/latest/userguide/configuring-env-variables.html#configuring-2.0-airflow-override). + +#### Setup a DataHub connection - ```shell - # For REST-based: - airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' - # For Kafka-based (standard Kafka sink config can be passed via extras): - airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' - ``` +You must configure an Airflow connection for Datahub. We support both a Datahub REST and a Kafka-based connections, but you only need one. -4. Add your `datahub_conn_id` and/or `cluster` to your `airflow.cfg` file if it is not align with the default values. See configuration parameters below +```shell +# For REST-based: +airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' +# For Kafka-based (standard Kafka sink config can be passed via extras): +airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' +``` - **Configuration options:** +#### Configure the plugin - | Name | Default value | Description | - | ------------------------------ | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | datahub.enabled | true | If the plugin should be enabled. | - | datahub.conn_id | datahub_rest_default | The name of the datahub connection you set in step 1. | - | datahub.cluster | prod | name of the airflow cluster | - | datahub.capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. | - | datahub.capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. | - | datahub.capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | - | datahub.graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | +If your config doesn't align with the default values, you can configure the plugin in your `airflow.cfg` file. + +```ini title="airflow.cfg" +[datahub] +enabled = true +conn_id = datahub_rest_default # or datahub_kafka_default +# etc. +``` -5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). -6. [optional] Learn more about [Airflow lineage](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html), including shorthand notation and some automation. +| Name | Default value | Description | +| ---------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| enabled | true | If the plugin should be enabled. | +| conn_id | datahub_rest_default | The name of the datahub connection you set in step 1. | +| cluster | prod | name of the airflow cluster | +| capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. | +| capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. | +| capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | +| graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | -### How to validate installation +#### Validate that the plugin is working 1. Go and check in Airflow at Admin -> Plugins menu if you can see the DataHub plugin 2. Run an Airflow DAG. In the task logs, you should see Datahub related log messages like: @@ -77,9 +140,22 @@ lazy_load_plugins = False Emitting DataHub ... ``` -### Emitting lineage via a custom operator to the Airflow Plugin +## Manual Lineage Annotation + +### Using `inlets` and `outlets` + +You can manually annotate lineage by setting `inlets` and `outlets` on your Airflow operators. This is useful if you're using an operator that doesn't support automatic lineage extraction, or if you want to override the automatic lineage extraction. + +We have a few code samples that demonstrate how to use `inlets` and `outlets`: -If you have created a custom Airflow operator [docs](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class, +- [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py) +- [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) - uses the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html) + +For more information, take a look at the [Airflow lineage docs](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html). + +### Custom Operators + +If you have created a [custom Airflow operator](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class, when overriding the `execute` function, set inlets and outlets via `context['ti'].task.inlets` and `context['ti'].task.outlets`. The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. @@ -90,7 +166,7 @@ class DbtOperator(BaseOperator): def execute(self, context): # do something inlets, outlets = self._get_lineage() - # inlets/outlets are lists of either datahub_provider.entities.Dataset or datahub_provider.entities.Urn + # inlets/outlets are lists of either datahub_airflow_plugin.entities.Dataset or datahub_airflow_plugin.entities.Urn context['ti'].task.inlets = self.inlets context['ti'].task.outlets = self.outlets @@ -100,78 +176,25 @@ class DbtOperator(BaseOperator): return inlets, outlets ``` -If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. [source](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) - -## Using DataHub's Airflow lineage backend (deprecated) - -:::caution - -The DataHub Airflow plugin (above) is the recommended way to integrate Airflow with DataHub. For managed services like MWAA, the lineage backend is not supported and so you must use the Airflow plugin. - -If you're using Airflow 1.x, we recommend using the Airflow lineage backend with acryl-datahub <= 0.9.1.0. - -::: - -:::note - -If you are looking to run Airflow and DataHub using docker locally, follow the guide [here](../../docker/airflow/local_airflow.md). Otherwise proceed to follow the instructions below. -::: - -### Setting up Airflow to use DataHub as Lineage Backend - -1. You need to install the required dependency in your airflow. See - -```shell -pip install acryl-datahub[airflow] -# If you need the Kafka-based emitter/hook: -pip install acryl-datahub[airflow,datahub-kafka] -``` - -2. You must configure an Airflow hook for Datahub. We support both a Datahub REST hook and a Kafka-based hook, but you only need one. - - ```shell - # For REST-based: - airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' - # For Kafka-based (standard Kafka sink config can be passed via extras): - airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' - ``` +If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. Reference the [Airflow docs](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) for more details. -3. Add the following lines to your `airflow.cfg` file. +## Emit Lineage Directly - ```ini title="airflow.cfg" - [lineage] - backend = datahub_provider.lineage.datahub.DatahubLineageBackend - datahub_kwargs = { - "enabled": true, - "datahub_conn_id": "datahub_rest_default", - "cluster": "prod", - "capture_ownership_info": true, - "capture_tags_info": true, - "graceful_exceptions": true } - # The above indentation is important! - ``` +If you can't use the plugin or annotate inlets/outlets, you can also emit lineage using the `DatahubEmitterOperator`. - **Configuration options:** +Reference [`lineage_emission_dag.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py) for a full example. - - `datahub_conn_id` (required): Usually `datahub_rest_default` or `datahub_kafka_default`, depending on what you named the connection in step 1. - - `cluster` (defaults to "prod"): The "cluster" to associate Airflow DAGs and tasks with. - - `capture_ownership_info` (defaults to true): If true, the owners field of the DAG will be capture as a DataHub corpuser. - - `capture_tags_info` (defaults to true): If true, the tags field of the DAG will be captured as DataHub tags. - - `capture_executions` (defaults to false): If true, it captures task runs as DataHub DataProcessInstances. - - `graceful_exceptions` (defaults to true): If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. +In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See the plugin configuration for examples. -4. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). -5. [optional] Learn more about [Airflow lineage](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html), including shorthand notation and some automation. - -## Emitting lineage via a separate operator - -Take a look at this sample DAG: +## Debugging -- [`lineage_emission_dag.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py) - emits lineage using the DatahubEmitterOperator. +### Missing lineage -In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details. +If you're not seeing lineage in DataHub, check the following: -## Debugging +- Validate that the plugin is loaded in Airflow. Go to Admin -> Plugins and check that the DataHub plugin is listed. +- If using the v2 plugin's automatic lineage, ensure that the `enable_extractors` config is set to true and that automatic lineage is supported for your operator. +- If using manual lineage annotation, ensure that you're using the `datahub_airflow_plugin.entities.Dataset` or `datahub_airflow_plugin.entities.Urn` classes for your inlets and outlets. ### Incorrect URLs @@ -179,9 +202,21 @@ If your URLs aren't being generated correctly (usually they'll start with `http: ```ini title="airflow.cfg" [webserver] -base_url = http://airflow.example.com +base_url = http://airflow.mycorp.example.com ``` +## Compatibility + +We no longer officially support Airflow <2.1. However, you can use older versions of `acryl-datahub-airflow-plugin` with older versions of Airflow. +Both of these options support Python 3.7+. + +- Airflow 1.10.x, use DataHub plugin v1 with acryl-datahub-airflow-plugin <= 0.9.1.0. +- Airflow 2.0.x, use DataHub plugin v1 with acryl-datahub-airflow-plugin <= 0.11.0.1. + +DataHub also previously supported an Airflow [lineage backend](https://airflow.apache.org/docs/apache-airflow/2.2.0/lineage.html#lineage-backend) implementation. While the implementation is still in our codebase, it is deprecated and will be removed in a future release. +Note that the lineage backend did not support automatic lineage extraction, did not capture task failures, and did not work in AWS MWAA. +The [documentation for the lineage backend](https://docs-website-1wmaehubl-acryldata.vercel.app/docs/lineage/airflow/#using-datahubs-airflow-lineage-backend-deprecated) has already been archived. + ## Additional references Related Datahub videos: diff --git a/docs/lineage/lineage-feature-guide.md b/docs/lineage/lineage-feature-guide.md deleted file mode 100644 index 678afce4c46a0..0000000000000 --- a/docs/lineage/lineage-feature-guide.md +++ /dev/null @@ -1,222 +0,0 @@ -import FeatureAvailability from '@site/src/components/FeatureAvailability'; - -# About DataHub Lineage - - - -Lineage is used to capture data dependencies within an organization. It allows you to track the inputs from which a data asset is derived, along with the data assets that depend on it downstream. - -If you're using an ingestion source that supports extraction of Lineage (e.g. the "Table Lineage Capability"), then lineage information can be extracted automatically. For detailed instructions, refer to the source documentation for the source you are using. If you are not using a Lineage-support ingestion source, you can programmatically emit lineage edges between entities via API. - -Alternatively, as of `v0.9.5`, DataHub supports the manual editing of lineage between entities. Data experts are free to add or remove upstream and downstream lineage edges in both the Lineage Visualization screen as well as the Lineage tab on entity pages. Use this feature to supplement automatic lineage extraction or establish important entity relationships in sources that do not support automatic extraction. Editing lineage by hand is supported for Datasets, Charts, Dashboards, and Data Jobs. - -:::note - -Lineage added by hand and programmatically may conflict with one another to cause unwanted overwrites. It is strongly recommend that lineage is edited manually in cases where lineage information is not also extracted in automated fashion, e.g. by running an ingestion source. - -::: - -Types of lineage connections supported in DataHub are: - -* Dataset-to-dataset -* Pipeline lineage (dataset-to-job-to-dataset) -* Dashboard-to-chart lineage -* Chart-to-dataset lineage -* Job-to-dataflow (dbt lineage) - -## Lineage Setup, Prerequisites, and Permissions - -To edit lineage for an entity, you'll need the following [Metadata Privilege](../authorization/policies.md): - -* **Edit Lineage** metadata privilege to edit lineage at the entity level - -It is important to know that the **Edit Lineage** privilege is required for all entities whose lineage is affected by the changes. For example, in order to add "Dataset B" as an upstream dependency of "Dataset A", you'll need the **Edit Lineage** privilege for both Dataset A and Dataset B. - -## Managing Lineage via the DataHub UI - -### Viewing lineage on the Datahub UI -The UI shows the latest version of the lineage. The time picker can be used to filter out edges within the latest version to exclude those that were last updated outside of the time window. Selecting time windows in the patch will not show you historical lineages. It will only filter the view of the latest version of the lineage. - -### Editing from Lineage Graph View - -The first place that you can edit lineage for entities is from the Lineage Visualization screen. Click on the "Lineage" button on the top right of an entity's profile to get to this view. - -

- -

- -Once you find the entity that you want to edit the lineage of, click on the three-dot menu dropdown to select whether you want to edit lineage in the upstream direction or the downstream direction. - -

- -

- -If you want to edit upstream lineage for entities downstream of the center node or downstream lineage for entities upstream of the center node, you can simply re-center to focus on the node you want to edit. Once focused on the desired node, you can edit lineage in either direction. - -

- -

- -#### Adding Lineage Edges - -Once you click "Edit Upstream" or "Edit Downstream," a modal will open that allows you to manage lineage for the selected entity in the chosen direction. In order to add a lineage edge to a new entity, search for it by name in the provided search bar and select it. Once you're satisfied with everything you've added, click "Save Changes." If you change your mind, you can always cancel or exit without saving the changes you've made. - -

- -

- -#### Removing Lineage Edges - -You can remove lineage edges from the same modal used to add lineage edges. Find the edge(s) that you want to remove, and click the "X" on the right side of it. And just like adding, you need to click "Save Changes" to save and if you exit without saving, your changes won't be applied. - -

- -

- -#### Reviewing Changes - -Any time lineage is edited manually, we keep track of who made the change and when they made it. You can see this information in the modal where you add and remove edges. If an edge was added manually, a user avatar will be in line with the edge that was added. You can hover over this avatar in order to see who added it and when. - -

- -

- -### Editing from Lineage Tab - -The other place that you can edit lineage for entities is from the Lineage Tab on an entity's profile. Click on the "Lineage" tab in an entity's profile and then find the "Edit" dropdown that allows you to edit upstream or downstream lineage for the given entity. - -

- -

- -Using the modal from this view will work the same as described above for editing from the Lineage Visualization screen. - -## Managing Lineage via API - -:::note - - When you emit any lineage aspect, the existing aspect gets completely overwritten, unless specifically using patch semantics. -This means that the latest version visible in the UI will be your version. - -::: - -### Using Dataset-to-Dataset Lineage - -This relationship model uses dataset -> dataset connection through the UpstreamLineage aspect in the Dataset entity. - -Here are a few samples for the usage of this type of lineage: - -* [lineage_emitter_mcpw_rest.py](../../metadata-ingestion/examples/library/lineage_emitter_mcpw_rest.py) - emits simple bigquery table-to-table (dataset-to-dataset) lineage via REST as MetadataChangeProposalWrapper. -* [lineage_emitter_rest.py](../../metadata-ingestion/examples/library/lineage_emitter_rest.py) - emits simple dataset-to-dataset lineage via REST as MetadataChangeEvent. -* [lineage_emitter_kafka.py](../../metadata-ingestion/examples/library/lineage_emitter_kafka.py) - emits simple dataset-to-dataset lineage via Kafka as MetadataChangeEvent. -* [lineage_emitter_dataset_finegrained.py](../../metadata-ingestion/examples/library/lineage_emitter_dataset_finegrained.py) - emits fine-grained dataset-dataset lineage via REST as MetadataChangeProposalWrapper. -* [Datahub Snowflake Lineage](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py) - emits Datahub's Snowflake lineage as MetadataChangeProposalWrapper. -* [Datahub BigQuery Lineage](https://github.com/datahub-project/datahub/blob/3022c2d12e68d221435c6134362c1a2cba2df6b3/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py#L1028) - emits Datahub's Bigquery lineage as MetadataChangeProposalWrapper. **Use the patch feature to add to rather than overwrite the current lineage.** - -### Using dbt Lineage - -This model captures dbt specific nodes (tables, views, etc.) and - -* uses datasets as the base entity type and -* extends subclass datasets for each dbt-specific concept, and -* links them together for dataset-to-dataset lineage - -Here is a sample usage of this lineage: - -* [Datahub dbt Lineage](https://github.com/datahub-project/datahub/blob/a9754ebe83b6b73bc2bfbf49d9ebf5dbd2ca5a8f/metadata-ingestion/src/datahub/ingestion/source/dbt.py#L625,L630) - emits Datahub's dbt lineage as MetadataChangeEvent. - -### Using Pipeline Lineage - -The relationship model for this is datajob-to-dataset through the dataJobInputOutput aspect in the DataJob entity. - -For Airflow, this lineage is supported using Airflow’s lineage backend which allows you to specify the inputs to and output from that task. - -If you annotate that on your task we can pick up that information and push that as lineage edges into datahub automatically. You can install this package from Airflow’s Astronomer marketplace [here](https://registry.astronomer.io/providers/datahub). - -Here are a few samples for the usage of this type of lineage: - -* [lineage_dataset_job_dataset.py](../../metadata-ingestion/examples/library/lineage_dataset_job_dataset.py) - emits mysql-to-airflow-to-kafka (dataset-to-job-to-dataset) lineage via REST as MetadataChangeProposalWrapper. -* [lineage_job_dataflow.py](../../metadata-ingestion/examples/library/lineage_job_dataflow.py) - emits the job-to-dataflow lineage via REST as MetadataChangeProposalWrapper. - -### Using Dashboard-to-Chart Lineage - -This relationship model uses the dashboardInfo aspect of the Dashboard entity and models an explicit edge between a dashboard and a chart (such that charts can be attached to multiple dashboards). - -Here is a sample usage of this lineage: - -* [lineage_chart_dashboard.py](../../metadata-ingestion/examples/library/lineage_chart_dashboard.py) - emits the chart-to-dashboard lineage via REST as MetadataChangeProposalWrapper. - -### Using Chart-to-Dataset Lineage - -This relationship model uses the chartInfo aspect of the Chart entity. - -Here is a sample usage of this lineage: - -* [lineage_dataset_chart.py](../../metadata-ingestion/examples/library/lineage_dataset_chart.py) - emits the dataset-to-chart lineage via REST as MetadataChangeProposalWrapper. - -## Additional Resources - -### Videos - -**DataHub Basics: Lineage 101** - -

- -

- -**DataHub November 2022 Town Hall - Including Manual Lineage Demo** - -

- -

- -### GraphQL - -* [updateLineage](../../graphql/mutations.md#updatelineage) -* [searchAcrossLineage](../../graphql/queries.md#searchacrosslineage) -* [searchAcrossLineageInput](../../graphql/inputObjects.md#searchacrosslineageinput) - -#### Examples - -**Updating Lineage** - -```graphql -mutation updateLineage { - updateLineage(input: { - edgesToAdd: [ - { - downstreamUrn: "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)", - upstreamUrn: "urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)" - } - ], - edgesToRemove: [ - { - downstreamUrn: "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD)", - upstreamUrn: "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)" - } - ] - }) -} -``` - -### DataHub Blog - -* [Acryl Data introduces lineage support and automated propagation of governance information for Snowflake in DataHub](https://blog.datahubproject.io/acryl-data-introduces-lineage-support-and-automated-propagation-of-governance-information-for-339c99536561) -* [Data in Context: Lineage Explorer in DataHub](https://blog.datahubproject.io/data-in-context-lineage-explorer-in-datahub-a53a9a476dc4) -* [Harnessing the Power of Data Lineage with DataHub](https://blog.datahubproject.io/harnessing-the-power-of-data-lineage-with-datahub-ad086358dec4) - -## FAQ and Troubleshooting - -**The Lineage Tab is greyed out - why can’t I click on it?** - -This means you have not yet ingested lineage metadata for that entity. Please ingest lineage to proceed. - -**Are there any recommended practices for emitting lineage?** - -We recommend emitting aspects as MetadataChangeProposalWrapper over emitting them via the MetadataChangeEvent. - -*Need more help? Join the conversation in [Slack](http://slack.datahubproject.io)!* - -### Related Features - -* [DataHub Lineage Impact Analysis](../act-on-metadata/impact-analysis.md) diff --git a/docs/ownership/ownership-types.md b/docs/ownership/ownership-types.md index 243f638a324ad..dbb08dd71ce6b 100644 --- a/docs/ownership/ownership-types.md +++ b/docs/ownership/ownership-types.md @@ -85,7 +85,7 @@ source: type: "file" config: # path to json file - filename: "metadata-ingestion/examples/ownership/ownership_type.json" + path: "metadata-ingestion/examples/ownership/ownership_type.json" # see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation sink: diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index 58a2bc9e670e3..dacf12dc020df 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -10,6 +10,13 @@ ext { if (!project.hasProperty("extra_pip_requirements")) { ext.extra_pip_requirements = "" } +if (!project.hasProperty("extra_pip_extras")) { + ext.extra_pip_extras = "plugin-v2" +} +// If extra_pip_extras is non-empty, we need to add a comma to the beginning of the string. +if (extra_pip_extras != "") { + ext.extra_pip_extras = "," + extra_pip_extras +} def pip_install_command = "${venv_name}/bin/pip install -e ../../metadata-ingestion" @@ -36,7 +43,7 @@ task installPackage(type: Exec, dependsOn: [environmentSetup, ':metadata-ingesti // and https://github.com/datahub-project/datahub/pull/8435. commandLine 'bash', '-x', '-c', "${pip_install_command} install 'Cython<3.0' 'PyYAML<6' --no-build-isolation && " + - "${pip_install_command} -e . ${extra_pip_requirements} &&" + + "${pip_install_command} -e .[ignore${extra_pip_extras}] ${extra_pip_requirements} &&" + "touch ${sentinel_file}" } @@ -47,7 +54,7 @@ task installDev(type: Exec, dependsOn: [install]) { inputs.file file('setup.py') outputs.file("${sentinel_file}") commandLine 'bash', '-x', '-c', - "${pip_install_command} -e .[dev] ${extra_pip_requirements} && " + + "${pip_install_command} -e .[dev${extra_pip_extras}] ${extra_pip_requirements} && " + "touch ${sentinel_file}" } @@ -79,7 +86,8 @@ task installDevTest(type: Exec, dependsOn: [installDev]) { outputs.dir("${venv_name}") outputs.file("${sentinel_file}") commandLine 'bash', '-x', '-c', - "${pip_install_command} -e .[dev,integration-tests] && touch ${sentinel_file}" + "${pip_install_command} -e .[dev,integration-tests${extra_pip_extras}] ${extra_pip_requirements} && " + + "touch ${sentinel_file}" } def testFile = hasProperty('testFile') ? testFile : 'unknown' @@ -97,20 +105,13 @@ task testSingle(dependsOn: [installDevTest]) { } task testQuick(type: Exec, dependsOn: installDevTest) { - // We can't enforce the coverage requirements if we run a subset of the tests. inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) inputs.files(project.fileTree(dir: "tests/")) - outputs.dir("${venv_name}") commandLine 'bash', '-x', '-c', - "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" + "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" } -task testFull(type: Exec, dependsOn: [testQuick, installDevTest]) { - commandLine 'bash', '-x', '-c', - "source ${venv_name}/bin/activate && pytest -m 'not slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.full.xml" -} - task cleanPythonCache(type: Exec) { commandLine 'bash', '-c', "find src -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete" diff --git a/metadata-ingestion-modules/airflow-plugin/pyproject.toml b/metadata-ingestion-modules/airflow-plugin/pyproject.toml index fba81486b9f67..648040c1951db 100644 --- a/metadata-ingestion-modules/airflow-plugin/pyproject.toml +++ b/metadata-ingestion-modules/airflow-plugin/pyproject.toml @@ -12,6 +12,7 @@ include = '\.pyi?$' [tool.isort] indent = ' ' +known_future_library = ['__future__', 'datahub.utilities._markupsafe_compat', 'datahub_provider._airflow_compat'] profile = 'black' sections = 'FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER' diff --git a/metadata-ingestion-modules/airflow-plugin/setup.cfg b/metadata-ingestion-modules/airflow-plugin/setup.cfg index 157bcce1c298d..c25256c5751b8 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.cfg +++ b/metadata-ingestion-modules/airflow-plugin/setup.cfg @@ -41,29 +41,29 @@ ignore_missing_imports = no [tool:pytest] asyncio_mode = auto -addopts = --cov=src --cov-report term-missing --cov-config setup.cfg --strict-markers +addopts = --cov=src --cov-report='' --cov-config setup.cfg --strict-markers -s -v +markers = + integration: marks tests to only run in integration (deselect with '-m "not integration"') testpaths = tests/unit tests/integration -[coverage:run] -# Because of some quirks in the way setup.cfg, coverage.py, pytest-cov, -# and tox interact, we should not uncomment the following line. -# See https://pytest-cov.readthedocs.io/en/latest/config.html and -# https://coverage.readthedocs.io/en/coverage-5.0/config.html. -# We also have some additional pytest/cov config options in tox.ini. -# source = src +# [coverage:run] +# # Because of some quirks in the way setup.cfg, coverage.py, pytest-cov, +# # and tox interact, we should not uncomment the following line. +# # See https://pytest-cov.readthedocs.io/en/latest/config.html and +# # https://coverage.readthedocs.io/en/coverage-5.0/config.html. +# # We also have some additional pytest/cov config options in tox.ini. +# # source = src -[coverage:paths] -# This is necessary for tox-based coverage to be counted properly. -source = - src - */site-packages +# [coverage:paths] +# # This is necessary for tox-based coverage to be counted properly. +# source = +# src +# */site-packages [coverage:report] -# The fail_under value ensures that at least some coverage data is collected. -# We override its value in the tox config. show_missing = true exclude_lines = pragma: no cover diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 47069f59c314d..a5af881022d8c 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -1,5 +1,6 @@ import os import pathlib +from typing import Dict, Set import setuptools @@ -13,23 +14,43 @@ def get_long_description(): return pathlib.Path(os.path.join(root, "README.md")).read_text() +_version = package_metadata["__version__"] +_self_pin = f"=={_version}" if not _version.endswith("dev0") else "" + + rest_common = {"requests", "requests_file"} base_requirements = { # Compatibility. "dataclasses>=0.6; python_version < '3.7'", - # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to Airflow 2.0.2 dependency conflict - "typing_extensions>=3.7.4.3 ; python_version < '3.8'", - "typing_extensions>=3.10.0.2,<4.6.0 ; python_version >= '3.8'", "mypy_extensions>=0.4.3", # Actual dependencies. - "typing-inspect", "pydantic>=1.5.1", "apache-airflow >= 2.0.2", *rest_common, - f"acryl-datahub == {package_metadata['__version__']}", } +plugins: Dict[str, Set[str]] = { + "datahub-rest": { + f"acryl-datahub[datahub-rest]{_self_pin}", + }, + "datahub-kafka": { + f"acryl-datahub[datahub-kafka]{_self_pin}", + }, + "datahub-file": { + f"acryl-datahub[sync-file-emitter]{_self_pin}", + }, + "plugin-v1": set(), + "plugin-v2": { + # The v2 plugin requires Python 3.8+. + f"acryl-datahub[sql-parser]{_self_pin}", + "openlineage-airflow==1.2.0; python_version >= '3.8'", + }, +} + +# Include datahub-rest in the base requirements. +base_requirements.update(plugins["datahub-rest"]) + mypy_stubs = { "types-dataclasses", @@ -45,11 +66,9 @@ def get_long_description(): # versions 0.1.13 and 0.1.14 seem to have issues "types-click==0.1.12", "types-tabulate", - # avrogen package requires this - "types-pytz", } -base_dev_requirements = { +dev_requirements = { *base_requirements, *mypy_stubs, "black==22.12.0", @@ -66,6 +85,7 @@ def get_long_description(): "pytest-cov>=2.8.1", "tox", "deepdiff", + "tenacity", "requests-mock", "freezegun", "jsonpickle", @@ -74,8 +94,24 @@ def get_long_description(): "packaging", } -dev_requirements = { - *base_dev_requirements, +integration_test_requirements = { + *dev_requirements, + *plugins["datahub-file"], + *plugins["datahub-kafka"], + f"acryl-datahub[testing-utils]{_self_pin}", + # Extra requirements for loading our test dags. + "apache-airflow[snowflake]>=2.0.2", + # https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 + # Eventually we want to set this to "snowflake-sqlalchemy>=1.4.3". + # However, that doesn't work with older versions of Airflow. Instead + # of splitting this into integration-test-old and integration-test-new, + # adding a bound to SQLAlchemy was the simplest solution. + "sqlalchemy<1.4.42", + # To avoid https://github.com/snowflakedb/snowflake-connector-python/issues/1188, + # we need https://github.com/snowflakedb/snowflake-connector-python/pull/1193 + "snowflake-connector-python>=2.7.10", + "virtualenv", # needed by PythonVirtualenvOperator + "apache-airflow-providers-sqlite", } @@ -88,7 +124,7 @@ def get_long_description(): setuptools.setup( # Package metadata. name=package_metadata["__package_name__"], - version=package_metadata["__version__"], + version=_version, url="https://datahubproject.io/", project_urls={ "Documentation": "https://datahubproject.io/docs/", @@ -131,17 +167,8 @@ def get_long_description(): # Dependencies. install_requires=list(base_requirements), extras_require={ + **{plugin: list(dependencies) for plugin, dependencies in plugins.items()}, "dev": list(dev_requirements), - "datahub-kafka": [ - f"acryl-datahub[datahub-kafka] == {package_metadata['__version__']}" - ], - "integration-tests": [ - f"acryl-datahub[datahub-kafka] == {package_metadata['__version__']}", - # Extra requirements for Airflow. - "apache-airflow[snowflake]>=2.0.2", # snowflake is used in example dags - # Because of https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 we need to restrict SQLAlchemy's max version. - "SQLAlchemy<1.4.42", - "virtualenv", # needed by PythonVirtualenvOperator - ], + "integration-tests": list(integration_test_requirements), }, ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py index 5ad20e1f72551..10f014fbd586f 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py @@ -1,3 +1,7 @@ +from typing import List + +import airflow.version +import packaging.version from airflow.models.baseoperator import BaseOperator from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED @@ -21,7 +25,35 @@ assert AIRFLOW_PATCHED +# Approach suggested by https://stackoverflow.com/a/11887885/5004662. +AIRFLOW_VERSION = packaging.version.parse(airflow.version.version) +HAS_AIRFLOW_STANDALONE_CMD = AIRFLOW_VERSION >= packaging.version.parse("2.2.0.dev0") +HAS_AIRFLOW_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.3.0.dev0") +HAS_AIRFLOW_DAG_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.5.0.dev0") + + +def get_task_inlets(operator: "Operator") -> List: + # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets + if hasattr(operator, "_inlets"): + return operator._inlets # type: ignore[attr-defined, union-attr] + if hasattr(operator, "get_inlet_defs"): + return operator.get_inlet_defs() # type: ignore[attr-defined] + return operator.inlets + + +def get_task_outlets(operator: "Operator") -> List: + # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets + # We have to use _outlets because outlets is empty in Airflow < 2.4.0 + if hasattr(operator, "_outlets"): + return operator._outlets # type: ignore[attr-defined, union-attr] + if hasattr(operator, "get_outlet_defs"): + return operator.get_outlet_defs() + return operator.outlets + + __all__ = [ + "AIRFLOW_VERSION", + "HAS_AIRFLOW_LISTENER_API", "Operator", "MappedOperator", "EmptyOperator", diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py new file mode 100644 index 0000000000000..67843da2ba995 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py @@ -0,0 +1,80 @@ +from typing import TYPE_CHECKING, Optional + +import datahub.emitter.mce_builder as builder +from airflow.configuration import conf +from datahub.configuration.common import ConfigModel + +if TYPE_CHECKING: + from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook + + +class DatahubLineageConfig(ConfigModel): + # This class is shared between the lineage backend and the Airflow plugin. + # The defaults listed here are only relevant for the lineage backend. + # The Airflow plugin's default values come from the fallback values in + # the get_lineage_config() function below. + + enabled: bool = True + + # DataHub hook connection ID. + datahub_conn_id: str + + # Cluster to associate with the pipelines and tasks. Defaults to "prod". + cluster: str = builder.DEFAULT_FLOW_CLUSTER + + # If true, the owners field of the DAG will be capture as a DataHub corpuser. + capture_ownership_info: bool = True + + # If true, the tags field of the DAG will be captured as DataHub tags. + capture_tags_info: bool = True + + capture_executions: bool = False + + enable_extractors: bool = True + + log_level: Optional[str] = None + debug_emitter: bool = False + + disable_openlineage_plugin: bool = True + + # Note that this field is only respected by the lineage backend. + # The Airflow plugin behaves as if it were set to True. + graceful_exceptions: bool = True + + def make_emitter_hook(self) -> "DatahubGenericHook": + # This is necessary to avoid issues with circular imports. + from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook + + return DatahubGenericHook(self.datahub_conn_id) + + +def get_lineage_config() -> DatahubLineageConfig: + """Load the DataHub plugin config from airflow.cfg.""" + + enabled = conf.get("datahub", "enabled", fallback=True) + datahub_conn_id = conf.get("datahub", "conn_id", fallback="datahub_rest_default") + cluster = conf.get("datahub", "cluster", fallback=builder.DEFAULT_FLOW_CLUSTER) + capture_tags_info = conf.get("datahub", "capture_tags_info", fallback=True) + capture_ownership_info = conf.get( + "datahub", "capture_ownership_info", fallback=True + ) + capture_executions = conf.get("datahub", "capture_executions", fallback=True) + enable_extractors = conf.get("datahub", "enable_extractors", fallback=True) + log_level = conf.get("datahub", "log_level", fallback=None) + debug_emitter = conf.get("datahub", "debug_emitter", fallback=False) + disable_openlineage_plugin = conf.get( + "datahub", "disable_openlineage_plugin", fallback=True + ) + + return DatahubLineageConfig( + enabled=enabled, + datahub_conn_id=datahub_conn_id, + cluster=cluster, + capture_ownership_info=capture_ownership_info, + capture_tags_info=capture_tags_info, + capture_executions=capture_executions, + enable_extractors=enable_extractors, + log_level=log_level, + debug_emitter=debug_emitter, + disable_openlineage_plugin=disable_openlineage_plugin, + ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py new file mode 100644 index 0000000000000..f39d37b122228 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py @@ -0,0 +1,7 @@ +from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener + +_listener = get_airflow_plugin_listener() +if _listener: + on_task_instance_running = _listener.on_task_instance_running + on_task_instance_success = _listener.on_task_instance_success + on_task_instance_failed = _listener.on_task_instance_failed diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py new file mode 100644 index 0000000000000..7d35791bf1db4 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py @@ -0,0 +1,23 @@ +import logging + +import datahub.emitter.mce_builder as builder +from openlineage.client.run import Dataset as OpenLineageDataset + +logger = logging.getLogger(__name__) + + +OL_SCHEME_TWEAKS = { + "sqlserver": "mssql", + "trino": "presto", + "awsathena": "athena", +} + + +def translate_ol_to_datahub_urn(ol_uri: OpenLineageDataset) -> str: + namespace = ol_uri.namespace + name = ol_uri.name + + scheme, *rest = namespace.split("://", maxsplit=1) + + platform = OL_SCHEME_TWEAKS.get(scheme, scheme) + return builder.make_dataset_urn(platform=platform, name=name) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py new file mode 100644 index 0000000000000..f84b7b56f6119 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py @@ -0,0 +1,244 @@ +import contextlib +import logging +import unittest.mock +from typing import TYPE_CHECKING, Optional + +import datahub.emitter.mce_builder as builder +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) +from datahub.utilities.sqlglot_lineage import ( + SqlParsingResult, + create_lineage_sql_parsed_result, +) +from openlineage.airflow.extractors import BaseExtractor +from openlineage.airflow.extractors import ExtractorManager as OLExtractorManager +from openlineage.airflow.extractors import TaskMetadata +from openlineage.airflow.extractors.snowflake_extractor import SnowflakeExtractor +from openlineage.airflow.extractors.sql_extractor import SqlExtractor +from openlineage.airflow.utils import get_operator_class, try_import_from_string +from openlineage.client.facet import ( + ExtractionError, + ExtractionErrorRunFacet, + SqlJobFacet, +) + +from datahub_airflow_plugin._airflow_shims import Operator +from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS + +if TYPE_CHECKING: + from airflow.models import DagRun, TaskInstance + from datahub.ingestion.graph.client import DataHubGraph + +logger = logging.getLogger(__name__) +_DATAHUB_GRAPH_CONTEXT_KEY = "datahub_graph" +SQL_PARSING_RESULT_KEY = "datahub_sql" + + +class ExtractorManager(OLExtractorManager): + # TODO: On Airflow 2.7, the OLExtractorManager is part of the built-in Airflow API. + # When available, we should use that instead. The same goe for most of the OL + # extractors. + + def __init__(self): + super().__init__() + + _sql_operator_overrides = [ + # The OL BigQuery extractor has some complex logic to fetch detect + # the BigQuery job_id and fetch lineage from there. However, it can't + # generate CLL, so we disable it and use our own extractor instead. + "BigQueryOperator", + "BigQueryExecuteQueryOperator", + # Athena also does something similar. + "AthenaOperator", + "AWSAthenaOperator", + # Additional types that OL doesn't support. This is only necessary because + # on older versions of Airflow, these operators don't inherit from SQLExecuteQueryOperator. + "SqliteOperator", + ] + for operator in _sql_operator_overrides: + self.task_to_extractor.extractors[operator] = GenericSqlExtractor + + self._graph: Optional["DataHubGraph"] = None + + @contextlib.contextmanager + def _patch_extractors(self): + with contextlib.ExitStack() as stack: + # Patch the SqlExtractor.extract() method. + stack.enter_context( + unittest.mock.patch.object( + SqlExtractor, + "extract", + _sql_extractor_extract, + ) + ) + + # Patch the SnowflakeExtractor.default_schema property. + stack.enter_context( + unittest.mock.patch.object( + SnowflakeExtractor, + "default_schema", + property(snowflake_default_schema), + ) + ) + + # TODO: Override the BigQuery extractor to use the DataHub SQL parser. + # self.extractor_manager.add_extractor() + + # TODO: Override the Athena extractor to use the DataHub SQL parser. + + yield + + def extract_metadata( + self, + dagrun: "DagRun", + task: "Operator", + complete: bool = False, + task_instance: Optional["TaskInstance"] = None, + task_uuid: Optional[str] = None, + graph: Optional["DataHubGraph"] = None, + ) -> TaskMetadata: + self._graph = graph + with self._patch_extractors(): + return super().extract_metadata( + dagrun, task, complete, task_instance, task_uuid + ) + + def _get_extractor(self, task: "Operator") -> Optional[BaseExtractor]: + # By adding this, we can use the generic extractor as a fallback for + # any operator that inherits from SQLExecuteQueryOperator. + clazz = get_operator_class(task) + SQLExecuteQueryOperator = try_import_from_string( + "airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator" + ) + if SQLExecuteQueryOperator and issubclass(clazz, SQLExecuteQueryOperator): + self.task_to_extractor.extractors.setdefault( + clazz.__name__, GenericSqlExtractor + ) + + extractor = super()._get_extractor(task) + if extractor: + extractor.set_context(_DATAHUB_GRAPH_CONTEXT_KEY, self._graph) + return extractor + + +class GenericSqlExtractor(SqlExtractor): + # Note that the extract() method is patched elsewhere. + + @property + def default_schema(self): + return super().default_schema + + def _get_scheme(self) -> Optional[str]: + # Best effort conversion to DataHub platform names. + + with contextlib.suppress(Exception): + if self.hook: + if hasattr(self.hook, "get_uri"): + uri = self.hook.get_uri() + return get_platform_from_sqlalchemy_uri(uri) + + return self.conn.conn_type or super().dialect + + def _get_database(self) -> Optional[str]: + if self.conn: + # For BigQuery, the "database" is the project name. + if hasattr(self.conn, "project_id"): + return self.conn.project_id + + return self.conn.schema + return None + + +def _sql_extractor_extract(self: "SqlExtractor") -> TaskMetadata: + # Why not override the OL sql_parse method directly, instead of overriding + # extract()? A few reasons: + # + # 1. We would want to pass the default_db and graph instance into our sql parser + # method. The OL code doesn't pass the default_db (despite having it available), + # and it's not clear how to get the graph instance into that method. + # 2. OL has some janky logic to fetch table schemas as part of the sql extractor. + # We don't want that behavior and this lets us disable it. + # 3. Our SqlParsingResult already has DataHub urns, whereas using SqlMeta would + # require us to convert those urns to OL uris, just for them to get converted + # back to urns later on in our processing. + + task_name = f"{self.operator.dag_id}.{self.operator.task_id}" + sql = self.operator.sql + + run_facets = {} + job_facets = {"sql": SqlJobFacet(query=self._normalize_sql(sql))} + + # Prepare to run the SQL parser. + graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None) + + default_database = getattr(self.operator, "database", None) + if not default_database: + default_database = self.database + default_schema = self.default_schema + + # TODO: Add better handling for sql being a list of statements. + if isinstance(sql, list): + logger.info(f"Got list of SQL statements for {task_name}. Using first one.") + sql = sql[0] + + # Run the SQL parser. + scheme = self.scheme + platform = OL_SCHEME_TWEAKS.get(scheme, scheme) + self.log.debug( + "Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s", + "with graph client" if graph else "in offline mode", + platform, + default_database, + default_schema, + sql, + ) + sql_parsing_result: SqlParsingResult = create_lineage_sql_parsed_result( + query=sql, + graph=graph, + platform=platform, + platform_instance=None, + env=builder.DEFAULT_ENV, + database=default_database, + schema=default_schema, + ) + self.log.debug(f"Got sql lineage {sql_parsing_result}") + + if sql_parsing_result.debug_info.error: + error = sql_parsing_result.debug_info.error + run_facets["extractionError"] = ExtractionErrorRunFacet( + totalTasks=1, + failedTasks=1, + errors=[ + ExtractionError( + errorMessage=str(error), + stackTrace=None, + task="datahub_sql_parser", + taskNumber=None, + ) + ], + ) + + # Save sql_parsing_result to the facets dict. It is removed from the + # facet dict in the extractor's processing logic. + run_facets[SQL_PARSING_RESULT_KEY] = sql_parsing_result # type: ignore + + return TaskMetadata( + name=task_name, + inputs=[], + outputs=[], + run_facets=run_facets, + job_facets=job_facets, + ) + + +def snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]: + if hasattr(self.operator, "schema") and self.operator.schema is not None: + return self.operator.schema + return ( + self.conn.extra_dejson.get("extra__snowflake__schema", "") + or self.conn.extra_dejson.get("schema", "") + or self.conn.schema + ) + # TODO: Should we try a fallback of: + # execute_query_on_hook(self.hook, "SELECT current_schema();")[0][0] diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py index b5e86e14d85d0..16585f70e820b 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union, cast +from datetime import datetime +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, cast from airflow.configuration import conf from datahub.api.entities.datajob import DataFlow, DataJob @@ -6,6 +7,7 @@ DataProcessInstance, InstanceRunResult, ) +from datahub.emitter.generic_emitter import Emitter from datahub.metadata.schema_classes import DataProcessTypeClass from datahub.utilities.urns.data_flow_urn import DataFlowUrn from datahub.utilities.urns.data_job_urn import DataJobUrn @@ -17,8 +19,6 @@ if TYPE_CHECKING: from airflow import DAG from airflow.models import DagRun, TaskInstance - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub_airflow_plugin._airflow_shims import Operator @@ -91,7 +91,7 @@ def _get_dependencies( ) # if the task triggers the subdag, link it to this node in the subdag - if subdag_task_id in _task_downstream_task_ids(upstream_task): + if subdag_task_id in sorted(_task_downstream_task_ids(upstream_task)): upstream_subdag_triggers.append(upstream_task_urn) # If the operator is an ExternalTaskSensor then we set the remote task as upstream. @@ -143,7 +143,7 @@ def generate_dataflow( """ id = dag.dag_id orchestrator = "airflow" - description = f"{dag.description}\n\n{dag.doc_md or ''}" + description = "\n\n".join(filter(None, [dag.description, dag.doc_md])) or None data_flow = DataFlow( env=cluster, id=id, orchestrator=orchestrator, description=description ) @@ -153,7 +153,7 @@ def generate_dataflow( allowed_flow_keys = [ "_access_control", "_concurrency", - "_default_view", + # "_default_view", "catchup", "fileloc", "is_paused_upon_creation", @@ -171,7 +171,7 @@ def generate_dataflow( data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}" if capture_owner and dag.owner: - data_flow.owners.add(dag.owner) + data_flow.owners.update(owner.strip() for owner in dag.owner.split(",")) if capture_tags and dag.tags: data_flow.tags.update(dag.tags) @@ -227,10 +227,7 @@ def generate_datajob( job_property_bag: Dict[str, str] = {} - allowed_task_keys = [ - "_downstream_task_ids", - "_inlets", - "_outlets", + allowed_task_keys: List[Union[str, Tuple[str, ...]]] = [ "_task_type", "_task_module", "depends_on_past", @@ -243,15 +240,28 @@ def generate_datajob( "trigger_rule", "wait_for_downstream", # In Airflow 2.3, _downstream_task_ids was renamed to downstream_task_ids - "downstream_task_ids", + ("downstream_task_ids", "_downstream_task_ids"), # In Airflow 2.4, _inlets and _outlets were removed in favor of non-private versions. - "inlets", - "outlets", + ("inlets", "_inlets"), + ("outlets", "_outlets"), ] for key in allowed_task_keys: - if hasattr(task, key): - job_property_bag[key] = repr(getattr(task, key)) + if isinstance(key, tuple): + out_key: str = key[0] + try_keys = key + else: + out_key = key + try_keys = (key,) + + for k in try_keys: + if hasattr(task, k): + v = getattr(task, k) + if out_key == "downstream_task_ids": + # Generate these in a consistent order. + v = list(sorted(v)) + job_property_bag[out_key] = repr(v) + break datajob.properties = job_property_bag base_url = conf.get("webserver", "base_url") @@ -288,7 +298,7 @@ def create_datajob_instance( @staticmethod def run_dataflow( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, cluster: str, dag_run: "DagRun", start_timestamp_millis: Optional[int] = None, @@ -340,7 +350,7 @@ def run_dataflow( @staticmethod def complete_dataflow( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, cluster: str, dag_run: "DagRun", end_timestamp_millis: Optional[int] = None, @@ -348,7 +358,7 @@ def complete_dataflow( ) -> None: """ - :param emitter: DatahubRestEmitter - the datahub rest emitter to emit the generated mcps + :param emitter: Emitter - the datahub emitter to emit the generated mcps :param cluster: str - name of the cluster :param dag_run: DagRun :param end_timestamp_millis: Optional[int] - the completion time in milliseconds if not set the current time will be used. @@ -386,7 +396,7 @@ def complete_dataflow( @staticmethod def run_datajob( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, cluster: str, ti: "TaskInstance", dag: "DAG", @@ -413,16 +423,13 @@ def run_datajob( job_property_bag["end_date"] = str(ti.end_date) job_property_bag["execution_date"] = str(ti.execution_date) job_property_bag["try_number"] = str(ti.try_number - 1) - job_property_bag["hostname"] = str(ti.hostname) job_property_bag["max_tries"] = str(ti.max_tries) # Not compatible with Airflow 1 if hasattr(ti, "external_executor_id"): job_property_bag["external_executor_id"] = str(ti.external_executor_id) - job_property_bag["pid"] = str(ti.pid) job_property_bag["state"] = str(ti.state) job_property_bag["operator"] = str(ti.operator) job_property_bag["priority_weight"] = str(ti.priority_weight) - job_property_bag["unixname"] = str(ti.unixname) job_property_bag["log_url"] = ti.log_url dpi.properties.update(job_property_bag) dpi.url = ti.log_url @@ -442,8 +449,10 @@ def run_datajob( dpi.type = DataProcessTypeClass.BATCH_AD_HOC if start_timestamp_millis is None: - assert ti.start_date - start_timestamp_millis = int(ti.start_date.timestamp() * 1000) + if ti.start_date: + start_timestamp_millis = int(ti.start_date.timestamp() * 1000) + else: + start_timestamp_millis = int(datetime.now().timestamp() * 1000) if attempt is None: attempt = ti.try_number @@ -458,7 +467,7 @@ def run_datajob( @staticmethod def complete_datajob( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, cluster: str, ti: "TaskInstance", dag: "DAG", @@ -469,7 +478,7 @@ def complete_datajob( ) -> DataProcessInstance: """ - :param emitter: DatahubRestEmitter + :param emitter: Emitter - the datahub emitter to emit the generated mcps :param cluster: str :param ti: TaskInstance :param dag: DAG @@ -483,8 +492,10 @@ def complete_datajob( datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) if end_timestamp_millis is None: - assert ti.end_date - end_timestamp_millis = int(ti.end_date.timestamp() * 1000) + if ti.end_date: + end_timestamp_millis = int(ti.end_date.timestamp() * 1000) + else: + end_timestamp_millis = int(datetime.now().timestamp() * 1000) if result is None: # We should use TaskInstanceState but it is not available in Airflow 1 diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py new file mode 100644 index 0000000000000..a3f5cb489e29f --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -0,0 +1,494 @@ +import copy +import functools +import logging +import threading +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, TypeVar, cast + +import airflow +import datahub.emitter.mce_builder as builder +from datahub.api.entities.datajob import DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult +from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.ingestion.graph.client import DataHubGraph +from datahub.metadata.schema_classes import ( + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, +) +from datahub.telemetry import telemetry +from datahub.utilities.sqlglot_lineage import SqlParsingResult +from datahub.utilities.urns.dataset_urn import DatasetUrn +from openlineage.airflow.listener import TaskHolder +from openlineage.airflow.utils import redact_with_exclusions +from openlineage.client.serde import Serde + +from datahub_airflow_plugin._airflow_shims import ( + HAS_AIRFLOW_DAG_LISTENER_API, + Operator, + get_task_inlets, + get_task_outlets, +) +from datahub_airflow_plugin._config import DatahubLineageConfig, get_lineage_config +from datahub_airflow_plugin._datahub_ol_adapter import translate_ol_to_datahub_urn +from datahub_airflow_plugin._extractors import SQL_PARSING_RESULT_KEY, ExtractorManager +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator +from datahub_airflow_plugin.entities import _Entity + +_F = TypeVar("_F", bound=Callable[..., None]) +if TYPE_CHECKING: + from airflow.models import DAG, DagRun, TaskInstance + from sqlalchemy.orm import Session + + # To placate mypy on Airflow versions that don't have the listener API, + # we define a dummy hookimpl that's an identity function. + + def hookimpl(f: _F) -> _F: # type: ignore[misc] # noqa: F811 + return f + +else: + from airflow.listeners import hookimpl + +logger = logging.getLogger(__name__) + +_airflow_listener_initialized = False +_airflow_listener: Optional["DataHubListener"] = None +_RUN_IN_THREAD = True +_RUN_IN_THREAD_TIMEOUT = 30 + + +def get_airflow_plugin_listener() -> Optional["DataHubListener"]: + # Using globals instead of functools.lru_cache to make testing easier. + global _airflow_listener_initialized + global _airflow_listener + + if not _airflow_listener_initialized: + _airflow_listener_initialized = True + + plugin_config = get_lineage_config() + + if plugin_config.enabled: + _airflow_listener = DataHubListener(config=plugin_config) + + if plugin_config.disable_openlineage_plugin: + # Deactivate the OpenLineagePlugin listener to avoid conflicts. + from openlineage.airflow.plugin import OpenLineagePlugin + + OpenLineagePlugin.listeners = [] + + telemetry.telemetry_instance.ping( + "airflow-plugin-init", + { + "airflow-version": airflow.__version__, + "datahub-airflow-plugin": "v2", + "datahub-airflow-plugin-dag-events": HAS_AIRFLOW_DAG_LISTENER_API, + "capture_executions": plugin_config.capture_executions, + "capture_tags": plugin_config.capture_tags_info, + "capture_ownership": plugin_config.capture_ownership_info, + "enable_extractors": plugin_config.enable_extractors, + "disable_openlineage_plugin": plugin_config.disable_openlineage_plugin, + }, + ) + return _airflow_listener + + +def run_in_thread(f: _F) -> _F: + # This is also responsible for catching exceptions and logging them. + + @functools.wraps(f) + def wrapper(*args, **kwargs): + try: + if _RUN_IN_THREAD: + # A poor-man's timeout mechanism. + # This ensures that we don't hang the task if the extractors + # are slow or the DataHub API is slow to respond. + + thread = threading.Thread( + target=f, args=args, kwargs=kwargs, daemon=True + ) + thread.start() + + thread.join(timeout=_RUN_IN_THREAD_TIMEOUT) + if thread.is_alive(): + logger.warning( + f"Thread for {f.__name__} is still running after {_RUN_IN_THREAD_TIMEOUT} seconds. " + "Continuing without waiting for it to finish." + ) + else: + f(*args, **kwargs) + except Exception as e: + logger.exception(e) + + return cast(_F, wrapper) + + +class DataHubListener: + __name__ = "DataHubListener" + + def __init__(self, config: DatahubLineageConfig): + self.config = config + self._set_log_level() + + self._emitter = config.make_emitter_hook().make_emitter() + self._graph: Optional[DataHubGraph] = None + logger.info(f"DataHub plugin using {repr(self._emitter)}") + + # See discussion here https://github.com/OpenLineage/OpenLineage/pull/508 for + # why we need to keep track of tasks ourselves. + self._task_holder = TaskHolder() + + # In our case, we also want to cache the initial datajob object + # so that we can add to it when the task completes. + self._datajob_holder: Dict[str, DataJob] = {} + + self.extractor_manager = ExtractorManager() + + # This "inherits" from types.ModuleType to avoid issues with Airflow's listener plugin loader. + # It previously (v2.4.x and likely other versions too) would throw errors if it was not a module. + # https://github.com/apache/airflow/blob/e99a518970b2d349a75b1647f6b738c8510fa40e/airflow/listeners/listener.py#L56 + # self.__class__ = types.ModuleType + + @property + def emitter(self): + return self._emitter + + @property + def graph(self) -> Optional[DataHubGraph]: + if self._graph: + return self._graph + + if isinstance(self._emitter, DatahubRestEmitter) and not isinstance( + self._emitter, DataHubGraph + ): + # This is lazy initialized to avoid throwing errors on plugin load. + self._graph = self._emitter.to_graph() + self._emitter = self._graph + + return self._graph + + def _set_log_level(self) -> None: + """Set the log level for the plugin and its dependencies. + + This may need to be called multiple times, since Airflow sometimes + messes with the logging configuration after the plugin is loaded. + In particular, the loggers may get changed when the worker starts + executing a task. + """ + + if self.config.log_level: + logging.getLogger(__name__.split(".")[0]).setLevel(self.config.log_level) + if self.config.debug_emitter: + logging.getLogger("datahub.emitter").setLevel(logging.DEBUG) + + def _make_emit_callback(self) -> Callable[[Optional[Exception], str], None]: + def emit_callback(err: Optional[Exception], msg: str) -> None: + if err: + logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) + + return emit_callback + + def _extract_lineage( + self, + datajob: DataJob, + dagrun: "DagRun", + task: "Operator", + task_instance: "TaskInstance", + complete: bool = False, + ) -> None: + """ + Combine lineage (including column lineage) from task inlets/outlets and + extractor-generated task_metadata and write it to the datajob. This + routine is also responsible for converting the lineage to DataHub URNs. + """ + + input_urns: List[str] = [] + output_urns: List[str] = [] + fine_grained_lineages: List[FineGrainedLineageClass] = [] + + task_metadata = None + if self.config.enable_extractors: + task_metadata = self.extractor_manager.extract_metadata( + dagrun, + task, + complete=complete, + task_instance=task_instance, + task_uuid=str(datajob.urn), + graph=self.graph, + ) + logger.debug(f"Got task metadata: {task_metadata}") + + # Translate task_metadata.inputs/outputs to DataHub URNs. + input_urns.extend( + translate_ol_to_datahub_urn(dataset) for dataset in task_metadata.inputs + ) + output_urns.extend( + translate_ol_to_datahub_urn(dataset) + for dataset in task_metadata.outputs + ) + + # Add DataHub-native SQL parser results. + sql_parsing_result: Optional[SqlParsingResult] = None + if task_metadata: + sql_parsing_result = task_metadata.run_facets.pop( + SQL_PARSING_RESULT_KEY, None + ) + if sql_parsing_result: + if sql_parsing_result.debug_info.error: + datajob.properties["datahub_sql_parser_error"] = str( + sql_parsing_result.debug_info.error + ) + if not sql_parsing_result.debug_info.table_error: + input_urns.extend(sql_parsing_result.in_tables) + output_urns.extend(sql_parsing_result.out_tables) + + if sql_parsing_result.column_lineage: + fine_grained_lineages.extend( + FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + upstreams=[ + builder.make_schema_field_urn( + upstream.table, upstream.column + ) + for upstream in column_lineage.upstreams + ], + downstreams=[ + builder.make_schema_field_urn( + downstream.table, downstream.column + ) + for downstream in [column_lineage.downstream] + if downstream.table + ], + ) + for column_lineage in sql_parsing_result.column_lineage + ) + + # Add DataHub-native inlets/outlets. + # These are filtered out by the extractor, so we need to add them manually. + input_urns.extend( + iolet.urn for iolet in get_task_inlets(task) if isinstance(iolet, _Entity) + ) + output_urns.extend( + iolet.urn for iolet in get_task_outlets(task) if isinstance(iolet, _Entity) + ) + + # Write the lineage to the datajob object. + datajob.inlets.extend(DatasetUrn.create_from_string(urn) for urn in input_urns) + datajob.outlets.extend( + DatasetUrn.create_from_string(urn) for urn in output_urns + ) + datajob.fine_grained_lineages.extend(fine_grained_lineages) + + # Merge in extra stuff that was present in the DataJob we constructed + # at the start of the task. + if complete: + original_datajob = self._datajob_holder.get(str(datajob.urn), None) + else: + self._datajob_holder[str(datajob.urn)] = datajob + original_datajob = None + + if original_datajob: + logger.debug("Merging start datajob into finish datajob") + datajob.inlets.extend(original_datajob.inlets) + datajob.outlets.extend(original_datajob.outlets) + datajob.fine_grained_lineages.extend(original_datajob.fine_grained_lineages) + + for k, v in original_datajob.properties.items(): + datajob.properties.setdefault(k, v) + + # Deduplicate inlets/outlets. + datajob.inlets = list(sorted(set(datajob.inlets), key=lambda x: str(x))) + datajob.outlets = list(sorted(set(datajob.outlets), key=lambda x: str(x))) + + # Write all other OL facets as DataHub properties. + if task_metadata: + for k, v in task_metadata.job_facets.items(): + datajob.properties[f"openlineage_job_facet_{k}"] = Serde.to_json( + redact_with_exclusions(v) + ) + + for k, v in task_metadata.run_facets.items(): + datajob.properties[f"openlineage_run_facet_{k}"] = Serde.to_json( + redact_with_exclusions(v) + ) + + @hookimpl + @run_in_thread + def on_task_instance_running( + self, + previous_state: None, + task_instance: "TaskInstance", + session: "Session", # This will always be QUEUED + ) -> None: + self._set_log_level() + + # This if statement mirrors the logic in https://github.com/OpenLineage/OpenLineage/pull/508. + if not hasattr(task_instance, "task"): + # The type ignore is to placate mypy on Airflow 2.1.x. + logger.warning( + f"No task set for task_id: {task_instance.task_id} - " # type: ignore[attr-defined] + f"dag_id: {task_instance.dag_id} - run_id {task_instance.run_id}" # type: ignore[attr-defined] + ) + return + + logger.debug( + f"DataHub listener got notification about task instance start for {task_instance.task_id}" + ) + + # Render templates in a copy of the task instance. + # This is necessary to get the correct operator args in the extractors. + task_instance = copy.deepcopy(task_instance) + task_instance.render_templates() + + # The type ignore is to placate mypy on Airflow 2.1.x. + dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined] + task = task_instance.task + dag: "DAG" = task.dag # type: ignore[assignment] + + self._task_holder.set_task(task_instance) + + # Handle async operators in Airflow 2.3 by skipping deferred state. + # Inspired by https://github.com/OpenLineage/OpenLineage/pull/1601 + if task_instance.next_method is not None: # type: ignore[attr-defined] + return + + # If we don't have the DAG listener API, we just pretend that + # the start of the task is the start of the DAG. + # This generates duplicate events, but it's better than not + # generating anything. + if not HAS_AIRFLOW_DAG_LISTENER_API: + self.on_dag_start(dagrun) + + datajob = AirflowGenerator.generate_datajob( + cluster=self.config.cluster, + task=task, + dag=dag, + capture_tags=self.config.capture_tags_info, + capture_owner=self.config.capture_ownership_info, + ) + + # TODO: Make use of get_task_location to extract github urls. + + # Add lineage info. + self._extract_lineage(datajob, dagrun, task, task_instance) + + # TODO: Add handling for Airflow mapped tasks using task_instance.map_index + + datajob.emit(self.emitter, callback=self._make_emit_callback()) + logger.debug(f"Emitted DataHub Datajob start: {datajob}") + + if self.config.capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=self.emitter, + cluster=self.config.cluster, + ti=task_instance, + dag=dag, + dag_run=dagrun, + datajob=datajob, + emit_templates=False, + ) + logger.debug(f"Emitted DataHub DataProcess Instance start: {dpi}") + + self.emitter.flush() + + logger.debug( + f"DataHub listener finished processing notification about task instance start for {task_instance.task_id}" + ) + + def on_task_instance_finish( + self, task_instance: "TaskInstance", status: InstanceRunResult + ) -> None: + dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined] + task = self._task_holder.get_task(task_instance) or task_instance.task + dag: "DAG" = task.dag # type: ignore[assignment] + + datajob = AirflowGenerator.generate_datajob( + cluster=self.config.cluster, + task=task, + dag=dag, + capture_tags=self.config.capture_tags_info, + capture_owner=self.config.capture_ownership_info, + ) + + # Add lineage info. + self._extract_lineage(datajob, dagrun, task, task_instance, complete=True) + + datajob.emit(self.emitter, callback=self._make_emit_callback()) + logger.debug(f"Emitted DataHub Datajob finish w/ status {status}: {datajob}") + + if self.config.capture_executions: + dpi = AirflowGenerator.complete_datajob( + emitter=self.emitter, + cluster=self.config.cluster, + ti=task_instance, + dag=dag, + dag_run=dagrun, + datajob=datajob, + result=status, + ) + logger.debug( + f"Emitted DataHub DataProcess Instance with status {status}: {dpi}" + ) + + self.emitter.flush() + + @hookimpl + @run_in_thread + def on_task_instance_success( + self, previous_state: None, task_instance: "TaskInstance", session: "Session" + ) -> None: + self._set_log_level() + + logger.debug( + f"DataHub listener got notification about task instance success for {task_instance.task_id}" + ) + self.on_task_instance_finish(task_instance, status=InstanceRunResult.SUCCESS) + logger.debug( + f"DataHub listener finished processing task instance success for {task_instance.task_id}" + ) + + @hookimpl + @run_in_thread + def on_task_instance_failed( + self, previous_state: None, task_instance: "TaskInstance", session: "Session" + ) -> None: + self._set_log_level() + + logger.debug( + f"DataHub listener got notification about task instance failure for {task_instance.task_id}" + ) + + # TODO: Handle UP_FOR_RETRY state. + self.on_task_instance_finish(task_instance, status=InstanceRunResult.FAILURE) + logger.debug( + f"DataHub listener finished processing task instance failure for {task_instance.task_id}" + ) + + def on_dag_start(self, dag_run: "DagRun") -> None: + dag = dag_run.dag + if not dag: + return + + dataflow = AirflowGenerator.generate_dataflow( + cluster=self.config.cluster, + dag=dag, + capture_tags=self.config.capture_tags_info, + capture_owner=self.config.capture_ownership_info, + ) + dataflow.emit(self.emitter, callback=self._make_emit_callback()) + + if HAS_AIRFLOW_DAG_LISTENER_API: + + @hookimpl + @run_in_thread + def on_dag_run_running(self, dag_run: "DagRun", msg: str) -> None: + self._set_log_level() + + logger.debug( + f"DataHub listener got notification about dag run start for {dag_run.dag_id}" + ) + + self.on_dag_start(dag_run) + + self.emitter.flush() + + # TODO: Add hooks for on_dag_run_success, on_dag_run_failed -> call AirflowGenerator.complete_dataflow diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py index d1cec9e5c1b54..c96fab31647f5 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py @@ -1,367 +1,74 @@ import contextlib import logging -import traceback -from typing import Any, Callable, Iterable, List, Optional, Union +import os -from airflow.configuration import conf -from airflow.lineage import PIPELINE_OUTLETS -from airflow.models.baseoperator import BaseOperator from airflow.plugins_manager import AirflowPlugin -from airflow.utils.module_loading import import_string -from cattr import structure -from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED -from datahub_airflow_plugin._airflow_shims import MappedOperator, Operator -from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator -from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook -from datahub_airflow_plugin.lineage.datahub import DatahubLineageConfig +from datahub_airflow_plugin._airflow_shims import ( + HAS_AIRFLOW_DAG_LISTENER_API, + HAS_AIRFLOW_LISTENER_API, +) assert AIRFLOW_PATCHED logger = logging.getLogger(__name__) -TASK_ON_FAILURE_CALLBACK = "on_failure_callback" -TASK_ON_SUCCESS_CALLBACK = "on_success_callback" +_USE_AIRFLOW_LISTENER_INTERFACE = HAS_AIRFLOW_LISTENER_API and not os.getenv( + "DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN", "false" +).lower() in ("true", "1") -def get_lineage_config() -> DatahubLineageConfig: - """Load the lineage config from airflow.cfg.""" +if _USE_AIRFLOW_LISTENER_INTERFACE: + try: + from openlineage.airflow.utils import try_import_from_string # noqa: F401 + except ImportError: + # If v2 plugin dependencies are not installed, we fall back to v1. + logger.debug("Falling back to v1 plugin due to missing dependencies.") + _USE_AIRFLOW_LISTENER_INTERFACE = False - enabled = conf.get("datahub", "enabled", fallback=True) - datahub_conn_id = conf.get("datahub", "conn_id", fallback="datahub_rest_default") - cluster = conf.get("datahub", "cluster", fallback="prod") - graceful_exceptions = conf.get("datahub", "graceful_exceptions", fallback=True) - capture_tags_info = conf.get("datahub", "capture_tags_info", fallback=True) - capture_ownership_info = conf.get( - "datahub", "capture_ownership_info", fallback=True - ) - capture_executions = conf.get("datahub", "capture_executions", fallback=True) - return DatahubLineageConfig( - enabled=enabled, - datahub_conn_id=datahub_conn_id, - cluster=cluster, - graceful_exceptions=graceful_exceptions, - capture_ownership_info=capture_ownership_info, - capture_tags_info=capture_tags_info, - capture_executions=capture_executions, - ) +with contextlib.suppress(Exception): + if not os.getenv("DATAHUB_AIRFLOW_PLUGIN_SKIP_FORK_PATCH", "false").lower() in ( + "true", + "1", + ): + # From https://github.com/apache/airflow/discussions/24463#discussioncomment-4404542 + # I'm not exactly sure why this fixes it, but I suspect it's that this + # forces the proxy settings to get cached before the fork happens. + # + # For more details, see https://github.com/python/cpython/issues/58037 + # and https://wefearchange.org/2018/11/forkmacos.rst.html + # and https://bugs.python.org/issue30385#msg293958 + # An alternative fix is to set NO_PROXY='*' -def _task_inlets(operator: "Operator") -> List: - # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets - if hasattr(operator, "_inlets"): - return operator._inlets # type: ignore[attr-defined, union-attr] - return operator.inlets + from _scproxy import _get_proxy_settings + _get_proxy_settings() -def _task_outlets(operator: "Operator") -> List: - # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets - # We have to use _outlets because outlets is empty in Airflow < 2.4.0 - if hasattr(operator, "_outlets"): - return operator._outlets # type: ignore[attr-defined, union-attr] - return operator.outlets +class DatahubPlugin(AirflowPlugin): + name = "datahub_plugin" -def get_inlets_from_task(task: BaseOperator, context: Any) -> Iterable[Any]: - # TODO: Fix for https://github.com/apache/airflow/commit/1b1f3fabc5909a447a6277cafef3a0d4ef1f01ae - # in Airflow 2.4. - # TODO: ignore/handle airflow's dataset type in our lineage - - inlets: List[Any] = [] - task_inlets = _task_inlets(task) - # From Airflow 2.3 this should be AbstractOperator but due to compatibility reason lets use BaseOperator - if isinstance(task_inlets, (str, BaseOperator)): - inlets = [ - task_inlets, - ] - - if task_inlets and isinstance(task_inlets, list): - inlets = [] - task_ids = ( - {o for o in task_inlets if isinstance(o, str)} - .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) - .intersection(task.get_flat_relative_ids(upstream=True)) - ) - - from airflow.lineage import AUTO - - # pick up unique direct upstream task_ids if AUTO is specified - if AUTO.upper() in task_inlets or AUTO.lower() in task_inlets: - print("Picking up unique direct upstream task_ids as AUTO is specified") - task_ids = task_ids.union( - task_ids.symmetric_difference(task.upstream_task_ids) - ) - - inlets = task.xcom_pull( - context, task_ids=list(task_ids), dag_id=task.dag_id, key=PIPELINE_OUTLETS - ) - - # re-instantiate the obtained inlets - inlets = [ - structure(item["data"], import_string(item["type_name"])) - # _get_instance(structure(item, Metadata)) - for sublist in inlets - if sublist - for item in sublist - ] - - for inlet in task_inlets: - if not isinstance(inlet, str): - inlets.append(inlet) - - return inlets - - -def _make_emit_callback( - logger: logging.Logger, -) -> Callable[[Optional[Exception], str], None]: - def emit_callback(err: Optional[Exception], msg: str) -> None: - if err: - logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) - - return emit_callback - - -def datahub_task_status_callback(context, status): - ti = context["ti"] - task: "BaseOperator" = ti.task - dag = context["dag"] - - # This code is from the original airflow lineage code -> - # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py - inlets = get_inlets_from_task(task, context) - - emitter = ( - DatahubGenericHook(context["_datahub_config"].datahub_conn_id) - .get_underlying_hook() - .make_emitter() - ) - - dataflow = AirflowGenerator.generate_dataflow( - cluster=context["_datahub_config"].cluster, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - task.log.info(f"Emitting Datahub Dataflow: {dataflow}") - dataflow.emit(emitter, callback=_make_emit_callback(task.log)) - - datajob = AirflowGenerator.generate_datajob( - cluster=context["_datahub_config"].cluster, - task=task, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - - for inlet in inlets: - datajob.inlets.append(inlet.urn) - - task_outlets = _task_outlets(task) - for outlet in task_outlets: - datajob.outlets.append(outlet.urn) - - task.log.info(f"Emitting Datahub Datajob: {datajob}") - datajob.emit(emitter, callback=_make_emit_callback(task.log)) - - if context["_datahub_config"].capture_executions: - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag=dag, - dag_run=context["dag_run"], - datajob=datajob, - start_timestamp_millis=int(ti.start_date.timestamp() * 1000), - ) - - task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}") - - dpi = AirflowGenerator.complete_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag_run=context["dag_run"], - result=status, - dag=dag, - datajob=datajob, - end_timestamp_millis=int(ti.end_date.timestamp() * 1000), - ) - task.log.info(f"Emitted Completed Data Process Instance: {dpi}") - - emitter.flush() - - -def datahub_pre_execution(context): - ti = context["ti"] - task: "BaseOperator" = ti.task - dag = context["dag"] - - task.log.info("Running Datahub pre_execute method") - - emitter = ( - DatahubGenericHook(context["_datahub_config"].datahub_conn_id) - .get_underlying_hook() - .make_emitter() - ) - - # This code is from the original airflow lineage code -> - # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py - inlets = get_inlets_from_task(task, context) - - datajob = AirflowGenerator.generate_datajob( - cluster=context["_datahub_config"].cluster, - task=context["ti"].task, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - - for inlet in inlets: - datajob.inlets.append(inlet.urn) - - task_outlets = _task_outlets(task) - - for outlet in task_outlets: - datajob.outlets.append(outlet.urn) - - task.log.info(f"Emitting Datahub dataJob {datajob}") - datajob.emit(emitter, callback=_make_emit_callback(task.log)) - - if context["_datahub_config"].capture_executions: - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag=dag, - dag_run=context["dag_run"], - datajob=datajob, - start_timestamp_millis=int(ti.start_date.timestamp() * 1000), - ) - - task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}") - - emitter.flush() - - -def _wrap_pre_execution(pre_execution): - def custom_pre_execution(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - datahub_pre_execution(context) - - # Call original policy - if pre_execution: - pre_execution(context) - - return custom_pre_execution - - -def _wrap_on_failure_callback(on_failure_callback): - def custom_on_failure_callback(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - try: - datahub_task_status_callback(context, status=InstanceRunResult.FAILURE) - except Exception as e: - if not config.graceful_exceptions: - raise e - else: - print(f"Exception: {traceback.format_exc()}") - - # Call original policy - if on_failure_callback: - on_failure_callback(context) - - return custom_on_failure_callback - - -def _wrap_on_success_callback(on_success_callback): - def custom_on_success_callback(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - try: - datahub_task_status_callback(context, status=InstanceRunResult.SUCCESS) - except Exception as e: - if not config.graceful_exceptions: - raise e - else: - print(f"Exception: {traceback.format_exc()}") - - # Call original policy - if on_success_callback: - on_success_callback(context) - - return custom_on_success_callback - - -def task_policy(task: Union[BaseOperator, MappedOperator]) -> None: - task.log.debug(f"Setting task policy for Dag: {task.dag_id} Task: {task.task_id}") - # task.add_inlets(["auto"]) - # task.pre_execute = _wrap_pre_execution(task.pre_execute) - - # MappedOperator's callbacks don't have setters until Airflow 2.X.X - # https://github.com/apache/airflow/issues/24547 - # We can bypass this by going through partial_kwargs for now - if MappedOperator and isinstance(task, MappedOperator): # type: ignore - on_failure_callback_prop: property = getattr( - MappedOperator, TASK_ON_FAILURE_CALLBACK - ) - on_success_callback_prop: property = getattr( - MappedOperator, TASK_ON_SUCCESS_CALLBACK - ) - if not on_failure_callback_prop.fset or not on_success_callback_prop.fset: - task.log.debug( - "Using MappedOperator's partial_kwargs instead of callback properties" - ) - task.partial_kwargs[TASK_ON_FAILURE_CALLBACK] = _wrap_on_failure_callback( - task.on_failure_callback + if _USE_AIRFLOW_LISTENER_INTERFACE: + if HAS_AIRFLOW_DAG_LISTENER_API: + from datahub_airflow_plugin.datahub_listener import ( # type: ignore[misc] + get_airflow_plugin_listener, ) - task.partial_kwargs[TASK_ON_SUCCESS_CALLBACK] = _wrap_on_success_callback( - task.on_success_callback - ) - return - - task.on_failure_callback = _wrap_on_failure_callback(task.on_failure_callback) # type: ignore - task.on_success_callback = _wrap_on_success_callback(task.on_success_callback) # type: ignore - # task.pre_execute = _wrap_pre_execution(task.pre_execute) - - -def _wrap_task_policy(policy): - if policy and hasattr(policy, "_task_policy_patched_by"): - return policy - - def custom_task_policy(task): - policy(task) - task_policy(task) - - # Add a flag to the policy to indicate that we've patched it. - custom_task_policy._task_policy_patched_by = "datahub_plugin" # type: ignore[attr-defined] - return custom_task_policy + listeners: list = list(filter(None, [get_airflow_plugin_listener()])) -def _patch_policy(settings): - if hasattr(settings, "task_policy"): - datahub_task_policy = _wrap_task_policy(settings.task_policy) - settings.task_policy = datahub_task_policy + else: + # On Airflow < 2.5, we need the listener to be a module. + # This is just a quick shim layer to make that work. + # The DAG listener API was added at the same time as this method + # was fixed, so we're reusing the same check variable. + # + # Related Airflow change: https://github.com/apache/airflow/pull/27113. + import datahub_airflow_plugin._datahub_listener_module as _listener_module # type: ignore[misc] + listeners = [_listener_module] -def _patch_datahub_policy(): - with contextlib.suppress(ImportError): - import airflow_local_settings - _patch_policy(airflow_local_settings) - - from airflow.models.dagbag import settings - - _patch_policy(settings) - - -_patch_datahub_policy() - - -class DatahubPlugin(AirflowPlugin): - name = "datahub_plugin" +if not _USE_AIRFLOW_LISTENER_INTERFACE: + # Use the policy patcher mechanism on Airflow 2.2 and below. + import datahub_airflow_plugin.datahub_plugin_v22 # noqa: F401 diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py new file mode 100644 index 0000000000000..046fbb5efaa03 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py @@ -0,0 +1,336 @@ +import contextlib +import logging +import traceback +from typing import Any, Callable, Iterable, List, Optional, Union + +import airflow +from airflow.lineage import PIPELINE_OUTLETS +from airflow.models.baseoperator import BaseOperator +from airflow.utils.module_loading import import_string +from cattr import structure +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult +from datahub.telemetry import telemetry + +from datahub_airflow_plugin._airflow_shims import ( + MappedOperator, + get_task_inlets, + get_task_outlets, +) +from datahub_airflow_plugin._config import get_lineage_config +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator +from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook +from datahub_airflow_plugin.lineage.datahub import DatahubLineageConfig + +TASK_ON_FAILURE_CALLBACK = "on_failure_callback" +TASK_ON_SUCCESS_CALLBACK = "on_success_callback" + + +def get_task_inlets_advanced(task: BaseOperator, context: Any) -> Iterable[Any]: + # TODO: Fix for https://github.com/apache/airflow/commit/1b1f3fabc5909a447a6277cafef3a0d4ef1f01ae + # in Airflow 2.4. + # TODO: ignore/handle airflow's dataset type in our lineage + + inlets: List[Any] = [] + task_inlets = get_task_inlets(task) + # From Airflow 2.3 this should be AbstractOperator but due to compatibility reason lets use BaseOperator + if isinstance(task_inlets, (str, BaseOperator)): + inlets = [ + task_inlets, + ] + + if task_inlets and isinstance(task_inlets, list): + inlets = [] + task_ids = ( + {o for o in task_inlets if isinstance(o, str)} + .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) + .intersection(task.get_flat_relative_ids(upstream=True)) + ) + + from airflow.lineage import AUTO + + # pick up unique direct upstream task_ids if AUTO is specified + if AUTO.upper() in task_inlets or AUTO.lower() in task_inlets: + print("Picking up unique direct upstream task_ids as AUTO is specified") + task_ids = task_ids.union( + task_ids.symmetric_difference(task.upstream_task_ids) + ) + + inlets = task.xcom_pull( + context, task_ids=list(task_ids), dag_id=task.dag_id, key=PIPELINE_OUTLETS + ) + + # re-instantiate the obtained inlets + inlets = [ + structure(item["data"], import_string(item["type_name"])) + # _get_instance(structure(item, Metadata)) + for sublist in inlets + if sublist + for item in sublist + ] + + for inlet in task_inlets: + if not isinstance(inlet, str): + inlets.append(inlet) + + return inlets + + +def _make_emit_callback( + logger: logging.Logger, +) -> Callable[[Optional[Exception], str], None]: + def emit_callback(err: Optional[Exception], msg: str) -> None: + if err: + logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) + + return emit_callback + + +def datahub_task_status_callback(context, status): + ti = context["ti"] + task: "BaseOperator" = ti.task + dag = context["dag"] + config: DatahubLineageConfig = context["_datahub_config"] + + # This code is from the original airflow lineage code -> + # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py + inlets = get_task_inlets_advanced(task, context) + + emitter = ( + DatahubGenericHook(config.datahub_conn_id).get_underlying_hook().make_emitter() + ) + + dataflow = AirflowGenerator.generate_dataflow( + cluster=config.cluster, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + task.log.info(f"Emitting Datahub Dataflow: {dataflow}") + dataflow.emit(emitter, callback=_make_emit_callback(task.log)) + + datajob = AirflowGenerator.generate_datajob( + cluster=config.cluster, + task=task, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + + for inlet in inlets: + datajob.inlets.append(inlet.urn) + + task_outlets = get_task_outlets(task) + for outlet in task_outlets: + datajob.outlets.append(outlet.urn) + + task.log.info(f"Emitting Datahub Datajob: {datajob}") + datajob.emit(emitter, callback=_make_emit_callback(task.log)) + + if config.capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag=dag, + dag_run=context["dag_run"], + datajob=datajob, + start_timestamp_millis=int(ti.start_date.timestamp() * 1000), + ) + + task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}") + + dpi = AirflowGenerator.complete_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag_run=context["dag_run"], + result=status, + dag=dag, + datajob=datajob, + end_timestamp_millis=int(ti.end_date.timestamp() * 1000), + ) + task.log.info(f"Emitted Completed Data Process Instance: {dpi}") + + emitter.flush() + + +def datahub_pre_execution(context): + ti = context["ti"] + task: "BaseOperator" = ti.task + dag = context["dag"] + config: DatahubLineageConfig = context["_datahub_config"] + + task.log.info("Running Datahub pre_execute method") + + emitter = ( + DatahubGenericHook(config.datahub_conn_id).get_underlying_hook().make_emitter() + ) + + # This code is from the original airflow lineage code -> + # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py + inlets = get_task_inlets_advanced(task, context) + + datajob = AirflowGenerator.generate_datajob( + cluster=config.cluster, + task=ti.task, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + + for inlet in inlets: + datajob.inlets.append(inlet.urn) + + task_outlets = get_task_outlets(task) + + for outlet in task_outlets: + datajob.outlets.append(outlet.urn) + + task.log.info(f"Emitting Datahub dataJob {datajob}") + datajob.emit(emitter, callback=_make_emit_callback(task.log)) + + if config.capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag=dag, + dag_run=context["dag_run"], + datajob=datajob, + start_timestamp_millis=int(ti.start_date.timestamp() * 1000), + ) + + task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}") + + emitter.flush() + + +def _wrap_pre_execution(pre_execution): + def custom_pre_execution(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + datahub_pre_execution(context) + + # Call original policy + if pre_execution: + pre_execution(context) + + return custom_pre_execution + + +def _wrap_on_failure_callback(on_failure_callback): + def custom_on_failure_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback(context, status=InstanceRunResult.FAILURE) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_failure_callback: + on_failure_callback(context) + + return custom_on_failure_callback + + +def _wrap_on_success_callback(on_success_callback): + def custom_on_success_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback(context, status=InstanceRunResult.SUCCESS) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_success_callback: + on_success_callback(context) + + return custom_on_success_callback + + +def task_policy(task: Union[BaseOperator, MappedOperator]) -> None: + task.log.debug(f"Setting task policy for Dag: {task.dag_id} Task: {task.task_id}") + # task.add_inlets(["auto"]) + # task.pre_execute = _wrap_pre_execution(task.pre_execute) + + # MappedOperator's callbacks don't have setters until Airflow 2.X.X + # https://github.com/apache/airflow/issues/24547 + # We can bypass this by going through partial_kwargs for now + if MappedOperator and isinstance(task, MappedOperator): # type: ignore + on_failure_callback_prop: property = getattr( + MappedOperator, TASK_ON_FAILURE_CALLBACK + ) + on_success_callback_prop: property = getattr( + MappedOperator, TASK_ON_SUCCESS_CALLBACK + ) + if not on_failure_callback_prop.fset or not on_success_callback_prop.fset: + task.log.debug( + "Using MappedOperator's partial_kwargs instead of callback properties" + ) + task.partial_kwargs[TASK_ON_FAILURE_CALLBACK] = _wrap_on_failure_callback( + task.on_failure_callback + ) + task.partial_kwargs[TASK_ON_SUCCESS_CALLBACK] = _wrap_on_success_callback( + task.on_success_callback + ) + return + + task.on_failure_callback = _wrap_on_failure_callback(task.on_failure_callback) # type: ignore + task.on_success_callback = _wrap_on_success_callback(task.on_success_callback) # type: ignore + # task.pre_execute = _wrap_pre_execution(task.pre_execute) + + +def _wrap_task_policy(policy): + if policy and hasattr(policy, "_task_policy_patched_by"): + return policy + + def custom_task_policy(task): + policy(task) + task_policy(task) + + # Add a flag to the policy to indicate that we've patched it. + custom_task_policy._task_policy_patched_by = "datahub_plugin" # type: ignore[attr-defined] + return custom_task_policy + + +def _patch_policy(settings): + if hasattr(settings, "task_policy"): + datahub_task_policy = _wrap_task_policy(settings.task_policy) + settings.task_policy = datahub_task_policy + + +def _patch_datahub_policy(): + with contextlib.suppress(ImportError): + import airflow_local_settings + + _patch_policy(airflow_local_settings) + + from airflow.models.dagbag import settings + + _patch_policy(settings) + + plugin_config = get_lineage_config() + telemetry.telemetry_instance.ping( + "airflow-plugin-init", + { + "airflow-version": airflow.__version__, + "datahub-airflow-plugin": "v1", + "capture_executions": plugin_config.capture_executions, + "capture_tags": plugin_config.capture_tags_info, + "capture_ownership": plugin_config.capture_ownership_info, + }, + ) + + +_patch_datahub_policy() diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py index f40295c6bb883..0d7cdb6b6e90a 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py @@ -2,12 +2,11 @@ This example demonstrates how to emit lineage to DataHub within an Airflow DAG. """ - from datetime import timedelta import datahub.emitter.mce_builder as builder from airflow import DAG -from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator +from airflow.operators.bash import BashOperator from airflow.utils.dates import days_ago from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator @@ -33,23 +32,10 @@ catchup=False, default_view="tree", ) as dag: - # This example shows a SnowflakeOperator followed by a lineage emission. However, the - # same DatahubEmitterOperator can be used to emit lineage in any context. - - sql = """CREATE OR REPLACE TABLE `mydb.schema.tableC` AS - WITH some_table AS ( - SELECT * FROM `mydb.schema.tableA` - ), - some_other_table AS ( - SELECT id, some_column FROM `mydb.schema.tableB` - ) - SELECT * FROM some_table - LEFT JOIN some_other_table ON some_table.unique_id=some_other_table.id""" - transformation_task = SnowflakeOperator( - task_id="snowflake_transformation", + transformation_task = BashOperator( + task_id="transformation_task", dag=dag, - snowflake_conn_id="snowflake_default", - sql=sql, + bash_command="echo 'This is where you might run your data tooling.'", ) emit_lineage_task = DatahubEmitterOperator( diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py index 8fb7363f8cad1..9604931795ccb 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py @@ -1,7 +1,9 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Sequence, Tuple, Union from airflow.exceptions import AirflowException from airflow.hooks.base import BaseHook +from datahub.emitter.generic_emitter import Emitter +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( MetadataChangeEvent, MetadataChangeProposal, @@ -11,6 +13,7 @@ from airflow.models.connection import Connection from datahub.emitter.kafka_emitter import DatahubKafkaEmitter from datahub.emitter.rest_emitter import DatahubRestEmitter + from datahub.emitter.synchronized_file_emitter import SynchronizedFileEmitter from datahub.ingestion.sink.datahub_kafka import KafkaSinkConfig @@ -80,17 +83,24 @@ def make_emitter(self) -> "DatahubRestEmitter": return datahub.emitter.rest_emitter.DatahubRestEmitter(*self._get_config()) - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + def emit( + self, + items: Sequence[ + Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ] + ], + ) -> None: emitter = self.make_emitter() - for mce in mces: - emitter.emit_mce(mce) + for item in items: + emitter.emit(item) - def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: - emitter = self.make_emitter() - - for mce in mcps: - emitter.emit_mcp(mce) + # Retained for backwards compatibility. + emit_mces = emit + emit_mcps = emit class DatahubKafkaHook(BaseHook): @@ -152,7 +162,16 @@ def make_emitter(self) -> "DatahubKafkaEmitter": sink_config = self._get_config() return datahub.emitter.kafka_emitter.DatahubKafkaEmitter(sink_config) - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + def emit( + self, + items: Sequence[ + Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ] + ], + ) -> None: emitter = self.make_emitter() errors = [] @@ -160,29 +179,50 @@ def callback(exc, msg): if exc: errors.append(exc) - for mce in mces: - emitter.emit_mce_async(mce, callback) + for mce in items: + emitter.emit(mce, callback) emitter.flush() if errors: - raise AirflowException(f"failed to push some MCEs: {errors}") + raise AirflowException(f"failed to push some metadata: {errors}") - def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: - emitter = self.make_emitter() - errors = [] + # Retained for backwards compatibility. + emit_mces = emit + emit_mcps = emit - def callback(exc, msg): - if exc: - errors.append(exc) - for mcp in mcps: - emitter.emit_mcp_async(mcp, callback) +class SynchronizedFileHook(BaseHook): + conn_type = "datahub-file" - emitter.flush() + def __init__(self, datahub_conn_id: str) -> None: + super().__init__() + self.datahub_conn_id = datahub_conn_id - if errors: - raise AirflowException(f"failed to push some MCPs: {errors}") + def make_emitter(self) -> "SynchronizedFileEmitter": + from datahub.emitter.synchronized_file_emitter import SynchronizedFileEmitter + + conn = self.get_connection(self.datahub_conn_id) + filename = conn.host + if not filename: + raise AirflowException("filename parameter is required") + + return SynchronizedFileEmitter(filename=filename) + + def emit( + self, + items: Sequence[ + Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ] + ], + ) -> None: + emitter = self.make_emitter() + + for item in items: + emitter.emit(item) class DatahubGenericHook(BaseHook): @@ -198,7 +238,9 @@ def __init__(self, datahub_conn_id: str) -> None: super().__init__() self.datahub_conn_id = datahub_conn_id - def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: + def get_underlying_hook( + self, + ) -> Union[DatahubRestHook, DatahubKafkaHook, SynchronizedFileHook]: conn = self.get_connection(self.datahub_conn_id) # We need to figure out the underlying hook type. First check the @@ -213,6 +255,11 @@ def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: or conn.conn_type == DatahubKafkaHook.conn_type.replace("-", "_") ): return DatahubKafkaHook(self.datahub_conn_id) + elif ( + conn.conn_type == SynchronizedFileHook.conn_type + or conn.conn_type == SynchronizedFileHook.conn_type.replace("-", "_") + ): + return SynchronizedFileHook(self.datahub_conn_id) elif "rest" in self.datahub_conn_id: return DatahubRestHook(self.datahub_conn_id) elif "kafka" in self.datahub_conn_id: @@ -222,8 +269,20 @@ def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: f"DataHub cannot handle conn_type {conn.conn_type} in {conn}" ) - def make_emitter(self) -> Union["DatahubRestEmitter", "DatahubKafkaEmitter"]: + def make_emitter(self) -> Emitter: return self.get_underlying_hook().make_emitter() - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: - return self.get_underlying_hook().emit_mces(mces) + def emit( + self, + items: Sequence[ + Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ] + ], + ) -> None: + return self.get_underlying_hook().emit(items) + + # Retained for backwards compatibility. + emit_mces = emit diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py similarity index 72% rename from metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py index d91c039ffa718..f5f519fa23b11 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py @@ -1,11 +1,10 @@ from datetime import datetime from typing import TYPE_CHECKING, Dict, List -import datahub.emitter.mce_builder as builder from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult -from datahub.configuration.common import ConfigModel from datahub.utilities.urns.dataset_urn import DatasetUrn +from datahub_airflow_plugin._config import DatahubLineageConfig from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator from datahub_airflow_plugin.entities import _Entity @@ -15,39 +14,14 @@ from airflow.models.taskinstance import TaskInstance from datahub_airflow_plugin._airflow_shims import Operator - from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook def _entities_to_urn_list(iolets: List[_Entity]) -> List[DatasetUrn]: return [DatasetUrn.create_from_string(let.urn) for let in iolets] -class DatahubBasicLineageConfig(ConfigModel): - enabled: bool = True - - # DataHub hook connection ID. - datahub_conn_id: str - - # Cluster to associate with the pipelines and tasks. Defaults to "prod". - cluster: str = builder.DEFAULT_FLOW_CLUSTER - - # If true, the owners field of the DAG will be capture as a DataHub corpuser. - capture_ownership_info: bool = True - - # If true, the tags field of the DAG will be captured as DataHub tags. - capture_tags_info: bool = True - - capture_executions: bool = False - - def make_emitter_hook(self) -> "DatahubGenericHook": - # This is necessary to avoid issues with circular imports. - from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook - - return DatahubGenericHook(self.datahub_conn_id) - - def send_lineage_to_datahub( - config: DatahubBasicLineageConfig, + config: DatahubLineageConfig, operator: "Operator", inlets: List[_Entity], outlets: List[_Entity], diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py index c41bb2b2a1e37..3ebe7831d08f9 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py @@ -4,8 +4,8 @@ from airflow.configuration import conf from airflow.lineage.backend import LineageBackend -from datahub_airflow_plugin._lineage_core import ( - DatahubBasicLineageConfig, +from datahub_airflow_plugin.lineage._lineage_core import ( + DatahubLineageConfig, send_lineage_to_datahub, ) @@ -13,14 +13,7 @@ from airflow.models.baseoperator import BaseOperator -class DatahubLineageConfig(DatahubBasicLineageConfig): - # If set to true, most runtime errors in the lineage backend will be - # suppressed and will not cause the overall task to fail. Note that - # configuration issues will still throw exceptions. - graceful_exceptions: bool = True - - -def get_lineage_config() -> DatahubLineageConfig: +def get_lineage_backend_config() -> DatahubLineageConfig: """Load the lineage config from airflow.cfg.""" # The kwargs pattern is also used for secret backends. @@ -51,8 +44,7 @@ class DatahubLineageBackend(LineageBackend): datahub_kwargs = { "datahub_conn_id": "datahub_rest_default", "capture_ownership_info": true, - "capture_tags_info": true, - "graceful_exceptions": true } + "capture_tags_info": true } # The above indentation is important! """ @@ -61,7 +53,7 @@ def __init__(self) -> None: # By attempting to get and parse the config, we can detect configuration errors # ahead of time. The init method is only called in Airflow 2.x. - _ = get_lineage_config() + _ = get_lineage_backend_config() # With Airflow 2.0, this can be an instance method. However, with Airflow 1.10.x, this # method is used statically, even though LineageBackend declares it as an instance variable. @@ -72,7 +64,7 @@ def send_lineage( outlets: Optional[List] = None, # unused context: Optional[Dict] = None, ) -> None: - config = get_lineage_config() + config = get_lineage_backend_config() if not config.enabled: return @@ -82,10 +74,4 @@ def send_lineage( config, operator, operator.inlets, operator.outlets, context ) except Exception as e: - if config.graceful_exceptions: - operator.log.error(e) - operator.log.info( - "Suppressing error because graceful_exceptions is set" - ) - else: - raise + operator.log.error(e) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py index 109e7ddfe4dfa..15b50c51a561d 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py @@ -57,7 +57,7 @@ def __init__( # type: ignore[no-untyped-def] datahub_conn_id=datahub_conn_id, **kwargs, ) - self.mces = mces + self.metadata = mces def execute(self, context): - self.generic_hook.get_underlying_hook().emit_mces(self.mces) + self.generic_hook.get_underlying_hook().emit(self.metadata) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/conftest.py b/metadata-ingestion-modules/airflow-plugin/tests/conftest.py new file mode 100644 index 0000000000000..d2c45e723f1b0 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/conftest.py @@ -0,0 +1,6 @@ +def pytest_addoption(parser): + parser.addoption( + "--update-golden-files", + action="store_true", + default=False, + ) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py new file mode 100644 index 0000000000000..8b0803ab98422 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py @@ -0,0 +1,34 @@ +from datetime import datetime + +from airflow import DAG +from airflow.operators.bash import BashOperator + +from datahub_airflow_plugin.entities import Dataset, Urn + +with DAG( + "basic_iolets", + start_date=datetime(2023, 1, 1), + schedule_interval=None, + catchup=False, +) as dag: + task = BashOperator( + task_id="run_data_task", + dag=dag, + bash_command="echo 'This is where you might run your data tooling.'", + inlets=[ + Dataset(platform="snowflake", name="mydb.schema.tableA"), + Dataset(platform="snowflake", name="mydb.schema.tableB", env="DEV"), + Dataset( + platform="snowflake", + name="mydb.schema.tableC", + platform_instance="cloud", + ), + Urn( + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ), + ], + outlets=[ + Dataset("snowflake", "mydb.schema.tableD"), + Dataset("snowflake", "mydb.schema.tableE"), + ], + ) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py new file mode 100644 index 0000000000000..1dd047f0a6dcc --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py @@ -0,0 +1,34 @@ +from datetime import datetime + +from airflow import DAG +from airflow.operators.bash import BashOperator + +from datahub_airflow_plugin.entities import Dataset, Urn + +with DAG( + "simple_dag", + start_date=datetime(2023, 1, 1), + schedule_interval=None, + catchup=False, + description="A simple DAG that runs a few fake data tasks.", +) as dag: + task1 = BashOperator( + task_id="task_1", + dag=dag, + bash_command="echo 'task 1'", + inlets=[ + Dataset(platform="snowflake", name="mydb.schema.tableA"), + Urn( + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ), + ], + outlets=[Dataset("snowflake", "mydb.schema.tableD")], + ) + + task2 = BashOperator( + task_id="run_another_data_task", + dag=dag, + bash_command="echo 'task 2'", + ) + + task1 >> task2 diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py new file mode 100644 index 0000000000000..347d0f88b0cd0 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py @@ -0,0 +1,32 @@ +from datetime import datetime + +from airflow import DAG +from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator + +SNOWFLAKE_COST_TABLE = "costs" +SNOWFLAKE_PROCESSED_TABLE = "processed_costs" + +with DAG( + "snowflake_operator", + start_date=datetime(2023, 1, 1), + schedule_interval=None, + catchup=False, +) as dag: + transform_cost_table = SnowflakeOperator( + snowflake_conn_id="my_snowflake", + task_id="transform_cost_table", + sql=""" + CREATE OR REPLACE TABLE {{ params.out_table_name }} AS + SELECT + id, + month, + total_cost, + area, + total_cost / area as cost_per_area + FROM {{ params.in_table_name }} + """, + params={ + "in_table_name": SNOWFLAKE_COST_TABLE, + "out_table_name": SNOWFLAKE_PROCESSED_TABLE, + }, + ) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py new file mode 100644 index 0000000000000..77faec3c8935a --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py @@ -0,0 +1,75 @@ +from datetime import datetime + +from airflow import DAG +from airflow.providers.sqlite.operators.sqlite import SqliteOperator + +CONN_ID = "my_sqlite" + +COST_TABLE = "costs" +PROCESSED_TABLE = "processed_costs" + +with DAG( + "sqlite_operator", + start_date=datetime(2023, 1, 1), + schedule_interval=None, + catchup=False, +) as dag: + create_cost_table = SqliteOperator( + sqlite_conn_id=CONN_ID, + task_id="create_cost_table", + sql=""" + CREATE TABLE IF NOT EXISTS {{ params.table_name }} ( + id INTEGER PRIMARY KEY, + month TEXT NOT NULL, + total_cost REAL NOT NULL, + area REAL NOT NULL + ) + """, + params={"table_name": COST_TABLE}, + ) + + populate_cost_table = SqliteOperator( + sqlite_conn_id=CONN_ID, + task_id="populate_cost_table", + sql=""" + INSERT INTO {{ params.table_name }} (id, month, total_cost, area) + VALUES + (1, '2021-01', 100, 10), + (2, '2021-02', 200, 20), + (3, '2021-03', 300, 30) + """, + params={"table_name": COST_TABLE}, + ) + + transform_cost_table = SqliteOperator( + sqlite_conn_id=CONN_ID, + task_id="transform_cost_table", + sql=""" + CREATE TABLE IF NOT EXISTS {{ params.out_table_name }} AS + SELECT + id, + month, + total_cost, + area, + total_cost / area as cost_per_area + FROM {{ params.in_table_name }} + """, + params={ + "in_table_name": COST_TABLE, + "out_table_name": PROCESSED_TABLE, + }, + ) + + cleanup_tables = [] + for table_name in [COST_TABLE, PROCESSED_TABLE]: + cleanup_table = SqliteOperator( + sqlite_conn_id=CONN_ID, + task_id=f"cleanup_{table_name}", + sql=""" + DROP TABLE {{ params.table_name }} + """, + params={"table_name": table_name}, + ) + cleanup_tables.append(cleanup_table) + + create_cost_table >> populate_cost_table >> transform_cost_table >> cleanup_tables diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json new file mode 100644 index 0000000000000..26aa2afaa831a --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json @@ -0,0 +1,533 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "None", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=basic_iolets", + "name": "basic_iolets" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "0.176536", + "start_date": "2023-09-30 00:49:56.670239+00:00", + "end_date": "2023-09-30 00:49:56.846775+00:00", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "1", + "max_tries": "0", + "external_executor_id": "None", + "state": "success", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets", + "name": "basic_iolets_run_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696034996670, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034996670, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 2 + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034996846, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json new file mode 100644 index 0000000000000..b2e3a1fe47da7 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json @@ -0,0 +1,718 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "None", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "0.175983", + "start_date": "2023-09-30 00:48:58.943850+00:00", + "end_date": "2023-09-30 00:48:59.119833+00:00", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "1", + "max_tries": "0", + "external_executor_id": "None", + "state": "success", + "operator": "BashOperator", + "priority_weight": "2", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag", + "name": "simple_dag_task_1_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696034938943, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034938943, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 2 + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034939119, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "None", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "0.129888", + "start_date": "2023-09-30 00:49:02.158752+00:00", + "end_date": "2023-09-30 00:49:02.288640+00:00", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "1", + "max_tries": "0", + "external_executor_id": "None", + "state": "success", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag", + "name": "simple_dag_run_another_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696034942158, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034942158, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 2 + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034942288, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json new file mode 100644 index 0000000000000..2e733c2ad40a9 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json @@ -0,0 +1,535 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=basic_iolets", + "name": "basic_iolets" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 01:13:14.266272+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets&map_index=-1", + "name": "basic_iolets_run_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696036394266, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696036394266, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696036394833, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json new file mode 100644 index 0000000000000..44b288efda954 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json @@ -0,0 +1,535 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=basic_iolets", + "name": "basic_iolets" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:59:52.401211+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets&map_index=-1", + "name": "basic_iolets_run_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057192401, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057192401, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057192982, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json new file mode 100644 index 0000000000000..454c509279e11 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json @@ -0,0 +1,666 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:53:58.219003+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "2", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag&map_index=-1", + "name": "simple_dag_task_1_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056838219, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056838219, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056838648, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:54:02.407515+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag&map_index=-1", + "name": "simple_dag_run_another_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056842407, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056842407, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056842831, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json new file mode 100644 index 0000000000000..73b5765e96b7d --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json @@ -0,0 +1,722 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:58:56.105026+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "2", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag&map_index=-1", + "name": "simple_dag_task_1_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057136105, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057136105, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057136612, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:58:59.567004+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag&map_index=-1", + "name": "simple_dag_run_another_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057139567, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057139567, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057140164, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json new file mode 100644 index 0000000000000..affc395d421da --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json @@ -0,0 +1,507 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,snowflake_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=snowflake_operator", + "name": "snowflake_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,snowflake_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,snowflake_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE OR REPLACE TABLE processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE OR REPLACE TABLE processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=snowflake_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:55:36.844976+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SnowflakeOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=snowflake_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=snowflake_operator&map_index=-1", + "name": "snowflake_operator_transform_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056936844, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056936844, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE OR REPLACE TABLE processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE OR REPLACE TABLE processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=snowflake_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056938096, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json new file mode 100644 index 0000000000000..1a32b38ce055d --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json @@ -0,0 +1,1735 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'create_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n '", + "task_id": "'create_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['populate_cost_table']", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", + "name": "create_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:56:24.632190+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "5", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=create_cost_table&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=create_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_create_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056984632, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056984632, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'create_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n '", + "task_id": "'create_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['populate_cost_table']", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", + "name": "create_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056984947, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'populate_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "\"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"", + "task_id": "'populate_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['transform_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=populate_cost_table", + "name": "populate_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:56:28.605901+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "4", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=populate_cost_table&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=populate_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_populate_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056988605, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056988605, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'populate_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "\"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"", + "task_id": "'populate_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['transform_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=populate_cost_table", + "name": "populate_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056989098, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['cleanup_costs', 'cleanup_processed_costs']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:56:32.888165+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "3", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_transform_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056992888, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056992888, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['cleanup_costs', 'cleanup_processed_costs']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056993744, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE costs\\n '", + "task_id": "'cleanup_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_costs", + "name": "cleanup_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:56:37.745717+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_costs&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_costs&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_cleanup_costs_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056997745, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056997745, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE costs\\n '", + "task_id": "'cleanup_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_costs", + "name": "cleanup_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056998672, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_processed_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE processed_costs\\n '", + "task_id": "'cleanup_processed_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE processed_costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_processed_costs", + "name": "cleanup_processed_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:56:42.645806+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_processed_costs&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_processed_costs&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_cleanup_processed_costs_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057002645, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057002645, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_processed_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE processed_costs\\n '", + "task_id": "'cleanup_processed_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE processed_costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_processed_costs", + "name": "cleanup_processed_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057003759, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json new file mode 100644 index 0000000000000..c082be693e30c --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json @@ -0,0 +1,1955 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'create_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n '", + "task_id": "'create_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['populate_cost_table']", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", + "name": "create_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 07:00:45.832554+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "5", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=create_cost_table&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=create_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_create_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057245832, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057245832, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'create_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n '", + "task_id": "'create_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['populate_cost_table']", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", + "name": "create_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057246734, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'populate_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "\"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"", + "task_id": "'populate_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['transform_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=populate_cost_table", + "name": "populate_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 07:00:49.653938+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "4", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=populate_cost_table&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=populate_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_populate_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057249653, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057249653, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'populate_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "\"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"", + "task_id": "'populate_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['transform_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=populate_cost_table", + "name": "populate_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057250831, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['cleanup_costs', 'cleanup_processed_costs']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 07:00:53.989264+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "3", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_transform_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057253989, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057253989, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['cleanup_costs', 'cleanup_processed_costs']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057255628, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE costs\\n '", + "task_id": "'cleanup_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_costs", + "name": "cleanup_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 07:01:00.421177+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_costs&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_costs&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_cleanup_costs_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057260421, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057260421, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE costs\\n '", + "task_id": "'cleanup_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_costs", + "name": "cleanup_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057262258, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_processed_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE processed_costs\\n '", + "task_id": "'cleanup_processed_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE processed_costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_processed_costs", + "name": "cleanup_processed_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 07:01:05.540192+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_processed_costs&dag_id=sqlite_operator&map_index=-1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_processed_costs&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_cleanup_processed_costs_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057265540, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057265540, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_processed_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE processed_costs\\n '", + "task_id": "'cleanup_processed_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE processed_costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_processed_costs", + "name": "cleanup_processed_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057267631, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py deleted file mode 100644 index 10cf3ad0a608a..0000000000000 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_dummy(): - pass diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py new file mode 100644 index 0000000000000..a2b7fd151a1e4 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py @@ -0,0 +1,392 @@ +import contextlib +import dataclasses +import functools +import logging +import os +import pathlib +import random +import signal +import subprocess +import time +from typing import Iterator, Sequence + +import pytest +import requests +import tenacity +from airflow.models.connection import Connection +from datahub.testing.compare_metadata_json import assert_metadata_files_equal + +from datahub_airflow_plugin._airflow_shims import ( + HAS_AIRFLOW_DAG_LISTENER_API, + HAS_AIRFLOW_LISTENER_API, + HAS_AIRFLOW_STANDALONE_CMD, +) + +pytestmark = pytest.mark.integration + +logger = logging.getLogger(__name__) +IS_LOCAL = os.environ.get("CI", "false") == "false" + +DAGS_FOLDER = pathlib.Path(__file__).parent / "dags" +GOLDENS_FOLDER = pathlib.Path(__file__).parent / "goldens" + + +@dataclasses.dataclass +class AirflowInstance: + airflow_home: pathlib.Path + airflow_port: int + pid: int + env_vars: dict + + username: str + password: str + + metadata_file: pathlib.Path + + @property + def airflow_url(self) -> str: + return f"http://localhost:{self.airflow_port}" + + @functools.cached_property + def session(self) -> requests.Session: + session = requests.Session() + session.auth = (self.username, self.password) + return session + + +@tenacity.retry( + reraise=True, + wait=tenacity.wait_fixed(1), + stop=tenacity.stop_after_delay(60), + retry=tenacity.retry_if_exception_type( + (AssertionError, requests.exceptions.RequestException) + ), +) +def _wait_for_airflow_healthy(airflow_port: int) -> None: + print("Checking if Airflow is ready...") + res = requests.get(f"http://localhost:{airflow_port}/health", timeout=5) + res.raise_for_status() + + airflow_health = res.json() + assert airflow_health["metadatabase"]["status"] == "healthy" + assert airflow_health["scheduler"]["status"] == "healthy" + + +class NotReadyError(Exception): + pass + + +@tenacity.retry( + reraise=True, + wait=tenacity.wait_fixed(1), + stop=tenacity.stop_after_delay(90), + retry=tenacity.retry_if_exception_type(NotReadyError), +) +def _wait_for_dag_finish( + airflow_instance: AirflowInstance, dag_id: str, require_success: bool +) -> None: + print("Checking if DAG is finished") + res = airflow_instance.session.get( + f"{airflow_instance.airflow_url}/api/v1/dags/{dag_id}/dagRuns", timeout=5 + ) + res.raise_for_status() + + dag_runs = res.json()["dag_runs"] + if not dag_runs: + raise NotReadyError("No DAG runs found") + + dag_run = dag_runs[0] + if dag_run["state"] == "failed": + if require_success: + raise ValueError("DAG failed") + # else - success is not required, so we're done. + + elif dag_run["state"] != "success": + raise NotReadyError(f"DAG has not finished yet: {dag_run['state']}") + + +@contextlib.contextmanager +def _run_airflow( + tmp_path: pathlib.Path, dags_folder: pathlib.Path, is_v1: bool +) -> Iterator[AirflowInstance]: + airflow_home = tmp_path / "airflow_home" + print(f"Using airflow home: {airflow_home}") + + if IS_LOCAL: + airflow_port = 11792 + else: + airflow_port = random.randint(10000, 12000) + print(f"Using airflow port: {airflow_port}") + + datahub_connection_name = "datahub_file_default" + meta_file = tmp_path / "datahub_metadata.json" + + environment = { + **os.environ, + "AIRFLOW_HOME": str(airflow_home), + "AIRFLOW__WEBSERVER__WEB_SERVER_PORT": str(airflow_port), + "AIRFLOW__WEBSERVER__BASE_URL": "http://airflow.example.com", + # Point airflow to the DAGs folder. + "AIRFLOW__CORE__LOAD_EXAMPLES": "False", + "AIRFLOW__CORE__DAGS_FOLDER": str(dags_folder), + "AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION": "False", + # Have the Airflow API use username/password authentication. + "AIRFLOW__API__AUTH_BACKEND": "airflow.api.auth.backend.basic_auth", + # Configure the datahub plugin and have it write the MCPs to a file. + "AIRFLOW__CORE__LAZY_LOAD_PLUGINS": "False" if is_v1 else "True", + "AIRFLOW__DATAHUB__CONN_ID": datahub_connection_name, + f"AIRFLOW_CONN_{datahub_connection_name.upper()}": Connection( + conn_id="datahub_file_default", + conn_type="datahub-file", + host=str(meta_file), + ).get_uri(), + # Configure fake credentials for the Snowflake connection. + "AIRFLOW_CONN_MY_SNOWFLAKE": Connection( + conn_id="my_snowflake", + conn_type="snowflake", + login="fake_username", + password="fake_password", + schema="DATAHUB_TEST_SCHEMA", + extra={ + "account": "fake_account", + "database": "DATAHUB_TEST_DATABASE", + "warehouse": "fake_warehouse", + "role": "fake_role", + "insecure_mode": "true", + }, + ).get_uri(), + "AIRFLOW_CONN_MY_SQLITE": Connection( + conn_id="my_sqlite", + conn_type="sqlite", + host=str(tmp_path / "my_sqlite.db"), + ).get_uri(), + # Convenience settings. + "AIRFLOW__DATAHUB__LOG_LEVEL": "DEBUG", + "AIRFLOW__DATAHUB__DEBUG_EMITTER": "True", + "SQLALCHEMY_SILENCE_UBER_WARNING": "1", + } + + if not HAS_AIRFLOW_STANDALONE_CMD: + raise pytest.skip("Airflow standalone command is not available") + + # Start airflow in a background subprocess. + airflow_process = subprocess.Popen( + ["airflow", "standalone"], + env=environment, + ) + + try: + _wait_for_airflow_healthy(airflow_port) + print("Airflow is ready!") + + # Sleep for a few seconds to make sure the other Airflow processes are ready. + time.sleep(3) + + # Create an extra "airflow" user for easy testing. + if IS_LOCAL: + print("Creating an extra test user...") + subprocess.check_call( + [ + # fmt: off + "airflow", "users", "create", + "--username", "airflow", + "--password", "airflow", + "--firstname", "admin", + "--lastname", "admin", + "--role", "Admin", + "--email", "airflow@example.com", + # fmt: on + ], + env=environment, + ) + + # Sanity check that the plugin got loaded. + if not is_v1: + print("[debug] Listing loaded plugins") + subprocess.check_call( + ["airflow", "plugins", "-v"], + env=environment, + ) + + # Load the admin user's password. This is generated by the + # `airflow standalone` command, and is different from the + # airflow user that we create when running locally. + airflow_username = "admin" + airflow_password = (airflow_home / "standalone_admin_password.txt").read_text() + + airflow_instance = AirflowInstance( + airflow_home=airflow_home, + airflow_port=airflow_port, + pid=airflow_process.pid, + env_vars=environment, + username=airflow_username, + password=airflow_password, + metadata_file=meta_file, + ) + + yield airflow_instance + finally: + try: + # Attempt a graceful shutdown. + print("Shutting down airflow...") + airflow_process.send_signal(signal.SIGINT) + airflow_process.wait(timeout=30) + except subprocess.TimeoutExpired: + # If the graceful shutdown failed, kill the process. + print("Hard shutting down airflow...") + airflow_process.kill() + airflow_process.wait(timeout=3) + + +def check_golden_file( + pytestconfig: pytest.Config, + output_path: pathlib.Path, + golden_path: pathlib.Path, + ignore_paths: Sequence[str] = (), +) -> None: + update_golden = pytestconfig.getoption("--update-golden-files") + + assert_metadata_files_equal( + output_path=output_path, + golden_path=golden_path, + update_golden=update_golden, + copy_output=False, + ignore_paths=ignore_paths, + ignore_order=False, + ) + + +@dataclasses.dataclass +class DagTestCase: + dag_id: str + success: bool = True + + v2_only: bool = False + + +test_cases = [ + DagTestCase("simple_dag"), + DagTestCase("basic_iolets"), + DagTestCase("snowflake_operator", success=False, v2_only=True), + DagTestCase("sqlite_operator", v2_only=True), +] + + +@pytest.mark.parametrize( + ["golden_filename", "test_case", "is_v1"], + [ + # On Airflow <= 2.2, test plugin v1. + *[ + pytest.param( + f"v1_{test_case.dag_id}", + test_case, + True, + id=f"v1_{test_case.dag_id}", + marks=pytest.mark.skipif( + HAS_AIRFLOW_LISTENER_API, + reason="Not testing plugin v1 on newer Airflow versions", + ), + ) + for test_case in test_cases + if not test_case.v2_only + ], + *[ + pytest.param( + # On Airflow 2.3-2.4, test plugin v2 without dataFlows. + f"v2_{test_case.dag_id}" + if HAS_AIRFLOW_DAG_LISTENER_API + else f"v2_{test_case.dag_id}_no_dag_listener", + test_case, + False, + id=f"v2_{test_case.dag_id}" + if HAS_AIRFLOW_DAG_LISTENER_API + else f"v2_{test_case.dag_id}_no_dag_listener", + marks=pytest.mark.skipif( + not HAS_AIRFLOW_LISTENER_API, + reason="Cannot test plugin v2 without the Airflow plugin listener API", + ), + ) + for test_case in test_cases + ], + ], +) +def test_airflow_plugin( + pytestconfig: pytest.Config, + tmp_path: pathlib.Path, + golden_filename: str, + test_case: DagTestCase, + is_v1: bool, +) -> None: + # This test: + # - Configures the plugin. + # - Starts a local airflow instance in a subprocess. + # - Runs a DAG that uses an operator supported by the extractor. + # - Waits for the DAG to complete. + # - Validates the metadata generated against a golden file. + + if not is_v1 and not test_case.success and not HAS_AIRFLOW_DAG_LISTENER_API: + # Saw a number of issues in CI where this would fail to emit the last events + # due to an error in the SQLAlchemy listener. This never happened locally for me. + pytest.skip("Cannot test failure cases without the Airflow DAG listener API") + + golden_path = GOLDENS_FOLDER / f"{golden_filename}.json" + dag_id = test_case.dag_id + + with _run_airflow( + tmp_path, dags_folder=DAGS_FOLDER, is_v1=is_v1 + ) as airflow_instance: + print(f"Running DAG {dag_id}...") + subprocess.check_call( + [ + "airflow", + "dags", + "trigger", + "--exec-date", + "2023-09-27T21:34:38+00:00", + "-r", + "manual_run_test", + dag_id, + ], + env=airflow_instance.env_vars, + ) + + print("Waiting for DAG to finish...") + _wait_for_dag_finish( + airflow_instance, dag_id, require_success=test_case.success + ) + + print("Sleeping for a few seconds to let the plugin finish...") + time.sleep(10) + + check_golden_file( + pytestconfig=pytestconfig, + output_path=airflow_instance.metadata_file, + golden_path=golden_path, + ignore_paths=[ + # Timing-related items. + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['start_date'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['end_date'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['duration'\]", + # Host-specific items. + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['pid'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['hostname'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['unixname'\]", + # TODO: If we switched to Git urls, maybe we could get this to work consistently. + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['fileloc'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['openlineage_.*'\]", + ], + ) + + +if __name__ == "__main__": + # When run directly, just set up a local airflow instance. + import tempfile + + with _run_airflow( + tmp_path=pathlib.Path(tempfile.mkdtemp("airflow-plugin-test")), + dags_folder=DAGS_FOLDER, + is_v1=not HAS_AIRFLOW_LISTENER_API, + ) as airflow_instance: + # input("Press enter to exit...") + breakpoint() + print("quitting airflow") diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py index 9aa901171cfa6..d8620e74d7e30 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py @@ -14,18 +14,21 @@ import pytest from airflow.lineage import apply_lineage, prepare_lineage from airflow.models import DAG, Connection, DagBag, DagRun, TaskInstance -from datahub_provider import get_provider_info -from datahub_provider._airflow_shims import AIRFLOW_PATCHED, EmptyOperator -from datahub_provider.entities import Dataset, Urn -from datahub_provider.hooks.datahub import DatahubKafkaHook, DatahubRestHook -from datahub_provider.operators.datahub import DatahubEmitterOperator + +from datahub_airflow_plugin import get_provider_info +from datahub_airflow_plugin._airflow_shims import ( + AIRFLOW_PATCHED, + AIRFLOW_VERSION, + EmptyOperator, +) +from datahub_airflow_plugin.entities import Dataset, Urn +from datahub_airflow_plugin.hooks.datahub import DatahubKafkaHook, DatahubRestHook +from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator assert AIRFLOW_PATCHED # TODO: Remove default_view="tree" arg. Figure out why is default_view being picked as "grid" and how to fix it ? -# Approach suggested by https://stackoverflow.com/a/11887885/5004662. -AIRFLOW_VERSION = packaging.version.parse(airflow.version.version) lineage_mce = builder.make_lineage_mce( [ @@ -105,7 +108,7 @@ def test_datahub_rest_hook(mock_emitter): mock_emitter.assert_called_once_with(config.host, None, None) instance = mock_emitter.return_value - instance.emit_mce.assert_called_with(lineage_mce) + instance.emit.assert_called_with(lineage_mce) @mock.patch("datahub.emitter.rest_emitter.DatahubRestEmitter", autospec=True) @@ -119,7 +122,7 @@ def test_datahub_rest_hook_with_timeout(mock_emitter): mock_emitter.assert_called_once_with(config.host, None, 5) instance = mock_emitter.return_value - instance.emit_mce.assert_called_with(lineage_mce) + instance.emit.assert_called_with(lineage_mce) @mock.patch("datahub.emitter.kafka_emitter.DatahubKafkaEmitter", autospec=True) @@ -131,11 +134,11 @@ def test_datahub_kafka_hook(mock_emitter): mock_emitter.assert_called_once() instance = mock_emitter.return_value - instance.emit_mce_async.assert_called() + instance.emit.assert_called() instance.flush.assert_called_once() -@mock.patch("datahub_provider.hooks.datahub.DatahubRestHook.emit_mces") +@mock.patch("datahub_provider.hooks.datahub.DatahubRestHook.emit") def test_datahub_lineage_operator(mock_emit): with patch_airflow_connection(datahub_rest_connection_config) as config: assert config.conn_id diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py deleted file mode 100644 index 10cf3ad0a608a..0000000000000 --- a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_dummy(): - pass diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_packaging.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_packaging.py new file mode 100644 index 0000000000000..1d0ce5835f958 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_packaging.py @@ -0,0 +1,8 @@ +import setuptools + + +def test_package_list_match_inits(): + where = "./src" + package_list = set(setuptools.find_packages(where)) + namespace_packages = set(setuptools.find_namespace_packages(where)) + assert package_list == namespace_packages, "are you missing a package init file?" diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini index 6a1c06aed8cdd..2f05854940d10 100644 --- a/metadata-ingestion-modules/airflow-plugin/tox.ini +++ b/metadata-ingestion-modules/airflow-plugin/tox.ini @@ -4,32 +4,23 @@ # and then run "tox" from this directory. [tox] -envlist = py3-quick,py3-full - -[gh-actions] -python = - 3.6: py3-full - 3.9: py3-full - -# Providing optional features that add dependencies from setup.py as deps here -# allows tox to recreate testenv when new dependencies are added to setup.py. -# Previous approach of using the tox global setting extras is not recommended -# as extras is only called when the testenv is created for the first time! -# see more here -> https://github.com/tox-dev/tox/issues/1105#issuecomment-448596282 +envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py310-airflow27 [testenv] -deps = - -e ../../metadata-ingestion/[.dev] +use_develop = true +extras = dev,integration-tests,plugin-v1 +deps = + -e ../../metadata-ingestion/ + # Airflow version + airflow21: apache-airflow~=2.1.0 + airflow22: apache-airflow~=2.2.0 + airflow24: apache-airflow~=2.4.0 + airflow26: apache-airflow~=2.6.0 + airflow27: apache-airflow~=2.7.0 commands = - pytest --cov={envsitepackagesdir}/datahub --cov={envsitepackagesdir}/datahub_provider \ - py3-quick: -m 'not integration and not slow_integration' --junit-xml=junit.quick.xml \ - py3-full: --cov-fail-under 65 --junit-xml=junit.full.xml \ - --continue-on-collection-errors \ - -vv + pytest --cov-append {posargs} -setenv = - AIRFLOW_HOME = /tmp/airflow/thisshouldnotexist-{envname} +# For Airflow 2.4+, add the plugin-v2 extra. +[testenv:py310-airflow{24,26,27}] +extras = dev,integration-tests,plugin-v2 -[testenv:py3-full] -deps = - ../../metadata-ingestion/.[dev] diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index ea7990ab9c660..0d8de625ec709 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -12,7 +12,7 @@ if (!project.hasProperty("extra_pip_requirements")) { } def get_coverage_arg(test_name) { - return "--cov-report term --cov-report xml:coverage_${test_name}.xml " + return "--cov-report xml:coverage_${test_name}.xml " } task checkPythonVersion(type: Exec) { @@ -138,7 +138,7 @@ task testQuick(type: Exec, dependsOn: [installDev, ':metadata-models:generateJso outputs.dir("${venv_name}") def cvg_arg = get_coverage_arg("quick") commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=20 -m 'not integration and not integration_batch_1 and not slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" + "source ${venv_name}/bin/activate && pytest ${cvg_arg} tests/unit --durations=20 -m 'not integration' -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" } task installDevTest(type: Exec, dependsOn: [install]) { @@ -164,27 +164,25 @@ task testSingle(dependsOn: [installDevTest]) { } } -task testIntegration(type: Exec, dependsOn: [installDevTest]) { - def cvg_arg = get_coverage_arg("int") +task testIntegrationBatch0(type: Exec, dependsOn: [installDevTest]) { + def cvg_arg = get_coverage_arg("intBatch0") commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=50 -m 'integration' -vv --continue-on-collection-errors --junit-xml=junit.integration.xml" + "source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=50 -m 'integration_batch_0' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch0.xml" } - task testIntegrationBatch1(type: Exec, dependsOn: [installDevTest]) { def cvg_arg = get_coverage_arg("intBatch1") commandLine 'bash', '-c', "source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=50 -m 'integration_batch_1' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch1.xml" } - -task testFull(type: Exec, dependsOn: [installDevTest]) { +task testIntegrationBatch2(type: Exec, dependsOn: [installDevTest]) { + def cvg_arg = get_coverage_arg("intBatch2") commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && pytest --durations=50 -vv --continue-on-collection-errors --junit-xml=junit.full.xml" + "source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=20 -m 'integration_batch_2' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch2.xml" } -task testSlowIntegration(type: Exec, dependsOn: [installDevTest]) { - def cvg_arg = get_coverage_arg("intSlow") +task testFull(type: Exec, dependsOn: [installDevTest]) { commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=20 -m 'slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.slow.integration.xml" + "source ${venv_name}/bin/activate && pytest --durations=50 -vv --continue-on-collection-errors --junit-xml=junit.full.xml" } task specGen(type: Exec, dependsOn: [codegen, installDevTest]) { diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md index f529590e2ab39..d5f834936cdcf 100644 --- a/metadata-ingestion/developing.md +++ b/metadata-ingestion/developing.md @@ -36,6 +36,7 @@ cd metadata-ingestion-modules/airflow-plugin source venv/bin/activate datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)" ``` + ### Common setup issues Common issues (click to expand): @@ -111,6 +112,7 @@ mypy src/ tests/ ``` or you can run from root of the repository + ```shell ./gradlew :metadata-ingestion:lintFix ``` @@ -178,14 +180,11 @@ pip install -e '.[integration-tests]' pytest -vv # Run unit tests. -pytest -m 'not integration and not slow_integration' +pytest -m 'not integration' # Run Docker-based integration tests. pytest -m 'integration' -# Run Docker-based slow integration tests. -pytest -m 'slow_integration' - # You can also run these steps via the gradle build: ../gradlew :metadata-ingestion:lint ../gradlew :metadata-ingestion:lintFix diff --git a/metadata-ingestion/docs/sources/athena/athena_pre.md b/metadata-ingestion/docs/sources/athena/athena_pre.md new file mode 100644 index 0000000000000..a56457d3f84fc --- /dev/null +++ b/metadata-ingestion/docs/sources/athena/athena_pre.md @@ -0,0 +1,72 @@ +### Prerequisities + +In order to execute this source, you will need to create a policy with below permissions and attach it to the the aws role or credentials used in ingestion recipe. + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "athena:GetTableMetadata", + "athena:StartQueryExecution", + "athena:GetQueryResults", + "athena:GetDatabase", + "athena:ListDataCatalogs", + "athena:GetDataCatalog", + "athena:ListQueryExecutions", + "athena:GetWorkGroup", + "athena:StopQueryExecution", + "athena:GetQueryResultsStream", + "athena:ListDatabases", + "athena:GetQueryExecution", + "athena:ListTableMetadata", + "athena:BatchGetQueryExecution", + "glue:GetTables", + "glue:GetDatabases", + "glue:GetTable", + "glue:GetDatabase", + "glue:SearchTables", + "glue:GetTableVersions", + "glue:GetTableVersion", + "glue:GetPartition", + "glue:GetPartitions", + "s3:GetObject", + "s3:ListBucket", + "s3:GetBucketLocation", + ], + "Resource": [ + "arn:aws:athena:${region-id}:${account-id}:datacatalog/*", + "arn:aws:athena:${region-id}:${account-id}:workgroup/*", + "arn:aws:glue:${region-id}:${account-id}:tableVersion/*/*/*", + "arn:aws:glue:${region-id}:${account-id}:table/*/*", + "arn:aws:glue:${region-id}:${account-id}:catalog", + "arn:aws:glue:${region-id}:${account-id}:database/*", + "arn:aws:s3:::${datasets-bucket}", + "arn:aws:s3:::${datasets-bucket}/*" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:ListBucketMultipartUploads", + "s3:AbortMultipartUpload", + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:ListMultipartUploadParts" + ], + "Resource": [ + "arn:aws:s3:::${athena-query-result-bucket}/*", + "arn:aws:s3:::${athena-query-result-bucket}" + ] + }, + ] +} +``` + +Replace `${var}` with appropriate values as per your athena setup. \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/databricks/unity-catalog_pre.md b/metadata-ingestion/docs/sources/databricks/unity-catalog_pre.md index 2be8846b87bea..ae2883343d7e8 100644 --- a/metadata-ingestion/docs/sources/databricks/unity-catalog_pre.md +++ b/metadata-ingestion/docs/sources/databricks/unity-catalog_pre.md @@ -13,6 +13,7 @@ * Ownership of or `SELECT` privilege on any tables and views you want to ingest * [Ownership documentation](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/ownership.html) * [Privileges documentation](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html) + + To ingest your workspace's notebooks and respective lineage, your service principal must have `CAN_READ` privileges on the folders containing the notebooks you want to ingest: [guide](https://docs.databricks.com/en/security/auth-authz/access-control/workspace-acl.html#folder-permissions). + To `include_usage_statistics` (enabled by default), your service principal must have `CAN_MANAGE` permissions on any SQL Warehouses you want to ingest: [guide](https://docs.databricks.com/security/auth-authz/access-control/sql-endpoint-acl.html). + To ingest `profiling` information with `call_analyze` (enabled by default), your service principal must have ownership or `MODIFY` privilege on any tables you want to profile. * Alternatively, you can run [ANALYZE TABLE](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-aux-analyze-table.html) yourself on any tables you want to profile, then set `call_analyze` to `false`. diff --git a/metadata-ingestion/docs/sources/dbt/dbt-cloud_recipe.yml b/metadata-ingestion/docs/sources/dbt/dbt-cloud_recipe.yml index 113303cfc1ad4..ef0776b189ca9 100644 --- a/metadata-ingestion/docs/sources/dbt/dbt-cloud_recipe.yml +++ b/metadata-ingestion/docs/sources/dbt/dbt-cloud_recipe.yml @@ -6,14 +6,14 @@ source: # In the URL https://cloud.getdbt.com/next/deploy/107298/projects/175705/jobs/148094, # 107298 is the account_id, 175705 is the project_id, and 148094 is the job_id - account_id: # set to your dbt cloud account id - project_id: # set to your dbt cloud project id - job_id: # set to your dbt cloud job id + account_id: "${DBT_ACCOUNT_ID}" # set to your dbt cloud account id + project_id: "${DBT_PROJECT_ID}" # set to your dbt cloud project id + job_id: "${DBT_JOB_ID}" # set to your dbt cloud job id run_id: # set to your dbt cloud run id. This is optional, and defaults to the latest run target_platform: postgres # Options - target_platform: "my_target_platform_id" # e.g. bigquery/postgres/etc. + target_platform: "${TARGET_PLATFORM_ID}" # e.g. bigquery/postgres/etc. # sink configs diff --git a/metadata-ingestion/docs/sources/dbt/dbt.md b/metadata-ingestion/docs/sources/dbt/dbt.md index bfc3ebd5bb350..43ced13c3b1f8 100644 --- a/metadata-ingestion/docs/sources/dbt/dbt.md +++ b/metadata-ingestion/docs/sources/dbt/dbt.md @@ -38,6 +38,12 @@ meta_mapping: operation: "add_terms" config: separator: "," + documentation_link: + match: "(?:https?)?\:\/\/\w*[^#]*" + operation: "add_doc_link" + config: + link: {{ $match }} + description: "Documentation Link" column_meta_mapping: terms_list: match: ".*" @@ -57,6 +63,7 @@ We support the following operations: 2. add_term - Requires `term` property in config. 3. add_terms - Accepts an optional `separator` property in config. 4. add_owner - Requires `owner_type` property in config which can be either user or group. Optionally accepts the `owner_category` config property which you can set to one of `['TECHNICAL_OWNER', 'BUSINESS_OWNER', 'DATA_STEWARD', 'DATAOWNER'` (defaults to `DATAOWNER`). +5. add_doc_link - Requires `link` and `description` properties in config. Upon ingestion run, this will overwrite current links in the institutional knowledge section with this new link. The anchor text is defined here in the meta_mappings as `description`. Note: diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md b/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md index 7f9a0324c7bc6..a1c0a6e2d4d21 100644 --- a/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md +++ b/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md @@ -1,21 +1,18 @@ -## Limitations - -For each region, the list table operation returns maximum number 100 tables, we need to further improve it by implementing pagination for listing tables - ## Advanced Configurations ### Using `include_table_item` config -If there are items that have most representative fields of the table, user could use the `include_table_item` option to provide a list of primary keys of a table in dynamodb format, those items from given primary keys will be included when we scan the table. +If there are items that have most representative fields of the table, users could use the `include_table_item` option to provide a list of primary keys of the table in dynamodb format. We include these items in addition to the first 100 items in the table when we scan it. -Take [AWS DynamoDB Developer Guide Example tables and data](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AppendixSampleTables.html) as an example, if user has a table `Reply` with composite primary key `Id` and `ReplyDateTime`, user can use `include_table_item` to include 2 items as following: +Take [AWS DynamoDB Developer Guide Example tables and data](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AppendixSampleTables.html) as an example, if a account has a table `Reply` in the `us-west-2` region with composite primary key `Id` and `ReplyDateTime`, users can use `include_table_item` to include 2 items as following: Example: ```yml -# put the table name and composite key in DynamoDB format +# The table name should be in the format of region.table_name +# The primary keys should be in the DynamoDB format include_table_item: - Reply: + us-west-2.Reply: [ { "ReplyDateTime": { "S": "2015-09-22T19:58:22.947Z" }, diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md b/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md index a48e8d5be04aa..598d0ecdb3786 100644 --- a/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md +++ b/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md @@ -1,8 +1,8 @@ ### Prerequisities -In order to execute this source, you will need to create access key and secret keys that have DynamoDB read access. You can create these policies and attach to your account or can ask your account admin to attach these policies to your account. +In order to execute this source, you need to attach the `AmazonDynamoDBReadOnlyAccess` policy to a user in your AWS account. Then create an API access key and secret for the user. -For access key permissions, you can create a policy with permissions below and attach to your account, you can find more details in [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html) +For a user to be able to create API access key, it needs the following access key permissions. Your AWS account admin can create a policy with these permissions and attach to the user, you can find more details in [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html) ```json { @@ -22,5 +22,3 @@ For access key permissions, you can create a policy with permissions below and a ] } ``` - -For DynamoDB read access, you can simply attach AWS managed policy `AmazonDynamoDBReadOnlyAccess` to your account, you can find more details in [Attaching a policy to an IAM user group](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_groups_manage_attach-policy.html) diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml b/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml index bd41637907b5c..4f4edc9a7d496 100644 --- a/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml +++ b/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml @@ -4,16 +4,14 @@ source: platform_instance: "AWS_ACCOUNT_ID" aws_access_key_id: "${AWS_ACCESS_KEY_ID}" aws_secret_access_key: "${AWS_SECRET_ACCESS_KEY}" - # User could use the below option to provide a list of primary keys of a table in dynamodb format, - # those items from given primary keys will be included when we scan the table. - # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items. - # We'll enforce the the primary keys list size not to exceed 100 - # The total items we'll try to retrieve in these two scenarios: - # 1. If user don't specify include_table_item: we'll retrieve up to 100 items - # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in - # the table, with a total not more than 200 items + # + # If there are items that have most representative fields of the table, users could use the + # `include_table_item` option to provide a list of primary keys of the table in dynamodb format. + # For each `region.table`, the list of primary keys can be at most 100. + # We include these items in addition to the first 100 items in the table when we scan it. + # # include_table_item: - # table_name: + # region.table_name: # [ # { # "partition_key_name": { "attribute_type": "attribute_value" }, diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md index 0323e214045ae..fcfae6cd1e6d7 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md @@ -40,7 +40,7 @@ PowerBI Source supports M-Query expression for below listed PowerBI Data Sources 4. Microsoft SQL Server 5. Google BigQuery -Native SQL query parsing is supported for `Snowflake` and `Amazon Redshift` data-sources and only first table from `FROM` clause will be ingested as upstream table. Advance SQL construct like JOIN and SUB-QUERIES in `FROM` clause are not supported. +Native SQL query parsing is supported for `Snowflake` and `Amazon Redshift` data-sources. For example refer below native SQL query. The table `OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_UNIT_TARGET` will be ingested as upstream table. diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index f0fa44687a109..d1a1555a3ca02 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -7,7 +7,7 @@ The below table shows transformer which can transform aspects of entity [Dataset | Dataset Aspect | Transformer | |---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `status` | - [Mark Dataset status](#mark-dataset-status) | -| `ownership` | - [Simple Add Dataset ownership](#simple-add-dataset-ownership)
- [Pattern Add Dataset ownership](#pattern-add-dataset-ownership)
- [Simple Remove Dataset Ownership](#simple-remove-dataset-ownership) | +| `ownership` | - [Simple Add Dataset ownership](#simple-add-dataset-ownership)
- [Pattern Add Dataset ownership](#pattern-add-dataset-ownership)
- [Simple Remove Dataset Ownership](#simple-remove-dataset-ownership)
- [Extract Ownership from Tags](#extract-ownership-from-tags) | | `globalTags` | - [Simple Add Dataset globalTags ](#simple-add-dataset-globaltags)
- [Pattern Add Dataset globalTags](#pattern-add-dataset-globaltags)
- [Add Dataset globalTags](#add-dataset-globaltags) | | `browsePaths` | - [Set Dataset browsePath](#set-dataset-browsepath) | | `glossaryTerms` | - [Simple Add Dataset glossaryTerms ](#simple-add-dataset-glossaryterms)
- [Pattern Add Dataset glossaryTerms](#pattern-add-dataset-glossaryterms) | @@ -15,6 +15,28 @@ The below table shows transformer which can transform aspects of entity [Dataset | `datasetProperties` | - [Simple Add Dataset datasetProperties](#simple-add-dataset-datasetproperties)
- [Add Dataset datasetProperties](#add-dataset-datasetproperties) | | `domains` | - [Simple Add Dataset domains](#simple-add-dataset-domains)
- [Pattern Add Dataset domains](#pattern-add-dataset-domains) | +## Extract Ownership from Tags +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|---------|---------------|---------------------------------------------| +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| `tag_prefix` | | str | | Regex to use for tags to match against. Supports Regex to match a prefix which is used to remove content. Rest of string is considered owner ID for creating owner URN. | +| `is_user` | | bool | `true` | Whether should be consider a user or not. If `false` then considered a group. | +| `email_domain` | | str | | If set then this is appended to create owner URN. | +| `owner_type` | | str | `TECHNICAL_OWNER` | Ownership type. | +| `owner_type_urn` | | str | `None` | Set to a custom ownership type's URN if using custom ownership. | + +Matches against a tag prefix and considers string in tags after that prefix as owner to create ownership. + +```yaml +transformers: + - type: "extract_ownership_from_tags" + config: + tag_prefix: "dbt:techno-genie:" + is_user: true + email_domain: "coolcompany.com" +``` + ## Mark Dataset Status ### Config Details | Field | Required | Type | Default | Description | diff --git a/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml b/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml new file mode 100644 index 0000000000000..c73904403f678 --- /dev/null +++ b/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml @@ -0,0 +1,21 @@ +# id: pet_details_dc # Optional: This is the unique identifier for the data contract +display_name: Data Contract for SampleHiveDataset +entity: urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD) +freshness: + time: 0700 + granularity: DAILY +schema: + properties: + field_foo: + type: string + native_type: VARCHAR(100) + field_bar: + type: boolean + required: + - field_bar +data_quality: + - type: column_range + config: + column: field_foo + min: 0 + max: 100 diff --git a/metadata-ingestion/scripts/docgen.py b/metadata-ingestion/scripts/docgen.py index b9f558011fc90..1a4db09e961ce 100644 --- a/metadata-ingestion/scripts/docgen.py +++ b/metadata-ingestion/scripts/docgen.py @@ -883,6 +883,150 @@ def generate( if metrics["plugins"].get("failed", 0) > 0: # type: ignore sys.exit(1) + ### Create Lineage doc + + source_dir = "../docs/generated/lineage" + os.makedirs(source_dir, exist_ok=True) + doc_file = f"{source_dir}/lineage-feature-guide.md" + with open(doc_file, "w+") as f: + f.write("import FeatureAvailability from '@site/src/components/FeatureAvailability';\n\n") + f.write(f"# About DataHub Lineage\n\n") + f.write("\n") + + f.write(""" +Lineage is used to capture data dependencies within an organization. It allows you to track the inputs from which a data asset is derived, along with the data assets that depend on it downstream. + +## Viewing Lineage + +You can view lineage under **Lineage** tab or **Lineage Visualization** screen. + +

+ +

+ +The UI shows the latest version of the lineage. The time picker can be used to filter out edges within the latest version to exclude those that were last updated outside of the time window. Selecting time windows in the patch will not show you historical lineages. It will only filter the view of the latest version of the lineage. + +

+ +

+ + +:::tip The Lineage Tab is greyed out - why can’t I click on it? +This means you have not yet ingested lineage metadata for that entity. Please ingest lineage to proceed. + +::: + +## Adding Lineage + +### Ingestion Source + +If you're using an ingestion source that supports extraction of Lineage (e.g. **Table Lineage Capability**), then lineage information can be extracted automatically. +For detailed instructions, refer to the [source documentation](https://datahubproject.io/integrations) for the source you are using. + +### UI + +As of `v0.9.5`, DataHub supports the manual editing of lineage between entities. Data experts are free to add or remove upstream and downstream lineage edges in both the Lineage Visualization screen as well as the Lineage tab on entity pages. Use this feature to supplement automatic lineage extraction or establish important entity relationships in sources that do not support automatic extraction. Editing lineage by hand is supported for Datasets, Charts, Dashboards, and Data Jobs. +Please refer to our [UI Guides on Lineage](../../features/feature-guides/ui-lineage.md) for more information. + +:::caution Recommendation on UI-based lineage + +Lineage added by hand and programmatically may conflict with one another to cause unwanted overwrites. +It is strongly recommend that lineage is edited manually in cases where lineage information is not also extracted in automated fashion, e.g. by running an ingestion source. + +::: + +### API + +If you are not using a Lineage-support ingestion source, you can programmatically emit lineage edges between entities via API. +Please refer to [API Guides on Lineage](../../api/tutorials/lineage.md) for more information. + + +## Lineage Support + +### Automatic Lineage Extraction Support + +This is a summary of automatic lineage extraciton support in our data source. Please refer to the **Important Capabilities** table in the source documentation. Note that even if the source does not support automatic extraction, you can still add lineage manually using our API & SDKs.\n""") + + f.write("\n| Source | Table-Level Lineage | Column-Level Lineage | Related Configs |\n") + f.write("| ---------- | ------ | ----- |----- |\n") + + for platform_id, platform_docs in sorted( + source_documentation.items(), + key=lambda x: (x[1]["name"].casefold(), x[1]["name"]) + if "name" in x[1] + else (x[0].casefold(), x[0]), + ): + for plugin, plugin_docs in sorted( + platform_docs["plugins"].items(), + key=lambda x: str(x[1].get("doc_order")) + if x[1].get("doc_order") + else x[0], + ): + platform_name = platform_docs['name'] + if len(platform_docs["plugins"].keys()) > 1: + # We only need to show this if there are multiple modules. + platform_name = f"{platform_name} `{plugin}`" + + # Initialize variables + table_level_supported = "❌" + column_level_supported = "❌" + config_names = '' + + if "capabilities" in plugin_docs: + plugin_capabilities = plugin_docs["capabilities"] + + for cap_setting in plugin_capabilities: + capability_text = get_capability_text(cap_setting.capability) + capability_supported = get_capability_supported_badge(cap_setting.supported) + + if capability_text == "Table-Level Lineage" and capability_supported == "✅": + table_level_supported = "✅" + + if capability_text == "Column-level Lineage" and capability_supported == "✅": + column_level_supported = "✅" + + if not (table_level_supported == "❌" and column_level_supported == "❌"): + if "config_schema" in plugin_docs: + config_properties = json.loads(plugin_docs['config_schema']).get('properties', {}) + config_names = '
'.join( + [f'- {property_name}' for property_name in config_properties if 'lineage' in property_name]) + lineage_not_applicable_sources = ['azure-ad', 'csv', 'demo-data', 'dynamodb', 'iceberg', 'json-schema', 'ldap', 'openapi', 'pulsar', 'sqlalchemy' ] + if platform_id not in lineage_not_applicable_sources : + f.write( + f"| [{platform_name}](../../generated/ingestion/sources/{platform_id}.md) | {table_level_supported} | {column_level_supported} | {config_names}|\n" + ) + + f.write(""" + +### Types of Lineage Connections + +Types of lineage connections supported in DataHub and the example codes are as follows. + +| Connection | Examples | A.K.A | +|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| Dataset to Dataset | - [lineage_emitter_mcpw_rest.py](../../../metadata-ingestion/examples/library/lineage_emitter_mcpw_rest.py)
- [lineage_emitter_rest.py](../../../metadata-ingestion/examples/library/lineage_emitter_rest.py)
- [lineage_emitter_kafka.py](../../../metadata-ingestion/examples/library/lineage_emitter_kafka.py)
- [lineage_emitter_dataset_finegrained.py](../../../metadata-ingestion/examples/library/lineage_emitter_dataset_finegrained.py)
- [Datahub BigQuery Lineage](https://github.com/datahub-project/datahub/blob/a1bf95307b040074c8d65ebb86b5eb177fdcd591/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py#L229)
- [Datahub Snowflake Lineage](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py#L249) | +| DataJob to DataFlow | - [lineage_job_dataflow.py](../../../metadata-ingestion/examples/library/lineage_job_dataflow.py) | | +| DataJob to Dataset | - [lineage_dataset_job_dataset.py](../../../metadata-ingestion/examples/library/lineage_dataset_job_dataset.py)
| Pipeline Lineage | +| Chart to Dashboard | - [lineage_chart_dashboard.py](../../../metadata-ingestion/examples/library/lineage_chart_dashboard.py) | | +| Chart to Dataset | - [lineage_dataset_chart.py](../../../metadata-ingestion/examples/library/lineage_dataset_chart.py) | | + + +:::tip Our Roadmap +We're actively working on expanding lineage support for new data sources. +Visit our [Official Roadmap](https://feature-requests.datahubproject.io/roadmap) for upcoming updates! +::: + +## References + +- [DataHub Basics: Lineage 101](https://www.youtube.com/watch?v=rONGpsndzRw&t=1s) +- [DataHub November 2022 Town Hall](https://www.youtube.com/watch?v=BlCLhG8lGoY&t=1s) - Including Manual Lineage Demo +- [Acryl Data introduces lineage support and automated propagation of governance information for Snowflake in DataHub](https://blog.datahubproject.io/acryl-data-introduces-lineage-support-and-automated-propagation-of-governance-information-for-339c99536561) +- [Data in Context: Lineage Explorer in DataHub](https://blog.datahubproject.io/data-in-context-lineage-explorer-in-datahub-a53a9a476dc4) +- [Harnessing the Power of Data Lineage with DataHub](https://blog.datahubproject.io/harnessing-the-power-of-data-lineage-with-datahub-ad086358dec4) +- [DataHub Lineage Impact Analysis](https://datahubproject.io/docs/next/act-on-metadata/impact-analysis) + """) + + print("Lineage Documentation Generation Complete") if __name__ == "__main__": logger.setLevel("INFO") diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index fad55b99ec938..8b78e4d3c9c6f 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -75,10 +75,11 @@ disallow_untyped_defs = yes asyncio_mode = auto addopts = --cov=src --cov-report= --cov-config setup.cfg --strict-markers markers = - slow_unit: marks tests to only run slow unit tests (deselect with '-m not slow_unit') - integration: marks tests to only run in integration (deselect with '-m "not integration"') - integration_batch_1: mark tests to only run in batch 1 of integration tests. This is done mainly for parallelisation (deselect with '-m not integration_batch_1') - slow_integration: marks tests that are too slow to even run in integration (deselect with '-m "not slow_integration"') + slow: marks tests that are slow to run, including all docker-based tests (deselect with '-m not slow') + integration: marks all integration tests, across all batches (deselect with '-m "not integration"') + integration_batch_0: mark tests to run in batch 0 of integration tests. This is done mainly for parallelisation in CI. Batch 0 is the default batch. + integration_batch_1: mark tests to run in batch 1 of integration tests + integration_batch_2: mark tests to run in batch 2 of integration tests testpaths = tests/unit tests/integration diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 024950e3a6fd5..fe8e3be4632c4 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -1,4 +1,3 @@ -import os import sys from typing import Dict, Set @@ -9,16 +8,9 @@ exec(fp.read(), package_metadata) -def get_long_description(): - root = os.path.dirname(__file__) - with open(os.path.join(root, "README.md")) as f: - description = f.read() - - return description - - base_requirements = { - "typing_extensions>=3.10.0.2", + # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict. + "typing_extensions>=3.7.4.3", "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", @@ -258,7 +250,7 @@ def get_long_description(): databricks = { # 0.1.11 appears to have authentication issues with azure databricks - "databricks-sdk>=0.1.1, != 0.1.11", + "databricks-sdk>=0.9.0", "pyspark", "requests", } @@ -270,6 +262,7 @@ def get_long_description(): # Sink plugins. "datahub-kafka": kafka_common, "datahub-rest": rest_common, + "sync-file-emitter": {"filelock"}, "datahub-lite": { "duckdb", "fastapi", @@ -470,6 +463,7 @@ def get_long_description(): *list( dependency for plugin in [ + "athena", "bigquery", "clickhouse", "clickhouse-usage", @@ -492,6 +486,7 @@ def get_long_description(): "kafka", "datahub-rest", "datahub-lite", + "great-expectations", "presto", "redash", "redshift", @@ -530,6 +525,7 @@ def get_long_description(): "clickhouse", "delta-lake", "druid", + "feast" if sys.version_info >= (3, 8) else None, "hana", "hive", "iceberg" if sys.version_info >= (3, 8) else None, @@ -634,6 +630,7 @@ def get_long_description(): "simple_add_dataset_properties = datahub.ingestion.transformer.add_dataset_properties:SimpleAddDatasetProperties", "pattern_add_dataset_schema_terms = datahub.ingestion.transformer.add_dataset_schema_terms:PatternAddDatasetSchemaTerms", "pattern_add_dataset_schema_tags = datahub.ingestion.transformer.add_dataset_schema_tags:PatternAddDatasetSchemaTags", + "extract_owners_from_tags = datahub.ingestion.transformer.extract_ownership_from_tags:ExtractOwnersFromTagsTransformer", ], "datahub.ingestion.sink.plugins": [ "file = datahub.ingestion.sink.file:FileSink", @@ -666,7 +663,12 @@ def get_long_description(): }, license="Apache License 2.0", description="A CLI to work with DataHub metadata", - long_description=get_long_description(), + long_description="""\ +The `acryl-datahub` package contains a CLI and SDK for interacting with DataHub, +as well as an integration framework for pulling/pushing metadata from external systems. + +See the [DataHub docs](https://datahubproject.io/docs/metadata-ingestion). +""", long_description_content_type="text/markdown", classifiers=[ "Development Status :: 5 - Production/Stable", diff --git a/metadata-ingestion/src/datahub/api/entities/corpgroup/corpgroup.py b/metadata-ingestion/src/datahub/api/entities/corpgroup/corpgroup.py index 796786beba21b..a898e35bb810e 100644 --- a/metadata-ingestion/src/datahub/api/entities/corpgroup/corpgroup.py +++ b/metadata-ingestion/src/datahub/api/entities/corpgroup/corpgroup.py @@ -2,7 +2,7 @@ import logging from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union +from typing import Callable, Iterable, List, Optional, Union import pydantic from pydantic import BaseModel @@ -11,9 +11,10 @@ from datahub.api.entities.corpuser.corpuser import CorpUser, CorpUserGenerationConfig from datahub.configuration.common import ConfigurationError from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.schema_classes import ( CorpGroupEditableInfoClass, CorpGroupInfoClass, @@ -25,9 +26,6 @@ _Aspect, ) -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - logger = logging.getLogger(__name__) @@ -194,30 +192,9 @@ def generate_mcp( entityUrn=urn, aspect=StatusClass(removed=False) ) - @staticmethod - def _datahub_graph_from_datahub_rest_emitter( - rest_emitter: DatahubRestEmitter, - ) -> DataHubGraph: - """ - Create a datahub graph instance from a REST Emitter. - A stop-gap implementation which is expected to be removed after PATCH support is implemented - for membership updates for users <-> groups - """ - graph = DataHubGraph( - config=DatahubClientConfig( - server=rest_emitter._gms_server, - token=rest_emitter._token, - timeout_sec=rest_emitter._connect_timeout_sec, - retry_status_codes=rest_emitter._retry_status_codes, - extra_headers=rest_emitter._session.headers, - disable_ssl_verification=rest_emitter._session.verify is False, - ) - ) - return graph - def emit( self, - emitter: Union[DatahubRestEmitter, "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ @@ -235,7 +212,7 @@ def emit( # who are passing in a DataHubRestEmitter today # we won't need this in the future once PATCH support is implemented as all emitters # will work - datahub_graph = self._datahub_graph_from_datahub_rest_emitter(emitter) + datahub_graph = emitter.to_graph() for mcp in self.generate_mcp( generation_config=CorpGroupGenerationConfig( override_editable=self.overrideEditable, datahub_graph=datahub_graph diff --git a/metadata-ingestion/src/datahub/api/entities/corpuser/corpuser.py b/metadata-ingestion/src/datahub/api/entities/corpuser/corpuser.py index c67eb02a870a5..9fe1ebedafca7 100644 --- a/metadata-ingestion/src/datahub/api/entities/corpuser/corpuser.py +++ b/metadata-ingestion/src/datahub/api/entities/corpuser/corpuser.py @@ -1,14 +1,14 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union +from typing import Callable, Iterable, List, Optional import pydantic import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.schema_classes import ( CorpUserEditableInfoClass, CorpUserInfoClass, @@ -16,9 +16,6 @@ StatusClass, ) -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - @dataclass class CorpUserGenerationConfig: @@ -144,7 +141,7 @@ def generate_mcp( def emit( self, - emitter: Union[DatahubRestEmitter, "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/__init__.py b/metadata-ingestion/src/datahub/api/entities/datacontract/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py new file mode 100644 index 0000000000000..a665e95e93c43 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py @@ -0,0 +1,107 @@ +from typing import List, Optional, Union + +import pydantic +from typing_extensions import Literal + +import datahub.emitter.mce_builder as builder +from datahub.configuration.common import ConfigModel +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionStdAggregationClass, + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, + AssertionTypeClass, + DatasetAssertionInfoClass, + DatasetAssertionScopeClass, +) + + +class IdConfigMixin(ConfigModel): + id_raw: Optional[str] = pydantic.Field( + default=None, + alias="id", + description="The id of the assertion. If not provided, one will be generated using the type.", + ) + + def generate_default_id(self) -> str: + raise NotImplementedError + + +class CustomSQLAssertion(IdConfigMixin, ConfigModel): + type: Literal["custom_sql"] + + sql: str + + def generate_dataset_assertion_info( + self, entity_urn: str + ) -> DatasetAssertionInfoClass: + return DatasetAssertionInfoClass( + dataset=entity_urn, + scope=DatasetAssertionScopeClass.UNKNOWN, + fields=[], + operator=AssertionStdOperatorClass._NATIVE_, + aggregation=AssertionStdAggregationClass._NATIVE_, + logic=self.sql, + ) + + +class ColumnUniqueAssertion(IdConfigMixin, ConfigModel): + type: Literal["unique"] + + # TODO: support multiple columns? + column: str + + def generate_default_id(self) -> str: + return f"{self.type}-{self.column}" + + def generate_dataset_assertion_info( + self, entity_urn: str + ) -> DatasetAssertionInfoClass: + return DatasetAssertionInfoClass( + dataset=entity_urn, + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + fields=[builder.make_schema_field_urn(entity_urn, self.column)], + operator=AssertionStdOperatorClass.EQUAL_TO, + aggregation=AssertionStdAggregationClass.UNIQUE_PROPOTION, # purposely using the misspelled version to work with gql + parameters=AssertionStdParametersClass( + value=AssertionStdParameterClass( + value="1", type=AssertionStdParameterTypeClass.NUMBER + ) + ), + ) + + +class DataQualityAssertion(ConfigModel): + __root__: Union[ + CustomSQLAssertion, + ColumnUniqueAssertion, + ] = pydantic.Field(discriminator="type") + + @property + def id(self) -> str: + if self.__root__.id_raw: + return self.__root__.id_raw + try: + return self.__root__.generate_default_id() + except NotImplementedError: + return self.__root__.type + + def generate_mcp( + self, assertion_urn: str, entity_urn: str + ) -> List[MetadataChangeProposalWrapper]: + dataset_assertion_info = self.__root__.generate_dataset_assertion_info( + entity_urn + ) + + return [ + MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=AssertionInfoClass( + type=AssertionTypeClass.DATASET, + datasetAssertion=dataset_assertion_info, + ), + ) + ] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py new file mode 100644 index 0000000000000..2df446623a9d6 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py @@ -0,0 +1,213 @@ +import collections +from typing import Iterable, List, Optional, Tuple + +import pydantic +from ruamel.yaml import YAML +from typing_extensions import Literal + +import datahub.emitter.mce_builder as builder +from datahub.api.entities.datacontract.data_quality_assertion import ( + DataQualityAssertion, +) +from datahub.api.entities.datacontract.freshness_assertion import FreshnessAssertion +from datahub.api.entities.datacontract.schema_assertion import SchemaAssertion +from datahub.configuration.common import ConfigModel +from datahub.emitter.mce_builder import datahub_guid, make_assertion_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + DataContractPropertiesClass, + DataContractStateClass, + DataContractStatusClass, + DataQualityContractClass, + FreshnessContractClass, + SchemaContractClass, + StatusClass, +) +from datahub.utilities.urns.urn import guess_entity_type + + +class DataContract(ConfigModel): + """A yml representation of a Data Contract. + + This model is used as a simpler, Python-native representation of a DataHub data contract. + It can be easily parsed from a YAML file, and can be easily converted into series of MCPs + that can be emitted to DataHub. + """ + + version: Literal[1] + + id: Optional[str] = pydantic.Field( + default=None, + alias="urn", + description="The data contract urn. If not provided, one will be generated.", + ) + entity: str = pydantic.Field( + description="The entity urn that the Data Contract is associated with" + ) + # TODO: add support for properties + # properties: Optional[Dict[str, str]] = None + + schema_field: Optional[SchemaAssertion] = pydantic.Field( + default=None, alias="schema" + ) + + freshness: Optional[FreshnessAssertion] = pydantic.Field(default=None) + + # TODO: Add a validator to ensure that ids are unique + data_quality: Optional[List[DataQualityAssertion]] = None + + _original_yaml_dict: Optional[dict] = None + + @pydantic.validator("data_quality") + def validate_data_quality( + cls, data_quality: Optional[List[DataQualityAssertion]] + ) -> Optional[List[DataQualityAssertion]]: + if data_quality: + # Raise an error if there are duplicate ids. + id_counts = collections.Counter(dq_check.id for dq_check in data_quality) + duplicates = [id for id, count in id_counts.items() if count > 1] + + if duplicates: + raise ValueError( + f"Got multiple data quality tests with the same type or ID: {duplicates}. Set a unique ID for each data quality test." + ) + + return data_quality + + @property + def urn(self) -> str: + if self.id: + assert guess_entity_type(self.id) == "dataContract" + return self.id + + # Data contract urns are stable + guid_obj = {"entity": self.entity} + urn = f"urn:li:dataContract:{datahub_guid(guid_obj)}" + return urn + + def _generate_freshness_assertion( + self, freshness: FreshnessAssertion + ) -> Tuple[str, List[MetadataChangeProposalWrapper]]: + guid_dict = { + "contract": self.urn, + "entity": self.entity, + "freshness": freshness.id, + } + assertion_urn = builder.make_assertion_urn(builder.datahub_guid(guid_dict)) + + return ( + assertion_urn, + freshness.generate_mcp(assertion_urn, self.entity), + ) + + def _generate_schema_assertion( + self, schema_metadata: SchemaAssertion + ) -> Tuple[str, List[MetadataChangeProposalWrapper]]: + # ingredients for guid -> the contract id, the fact that this is a schema assertion and the entity on which the assertion is made + guid_dict = { + "contract": self.urn, + "entity": self.entity, + "schema": schema_metadata.id, + } + assertion_urn = make_assertion_urn(datahub_guid(guid_dict)) + + return ( + assertion_urn, + schema_metadata.generate_mcp(assertion_urn, self.entity), + ) + + def _generate_data_quality_assertion( + self, data_quality: DataQualityAssertion + ) -> Tuple[str, List[MetadataChangeProposalWrapper]]: + guid_dict = { + "contract": self.urn, + "entity": self.entity, + "data_quality": data_quality.id, + } + assertion_urn = make_assertion_urn(datahub_guid(guid_dict)) + + return ( + assertion_urn, + data_quality.generate_mcp(assertion_urn, self.entity), + ) + + def _generate_dq_assertions( + self, data_quality_spec: List[DataQualityAssertion] + ) -> Tuple[List[str], List[MetadataChangeProposalWrapper]]: + assertion_urns = [] + assertion_mcps = [] + + for dq_check in data_quality_spec: + assertion_urn, assertion_mcp = self._generate_data_quality_assertion( + dq_check + ) + + assertion_urns.append(assertion_urn) + assertion_mcps.extend(assertion_mcp) + + return (assertion_urns, assertion_mcps) + + def generate_mcp( + self, + ) -> Iterable[MetadataChangeProposalWrapper]: + schema_assertion_urn = None + if self.schema_field is not None: + ( + schema_assertion_urn, + schema_assertion_mcps, + ) = self._generate_schema_assertion(self.schema_field) + yield from schema_assertion_mcps + + freshness_assertion_urn = None + if self.freshness: + ( + freshness_assertion_urn, + sla_assertion_mcps, + ) = self._generate_freshness_assertion(self.freshness) + yield from sla_assertion_mcps + + dq_assertions, dq_assertion_mcps = self._generate_dq_assertions( + self.data_quality or [] + ) + yield from dq_assertion_mcps + + # Now that we've generated the assertions, we can generate + # the actual data contract. + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=self.urn, + aspects=[ + DataContractPropertiesClass( + entity=self.entity, + schema=[SchemaContractClass(assertion=schema_assertion_urn)] + if schema_assertion_urn + else None, + freshness=[ + FreshnessContractClass(assertion=freshness_assertion_urn) + ] + if freshness_assertion_urn + else None, + dataQuality=[ + DataQualityContractClass(assertion=dq_assertion_urn) + for dq_assertion_urn in dq_assertions + ], + ), + # Also emit status. + StatusClass(removed=False), + # Emit the contract state as PENDING. + DataContractStatusClass(state=DataContractStateClass.PENDING) + if True + else None, + ], + ) + + @classmethod + def from_yaml( + cls, + file: str, + ) -> "DataContract": + with open(file) as fp: + yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) + orig_dictionary = yaml.load(fp) + parsed_data_contract = DataContract.parse_obj(orig_dictionary) + parsed_data_contract._original_yaml_dict = orig_dictionary + return parsed_data_contract diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py new file mode 100644 index 0000000000000..ee8fa1181e614 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from datetime import timedelta +from typing import List, Union + +import pydantic +from typing_extensions import Literal + +from datahub.configuration.common import ConfigModel +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionTypeClass, + CalendarIntervalClass, + FixedIntervalScheduleClass, + FreshnessAssertionInfoClass, + FreshnessAssertionScheduleClass, + FreshnessAssertionScheduleTypeClass, + FreshnessAssertionTypeClass, + FreshnessCronScheduleClass, +) + + +class CronFreshnessAssertion(ConfigModel): + type: Literal["cron"] + + cron: str = pydantic.Field( + description="The cron expression to use. See https://crontab.guru/ for help." + ) + timezone: str = pydantic.Field( + "UTC", + description="The timezone to use for the cron schedule. Defaults to UTC.", + ) + + +class FixedIntervalFreshnessAssertion(ConfigModel): + type: Literal["interval"] + + interval: timedelta + + +class FreshnessAssertion(ConfigModel): + __root__: Union[ + CronFreshnessAssertion, FixedIntervalFreshnessAssertion + ] = pydantic.Field(discriminator="type") + + @property + def id(self): + return self.__root__.type + + def generate_mcp( + self, assertion_urn: str, entity_urn: str + ) -> List[MetadataChangeProposalWrapper]: + freshness = self.__root__ + + if isinstance(freshness, CronFreshnessAssertion): + schedule = FreshnessAssertionScheduleClass( + type=FreshnessAssertionScheduleTypeClass.CRON, + cron=FreshnessCronScheduleClass( + cron=freshness.cron, + timezone=freshness.timezone, + ), + ) + elif isinstance(freshness, FixedIntervalFreshnessAssertion): + schedule = FreshnessAssertionScheduleClass( + type=FreshnessAssertionScheduleTypeClass.FIXED_INTERVAL, + fixedInterval=FixedIntervalScheduleClass( + unit=CalendarIntervalClass.SECOND, + multiple=int(freshness.interval.total_seconds()), + ), + ) + else: + raise ValueError(f"Unknown freshness type {freshness}") + + assertionInfo = AssertionInfoClass( + type=AssertionTypeClass.FRESHNESS, + freshnessAssertion=FreshnessAssertionInfoClass( + entity=entity_urn, + type=FreshnessAssertionTypeClass.DATASET_CHANGE, + schedule=schedule, + ), + ) + + return [ + MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=assertionInfo) + ] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py new file mode 100644 index 0000000000000..b5b592e01f58f --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import json +from typing import List, Union + +import pydantic +from typing_extensions import Literal + +from datahub.configuration.common import ConfigModel +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.extractor.json_schema_util import get_schema_metadata +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionTypeClass, + SchemaAssertionInfoClass, + SchemaFieldClass, + SchemalessClass, + SchemaMetadataClass, +) + + +class JsonSchemaContract(ConfigModel): + type: Literal["json-schema"] + + json_schema: dict = pydantic.Field(alias="json-schema") + + _schema_metadata: SchemaMetadataClass + + def _init_private_attributes(self) -> None: + super()._init_private_attributes() + self._schema_metadata = get_schema_metadata( + platform="urn:li:dataPlatform:datahub", + name="", + json_schema=self.json_schema, + raw_schema_string=json.dumps(self.json_schema), + ) + + +class FieldListSchemaContract(ConfigModel, arbitrary_types_allowed=True): + type: Literal["field-list"] + + fields: List[SchemaFieldClass] + + _schema_metadata: SchemaMetadataClass + + def _init_private_attributes(self) -> None: + super()._init_private_attributes() + self._schema_metadata = SchemaMetadataClass( + schemaName="", + platform="urn:li:dataPlatform:datahub", + version=0, + hash="", + platformSchema=SchemalessClass(), + fields=self.fields, + ) + + +class SchemaAssertion(ConfigModel): + __root__: Union[JsonSchemaContract, FieldListSchemaContract] = pydantic.Field( + discriminator="type" + ) + + @property + def id(self): + return self.__root__.type + + def generate_mcp( + self, assertion_urn: str, entity_urn: str + ) -> List[MetadataChangeProposalWrapper]: + schema_metadata = self.__root__._schema_metadata + + assertionInfo = AssertionInfoClass( + type=AssertionTypeClass.DATA_SCHEMA, + schemaAssertion=SchemaAssertionInfoClass( + entity=entity_urn, schema=schema_metadata + ), + ) + + return [ + MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=assertionInfo) + ] diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py index 8a04768bc0a72..acd708ee81a5c 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py @@ -1,18 +1,9 @@ import logging from dataclasses import dataclass, field -from typing import ( - TYPE_CHECKING, - Callable, - Dict, - Iterable, - List, - Optional, - Set, - Union, - cast, -) +from typing import Callable, Dict, Iterable, List, Optional, Set, cast import datahub.emitter.mce_builder as builder +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AuditStampClass, @@ -29,10 +20,6 @@ ) from datahub.utilities.urns.data_flow_urn import DataFlowUrn -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - logger = logging.getLogger(__name__) @@ -170,7 +157,7 @@ def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: def emit( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py index 7eb6fc8c8d1a9..0face6415bacc 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py @@ -1,16 +1,16 @@ from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Set, Union +from typing import Callable, Dict, Iterable, List, Optional, Set import datahub.emitter.mce_builder as builder +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AuditStampClass, AzkabanJobTypeClass, DataJobInfoClass, DataJobInputOutputClass, - DataJobSnapshotClass, + FineGrainedLineageClass, GlobalTagsClass, - MetadataChangeEventClass, OwnerClass, OwnershipClass, OwnershipSourceClass, @@ -23,10 +23,6 @@ from datahub.utilities.urns.data_job_urn import DataJobUrn from datahub.utilities.urns.dataset_urn import DatasetUrn -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - @dataclass class DataJob: @@ -59,6 +55,7 @@ class DataJob: group_owners: Set[str] = field(default_factory=set) inlets: List[DatasetUrn] = field(default_factory=list) outlets: List[DatasetUrn] = field(default_factory=list) + fine_grained_lineages: List[FineGrainedLineageClass] = field(default_factory=list) upstream_urns: List[DataJobUrn] = field(default_factory=list) def __post_init__(self): @@ -103,31 +100,6 @@ def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]: ) return [tags] - def generate_mce(self) -> MetadataChangeEventClass: - job_mce = MetadataChangeEventClass( - proposedSnapshot=DataJobSnapshotClass( - urn=str(self.urn), - aspects=[ - DataJobInfoClass( - name=self.name if self.name is not None else self.id, - type=AzkabanJobTypeClass.COMMAND, - description=self.description, - customProperties=self.properties, - externalUrl=self.url, - ), - DataJobInputOutputClass( - inputDatasets=[str(urn) for urn in self.inlets], - outputDatasets=[str(urn) for urn in self.outlets], - inputDatajobs=[str(urn) for urn in self.upstream_urns], - ), - *self.generate_ownership_aspect(), - *self.generate_tags_aspect(), - ], - ) - ) - - return job_mce - def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), @@ -159,7 +131,7 @@ def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: def emit( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ @@ -179,6 +151,7 @@ def generate_data_input_output_mcp(self) -> Iterable[MetadataChangeProposalWrapp inputDatasets=[str(urn) for urn in self.inlets], outputDatasets=[str(urn) for urn in self.outlets], inputDatajobs=[str(urn) for urn in self.upstream_urns], + fineGrainedLineages=self.fine_grained_lineages, ), ) yield mcp diff --git a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py index 9ec389c3a0989..cf6080c7072e6 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py +++ b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py @@ -1,9 +1,10 @@ import time from dataclasses import dataclass, field from enum import Enum -from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Union, cast +from typing import Callable, Dict, Iterable, List, Optional, Union, cast from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import DatahubKey from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import ( @@ -26,10 +27,6 @@ from datahub.utilities.urns.data_process_instance_urn import DataProcessInstanceUrn from datahub.utilities.urns.dataset_urn import DatasetUrn -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - class DataProcessInstanceKey(DatahubKey): cluster: str @@ -106,7 +103,7 @@ def start_event_mcp( def emit_process_start( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, start_timestamp_millis: int, attempt: Optional[int] = None, emit_template: bool = True, @@ -197,7 +194,7 @@ def end_event_mcp( def emit_process_end( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, end_timestamp_millis: int, result: InstanceRunResult, result_type: Optional[str] = None, @@ -207,7 +204,7 @@ def emit_process_end( """ Generate an DataProcessInstance finish event and emits is - :param emitter: (Union[DatahubRestEmitter, DatahubKafkaEmitter]) the datahub emitter to emit generated mcps + :param emitter: (Emitter) the datahub emitter to emit generated mcps :param end_timestamp_millis: (int) the end time of the execution in milliseconds :param result: (InstanceRunResult) The result of the run :param result_type: (string) It identifies the system where the native result comes from like Airflow, Azkaban @@ -261,24 +258,24 @@ def generate_mcp( @staticmethod def _emit_mcp( mcp: MetadataChangeProposalWrapper, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ - :param emitter: (Union[DatahubRestEmitter, DatahubKafkaEmitter]) the datahub emitter to emit generated mcps + :param emitter: (Emitter) the datahub emitter to emit generated mcps :param callback: (Optional[Callable[[Exception, str], None]]) the callback method for KafkaEmitter if it is used """ emitter.emit(mcp, callback) def emit( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ - :param emitter: (Union[DatahubRestEmitter, DatahubKafkaEmitter]) the datahub emitter to emit generated mcps + :param emitter: (Emitter) the datahub emitter to emit generated mcps :param callback: (Optional[Callable[[Exception, str], None]]) the callback method for KafkaEmitter if it is used """ for mcp in self.generate_mcp(): diff --git a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py index 04f12b4f61d1e..2d9b14ceb2d06 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py +++ b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py @@ -2,25 +2,15 @@ import time from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Tuple, - Union, -) +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import pydantic from ruamel.yaml import YAML import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.schema_classes import ( AuditStampClass, @@ -43,9 +33,6 @@ from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.urns.urn import Urn -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - def patch_list( orig_list: Optional[list], @@ -225,7 +212,6 @@ def _generate_properties_mcp( def generate_mcp( self, upsert: bool ) -> Iterable[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: - if self._resolved_domain_urn is None: raise Exception( f"Unable to generate MCP-s because we were unable to resolve the domain {self.domain} to an urn." @@ -282,7 +268,7 @@ def generate_mcp( def emit( self, - emitter: Union[DatahubRestEmitter, "DatahubKafkaEmitter"], + emitter: Emitter, upsert: bool, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: @@ -440,7 +426,6 @@ def patch_yaml( original_dataproduct: DataProduct, output_file: Path, ) -> bool: - update_needed = False if not original_dataproduct._original_yaml_dict: raise Exception("Original Data Product was not loaded from yaml") @@ -523,7 +508,6 @@ def to_yaml( self, file: Path, ) -> None: - with open(file, "w") as fp: yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) yaml.indent(mapping=2, sequence=4, offset=2) diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 9fde47c82873c..4afccfe711e34 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -426,7 +426,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures: return quickstart_arch -@docker.command() +@docker.command() # noqa: C901 @click.option( "--version", type=str, @@ -588,7 +588,7 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures: "arch", ] ) -def quickstart( +def quickstart( # noqa: C901 version: Optional[str], build_locally: bool, pull_images: bool, @@ -755,14 +755,21 @@ def quickstart( up_attempts += 1 logger.debug(f"Executing docker compose up command, attempt #{up_attempts}") + up_process = subprocess.Popen( + base_command + ["up", "-d", "--remove-orphans"], + env=_docker_subprocess_env(), + ) try: - subprocess.run( - base_command + ["up", "-d", "--remove-orphans"], - env=_docker_subprocess_env(), - timeout=_QUICKSTART_UP_TIMEOUT.total_seconds(), - ) + up_process.wait(timeout=_QUICKSTART_UP_TIMEOUT.total_seconds()) except subprocess.TimeoutExpired: - logger.debug("docker compose up timed out, will retry") + logger.debug("docker compose up timed out, sending SIGTERM") + up_process.terminate() + try: + up_process.wait(timeout=3) + except subprocess.TimeoutExpired: + logger.debug("docker compose up still running, sending SIGKILL") + up_process.kill() + up_process.wait() # Check docker health every few seconds. status = check_docker_quickstart() diff --git a/metadata-ingestion/src/datahub/cli/specific/datacontract_cli.py b/metadata-ingestion/src/datahub/cli/specific/datacontract_cli.py new file mode 100644 index 0000000000000..3745943c8c96a --- /dev/null +++ b/metadata-ingestion/src/datahub/cli/specific/datacontract_cli.py @@ -0,0 +1,80 @@ +import logging +from typing import Optional + +import click +from click_default_group import DefaultGroup + +from datahub.api.entities.datacontract.datacontract import DataContract +from datahub.ingestion.graph.client import get_default_graph +from datahub.telemetry import telemetry +from datahub.upgrade import upgrade + +logger = logging.getLogger(__name__) + + +@click.group(cls=DefaultGroup, default="upsert") +def datacontract() -> None: + """A group of commands to interact with the DataContract entity in DataHub.""" + pass + + +@datacontract.command() +@click.option("-f", "--file", required=True, type=click.Path(exists=True)) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def upsert(file: str) -> None: + """Upsert (create or update) a Data Contract in DataHub.""" + + data_contract: DataContract = DataContract.from_yaml(file) + urn = data_contract.urn + + with get_default_graph() as graph: + if not graph.exists(data_contract.entity): + raise ValueError( + f"Cannot define a data contract for non-existent entity {data_contract.entity}" + ) + + try: + for mcp in data_contract.generate_mcp(): + graph.emit(mcp) + click.secho(f"Update succeeded for urn {urn}.", fg="green") + except Exception as e: + logger.exception(e) + click.secho( + f"Update failed for {urn}: {e}", + fg="red", + ) + + +@datacontract.command() +@click.option( + "--urn", required=False, type=str, help="The urn for the data contract to delete" +) +@click.option( + "-f", + "--file", + required=False, + type=click.Path(exists=True), + help="The file containing the data contract definition", +) +@click.option("--hard/--soft", required=False, is_flag=True, default=False) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def delete(urn: Optional[str], file: Optional[str], hard: bool) -> None: + """Delete a Data Contract in DataHub. Defaults to a soft-delete. Use --hard to completely erase metadata.""" + + if not urn: + if not file: + raise click.UsageError( + "Must provide either an urn or a file to delete a data contract" + ) + + data_contract = DataContract.from_yaml(file) + urn = data_contract.urn + + with get_default_graph() as graph: + if not graph.exists(urn): + raise ValueError(f"Data Contract {urn} does not exist") + + graph.delete_entity(urn, hard=hard) + click.secho(f"Data Contract {urn} deleted") diff --git a/metadata-ingestion/src/datahub/cli/specific/file_loader.py b/metadata-ingestion/src/datahub/cli/specific/file_loader.py index 54f12e024d294..a9787343fdb91 100644 --- a/metadata-ingestion/src/datahub/cli/specific/file_loader.py +++ b/metadata-ingestion/src/datahub/cli/specific/file_loader.py @@ -1,9 +1,7 @@ -import io from pathlib import Path from typing import Union -from datahub.configuration.common import ConfigurationError -from datahub.configuration.yaml import YamlConfigurationMechanism +from datahub.configuration.config_loader import load_config_file def load_file(config_file: Path) -> Union[dict, list]: @@ -17,19 +15,11 @@ def load_file(config_file: Path) -> Union[dict, list]: evolve to becoming a standard function that all the specific. cli variants will use to load up the models from external files """ - if not isinstance(config_file, Path): - config_file = Path(config_file) - if not config_file.is_file(): - raise ConfigurationError(f"Cannot open config file {config_file}") - if config_file.suffix in {".yaml", ".yml"}: - config_mech: YamlConfigurationMechanism = YamlConfigurationMechanism() - else: - raise ConfigurationError( - f"Only .yaml and .yml are supported. Cannot process file type {config_file.suffix}" - ) - - raw_config_file = config_file.read_text() - config_fp = io.StringIO(raw_config_file) - raw_config = config_mech.load_config(config_fp) - return raw_config + res = load_config_file( + config_file, + squirrel_original_config=False, + resolve_env_vars=False, + allow_stdin=False, + ) + return res diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index 37b93f3e598e1..a9f891ddb7b1e 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -4,7 +4,7 @@ from pydantic.fields import Field from datahub.configuration.common import ConfigModel, ConfigurationError -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.metadata.schema_classes import FabricTypeClass DEFAULT_ENV = FabricTypeClass.PROD diff --git a/metadata-ingestion/src/datahub/configuration/pydantic_field_deprecation.py b/metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py similarity index 74% rename from metadata-ingestion/src/datahub/configuration/pydantic_field_deprecation.py rename to metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py index ed82acb594ed7..6134c4dab4817 100644 --- a/metadata-ingestion/src/datahub/configuration/pydantic_field_deprecation.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py @@ -1,20 +1,28 @@ import warnings -from typing import Optional, Type +from typing import Any, Optional, Type import pydantic from datahub.configuration.common import ConfigurationWarning from datahub.utilities.global_warning_util import add_global_warning +_unset = object() -def pydantic_field_deprecated(field: str, message: Optional[str] = None) -> classmethod: + +def pydantic_field_deprecated( + field: str, + warn_if_value_is_not: Any = _unset, + message: Optional[str] = None, +) -> classmethod: if message: output = message else: output = f"{field} is deprecated and will be removed in a future release. Please remove it from your config." def _validate_deprecated(cls: Type, values: dict) -> dict: - if field in values: + if field in values and ( + warn_if_value_is_not is _unset or values[field] != warn_if_value_is_not + ): add_global_warning(output) warnings.warn(output, ConfigurationWarning, stacklevel=2) return values diff --git a/metadata-ingestion/src/datahub/emitter/generic_emitter.py b/metadata-ingestion/src/datahub/emitter/generic_emitter.py new file mode 100644 index 0000000000000..28138c6182758 --- /dev/null +++ b/metadata-ingestion/src/datahub/emitter/generic_emitter.py @@ -0,0 +1,31 @@ +from typing import Any, Callable, Optional, Union + +from typing_extensions import Protocol + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( + MetadataChangeEvent, + MetadataChangeProposal, +) + + +class Emitter(Protocol): + def emit( + self, + item: Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ], + # NOTE: This signature should have the exception be optional rather than + # required. However, this would be a breaking change that may need + # more careful consideration. + callback: Optional[Callable[[Exception, str], None]] = None, + # TODO: The rest emitter returns timestamps as the return type. For now + # we smooth over that detail using Any, but eventually we should + # standardize on a return type. + ) -> Any: + raise NotImplementedError + + def flush(self) -> None: + pass diff --git a/metadata-ingestion/src/datahub/emitter/kafka_emitter.py b/metadata-ingestion/src/datahub/emitter/kafka_emitter.py index ec0c8f3418a4a..781930011b78f 100644 --- a/metadata-ingestion/src/datahub/emitter/kafka_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/kafka_emitter.py @@ -10,6 +10,7 @@ from datahub.configuration.common import ConfigModel from datahub.configuration.kafka import KafkaProducerConnectionConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.closeable import Closeable from datahub.metadata.schema_classes import ( @@ -55,7 +56,7 @@ def validate_topic_routes(cls, v: Dict[str, str]) -> Dict[str, str]: return v -class DatahubKafkaEmitter(Closeable): +class DatahubKafkaEmitter(Closeable, Emitter): def __init__(self, config: KafkaEmitterConfig): self.config = config schema_registry_conf = { diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 0928818c7005c..64c9ec1bb5704 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -1,11 +1,11 @@ """Convenience functions for creating MCEs""" +import hashlib import json import logging import os import re import time from enum import Enum -from hashlib import md5 from typing import ( TYPE_CHECKING, Any, @@ -21,7 +21,6 @@ import typing_inspect from datahub.configuration.source_common import DEFAULT_ENV as DEFAULT_ENV_CONFIGURATION -from datahub.emitter.serialization_helper import pre_json_transform from datahub.metadata.schema_classes import ( AssertionKeyClass, AuditStampClass, @@ -159,11 +158,24 @@ def container_urn_to_key(guid: str) -> Optional[ContainerKeyClass]: return None +class _DatahubKeyJSONEncoder(json.JSONEncoder): + # overload method default + def default(self, obj: Any) -> Any: + if hasattr(obj, "guid"): + return obj.guid() + # Call the default method for other types + return json.JSONEncoder.default(self, obj) + + def datahub_guid(obj: dict) -> str: - obj_str = json.dumps( - pre_json_transform(obj), separators=(",", ":"), sort_keys=True - ).encode("utf-8") - return md5(obj_str).hexdigest() + json_key = json.dumps( + obj, + separators=(",", ":"), + sort_keys=True, + cls=_DatahubKeyJSONEncoder, + ) + md5_hash = hashlib.md5(json_key.encode("utf-8")) + return str(md5_hash.hexdigest()) def make_assertion_urn(assertion_id: str) -> str: diff --git a/metadata-ingestion/src/datahub/emitter/mcp_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_builder.py index 844a29f1c78a3..65e0c0d6ba60d 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_builder.py @@ -1,14 +1,14 @@ -import hashlib -import json -from typing import Any, Dict, Iterable, List, Optional, TypeVar +from typing import Dict, Iterable, List, Optional, TypeVar from pydantic.fields import Field from pydantic.main import BaseModel from datahub.emitter.mce_builder import ( + datahub_guid, make_container_urn, make_data_platform_urn, make_dataplatform_instance_urn, + make_dataset_urn_with_platform_instance, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -32,24 +32,13 @@ ) -def _stable_guid_from_dict(d: dict) -> str: - json_key = json.dumps( - d, - separators=(",", ":"), - sort_keys=True, - cls=DatahubKeyJSONEncoder, - ) - md5_hash = hashlib.md5(json_key.encode("utf-8")) - return str(md5_hash.hexdigest()) - - class DatahubKey(BaseModel): def guid_dict(self) -> Dict[str, str]: return self.dict(by_alias=True, exclude_none=True) def guid(self) -> str: bag = self.guid_dict() - return _stable_guid_from_dict(bag) + return datahub_guid(bag) class ContainerKey(DatahubKey): @@ -105,7 +94,15 @@ class MetastoreKey(ContainerKey): metastore: str -class CatalogKey(MetastoreKey): +class CatalogKeyWithMetastore(MetastoreKey): + catalog: str + + +class UnitySchemaKeyWithMetastore(CatalogKeyWithMetastore): + unity_schema: str + + +class CatalogKey(ContainerKey): catalog: str @@ -125,13 +122,15 @@ class BucketKey(ContainerKey): bucket_name: str -class DatahubKeyJSONEncoder(json.JSONEncoder): - # overload method default - def default(self, obj: Any) -> Any: - if hasattr(obj, "guid"): - return obj.guid() - # Call the default method for other types - return json.JSONEncoder.default(self, obj) +class NotebookKey(DatahubKey): + notebook_id: int + platform: str + instance: Optional[str] + + def as_urn(self) -> str: + return make_dataset_urn_with_platform_instance( + platform=self.platform, platform_instance=self.instance, name=self.guid() + ) KeyType = TypeVar("KeyType", bound=ContainerKey) diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index 937e0902d6d8c..afb19df9791af 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -4,7 +4,7 @@ import logging import os from json.decoder import JSONDecodeError -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import requests from deprecated import deprecated @@ -13,6 +13,7 @@ from datahub.cli.cli_utils import get_system_auth from datahub.configuration.common import ConfigurationError, OperationalError +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.request_helper import make_curl_command from datahub.emitter.serialization_helper import pre_json_transform @@ -23,6 +24,9 @@ ) from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation +if TYPE_CHECKING: + from datahub.ingestion.graph.client import DataHubGraph + logger = logging.getLogger(__name__) _DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect @@ -42,7 +46,7 @@ ) -class DataHubRestEmitter(Closeable): +class DataHubRestEmitter(Closeable, Emitter): _gms_server: str _token: Optional[str] _session: requests.Session @@ -190,6 +194,11 @@ def test_connection(self) -> dict: message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually :8080) or Frontend GMS API (usually :9002/api/gms)." raise ConfigurationError(message) + def to_graph(self) -> "DataHubGraph": + from datahub.ingestion.graph.client import DataHubGraph + + return DataHubGraph.from_emitter(self) + def emit( self, item: Union[ @@ -198,9 +207,6 @@ def emit( MetadataChangeProposalWrapper, UsageAggregation, ], - # NOTE: This signature should have the exception be optional rather than - # required. However, this would be a breaking change that may need - # more careful consideration. callback: Optional[Callable[[Exception, str], None]] = None, ) -> Tuple[datetime.datetime, datetime.datetime]: start_time = datetime.datetime.now() diff --git a/metadata-ingestion/src/datahub/emitter/synchronized_file_emitter.py b/metadata-ingestion/src/datahub/emitter/synchronized_file_emitter.py new file mode 100644 index 0000000000000..f82882f1a87cc --- /dev/null +++ b/metadata-ingestion/src/datahub/emitter/synchronized_file_emitter.py @@ -0,0 +1,60 @@ +import logging +import pathlib +from typing import Callable, Optional, Union + +import filelock + +from datahub.emitter.generic_emitter import Emitter +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.closeable import Closeable +from datahub.ingestion.sink.file import write_metadata_file +from datahub.ingestion.source.file import read_metadata_file +from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( + MetadataChangeEvent, + MetadataChangeProposal, +) + +logger = logging.getLogger(__name__) + + +class SynchronizedFileEmitter(Closeable, Emitter): + """ + A multiprocessing-safe emitter that writes to a file. + + This emitter is intended for testing purposes only. It is not performant + because it reads and writes the full file on every emit call to ensure + that the file is always valid JSON. + """ + + def __init__(self, filename: str) -> None: + self._filename = pathlib.Path(filename) + self._lock = filelock.FileLock(self._filename.with_suffix(".lock")) + + def emit( + self, + item: Union[ + MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper + ], + callback: Optional[Callable[[Exception, str], None]] = None, + ) -> None: + with self._lock: + if self._filename.exists(): + metadata = list(read_metadata_file(self._filename)) + else: + metadata = [] + + logger.debug("Emitting metadata: %s", item) + metadata.append(item) + + write_metadata_file(self._filename, metadata) + + def __repr__(self) -> str: + return f"SynchronizedFileEmitter('{self._filename}')" + + def flush(self) -> None: + # No-op. + pass + + def close(self) -> None: + # No-op. + pass diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index 84615fd9a6148..5bfab3b841fa3 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -21,6 +21,7 @@ from datahub.cli.ingest_cli import ingest from datahub.cli.migrate import migrate from datahub.cli.put_cli import put +from datahub.cli.specific.datacontract_cli import datacontract from datahub.cli.specific.dataproduct_cli import dataproduct from datahub.cli.specific.group_cli import group from datahub.cli.specific.user_cli import user @@ -158,6 +159,7 @@ def init() -> None: datahub.add_command(user) datahub.add_command(group) datahub.add_command(dataproduct) +datahub.add_command(datacontract) try: from datahub.cli.lite_cli import lite diff --git a/metadata-ingestion/src/datahub/ingestion/api/closeable.py b/metadata-ingestion/src/datahub/ingestion/api/closeable.py index 523174b9978b3..80a5008ed6368 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/closeable.py +++ b/metadata-ingestion/src/datahub/ingestion/api/closeable.py @@ -1,7 +1,9 @@ from abc import abstractmethod from contextlib import AbstractContextManager from types import TracebackType -from typing import Optional, Type +from typing import Optional, Type, TypeVar + +_Self = TypeVar("_Self", bound="Closeable") class Closeable(AbstractContextManager): @@ -9,6 +11,10 @@ class Closeable(AbstractContextManager): def close(self) -> None: pass + def __enter__(self: _Self) -> _Self: + # This method is mainly required for type checking. + return self + def __exit__( self, exc_type: Optional[Type[BaseException]], diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index e22d48d0af80a..ccff677c3a471 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type from avro.schema import RecordSchema from deprecated import deprecated @@ -138,6 +138,23 @@ def __init__(self, config: DatahubClientConfig) -> None: self.server_id = "missing" logger.debug(f"Failed to get server id due to {e}") + @classmethod + def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph": + return cls( + DatahubClientConfig( + server=emitter._gms_server, + token=emitter._token, + timeout_sec=emitter._read_timeout_sec, + retry_status_codes=emitter._retry_status_codes, + retry_max_times=emitter._retry_max_times, + extra_headers=emitter._session.headers, + disable_ssl_verification=emitter._session.verify is False, + # TODO: Support these headers. + # ca_certificate_path=emitter._ca_certificate_path, + # client_certificate_path=emitter._client_certificate_path, + ) + ) + def _send_restli_request(self, method: str, url: str, **kwargs: Any) -> Dict: try: response = self._session.request(method, url, **kwargs) @@ -805,7 +822,7 @@ def get_related_entities( url=relationship_endpoint, params={ "urn": entity_urn, - "direction": direction, + "direction": direction.value, "relationshipTypes": relationship_types, "start": start, }, @@ -993,14 +1010,13 @@ def _make_schema_resolver( def initialize_schema_resolver_from_datahub( self, platform: str, platform_instance: Optional[str], env: str - ) -> Tuple["SchemaResolver", Set[str]]: + ) -> "SchemaResolver": logger.info("Initializing schema resolver") schema_resolver = self._make_schema_resolver( platform, platform_instance, env, include_graph=False ) logger.info(f"Fetching schemas for platform {platform}, env {env}") - urns = [] count = 0 with PerfTimer() as timer: for urn, schema_info in self._bulk_fetch_schema_info_by_filter( @@ -1009,7 +1025,6 @@ def initialize_schema_resolver_from_datahub( env=env, ): try: - urns.append(urn) schema_resolver.add_graphql_schema_metadata(urn, schema_info) count += 1 except Exception: @@ -1024,7 +1039,7 @@ def initialize_schema_resolver_from_datahub( ) logger.info("Finished initializing schema resolver") - return schema_resolver, set(urns) + return schema_resolver def parse_sql_lineage( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index d544a9776f329..b4a04d96b532b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -458,7 +458,7 @@ def _init_schema_resolver(self) -> SchemaResolver: platform=self.platform, platform_instance=self.config.platform_instance, env=self.config.env, - )[0] + ) else: logger.warning( "Failed to load schema info from DataHub as DataHubGraph is missing. " @@ -601,9 +601,6 @@ def _process_project( db_views: Dict[str, List[BigqueryView]] = {} project_id = bigquery_project.id - - yield from self.gen_project_id_containers(project_id) - try: bigquery_project.datasets = ( self.bigquery_data_dictionary.get_datasets_for_project_id(project_id) @@ -620,11 +617,23 @@ def _process_project( return None if len(bigquery_project.datasets) == 0: - logger.warning( - f"No dataset found in {project_id}. Either there are no datasets in this project or missing bigquery.datasets.get permission. You can assign predefined roles/bigquery.metadataViewer role to your service account." + more_info = ( + "Either there are no datasets in this project or missing bigquery.datasets.get permission. " + "You can assign predefined roles/bigquery.metadataViewer role to your service account." ) + if self.config.exclude_empty_projects: + self.report.report_dropped(project_id) + warning_message = f"Excluded project '{project_id}' since no were datasets found. {more_info}" + else: + yield from self.gen_project_id_containers(project_id) + warning_message = ( + f"No datasets found in project '{project_id}'. {more_info}" + ) + logger.warning(warning_message) return + yield from self.gen_project_id_containers(project_id) + self.report.num_project_datasets_to_scan[project_id] = len( bigquery_project.datasets ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 3b06a4699c566..483355a85ac05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -265,6 +265,11 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: description="Maximum number of entries for the in-memory caches of FileBacked data structures.", ) + exclude_empty_projects: bool = Field( + default=False, + description="Option to exclude empty projects from being ingested.", + ) + @root_validator(pre=False) def profile_default_settings(cls, values: Dict) -> Dict: # Extra default SQLAlchemy option for better connection pooling and threading. diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 661589a0c58e5..9d92b011ee285 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -122,6 +122,8 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR usage_state_size: Optional[str] = None + exclude_empty_projects: Optional[bool] = None + schema_api_perf: BigQuerySchemaApiPerfReport = field( default_factory=BigQuerySchemaApiPerfReport ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index b3e88459917b3..8ae17600e0eea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -1,12 +1,9 @@ -import dataclasses import logging from datetime import datetime from typing import Dict, Iterable, List, Optional, Tuple, cast from dateutil.relativedelta import relativedelta -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config @@ -15,7 +12,7 @@ RANGE_PARTITION_NAME, BigqueryTable, ) -from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest +from datahub.ingestion.source.sql.sql_generic import BaseTable from datahub.ingestion.source.sql.sql_generic_profiler import ( GenericProfiler, TableProfilerRequest, @@ -25,12 +22,6 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass -class BigqueryProfilerRequest(GEProfilerRequest): - table: BigqueryTable - profile_table_level_only: bool = False - - class BigqueryProfiler(GenericProfiler): config: BigQueryV2Config report: BigQueryV2Report @@ -183,84 +174,54 @@ def get_workunits( ) # Emit the profile work unit - profile_request = self.get_bigquery_profile_request( - project=project_id, dataset=dataset, table=table - ) + profile_request = self.get_profile_request(table, dataset, project_id) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: return - yield from self.generate_wu_from_profile_requests(profile_requests) - - def generate_wu_from_profile_requests( - self, profile_requests: List[BigqueryProfilerRequest] - ) -> Iterable[MetadataWorkUnit]: - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - for request, profile in self.generate_profiles( - table_profile_requests, + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if request is None or profile is None: - continue - - request = cast(BigqueryProfilerRequest, request) - profile.sizeInBytes = request.table.size_in_bytes - # If table is partitioned we profile only one partition (if nothing set then the last one) - # but for table level we can use the rows_count from the table metadata - # This way even though column statistics only reflects one partition data but the rows count - # shows the proper count. - if profile.partitionSpec and profile.partitionSpec.partition: - profile.rowCount = request.table.rows_count - - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler and not request.profile_table_level_only: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() + ) - def get_bigquery_profile_request( - self, project: str, dataset: str, table: BigqueryTable - ) -> Optional[BigqueryProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = BigqueryTableIdentifier( - project_id=project, dataset=dataset, table=table.name + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return BigqueryTableIdentifier( + project_id=db_name, dataset=schema_name, table=table_name ).get_table_name() - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count - ): - profile_table_level_only = True - self.report.num_tables_not_eligible_profiling[f"{project}.{dataset}"] += 1 - if not table.column_count: - skip_profiling = True + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + return dict( + schema=db_name, # + table=f"{schema_name}.{table.name}", # . + ) - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") + def get_profile_request( + self, table: BaseTable, schema_name: str, db_name: str + ) -> Optional[TableProfilerRequest]: + profile_request = super().get_profile_request(table, schema_name, db_name) + + if not profile_request: return None + # Below code handles profiling changes required for partitioned or sharded tables + # 1. Skip profile if partition profiling is disabled. + # 2. Else update `profile_request.batch_kwargs` with partition and custom_sql + + bq_table = cast(BigqueryTable, table) (partition, custom_sql) = self.generate_partition_profiler_query( - project, dataset, table, self.config.profiling.partition_datetime + db_name, schema_name, bq_table, self.config.profiling.partition_datetime ) - if partition is None and table.partition_info: + + if partition is None and bq_table.partition_info: self.report.report_warning( "profile skipped as partitioned table is empty or partition id or type was invalid", - dataset_name, + profile_request.pretty_name, ) return None if ( @@ -268,24 +229,20 @@ def get_bigquery_profile_request( and not self.config.profiling.partition_profiling_enabled ): logger.debug( - f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled" + f"{profile_request.pretty_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled" ) self.report.profiling_skipped_partition_profiling_disabled.append( - dataset_name + profile_request.pretty_name ) return None - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = BigqueryProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict( - schema=project, - table=f"{dataset}.{table.name}", - custom_sql=custom_sql, - partition=partition, - ), - table=table, - profile_table_level_only=profile_table_level_only, - ) + if partition: + logger.debug("Updating profiling request for partitioned/sharded tables") + profile_request.batch_kwargs.update( + dict( + custom_sql=custom_sql, + partition=partition, + ) + ) + return profile_request diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py index a2d89d26112f4..741b4789bef21 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py @@ -16,6 +16,9 @@ class DatasetSubTypes(str, Enum): SALESFORCE_STANDARD_OBJECT = "Object" POWERBI_DATASET_TABLE = "PowerBI Dataset Table" + # TODO: Create separate entity... + NOTEBOOK = "Notebook" + class DatasetContainerSubTypes(str, Enum): # Generic SubTypes diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 782d94f39e8a5..0f5c08eb6ac54 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -18,8 +18,8 @@ ConfigurationError, LineageConfig, ) -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext @@ -214,7 +214,9 @@ class DBTCommonConfig( default=False, description="Use model identifier instead of model name if defined (if not, default to model name).", ) - _deprecate_use_identifiers = pydantic_field_deprecated("use_identifiers") + _deprecate_use_identifiers = pydantic_field_deprecated( + "use_identifiers", warn_if_value_is_not=False + ) entities_enabled: DBTEntitiesEnabled = Field( DBTEntitiesEnabled(), @@ -278,6 +280,14 @@ class DBTCommonConfig( description="When enabled, converts column URNs to lowercase to ensure cross-platform compatibility. " "If `target_platform` is Snowflake, the default is True.", ) + use_compiled_code: bool = Field( + default=False, + description="When enabled, uses the compiled dbt code instead of the raw dbt node definition.", + ) + test_warnings_are_errors: bool = Field( + default=False, + description="When enabled, dbt test warnings will be treated as failures.", + ) @validator("target_platform") def validate_target_platform_value(cls, target_platform: str) -> str: @@ -701,18 +711,22 @@ def create_test_entity_mcps( assertion_urn = mce_builder.make_assertion_urn( mce_builder.datahub_guid( { - "platform": DBT_PLATFORM, - "name": node.dbt_name, - "instance": self.config.platform_instance, - **( - # Ideally we'd include the env unconditionally. However, we started out - # not including env in the guid, so we need to maintain backwards compatibility - # with existing PROD assertions. - {"env": self.config.env} - if self.config.env != mce_builder.DEFAULT_ENV - and self.config.include_env_in_assertion_guid - else {} - ), + k: v + for k, v in { + "platform": DBT_PLATFORM, + "name": node.dbt_name, + "instance": self.config.platform_instance, + **( + # Ideally we'd include the env unconditionally. However, we started out + # not including env in the guid, so we need to maintain backwards compatibility + # with existing PROD assertions. + {"env": self.config.env} + if self.config.env != mce_builder.DEFAULT_ENV + and self.config.include_env_in_assertion_guid + else {} + ), + }.items() + if v is not None } ) ) @@ -807,7 +821,7 @@ def _make_assertion_from_test( mce_builder.make_schema_field_urn(upstream_urn, column_name) ], nativeType=node.name, - logic=node.compiled_code if node.compiled_code else node.raw_code, + logic=node.compiled_code or node.raw_code, aggregation=AssertionStdAggregationClass._NATIVE_, nativeParameters=string_map(kw_args), ), @@ -821,7 +835,7 @@ def _make_assertion_from_test( dataset=upstream_urn, scope=DatasetAssertionScopeClass.DATASET_ROWS, operator=AssertionStdOperatorClass._NATIVE_, - logic=node.compiled_code if node.compiled_code else node.raw_code, + logic=node.compiled_code or node.raw_code, nativeType=node.name, aggregation=AssertionStdAggregationClass._NATIVE_, nativeParameters=string_map(kw_args), @@ -852,6 +866,10 @@ def _make_assertion_result_from_test( result=AssertionResultClass( type=AssertionResultTypeClass.SUCCESS if test_result.status == "pass" + or ( + not self.config.test_warnings_are_errors + and test_result.status == "warn" + ) else AssertionResultTypeClass.FAILURE, nativeResults=test_result.native_results, ), @@ -1003,8 +1021,8 @@ def create_platform_mces( aspects.append(upstream_lineage_class) # add view properties aspect - if node.raw_code and node.language == "sql": - view_prop_aspect = self._create_view_properties_aspect(node) + view_prop_aspect = self._create_view_properties_aspect(node) + if view_prop_aspect: aspects.append(view_prop_aspect) # emit subtype mcp @@ -1129,14 +1147,21 @@ def _create_dataset_properties_aspect( def get_external_url(self, node: DBTNode) -> Optional[str]: pass - def _create_view_properties_aspect(self, node: DBTNode) -> ViewPropertiesClass: + def _create_view_properties_aspect( + self, node: DBTNode + ) -> Optional[ViewPropertiesClass]: + view_logic = ( + node.compiled_code if self.config.use_compiled_code else node.raw_code + ) + + if node.language != "sql" or not view_logic: + return None + materialized = node.materialization in {"table", "incremental", "snapshot"} - # this function is only called when raw sql is present. assert is added to satisfy lint checks - assert node.raw_code is not None view_properties = ViewPropertiesClass( materialized=materialized, viewLanguage="SQL", - viewLogic=node.raw_code, + viewLogic=view_logic, ) return view_properties @@ -1188,9 +1213,15 @@ def _generate_base_aspects( ): aspects.append(meta_aspects.get(Constants.ADD_TERM_OPERATION)) + # add meta links aspect + meta_links_aspect = meta_aspects.get(Constants.ADD_DOC_LINK_OPERATION) + if meta_links_aspect and self.config.enable_meta_mapping: + aspects.append(meta_links_aspect) + # add schema metadata aspect schema_metadata = self.get_schema_metadata(self.report, node, mce_platform) aspects.append(schema_metadata) + return aspects def get_schema_metadata( diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py index 6b7c118373673..d7f3dfb9279fb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py @@ -1,5 +1,5 @@ import logging -from dataclasses import field +from dataclasses import dataclass, field from typing import Any, Counter, Dict, Iterable, List, Optional, Type, Union import boto3 @@ -79,12 +79,13 @@ class DynamoDBConfig(DatasetSourceConfigMixin, StatefulIngestionConfigBase): table_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), - description="regex patterns for tables to filter in ingestion.", + description="Regex patterns for tables to filter in ingestion. The table name format is 'region.table'", ) # Custom Stateful Ingestion settings stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None +@dataclass class DynamoDBSourceReport(StaleEntityRemovalSourceReport): filtered: List[str] = field(default_factory=list) @@ -175,39 +176,30 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # traverse databases in sorted order so output is consistent for region in dynamodb_regions: - try: - # create a new dynamodb client for each region, - # it seems for one client we could only list the table of one specific region, - # the list_tables() method don't take any config that related to region - # TODO: list table returns maximum number 100, need to implement pagination here - dynamodb_client = boto3.client( - "dynamodb", - region_name=region, - aws_access_key_id=self.config.aws_access_key_id - if self.config.aws_access_key_id - else None, - aws_secret_access_key=self.config.aws_secret_access_key.get_secret_value() - if self.config.aws_secret_access_key - else None, - ) - table_names: List[str] = dynamodb_client.list_tables()["TableNames"] - except Exception as ex: - # TODO: If regions is config input then this would be self.report.report_warning, - # we can create dynamodb client to take aws region or regions as user input - logger.info(f"exception happen in region {region}, skipping: {ex}") - continue - for table_name in sorted(table_names): - if not self.config.table_pattern.allowed(table_name): + logger.info(f"Processing region {region}") + # create a new dynamodb client for each region, + # it seems for one client we could only list the table of one specific region, + # the list_tables() method don't take any config that related to region + dynamodb_client = boto3.client( + "dynamodb", + region_name=region, + aws_access_key_id=self.config.aws_access_key_id, + aws_secret_access_key=self.config.aws_secret_access_key.get_secret_value(), + ) + + for table_name in self._list_tables(dynamodb_client): + dataset_name = f"{region}.{table_name}" + if not self.config.table_pattern.allowed(dataset_name): + logger.debug(f"skipping table: {dataset_name}") + self.report.report_dropped(dataset_name) continue + + logger.debug(f"Processing table: {dataset_name}") table_info = dynamodb_client.describe_table(TableName=table_name)[ "Table" ] account_id = table_info["TableArn"].split(":")[4] - if not self.config.table_pattern.allowed(table_name): - self.report.report_dropped(table_name) - continue platform_instance = self.config.platform_instance or account_id - dataset_name = f"{region}.{table_name}" dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, platform_instance=platform_instance, @@ -222,7 +214,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) primary_key_dict = self.extract_primary_key_from_key_schema(table_info) table_schema = self.construct_schema_from_dynamodb( - dynamodb_client, table_name + dynamodb_client, region, table_name ) schema_metadata = self.construct_schema_metadata( table_name, @@ -254,9 +246,25 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: aspect=platform_instance_aspect, ).as_workunit() + def _list_tables( + self, + dynamodb_client: BaseClient, + ) -> Iterable[str]: + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb/paginator/ListTables.html + try: + for page in dynamodb_client.get_paginator("list_tables").paginate(): + table_names = page.get("TableNames") + if table_names: + yield from table_names + except Exception as ex: + # TODO: If regions is config input then this would be self.report.report_warning, + # we can create dynamodb client to take aws region or regions as user input + logger.info(f"Exception happened while listing tables, skipping: {ex}") + def construct_schema_from_dynamodb( self, dynamodb_client: BaseClient, + region: str, table_name: str, ) -> Dict[str, SchemaDescription]: """ @@ -275,7 +283,7 @@ def construct_schema_from_dynamodb( The MaxItems is the total number of items to return, and PageSize is the size of each page, we are assigning same value to these two config. If MaxItems is more than PageSize then we expect MaxItems / PageSize pages in response_iterator will return """ - self.include_table_item_to_schema(dynamodb_client, table_name, schema) + self.include_table_item_to_schema(dynamodb_client, region, table_name, schema) response_iterator = paginator.paginate( TableName=table_name, PaginationConfig={ @@ -294,33 +302,38 @@ def construct_schema_from_dynamodb( def include_table_item_to_schema( self, dynamodb_client: Any, + region: str, table_name: str, schema: Dict[str, SchemaDescription], ) -> None: """ - It will look up in the config include_table_item dict to see if the current table name exists as key, + It will look up in the config include_table_item dict to see if "region.table_name" exists as key, if it exists then get the items by primary key from the table and put it to schema """ if self.config.include_table_item is None: return - if table_name not in self.config.include_table_item.keys(): + dataset_name = f"{region}.{table_name}" + if dataset_name not in self.config.include_table_item.keys(): return - primary_key_list = self.config.include_table_item.get(table_name) + primary_key_list = self.config.include_table_item.get(dataset_name) assert isinstance(primary_key_list, List) if len(primary_key_list) > MAX_PRIMARY_KEYS_SIZE: logger.info( - f"the provided primary keys list size exceeded the max size for table {table_name}, we'll only process the first {MAX_PRIMARY_KEYS_SIZE} items" + f"the provided primary keys list size exceeded the max size for table {dataset_name}, we'll only process the first {MAX_PRIMARY_KEYS_SIZE} items" ) primary_key_list = primary_key_list[0:MAX_PRIMARY_KEYS_SIZE] items = [] response = dynamodb_client.batch_get_item( RequestItems={table_name: {"Keys": primary_key_list}} - ).get("Responses", None) + ).get("Responses") if response is None: logger.error( f"failed to retrieve item from table {table_name} by the given key {primary_key_list}" ) return + logger.debug( + f"successfully retrieved {len(primary_key_list)} items based on supplied primary key list" + ) items = response.get(table_name) self.construct_schema_from_items(items, schema) diff --git a/metadata-ingestion/src/datahub/ingestion/source/file.py b/metadata-ingestion/src/datahub/ingestion/source/file.py index de61fa8481c58..590aa59f7b5b6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/file.py +++ b/metadata-ingestion/src/datahub/ingestion/source/file.py @@ -16,7 +16,7 @@ from pydantic.fields import Field from datahub.configuration.common import ConfigEnum, ConfigModel, ConfigurationError -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 01e083d566168..9f6ac9dd21164 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -273,6 +273,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): partition: Optional[str] config: GEProfilingConfig report: SQLSourceReport + custom_sql: Optional[str] query_combiner: SQLAlchemyQueryCombiner @@ -596,16 +597,8 @@ def generate_dataset_profile( # noqa: C901 (complexity) "catch_exceptions", self.config.catch_exceptions ) - profile = DatasetProfileClass(timestampMillis=get_sys_time()) - if self.partition: - profile.partitionSpec = PartitionSpecClass(partition=self.partition) - elif self.config.limit and self.config.offset: - profile.partitionSpec = PartitionSpecClass( - type=PartitionTypeClass.QUERY, - partition=json.dumps( - dict(limit=self.config.limit, offset=self.config.offset) - ), - ) + profile = self.init_profile() + profile.fieldProfiles = [] self._get_dataset_rows(profile) @@ -740,6 +733,24 @@ def generate_dataset_profile( # noqa: C901 (complexity) self.query_combiner.flush() return profile + def init_profile(self): + profile = DatasetProfileClass(timestampMillis=get_sys_time()) + if self.partition: + profile.partitionSpec = PartitionSpecClass(partition=self.partition) + elif self.config.limit: + profile.partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, + partition=json.dumps( + dict(limit=self.config.limit, offset=self.config.offset) + ), + ) + elif self.custom_sql: + profile.partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, partition="SAMPLE" + ) + + return profile + def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> None: if ( self.dataset.engine.dialect.name.lower() == BIGQUERY @@ -1064,6 +1075,7 @@ def _generate_single_profile( partition, self.config, self.report, + custom_sql, query_combiner, ).generate_dataset_profile() diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 77761c529ba0b..24a3e520d8caf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -157,12 +157,12 @@ class GEProfilingConfig(ConfigModel): ) use_sampling: bool = Field( default=True, - description="Whether to profile column level stats on sample of table. Only BigQuery supports this. " + description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. " "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ", ) sample_size: int = Field( - default=1000, + default=10000, description="Number of rows to be sampled from table for column level profiling." "Applicable only if `use_sampling` is set to True.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index f3344782917ab..5fae0ee5215a3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -28,7 +28,9 @@ ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql.sql_common import get_platform_from_sqlalchemy_uri +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index ffa685fb25826..96729f4c60c6c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,8 +9,8 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, @@ -397,6 +397,42 @@ class PowerBiDashboardSourceConfig( "as this option generates the upstream datasets URN in lowercase.", ) + # Enable CLL extraction + extract_column_level_lineage: bool = pydantic.Field( + default=False, + description="Whether to extract column level lineage. " + "Works only if configs `native_query_parsing`, `enable_advance_lineage_sql_construct` & `extract_lineage` are enabled. " + "Works for M-Query where native SQL is used for transformation.", + ) + + @root_validator + @classmethod + def validate_extract_column_level_lineage(cls, values: Dict) -> Dict: + flags = [ + "native_query_parsing", + "enable_advance_lineage_sql_construct", + "extract_lineage", + ] + + if ( + "extract_column_level_lineage" in values + and values["extract_column_level_lineage"] is False + ): + # Flag is not set. skip validation + return values + + logger.debug(f"Validating additional flags: {flags}") + + is_flag_enabled: bool = True + for flag in flags: + if flag not in values or values[flag] is False: + is_flag_enabled = False + + if not is_flag_enabled: + raise ValueError(f"Enable all these flags in recipe: {flags} ") + + return values + @validator("dataset_type_mapping") @classmethod def map_data_platform(cls, value): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 021c429c3c633..0afa8e7ff4564 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -9,7 +9,7 @@ SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] -logger = logging.getLogger() +logger = logging.getLogger(__name__) def remove_special_characters(native_query: str) -> str: @@ -21,7 +21,7 @@ def remove_special_characters(native_query: str) -> str: def get_tables(native_query: str) -> List[str]: native_query = remove_special_characters(native_query) - logger.debug(f"Processing query = {native_query}") + logger.debug(f"Processing native query = {native_query}") tables: List[str] = [] parsed = sqlparse.parse(native_query)[0] tokens: List[sqlparse.sql.Token] = list(parsed.tokens) @@ -65,7 +65,7 @@ def parse_custom_sql( sql_query = remove_special_characters(query) - logger.debug(f"Parsing sql={sql_query}") + logger.debug(f"Processing native query = {sql_query}") return sqlglot_l.create_lineage_sql_parsed_result( query=sql_query, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 8cc38c366c42a..9134932c39fe0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -56,7 +56,7 @@ def get_upstream_tables( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, -) -> List[resolver.DataPlatformTable]: +) -> List[resolver.Lineage]: if table.expression is None: logger.debug(f"Expression is none for table {table.full_name}") return [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 479f1decff903..e200ff41f71c2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -27,7 +27,7 @@ IdentifierAccessor, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table -from datahub.utilities.sqlglot_lineage import SqlParsingResult +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult logger = logging.getLogger(__name__) @@ -38,6 +38,16 @@ class DataPlatformTable: urn: str +@dataclass +class Lineage: + upstreams: List[DataPlatformTable] + column_lineage: List[ColumnLineageInfo] + + @staticmethod + def empty() -> "Lineage": + return Lineage(upstreams=[], column_lineage=[]) + + def urn_to_lowercase(value: str, flag: bool) -> str: if flag is True: return value.lower() @@ -120,9 +130,9 @@ def __init__( self.platform_instance_resolver = platform_instance_resolver @abstractmethod - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: pass @abstractmethod @@ -147,7 +157,7 @@ def get_db_detail_from_argument( def parse_custom_sql( self, query: str, server: str, database: Optional[str], schema: Optional[str] - ) -> List[DataPlatformTable]: + ) -> Lineage: dataplatform_tables: List[DataPlatformTable] = [] @@ -174,7 +184,7 @@ def parse_custom_sql( if parsed_result is None: logger.debug("Failed to parse query") - return dataplatform_tables + return Lineage.empty() for urn in parsed_result.in_tables: dataplatform_tables.append( @@ -184,9 +194,15 @@ def parse_custom_sql( ) ) + logger.debug(f"Native Query parsed result={parsed_result}") logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") - return dataplatform_tables + return Lineage( + upstreams=dataplatform_tables, + column_lineage=parsed_result.column_lineage + if parsed_result.column_lineage is not None + else [], + ) class AbstractDataAccessMQueryResolver(ABC): @@ -215,7 +231,7 @@ def resolve_to_data_platform_table_list( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, platform_instance_resolver: AbstractDataPlatformInstanceResolver, - ) -> List[DataPlatformTable]: + ) -> List[Lineage]: pass @@ -471,8 +487,8 @@ def resolve_to_data_platform_table_list( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, platform_instance_resolver: AbstractDataPlatformInstanceResolver, - ) -> List[DataPlatformTable]: - data_platform_tables: List[DataPlatformTable] = [] + ) -> List[Lineage]: + lineage: List[Lineage] = [] # Find out output variable as we are doing backtracking in M-Query output_variable: Optional[str] = tree_function.get_output_variable( @@ -484,7 +500,7 @@ def resolve_to_data_platform_table_list( f"{self.table.full_name}-output-variable", "output-variable not found in table expression", ) - return data_platform_tables + return lineage # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail table_links: List[ @@ -509,7 +525,7 @@ def resolve_to_data_platform_table_list( # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it # & also pass additional information that will be need to generate urn - table_full_name_creator: AbstractDataPlatformTableCreator = ( + table_qualified_name_creator: AbstractDataPlatformTableCreator = ( supported_resolver.get_table_full_name_creator()( ctx=ctx, config=config, @@ -517,11 +533,9 @@ def resolve_to_data_platform_table_list( ) ) - data_platform_tables.extend( - table_full_name_creator.create_dataplatform_tables(f_detail) - ) + lineage.append(table_qualified_name_creator.create_lineage(f_detail)) - return data_platform_tables + return lineage class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC): @@ -536,7 +550,7 @@ class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC): def two_level_access_pattern( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}" ) @@ -545,7 +559,7 @@ def two_level_access_pattern( data_access_func_detail.arg_list ) if server is None or db_name is None: - return [] # Return empty list + return Lineage.empty() # Return empty list schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -568,19 +582,21 @@ def two_level_access_pattern( server=server, qualified_table_name=qualified_table_name, ) - - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) class PostgresDataPlatformTableCreator(DefaultTwoStepDataAccessSources): - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: return self.two_level_access_pattern(data_access_func_detail) def get_platform_pair(self) -> DataPlatformPair: @@ -630,10 +646,10 @@ def create_urn_using_old_parser( return dataplatform_tables - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: - dataplatform_tables: List[DataPlatformTable] = [] + ) -> Lineage: + arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(data_access_func_detail.arg_list) @@ -647,14 +663,17 @@ def create_dataplatform_tables( if len(arguments) >= 4 and arguments[2] != "Query": logger.debug("Unsupported case is found. Second index is not the Query") - return dataplatform_tables + return Lineage.empty() if self.config.enable_advance_lineage_sql_construct is False: # Use previous parser to generate URN to keep backward compatibility - return self.create_urn_using_old_parser( - query=arguments[3], - db_name=arguments[1], - server=arguments[0], + return Lineage( + upstreams=self.create_urn_using_old_parser( + query=arguments[3], + db_name=arguments[1], + server=arguments[0], + ), + column_lineage=[], ) return self.parse_custom_sql( @@ -684,9 +703,9 @@ def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]: return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing Oracle data-access function detail {data_access_func_detail}" ) @@ -698,7 +717,7 @@ def create_dataplatform_tables( server, db_name = self._get_server_and_db_name(arguments[0]) if db_name is None or server is None: - return [] + return Lineage.empty() schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -719,18 +738,21 @@ def create_dataplatform_tables( qualified_table_name=qualified_table_name, ) - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) class DatabrickDataPlatformTableCreator(AbstractDataPlatformTableCreator): - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing Databrick data-access function detail {data_access_func_detail}" ) @@ -749,7 +771,7 @@ def create_dataplatform_tables( logger.debug( "expecting instance to be IdentifierAccessor, please check if parsing is done properly" ) - return [] + return Lineage.empty() db_name: str = value_dict["Database"] schema_name: str = value_dict["Schema"] @@ -762,7 +784,7 @@ def create_dataplatform_tables( logger.info( f"server information is not available for {qualified_table_name}. Skipping upstream table" ) - return [] + return Lineage.empty() urn = urn_creator( config=self.config, @@ -772,12 +794,15 @@ def create_dataplatform_tables( qualified_table_name=qualified_table_name, ) - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.DATABRICK_SQL.value @@ -789,9 +814,9 @@ def get_datasource_server( ) -> str: return tree_function.strip_char_from_list([arguments[0]])[0] - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}" ) @@ -826,12 +851,15 @@ def create_dataplatform_tables( qualified_table_name=qualified_table_name, ) - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) class SnowflakeDataPlatformTableCreator(DefaultThreeStepDataAccessSources): @@ -859,9 +887,9 @@ class AmazonRedshiftDataPlatformTableCreator(AbstractDataPlatformTableCreator): def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.AMAZON_REDSHIFT.value - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing AmazonRedshift data-access function detail {data_access_func_detail}" ) @@ -870,7 +898,7 @@ def create_dataplatform_tables( data_access_func_detail.arg_list ) if db_name is None or server is None: - return [] # Return empty list + return Lineage.empty() # Return empty list schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -891,12 +919,15 @@ def create_dataplatform_tables( qualified_table_name=qualified_table_name, ) - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) class NativeQueryDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -916,9 +947,7 @@ def is_native_parsing_supported(data_access_function_name: str) -> bool: in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM ) - def create_urn_using_old_parser( - self, query: str, server: str - ) -> List[DataPlatformTable]: + def create_urn_using_old_parser(self, query: str, server: str) -> Lineage: dataplatform_tables: List[DataPlatformTable] = [] tables: List[str] = native_sql_parser.get_tables(query) @@ -947,12 +976,14 @@ def create_urn_using_old_parser( logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") - return dataplatform_tables + return Lineage( + upstreams=dataplatform_tables, + column_lineage=[], + ) - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: - dataplatform_tables: List[DataPlatformTable] = [] + ) -> Lineage: t1: Tree = cast( Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list) ) @@ -963,7 +994,7 @@ def create_dataplatform_tables( f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}" ) logger.debug(f"Flat argument list = {flat_argument_list}") - return dataplatform_tables + return Lineage.empty() data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( tree_function.token_values(flat_argument_list[0]) ) @@ -981,7 +1012,7 @@ def create_dataplatform_tables( f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty " "list" ) - return dataplatform_tables + return Lineage.empty() self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[ data_access_tokens[0] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 5d477ee090e7e..52bcef66658c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -44,6 +44,11 @@ StatefulIngestionSourceBase, ) from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + FineGrainedLineage, + FineGrainedLineageDownstreamType, + FineGrainedLineageUpstreamType, +) from datahub.metadata.schema_classes import ( BrowsePathsClass, ChangeTypeClass, @@ -71,6 +76,7 @@ ViewPropertiesClass, ) from datahub.utilities.dedup_list import deduplicate_list +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo # Logger instance logger = logging.getLogger(__name__) @@ -165,6 +171,48 @@ def extract_dataset_schema( ) return [schema_mcp] + def make_fine_grained_lineage_class( + self, lineage: resolver.Lineage, dataset_urn: str + ) -> List[FineGrainedLineage]: + fine_grained_lineages: List[FineGrainedLineage] = [] + + if ( + self.__config.extract_column_level_lineage is False + or self.__config.extract_lineage is False + ): + return fine_grained_lineages + + if lineage is None: + return fine_grained_lineages + + logger.info("Extracting column level lineage") + + cll: List[ColumnLineageInfo] = lineage.column_lineage + + for cll_info in cll: + downstream = ( + [builder.make_schema_field_urn(dataset_urn, cll_info.downstream.column)] + if cll_info.downstream is not None + and cll_info.downstream.column is not None + else [] + ) + + upstreams = [ + builder.make_schema_field_urn(column_ref.table, column_ref.column) + for column_ref in cll_info.upstreams + ] + + fine_grained_lineages.append( + FineGrainedLineage( + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=downstream, + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=upstreams, + ) + ) + + return fine_grained_lineages + def extract_lineage( self, table: powerbi_data_classes.Table, ds_urn: str ) -> List[MetadataChangeProposalWrapper]: @@ -174,8 +222,9 @@ def extract_lineage( parameters = table.dataset.parameters if table.dataset else {} upstream: List[UpstreamClass] = [] + cll_lineage: List[FineGrainedLineage] = [] - upstream_dpts: List[resolver.DataPlatformTable] = parser.get_upstream_tables( + upstream_lineage: List[resolver.Lineage] = parser.get_upstream_tables( table=table, reporter=self.__reporter, platform_instance_resolver=self.__dataplatform_instance_resolver, @@ -185,34 +234,49 @@ def extract_lineage( ) logger.debug( - f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_dpts}" + f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_lineage}" ) - for upstream_dpt in upstream_dpts: - if ( - upstream_dpt.data_platform_pair.powerbi_data_platform_name - not in self.__config.dataset_type_mapping.keys() - ): - logger.debug( - f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", + for lineage in upstream_lineage: + for upstream_dpt in lineage.upstreams: + if ( + upstream_dpt.data_platform_pair.powerbi_data_platform_name + not in self.__config.dataset_type_mapping.keys() + ): + logger.debug( + f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", + ) + continue + + upstream_table_class = UpstreamClass( + upstream_dpt.urn, + DatasetLineageTypeClass.TRANSFORMED, ) - continue - upstream_table_class = UpstreamClass( - upstream_dpt.urn, - DatasetLineageTypeClass.TRANSFORMED, - ) + upstream.append(upstream_table_class) - upstream.append(upstream_table_class) + # Add column level lineage if any + cll_lineage.extend( + self.make_fine_grained_lineage_class( + lineage=lineage, + dataset_urn=ds_urn, + ) + ) if len(upstream) > 0: - upstream_lineage = UpstreamLineageClass(upstreams=upstream) + + upstream_lineage_class: UpstreamLineageClass = UpstreamLineageClass( + upstreams=upstream, + fineGrainedLineages=cll_lineage or None, + ) + logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}") + mcp = MetadataChangeProposalWrapper( entityType=Constant.DATASET, changeType=ChangeTypeClass.UPSERT, entityUrn=ds_urn, - aspect=upstream_lineage, + aspect=upstream_lineage_class, ) mcps.append(mcp) @@ -1075,6 +1139,10 @@ def report_to_datahub_work_units( SourceCapability.OWNERSHIP, "Disabled by default, configured using `extract_ownership`", ) +@capability( + SourceCapability.LINEAGE_FINE, + "Disabled by default, configured using `extract_column_level_lineage`. ", +) class PowerBiDashboardSource(StatefulIngestionSourceBase): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index 93850607e551e..804a14b0fe1cf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -7,8 +7,8 @@ from datahub.configuration import ConfigModel from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetLineageProviderConfigBase +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.data_lake_common.path_spec import PathSpec from datahub.ingestion.source.sql.postgres import BasePostgresConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py index e983734082b1d..771636e8498a3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py @@ -1,33 +1,19 @@ -import dataclasses import logging -from datetime import datetime -from typing import Dict, Iterable, List, Optional, Union, cast +from typing import Dict, Iterable, List, Optional, Union -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest from datahub.ingestion.source.redshift.config import RedshiftConfig from datahub.ingestion.source.redshift.redshift_schema import ( RedshiftTable, RedshiftView, ) from datahub.ingestion.source.redshift.report import RedshiftReport -from datahub.ingestion.source.sql.sql_generic_profiler import ( - GenericProfiler, - TableProfilerRequest, -) +from datahub.ingestion.source.sql.sql_generic_profiler import GenericProfiler from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler logger = logging.getLogger(__name__) -@dataclasses.dataclass -class RedshiftProfilerRequest(GEProfilerRequest): - table: Union[RedshiftTable, RedshiftView] - profile_table_level_only: bool = False - - class RedshiftProfiler(GenericProfiler): config: RedshiftConfig report: RedshiftReport @@ -63,80 +49,21 @@ def get_workunits( continue for table in tables[db].get(schema, {}): # Emit the profile work unit - profile_request = self.get_redshift_profile_request( - table, schema, db - ) + profile_request = self.get_profile_request(table, schema, db) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: continue - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - for request, profile in self.generate_profiles( - table_profile_requests, + + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, db, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if profile is None: - continue - request = cast(RedshiftProfilerRequest, request) - - profile.sizeInBytes = request.table.size_in_bytes - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler and not request.profile_table_level_only: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() - - def get_redshift_profile_request( - self, - table: Union[RedshiftTable, RedshiftView], - schema_name: str, - db_name: str, - ) -> Optional[RedshiftProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = f"{db_name}.{schema_name}.{table.name}".lower() - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count - ): - # Profile only table level if dataset is filtered from profiling - # due to size limits alone - if self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, 0, 0 - ): - profile_table_level_only = True - else: - skip_profiling = True - - if len(table.columns) == 0: - skip_profiling = True - - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") - return None + ) - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = RedshiftProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict(schema=schema_name, table=table.name), - table=table, - profile_table_level_only=profile_table_level_only, - ) - return profile_request + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return f"{db_name}.{schema_name}.{table_name}".lower() diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index f1dd622efb746..9b5296f0b9dd5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -5,8 +5,8 @@ from pydantic.fields import Field from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 95f6444384408..032bdef178fdf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -101,8 +101,8 @@ class SnowflakeV2Config( ) include_view_column_lineage: bool = Field( - default=False, - description="Populates view->view and table->view column lineage.", + default=True, + description="Populates view->view and table->view column lineage using DataHub's sql parser.", ) _check_role_grants_removed = pydantic_removed_field("check_role_grants") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 5f5e8e4bcdea3..24275dcdff34d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -1,20 +1,12 @@ -import dataclasses import logging -from datetime import datetime -from typing import Callable, Dict, Iterable, List, Optional, cast +from typing import Callable, Dict, Iterable, List, Optional from snowflake.sqlalchemy import snowdialect from sqlalchemy import create_engine, inspect from sqlalchemy.sql import sqltypes -from datahub.configuration.pattern_utils import is_schema_allowed -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.ge_data_profiler import ( - DatahubGEProfiler, - GEProfilerRequest, -) +from datahub.ingestion.source.ge_data_profiler import DatahubGEProfiler from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report @@ -23,10 +15,8 @@ SnowflakeTable, ) from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin -from datahub.ingestion.source.sql.sql_generic_profiler import ( - GenericProfiler, - TableProfilerRequest, -) +from datahub.ingestion.source.sql.sql_generic import BaseTable +from datahub.ingestion.source.sql.sql_generic_profiler import GenericProfiler from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler snowdialect.ischema_names["GEOGRAPHY"] = sqltypes.NullType @@ -35,12 +25,6 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass -class SnowflakeProfilerRequest(GEProfilerRequest): - table: SnowflakeTable - profile_table_level_only: bool = False - - class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin): def __init__( self, @@ -65,101 +49,52 @@ def get_workunits( profile_requests = [] for schema in database.schemas: - if not is_schema_allowed( - self.config.schema_pattern, - schema.name, - database.name, - self.config.match_fully_qualified_names, - ): - continue - for table in db_tables[schema.name]: - profile_request = self.get_snowflake_profile_request( + profile_request = self.get_profile_request( table, schema.name, database.name ) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: return - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - - for request, profile in self.generate_profiles( - table_profile_requests, + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, database.name, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if profile is None: - continue - profile.sizeInBytes = cast( - SnowflakeProfilerRequest, request - ).table.size_in_bytes - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() + ) - def get_snowflake_profile_request( - self, - table: SnowflakeTable, - schema_name: str, - db_name: str, - ) -> Optional[SnowflakeProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return self.get_dataset_identifier(table_name, schema_name, db_name) + + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + custom_sql = None + if ( + not self.config.profiling.limit + and self.config.profiling.use_sampling + and table.rows_count + and table.rows_count > self.config.profiling.sample_size ): - # Profile only table level if dataset is filtered from profiling - # due to size limits alone - if self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, 0, 0 - ): - profile_table_level_only = True - else: - skip_profiling = True - - if len(table.columns) == 0: - skip_profiling = True - - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") - return None - - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = SnowflakeProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict( - schema=schema_name, - table=table.name, - # Lowercase/Mixedcase table names in Snowflake do not work by default. - # We need to pass `use_quoted_name=True` for such tables as mentioned here - - # https://github.com/great-expectations/great_expectations/pull/2023 - use_quoted_name=(table.name != table.name.upper()), - ), - table=table, - profile_table_level_only=profile_table_level_only, - ) - return profile_request + # GX creates a temporary table from query if query is passed as batch kwargs. + # We are using fraction-based sampling here, instead of fixed-size sampling because + # Fixed-size sampling can be slower than equivalent fraction-based sampling + # as per https://docs.snowflake.com/en/sql-reference/constructs/sample#performance-considerations + sample_pc = 100 * self.config.profiling.sample_size / table.rows_count + custom_sql = f'select * from "{db_name}"."{schema_name}"."{table.name}" TABLESAMPLE ({sample_pc:.3f})' + return { + **super().get_batch_kwargs(table, schema_name, db_name), + # Lowercase/Mixedcase table names in Snowflake do not work by default. + # We need to pass `use_quoted_name=True` for such tables as mentioned here - + # https://github.com/great-expectations/great_expectations/pull/2023 + "use_quoted_name": (table.name != table.name.upper()), + "custom_sql": custom_sql, + } def get_profiler_instance( self, db_name: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 974348320928a..e0848b5f9ab34 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -303,14 +303,11 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # Caches tables for a single database. Consider moving to disk or S3 when possible. self.db_tables: Dict[str, List[SnowflakeTable]] = {} - self.sql_parser_schema_resolver = SchemaResolver( - platform=self.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - ) self.view_definitions: FileBackedDict[str] = FileBackedDict() self.add_config_to_report() + self.sql_parser_schema_resolver = self._init_schema_resolver() + @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": config = SnowflakeV2Config.parse_obj(config_dict) @@ -495,6 +492,24 @@ def query(query): return _report + def _init_schema_resolver(self) -> SchemaResolver: + if not self.config.include_technical_schema and self.config.parse_view_ddl: + if self.ctx.graph: + return self.ctx.graph.initialize_schema_resolver_from_datahub( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + else: + logger.warning( + "Failed to load schema info from DataHub as DataHubGraph is missing.", + ) + return SchemaResolver( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), @@ -773,7 +788,7 @@ def _process_schema( ) self.db_tables[schema_name] = tables - if self.config.include_technical_schema or self.config.parse_view_ddl: + if self.config.include_technical_schema: for table in tables: yield from self._process_table(table, schema_name, db_name) @@ -785,7 +800,7 @@ def _process_schema( if view.view_definition: self.view_definitions[key] = view.view_definition - if self.config.include_technical_schema or self.config.parse_view_ddl: + if self.config.include_technical_schema: for view in views: yield from self._process_view(view, schema_name, db_name) @@ -901,8 +916,6 @@ def _process_table( yield from self._process_tag(tag) yield from self.gen_dataset_workunits(table, schema_name, db_name) - elif self.config.parse_view_ddl: - self.gen_schema_metadata(table, schema_name, db_name) def fetch_sample_data_for_classification( self, table: SnowflakeTable, schema_name: str, db_name: str, dataset_name: str @@ -1013,8 +1026,6 @@ def _process_view( yield from self._process_tag(tag) yield from self.gen_dataset_workunits(view, schema_name, db_name) - elif self.config.parse_view_ddl: - self.gen_schema_metadata(view, schema_name, db_name) def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]: tag_identifier = tag.identifier() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index 1626f86b92545..8873038079bad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -19,9 +19,9 @@ from sqlalchemy.types import BOOLEAN, DATE, DATETIME, INTEGER import datahub.emitter.mce_builder as builder -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetLineageProviderConfigBase from datahub.configuration.time_window_config import BaseTimeWindowConfig +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.decorators import ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 112defe76d957..056be6c2e50ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -1,12 +1,10 @@ import datetime import logging import traceback -from collections import OrderedDict from dataclasses import dataclass, field from typing import ( TYPE_CHECKING, Any, - Callable, Dict, Iterable, List, @@ -103,52 +101,6 @@ MISSING_COLUMN_INFO = "missing column information" -def _platform_alchemy_uri_tester_gen( - platform: str, opt_starts_with: Optional[str] = None -) -> Tuple[str, Callable[[str], bool]]: - return platform, lambda x: x.startswith( - platform if not opt_starts_with else opt_starts_with - ) - - -PLATFORM_TO_SQLALCHEMY_URI_TESTER_MAP: Dict[str, Callable[[str], bool]] = OrderedDict( - [ - _platform_alchemy_uri_tester_gen("athena", "awsathena"), - _platform_alchemy_uri_tester_gen("bigquery"), - _platform_alchemy_uri_tester_gen("clickhouse"), - _platform_alchemy_uri_tester_gen("druid"), - _platform_alchemy_uri_tester_gen("hana"), - _platform_alchemy_uri_tester_gen("hive"), - _platform_alchemy_uri_tester_gen("mongodb"), - _platform_alchemy_uri_tester_gen("mssql"), - _platform_alchemy_uri_tester_gen("mysql"), - _platform_alchemy_uri_tester_gen("oracle"), - _platform_alchemy_uri_tester_gen("pinot"), - _platform_alchemy_uri_tester_gen("presto"), - ( - "redshift", - lambda x: ( - x.startswith(("jdbc:postgres:", "postgresql")) - and x.find("redshift.amazonaws") > 0 - ) - or x.startswith("redshift"), - ), - # Don't move this before redshift. - _platform_alchemy_uri_tester_gen("postgres", "postgresql"), - _platform_alchemy_uri_tester_gen("snowflake"), - _platform_alchemy_uri_tester_gen("trino"), - _platform_alchemy_uri_tester_gen("vertica"), - ] -) - - -def get_platform_from_sqlalchemy_uri(sqlalchemy_uri: str) -> str: - for platform, tester in PLATFORM_TO_SQLALCHEMY_URI_TESTER_MAP.items(): - if tester(sqlalchemy_uri): - return platform - return "external" - - @dataclass class SQLSourceReport(StaleEntityRemovalSourceReport): tables_scanned: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 8f1e04b915f3b..677d32c8bac08 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -7,8 +7,8 @@ from pydantic import Field from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig from datahub.ingestion.source.state.stale_entity_removal_handler import ( StatefulStaleMetadataRemovalConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index 344c114d464a9..aaeee5717a867 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -1,12 +1,15 @@ import logging +from abc import abstractmethod from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone -from typing import Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import Dict, Iterable, List, Optional, Union, cast from sqlalchemy import create_engine, inspect from sqlalchemy.engine.reflection import Inspector from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.ge_data_profiler import ( DatahubGEProfiler, GEProfilerRequest, @@ -16,7 +19,7 @@ from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile -from datahub.metadata.schema_classes import DatasetProfileClass +from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType from datahub.utilities.stats_collections import TopKDict, int_top_k_dict @@ -63,14 +66,14 @@ def __init__( self.platform = platform self.state_handler = state_handler - def generate_profiles( + def generate_profile_workunits( self, requests: List[TableProfilerRequest], max_workers: int, db_name: Optional[str] = None, platform: Optional[str] = None, profiler_args: Optional[Dict] = None, - ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]: + ) -> Iterable[MetadataWorkUnit]: ge_profile_requests: List[GEProfilerRequest] = [ cast(GEProfilerRequest, request) for request in requests @@ -80,21 +83,109 @@ def generate_profiles( request for request in requests if request.profile_table_level_only ] for request in table_level_profile_requests: - profile = DatasetProfile( + table_level_profile = DatasetProfile( timestampMillis=int(datetime.now().timestamp() * 1000), columnCount=request.table.column_count, rowCount=request.table.rows_count, sizeInBytes=request.table.size_in_bytes, ) - yield (request, profile) + dataset_urn = self.dataset_urn_builder(request.pretty_name) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=table_level_profile + ).as_workunit() if not ge_profile_requests: return # Otherwise, if column level profiling is enabled, use GE profiler. ge_profiler = self.get_profiler_instance(db_name) - yield from ge_profiler.generate_profiles( + + for ge_profiler_request, profile in ge_profiler.generate_profiles( ge_profile_requests, max_workers, platform, profiler_args + ): + if profile is None: + continue + + request = cast(TableProfilerRequest, ge_profiler_request) + profile.sizeInBytes = request.table.size_in_bytes + + # If table is partitioned we profile only one partition (if nothing set then the last one) + # but for table level we can use the rows_count from the table metadata + # This way even though column statistics only reflects one partition data but the rows count + # shows the proper count. + if ( + profile.partitionSpec + and profile.partitionSpec.type != PartitionType.FULL_TABLE + ): + profile.rowCount = request.table.rows_count + + dataset_urn = self.dataset_urn_builder(request.pretty_name) + + # We don't add to the profiler state if we only do table level profiling as it always happens + if self.state_handler: + self.state_handler.add_to_state( + dataset_urn, int(datetime.now().timestamp() * 1000) + ) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=profile + ).as_workunit() + + def dataset_urn_builder(self, dataset_name: str) -> str: + return make_dataset_urn_with_platform_instance( + self.platform, + dataset_name, + self.config.platform_instance, + self.config.env, + ) + + @abstractmethod + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + pass + + def get_profile_request( + self, table: BaseTable, schema_name: str, db_name: str + ) -> Optional[TableProfilerRequest]: + skip_profiling = False + profile_table_level_only = self.config.profiling.profile_table_level_only + dataset_name = self.get_dataset_name(table.name, schema_name, db_name) + if not self.is_dataset_eligible_for_profiling( + dataset_name, table.last_altered, table.size_in_bytes, table.rows_count + ): + # Profile only table level if dataset is filtered from profiling + # due to size limits alone + if self.is_dataset_eligible_for_profiling( + dataset_name, table.last_altered, 0, 0 + ): + profile_table_level_only = True + else: + skip_profiling = True + self.report.num_tables_not_eligible_profiling[ + f"{db_name}.{schema_name}" + ] += 1 + + if table.column_count == 0: + skip_profiling = True + + if skip_profiling: + if self.config.profiling.report_dropped_profiles: + self.report.report_dropped(f"profile of {dataset_name}") + return None + + logger.debug(f"Preparing profiling request for {dataset_name}") + profile_request = TableProfilerRequest( + pretty_name=dataset_name, + batch_kwargs=self.get_batch_kwargs(table, schema_name, db_name), + table=table, + profile_table_level_only=profile_table_level_only, + ) + return profile_request + + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + return dict( + schema=schema_name, + table=table.name, ) def get_inspectors(self) -> Iterable[Inspector]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py new file mode 100644 index 0000000000000..b6a463837228d --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py @@ -0,0 +1,47 @@ +from collections import OrderedDict +from typing import Callable, Dict, Optional, Tuple + + +def _platform_alchemy_uri_tester_gen( + platform: str, opt_starts_with: Optional[str] = None +) -> Tuple[str, Callable[[str], bool]]: + return platform, lambda x: x.startswith(opt_starts_with or platform) + + +PLATFORM_TO_SQLALCHEMY_URI_TESTER_MAP: Dict[str, Callable[[str], bool]] = OrderedDict( + [ + _platform_alchemy_uri_tester_gen("athena", "awsathena"), + _platform_alchemy_uri_tester_gen("bigquery"), + _platform_alchemy_uri_tester_gen("clickhouse"), + _platform_alchemy_uri_tester_gen("druid"), + _platform_alchemy_uri_tester_gen("hana"), + _platform_alchemy_uri_tester_gen("hive"), + _platform_alchemy_uri_tester_gen("mongodb"), + _platform_alchemy_uri_tester_gen("mssql"), + _platform_alchemy_uri_tester_gen("mysql"), + _platform_alchemy_uri_tester_gen("oracle"), + _platform_alchemy_uri_tester_gen("pinot"), + _platform_alchemy_uri_tester_gen("presto"), + ( + "redshift", + lambda x: ( + x.startswith(("jdbc:postgres:", "postgresql")) + and x.find("redshift.amazonaws") > 0 + ) + or x.startswith("redshift"), + ), + # Don't move this before redshift. + _platform_alchemy_uri_tester_gen("postgres", "postgresql"), + _platform_alchemy_uri_tester_gen("snowflake"), + _platform_alchemy_uri_tester_gen("sqlite"), + _platform_alchemy_uri_tester_gen("trino"), + _platform_alchemy_uri_tester_gen("vertica"), + ] +) + + +def get_platform_from_sqlalchemy_uri(sqlalchemy_uri: str) -> str: + for platform, tester in PLATFORM_TO_SQLALCHEMY_URI_TESTER_MAP.items(): + if tester(sqlalchemy_uri): + return platform + return "external" diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py index 2fcc93292c2ef..bce4d1ec76e6e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -103,13 +103,12 @@ def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig): self.builder = SqlParsingBuilder(usage_config=self.config.usage) if self.config.use_schema_resolver: - schema_resolver, urns = self.graph.initialize_schema_resolver_from_datahub( + self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub( platform=self.config.platform, platform_instance=self.config.platform_instance, env=self.config.env, ) - self.schema_resolver = schema_resolver - self.urns = urns + self.urns = self.schema_resolver.get_urns() else: self.schema_resolver = self.graph._make_schema_resolver( platform=self.config.platform, diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index 2a4563439b6ba..14bc4242d2a91 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -21,7 +21,9 @@ ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql import sql_common +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -202,7 +204,7 @@ def get_platform_from_database_id(self, database_id): sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri") if sqlalchemy_uri is None: return database_response.get("result", {}).get("backend", "external") - return sql_common.get_platform_from_sqlalchemy_uri(sqlalchemy_uri) + return get_platform_from_sqlalchemy_uri(sqlalchemy_uri) @lru_cache(maxsize=None) def get_datasource_urn_from_id(self, datasource_id): diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 4cc00a66116e9..e347cd26d245a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -37,11 +37,11 @@ ConfigModel, ConfigurationError, ) -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import ( DatasetLineageProviderConfigBase, DatasetSourceConfigMixin, ) +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( ContainerKey, @@ -1179,8 +1179,6 @@ def get_upstream_fields_of_field_in_datasource( def get_upstream_fields_from_custom_sql( self, datasource: dict, datasource_urn: str ) -> List[FineGrainedLineage]: - fine_grained_lineages: List[FineGrainedLineage] = [] - parsed_result = self.parse_custom_sql( datasource=datasource, datasource_urn=datasource_urn, @@ -1194,13 +1192,20 @@ def get_upstream_fields_from_custom_sql( logger.info( f"Failed to extract column level lineage from datasource {datasource_urn}" ) - return fine_grained_lineages + return [] + if parsed_result.debug_info.error: + logger.info( + f"Failed to extract column level lineage from datasource {datasource_urn}: {parsed_result.debug_info.error}" + ) + return [] cll: List[ColumnLineageInfo] = ( parsed_result.column_lineage if parsed_result.column_lineage is not None else [] ) + + fine_grained_lineages: List[FineGrainedLineage] = [] for cll_info in cll: downstream = ( [ diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 94ff755e3b254..51390873712d3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -1,3 +1,4 @@ +import logging import os from datetime import datetime, timedelta, timezone from typing import Any, Dict, Optional @@ -21,6 +22,9 @@ OperationConfig, is_profiling_enabled, ) +from datahub.utilities.global_warning_util import add_global_warning + +logger = logging.getLogger(__name__) class UnityCatalogProfilerConfig(ConfigModel): @@ -97,9 +101,25 @@ class UnityCatalogSourceConfig( description="Name of the workspace. Default to deployment name present in workspace_url", ) + include_metastore: bool = pydantic.Field( + default=True, + description=( + "Whether to ingest the workspace's metastore as a container and include it in all urns." + " Changing this will affect the urns of all entities in the workspace." + " This will be disabled by default in the future," + " so it is recommended to set this to `False` for new ingestions." + " If you have an existing unity catalog ingestion, you'll want to avoid duplicates by soft deleting existing data." + " If stateful ingestion is enabled, running with `include_metastore: false` should be sufficient." + " Otherwise, we recommend deleting via the cli: `datahub delete --platform databricks` and re-ingesting with `include_metastore: false`." + ), + ) + ingest_data_platform_instance_aspect: Optional[bool] = pydantic.Field( default=False, - description="Option to enable/disable ingestion of the data platform instance aspect. The default data platform instance id for a dataset is workspace_name", + description=( + "Option to enable/disable ingestion of the data platform instance aspect." + " The default data platform instance id for a dataset is workspace_name" + ), ) _only_ingest_assigned_metastore_removed = pydantic_removed_field( @@ -122,16 +142,31 @@ class UnityCatalogSourceConfig( default=AllowDenyPattern.allow_all(), description="Regex patterns for tables to filter in ingestion. Specify regex to match the entire table name in `catalog.schema.table` format. e.g. to match all tables starting with customer in Customer catalog and public schema, use the regex `Customer\\.public\\.customer.*`.", ) + + notebook_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description=( + "Regex patterns for notebooks to filter in ingestion, based on notebook *path*." + " Specify regex to match the entire notebook path in `//.../` format." + " e.g. to match all notebooks in the root Shared directory, use the regex `/Shared/.*`." + ), + ) + domain: Dict[str, AllowDenyPattern] = Field( default=dict(), description='Attach domains to catalogs, schemas or tables during ingestion using regex patterns. Domain key can be a guid like *urn:li:domain:ec428203-ce86-4db3-985d-5a8ee6df32ba* or a string like "Marketing".) If you provide strings, then datahub will attempt to resolve this name to a guid, and will error out if this fails. There can be multiple domain keys specified.', ) - include_table_lineage: Optional[bool] = pydantic.Field( + include_table_lineage: bool = pydantic.Field( default=True, description="Option to enable/disable lineage generation.", ) + include_notebooks: bool = pydantic.Field( + default=False, + description="Ingest notebooks, represented as DataHub datasets.", + ) + include_ownership: bool = pydantic.Field( default=False, description="Option to enable/disable ownership generation for metastores, catalogs, schemas, and tables.", @@ -141,11 +176,22 @@ class UnityCatalogSourceConfig( "include_table_ownership", "include_ownership" ) - include_column_lineage: Optional[bool] = pydantic.Field( + include_column_lineage: bool = pydantic.Field( default=True, description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ", ) + column_lineage_column_limit: int = pydantic.Field( + default=300, + description="Limit the number of columns to get column level lineage. ", + ) + + lineage_max_workers: int = pydantic.Field( + default=5 * (os.cpu_count() or 4), + description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.", + hidden_from_docs=True, + ) + include_usage_statistics: bool = Field( default=True, description="Generate usage statistics.", @@ -177,3 +223,16 @@ def workspace_url_should_start_with_http_scheme(cls, workspace_url: str) -> str: "Workspace URL must start with http scheme. e.g. https://my-workspace.cloud.databricks.com" ) return workspace_url + + @pydantic.validator("include_metastore") + def include_metastore_warning(cls, v: bool) -> bool: + if v: + msg = ( + "`include_metastore` is enabled." + " This is not recommended and will be disabled by default in the future, which is a breaking change." + " All databricks urns will change if you re-ingest with this disabled." + " We recommend soft deleting all databricks data and re-ingesting with `include_metastore` set to `False`." + ) + logger.warning(msg) + add_global_warning(msg) + return v diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index e92f4ff07b1ad..9bcdb200f180e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -23,6 +23,7 @@ QueryStatementType, QueryStatus, ) +from databricks.sdk.service.workspace import ObjectType import datahub from datahub.ingestion.source.unity.proxy_profiling import ( @@ -33,6 +34,7 @@ Catalog, Column, Metastore, + Notebook, Query, Schema, ServicePrincipal, @@ -95,14 +97,13 @@ def __init__( self.report = report def check_basic_connectivity(self) -> bool: - self._workspace_client.metastores.summary() - return True + return bool(self._workspace_client.catalogs.list()) def assigned_metastore(self) -> Metastore: response = self._workspace_client.metastores.summary() return self._create_metastore(response) - def catalogs(self, metastore: Metastore) -> Iterable[Catalog]: + def catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]: response = self._workspace_client.catalogs.list() if not response: logger.info("Catalogs not found") @@ -137,6 +138,21 @@ def service_principals(self) -> Iterable[ServicePrincipal]: for principal in self._workspace_client.service_principals.list(): yield self._create_service_principal(principal) + def workspace_notebooks(self) -> Iterable[Notebook]: + for obj in self._workspace_client.workspace.list("/", recursive=True): + if obj.object_type == ObjectType.NOTEBOOK: + yield Notebook( + id=obj.object_id, + path=obj.path, + language=obj.language, + created_at=datetime.fromtimestamp( + obj.created_at / 1000, tz=timezone.utc + ), + modified_at=datetime.fromtimestamp( + obj.modified_at / 1000, tz=timezone.utc + ), + ) + def query_history( self, start_time: datetime, @@ -153,7 +169,7 @@ def query_history( "start_time_ms": start_time.timestamp() * 1000, "end_time_ms": end_time.timestamp() * 1000, }, - "statuses": [QueryStatus.FINISHED.value], + "statuses": [QueryStatus.FINISHED], "statement_types": [typ.value for typ in ALLOWED_STATEMENT_TYPES], } ) @@ -196,64 +212,72 @@ def _query_history( method, path, body={**body, "page_token": response["next_page_token"]} ) - def list_lineages_by_table(self, table_name: str) -> dict: + def list_lineages_by_table( + self, table_name: str, include_entity_lineage: bool + ) -> dict: """List table lineage by table name.""" return self._workspace_client.api_client.do( method="GET", - path="/api/2.0/lineage-tracking/table-lineage/get", - body={"table_name": table_name}, + path="/api/2.0/lineage-tracking/table-lineage", + body={ + "table_name": table_name, + "include_entity_lineage": include_entity_lineage, + }, ) def list_lineages_by_column(self, table_name: str, column_name: str) -> dict: """List column lineage by table name and column name.""" return self._workspace_client.api_client.do( "GET", - "/api/2.0/lineage-tracking/column-lineage/get", + "/api/2.0/lineage-tracking/column-lineage", body={"table_name": table_name, "column_name": column_name}, ) - def table_lineage(self, table: Table) -> None: + def table_lineage(self, table: Table, include_entity_lineage: bool) -> None: # Lineage endpoint doesn't exists on 2.1 version try: response: dict = self.list_lineages_by_table( - table_name=f"{table.schema.catalog.name}.{table.schema.name}.{table.name}" + table_name=table.ref.qualified_table_name, + include_entity_lineage=include_entity_lineage, ) - table.upstreams = { - TableReference( - table.schema.catalog.metastore.id, - item["catalog_name"], - item["schema_name"], - item["name"], - ): {} - for item in response.get("upstream_tables", []) - } + + for item in response.get("upstreams") or []: + if "tableInfo" in item: + table_ref = TableReference.create_from_lineage( + item["tableInfo"], table.schema.catalog.metastore + ) + if table_ref: + table.upstreams[table_ref] = {} + for notebook in item.get("notebookInfos") or []: + table.upstream_notebooks.add(notebook["notebook_id"]) + + for item in response.get("downstreams") or []: + for notebook in item.get("notebookInfos") or []: + table.downstream_notebooks.add(notebook["notebook_id"]) except Exception as e: - logger.error(f"Error getting lineage: {e}") + logger.warning( + f"Error getting lineage on table {table.ref}: {e}", exc_info=True + ) - def get_column_lineage(self, table: Table) -> None: + def get_column_lineage(self, table: Table, column_name: str) -> None: try: - table_lineage_response: dict = self.list_lineages_by_table( - table_name=f"{table.schema.catalog.name}.{table.schema.name}.{table.name}" + response: dict = self.list_lineages_by_column( + table_name=table.ref.qualified_table_name, + column_name=column_name, ) - if table_lineage_response: - for column in table.columns: - response: dict = self.list_lineages_by_column( - table_name=f"{table.schema.catalog.name}.{table.schema.name}.{table.name}", - column_name=column.name, - ) - for item in response.get("upstream_cols", []): - table_ref = TableReference( - table.schema.catalog.metastore.id, - item["catalog_name"], - item["schema_name"], - item["table_name"], - ) - table.upstreams.setdefault(table_ref, {}).setdefault( - column.name, [] - ).append(item["name"]) - + for item in response.get("upstream_cols") or []: + table_ref = TableReference.create_from_lineage( + item, table.schema.catalog.metastore + ) + if table_ref: + table.upstreams.setdefault(table_ref, {}).setdefault( + column_name, [] + ).append(item["name"]) except Exception as e: - logger.error(f"Error getting lineage: {e}") + logger.warning( + f"Error getting column lineage on table {table.ref}, column {column_name}: {e}", + exc_info=True, + ) @staticmethod def _escape_sequence(value: str) -> str: @@ -274,10 +298,13 @@ def _create_metastore( comment=None, ) - def _create_catalog(self, metastore: Metastore, obj: CatalogInfo) -> Catalog: + def _create_catalog( + self, metastore: Optional[Metastore], obj: CatalogInfo + ) -> Catalog: + catalog_name = self._escape_sequence(obj.name) return Catalog( name=obj.name, - id=f"{metastore.id}.{self._escape_sequence(obj.name)}", + id=f"{metastore.id}.{catalog_name}" if metastore else catalog_name, metastore=metastore, comment=obj.comment, owner=obj.owner, diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py index 2b943d8c98e7d..18ac2475b51e0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py @@ -1,8 +1,10 @@ # Supported types are available at # https://api-docs.databricks.com/rest/latest/unity-catalog-api-specification-2-1.html?_ga=2.151019001.1795147704.1666247755-2119235717.1666247755 +import dataclasses +import logging from dataclasses import dataclass, field from datetime import datetime -from typing import Dict, List, Optional +from typing import Dict, FrozenSet, List, Optional, Set from databricks.sdk.service.catalog import ( CatalogType, @@ -11,6 +13,7 @@ TableType, ) from databricks.sdk.service.sql import QueryStatementType +from databricks.sdk.service.workspace import Language from datahub.metadata.schema_classes import ( ArrayTypeClass, @@ -26,6 +29,8 @@ TimeTypeClass, ) +logger = logging.getLogger(__name__) + DATA_TYPE_REGISTRY: dict = { ColumnTypeName.BOOLEAN: BooleanTypeClass, ColumnTypeName.BYTE: BytesTypeClass, @@ -66,6 +71,9 @@ ALLOWED_STATEMENT_TYPES = {*OPERATION_STATEMENT_TYPES.keys(), QueryStatementType.SELECT} +NotebookId = int + + @dataclass class CommonProperty: id: str @@ -84,7 +92,7 @@ class Metastore(CommonProperty): @dataclass class Catalog(CommonProperty): - metastore: Metastore + metastore: Optional[Metastore] owner: Optional[str] type: CatalogType @@ -122,7 +130,7 @@ class ServicePrincipal: @dataclass(frozen=True, order=True) class TableReference: - metastore: str + metastore: Optional[str] catalog: str schema: str table: str @@ -130,14 +138,34 @@ class TableReference: @classmethod def create(cls, table: "Table") -> "TableReference": return cls( - table.schema.catalog.metastore.id, + table.schema.catalog.metastore.id + if table.schema.catalog.metastore + else None, table.schema.catalog.name, table.schema.name, table.name, ) + @classmethod + def create_from_lineage( + cls, d: dict, metastore: Optional[Metastore] + ) -> Optional["TableReference"]: + try: + return cls( + metastore.id if metastore else None, + d["catalog_name"], + d["schema_name"], + d.get("table_name", d["name"]), # column vs table query output + ) + except Exception as e: + logger.warning(f"Failed to create TableReference from {d}: {e}") + return None + def __str__(self) -> str: - return f"{self.metastore}.{self.catalog}.{self.schema}.{self.table}" + if self.metastore: + return f"{self.metastore}.{self.catalog}.{self.schema}.{self.table}" + else: + return self.qualified_table_name @property def qualified_table_name(self) -> str: @@ -154,7 +182,6 @@ class Table(CommonProperty): columns: List[Column] storage_location: Optional[str] data_source_format: Optional[DataSourceFormat] - comment: Optional[str] table_type: TableType owner: Optional[str] generation: Optional[int] @@ -166,6 +193,8 @@ class Table(CommonProperty): view_definition: Optional[str] properties: Dict[str, str] upstreams: Dict[TableReference, Dict[str, List[str]]] = field(default_factory=dict) + upstream_notebooks: Set[NotebookId] = field(default_factory=set) + downstream_notebooks: Set[NotebookId] = field(default_factory=set) ref: TableReference = field(init=False) @@ -228,3 +257,23 @@ def __bool__(self): self.max is not None, ) ) + + +@dataclass +class Notebook: + id: NotebookId + path: str + language: Language + created_at: datetime + modified_at: datetime + + upstreams: FrozenSet[TableReference] = field(default_factory=frozenset) + + @classmethod + def add_upstream(cls, upstream: TableReference, notebook: "Notebook") -> "Notebook": + return cls( + **{ # type: ignore + **dataclasses.asdict(notebook), + "upstreams": frozenset([*notebook.upstreams, upstream]), + } + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index 8382b31a56add..fa61571fa92cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -5,21 +5,25 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, ) +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.utilities.lossy_collections import LossyDict, LossyList @dataclass -class UnityCatalogReport(StaleEntityRemovalSourceReport): +class UnityCatalogReport(IngestionStageReport, StaleEntityRemovalSourceReport): metastores: EntityFilterReport = EntityFilterReport.field(type="metastore") catalogs: EntityFilterReport = EntityFilterReport.field(type="catalog") schemas: EntityFilterReport = EntityFilterReport.field(type="schema") tables: EntityFilterReport = EntityFilterReport.field(type="table/view") table_profiles: EntityFilterReport = EntityFilterReport.field(type="table profile") + notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook") + + num_column_lineage_skipped_column_count: int = 0 num_queries: int = 0 num_queries_dropped_parse_failure: int = 0 - num_queries_dropped_missing_table: int = 0 # Can be due to pattern filter - num_queries_dropped_duplicate_table: int = 0 + num_queries_missing_table: int = 0 # Can be due to pattern filter + num_queries_duplicate_table: int = 0 num_queries_parsed_by_spark_plan: int = 0 # Distinguish from Operations emitted for created / updated timestamps diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 493acb939c3bb..27c1f341aa84d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -1,8 +1,9 @@ import logging import re import time +from concurrent.futures import ThreadPoolExecutor from datetime import timedelta -from typing import Dict, Iterable, List, Optional, Set +from typing import Dict, Iterable, List, Optional, Set, Union from urllib.parse import urljoin from datahub.emitter.mce_builder import ( @@ -16,9 +17,12 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( CatalogKey, + CatalogKeyWithMetastore, ContainerKey, MetastoreKey, + NotebookKey, UnitySchemaKey, + UnitySchemaKeyWithMetastore, add_dataset_to_container, gen_containers, ) @@ -56,6 +60,8 @@ Catalog, Column, Metastore, + Notebook, + NotebookId, Schema, ServicePrincipal, Table, @@ -69,6 +75,7 @@ ViewProperties, ) from datahub.metadata.schema_classes import ( + BrowsePathsClass, DataPlatformInstanceClass, DatasetLineageTypeClass, DatasetPropertiesClass, @@ -88,6 +95,7 @@ UpstreamClass, UpstreamLineageClass, ) +from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column from datahub.utilities.registries.domain_registry import DomainRegistry @@ -122,7 +130,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource): config: UnityCatalogSourceConfig unity_catalog_api_proxy: UnityCatalogApiProxy platform: str = "databricks" - platform_instance_name: str + platform_instance_name: Optional[str] def get_report(self) -> UnityCatalogReport: return self.report @@ -141,11 +149,13 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig): self.external_url_base = urljoin(self.config.workspace_url, "/explore/data") # Determine the platform_instance_name - self.platform_instance_name = ( - config.workspace_name - if config.workspace_name is not None - else config.workspace_url.split("//")[1].split(".")[0] - ) + self.platform_instance_name = self.config.platform_instance + if self.config.include_metastore: + self.platform_instance_name = ( + config.workspace_name + if config.workspace_name is not None + else config.workspace_url.split("//")[1].split(".")[0] + ) if self.config.domain: self.domain_registry = DomainRegistry( @@ -157,6 +167,7 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig): # Global set of table refs self.table_refs: Set[TableReference] = set() self.view_refs: Set[TableReference] = set() + self.notebooks: FileBackedDict[Notebook] = FileBackedDict() @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: @@ -176,6 +187,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + self.report.report_ingestion_stage_start("Start warehouse") wait_on_warehouse = None if self.config.is_profiling_enabled(): # Can take several minutes, so start now and wait later @@ -187,10 +199,23 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) return + self.report.report_ingestion_stage_start("Ingest service principals") self.build_service_principal_map() + if self.config.include_notebooks: + self.report.report_ingestion_stage_start("Ingest notebooks") + yield from self.process_notebooks() + yield from self.process_metastores() + if self.config.include_notebooks: + self.report.report_ingestion_stage_start("Notebook lineage") + for notebook in self.notebooks.values(): + wu = self._gen_notebook_lineage(notebook) + if wu: + yield wu + if self.config.include_usage_statistics: + self.report.report_ingestion_stage_start("Ingest usage") usage_extractor = UnityCatalogUsageExtractor( config=self.config, report=self.report, @@ -203,6 +228,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) if self.config.is_profiling_enabled(): + self.report.report_ingestion_stage_start("Wait on warehouse") assert wait_on_warehouse timeout = timedelta(seconds=self.config.profiling.max_wait_secs) wait_on_warehouse.result(timeout) @@ -212,6 +238,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.unity_catalog_api_proxy, self.gen_dataset_urn, ) + self.report.report_ingestion_stage_start("Profiling") yield from profiling_extractor.get_workunits(self.table_refs) def build_service_principal_map(self) -> None: @@ -223,14 +250,72 @@ def build_service_principal_map(self) -> None: "service-principals", f"Unable to fetch service principals: {e}" ) + def process_notebooks(self) -> Iterable[MetadataWorkUnit]: + for notebook in self.unity_catalog_api_proxy.workspace_notebooks(): + if not self.config.notebook_pattern.allowed(notebook.path): + self.report.notebooks.dropped(notebook.path) + continue + + self.notebooks[str(notebook.id)] = notebook + yield from self._gen_notebook_workunits(notebook) + + def _gen_notebook_workunits(self, notebook: Notebook) -> Iterable[MetadataWorkUnit]: + mcps = MetadataChangeProposalWrapper.construct_many( + entityUrn=self.gen_notebook_urn(notebook), + aspects=[ + DatasetPropertiesClass( + name=notebook.path.rsplit("/", 1)[-1], + customProperties={ + "path": notebook.path, + "language": notebook.language.value, + }, + externalUrl=urljoin( + self.config.workspace_url, f"#notebook/{notebook.id}" + ), + created=TimeStampClass(int(notebook.created_at.timestamp() * 1000)), + lastModified=TimeStampClass( + int(notebook.modified_at.timestamp() * 1000) + ), + ), + SubTypesClass(typeNames=[DatasetSubTypes.NOTEBOOK]), + BrowsePathsClass(paths=notebook.path.split("/")), + self._create_data_platform_instance_aspect(), + ], + ) + for mcp in mcps: + yield mcp.as_workunit() + + self.report.notebooks.processed(notebook.path) + + def _gen_notebook_lineage(self, notebook: Notebook) -> Optional[MetadataWorkUnit]: + if not notebook.upstreams: + return None + + return MetadataChangeProposalWrapper( + entityUrn=self.gen_notebook_urn(notebook), + aspect=UpstreamLineageClass( + upstreams=[ + UpstreamClass( + dataset=self.gen_dataset_urn(upstream_ref), + type=DatasetLineageTypeClass.COPY, + ) + for upstream_ref in notebook.upstreams + ] + ), + ).as_workunit() + def process_metastores(self) -> Iterable[MetadataWorkUnit]: - metastore = self.unity_catalog_api_proxy.assigned_metastore() - yield from self.gen_metastore_containers(metastore) + metastore: Optional[Metastore] = None + if self.config.include_metastore: + metastore = self.unity_catalog_api_proxy.assigned_metastore() + yield from self.gen_metastore_containers(metastore) yield from self.process_catalogs(metastore) + if metastore and self.config.include_metastore: + self.report.metastores.processed(metastore.id) - self.report.metastores.processed(metastore.id) - - def process_catalogs(self, metastore: Metastore) -> Iterable[MetadataWorkUnit]: + def process_catalogs( + self, metastore: Optional[Metastore] + ) -> Iterable[MetadataWorkUnit]: for catalog in self.unity_catalog_api_proxy.catalogs(metastore=metastore): if not self.config.catalog_pattern.allowed(catalog.id): self.report.catalogs.dropped(catalog.id) @@ -247,6 +332,7 @@ def process_schemas(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]: self.report.schemas.dropped(schema.id) continue + self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}") yield from self.gen_schema_containers(schema) yield from self.process_tables(schema) @@ -280,15 +366,15 @@ def process_table(self, table: Table, schema: Schema) -> Iterable[MetadataWorkUn operation = self._create_table_operation_aspect(table) domain = self._get_domain_aspect(dataset_name=table.ref.qualified_table_name) ownership = self._create_table_ownership_aspect(table) - data_platform_instance = self._create_data_platform_instance_aspect(table) + data_platform_instance = self._create_data_platform_instance_aspect() - lineage: Optional[UpstreamLineageClass] = None - if self.config.include_column_lineage: - self.unity_catalog_api_proxy.get_column_lineage(table) - lineage = self._generate_column_lineage_aspect(dataset_urn, table) - elif self.config.include_table_lineage: - self.unity_catalog_api_proxy.table_lineage(table) - lineage = self._generate_lineage_aspect(dataset_urn, table) + lineage = self.ingest_lineage(table) + + if self.config.include_notebooks: + for notebook_id in table.downstream_notebooks: + self.notebooks[str(notebook_id)] = Notebook.add_upstream( + table.ref, self.notebooks[str(notebook_id)] + ) yield from [ mcp.as_workunit() @@ -308,7 +394,29 @@ def process_table(self, table: Table, schema: Schema) -> Iterable[MetadataWorkUn ) ] - def _generate_column_lineage_aspect( + def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]: + if self.config.include_table_lineage: + self.unity_catalog_api_proxy.table_lineage( + table, include_entity_lineage=self.config.include_notebooks + ) + + if self.config.include_column_lineage and table.upstreams: + if len(table.columns) > self.config.column_lineage_column_limit: + self.report.num_column_lineage_skipped_column_count += 1 + + with ThreadPoolExecutor( + max_workers=self.config.lineage_max_workers + ) as executor: + for column in table.columns[: self.config.column_lineage_column_limit]: + executor.submit( + self.unity_catalog_api_proxy.get_column_lineage, + table, + column.name, + ) + + return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table) + + def _generate_lineage_aspect( self, dataset_urn: str, table: Table ) -> Optional[UpstreamLineageClass]: upstreams: List[UpstreamClass] = [] @@ -318,6 +426,7 @@ def _generate_column_lineage_aspect( ): upstream_urn = self.gen_dataset_urn(upstream_ref) + # Should be empty if config.include_column_lineage is False finegrained_lineages.extend( FineGrainedLineage( upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, @@ -331,38 +440,28 @@ def _generate_column_lineage_aspect( for d_col, u_cols in sorted(downstream_to_upstream_cols.items()) ) - upstream_table = UpstreamClass( - upstream_urn, - DatasetLineageTypeClass.TRANSFORMED, - ) - upstreams.append(upstream_table) - - if upstreams: - return UpstreamLineageClass( - upstreams=upstreams, fineGrainedLineages=finegrained_lineages - ) - else: - return None - - def _generate_lineage_aspect( - self, dataset_urn: str, table: Table - ) -> Optional[UpstreamLineageClass]: - upstreams: List[UpstreamClass] = [] - for upstream in sorted(table.upstreams.keys()): - upstream_urn = make_dataset_urn_with_platform_instance( - self.platform, - f"{table.schema.catalog.metastore.id}.{upstream}", - self.platform_instance_name, + upstreams.append( + UpstreamClass( + dataset=upstream_urn, + type=DatasetLineageTypeClass.TRANSFORMED, + ) ) - upstream_table = UpstreamClass( - upstream_urn, - DatasetLineageTypeClass.TRANSFORMED, + for notebook in table.upstream_notebooks: + upstreams.append( + UpstreamClass( + dataset=self.gen_notebook_urn(notebook), + type=DatasetLineageTypeClass.TRANSFORMED, + ) ) - upstreams.append(upstream_table) if upstreams: - return UpstreamLineageClass(upstreams=upstreams) + return UpstreamLineageClass( + upstreams=upstreams, + fineGrainedLineages=finegrained_lineages + if self.config.include_column_lineage + else None, + ) else: return None @@ -389,6 +488,14 @@ def gen_dataset_urn(self, table_ref: TableReference) -> str: name=str(table_ref), ) + def gen_notebook_urn(self, notebook: Union[Notebook, NotebookId]) -> str: + notebook_id = notebook.id if isinstance(notebook, Notebook) else notebook + return NotebookKey( + notebook_id=notebook_id, + platform=self.platform, + instance=self.config.platform_instance, + ).as_urn() + def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]: domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}") @@ -423,27 +530,37 @@ def gen_metastore_containers( def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]: domain_urn = self._gen_domain_urn(catalog.name) - metastore_container_key = self.gen_metastore_key(catalog.metastore) catalog_container_key = self.gen_catalog_key(catalog) yield from gen_containers( container_key=catalog_container_key, name=catalog.name, sub_types=[DatasetContainerSubTypes.CATALOG], domain_urn=domain_urn, - parent_container_key=metastore_container_key, + parent_container_key=self.gen_metastore_key(catalog.metastore) + if self.config.include_metastore and catalog.metastore + else None, description=catalog.comment, owner_urn=self.get_owner_urn(catalog.owner), external_url=f"{self.external_url_base}/{catalog.name}", ) def gen_schema_key(self, schema: Schema) -> ContainerKey: - return UnitySchemaKey( - unity_schema=schema.name, - platform=self.platform, - instance=self.config.platform_instance, - catalog=schema.catalog.name, - metastore=schema.catalog.metastore.name, - ) + if self.config.include_metastore: + assert schema.catalog.metastore + return UnitySchemaKeyWithMetastore( + unity_schema=schema.name, + platform=self.platform, + instance=self.config.platform_instance, + catalog=schema.catalog.name, + metastore=schema.catalog.metastore.name, + ) + else: + return UnitySchemaKey( + unity_schema=schema.name, + platform=self.platform, + instance=self.config.platform_instance, + catalog=schema.catalog.name, + ) def gen_metastore_key(self, metastore: Metastore) -> MetastoreKey: return MetastoreKey( @@ -452,13 +569,21 @@ def gen_metastore_key(self, metastore: Metastore) -> MetastoreKey: instance=self.config.platform_instance, ) - def gen_catalog_key(self, catalog: Catalog) -> CatalogKey: - return CatalogKey( - catalog=catalog.name, - metastore=catalog.metastore.name, - platform=self.platform, - instance=self.config.platform_instance, - ) + def gen_catalog_key(self, catalog: Catalog) -> ContainerKey: + if self.config.include_metastore: + assert catalog.metastore + return CatalogKeyWithMetastore( + catalog=catalog.name, + metastore=catalog.metastore.name, + platform=self.platform, + instance=self.config.platform_instance, + ) + else: + return CatalogKey( + catalog=catalog.name, + platform=self.platform, + instance=self.config.platform_instance, + ) def _gen_domain_urn(self, dataset_name: str) -> Optional[str]: domain_urn: Optional[str] = None @@ -563,15 +688,16 @@ def _create_table_ownership_aspect(self, table: Table) -> Optional[OwnershipClas return None def _create_data_platform_instance_aspect( - self, table: Table + self, ) -> Optional[DataPlatformInstanceClass]: - # Only ingest the DPI aspect if the flag is true if self.config.ingest_data_platform_instance_aspect: return DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.platform_instance_name - ), + ) + if self.platform_instance_name + else None, ) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index 49f56b46fb012..ab21c1a318659 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -214,12 +214,15 @@ def _resolve_tables( self, tables: List[str], table_map: TableMap ) -> List[TableReference]: """Resolve tables to TableReferences, filtering out unrecognized or unresolvable table names.""" + + missing_table = False + duplicate_table = False output = [] for table in tables: table = str(table) if table not in table_map: logger.debug(f"Dropping query with unrecognized table: {table}") - self.report.num_queries_dropped_missing_table += 1 + missing_table = True else: refs = table_map[table] if len(refs) == 1: @@ -228,6 +231,11 @@ def _resolve_tables( logger.warning( f"Could not resolve table ref for {table}: {len(refs)} duplicates." ) - self.report.num_queries_dropped_duplicate_table += 1 + duplicate_table = True + + if missing_table: + self.report.num_queries_missing_table += 1 + if duplicate_table: + self.report.num_queries_duplicate_table += 1 return output diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py new file mode 100644 index 0000000000000..64f70988ea3a7 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py @@ -0,0 +1,91 @@ +import re +from functools import lru_cache +from typing import List, Optional, cast + +from datahub.configuration.common import TransformerSemanticsConfigModel +from datahub.emitter.mce_builder import Aspect +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.transformer.dataset_transformer import DatasetTagsTransformer +from datahub.metadata.schema_classes import ( + GlobalTagsClass, + OwnerClass, + OwnershipClass, + OwnershipTypeClass, +) +from datahub.utilities.urns.corp_group_urn import CorpGroupUrn +from datahub.utilities.urns.corpuser_urn import CorpuserUrn +from datahub.utilities.urns.tag_urn import TagUrn + + +class ExtractOwnersFromTagsConfig(TransformerSemanticsConfigModel): + tag_prefix: str + is_user: bool = True + email_domain: Optional[str] = None + owner_type: str = "TECHNICAL_OWNER" + owner_type_urn: Optional[str] = None + + +@lru_cache(maxsize=10) +def get_owner_type(owner_type_str: str) -> str: + for item in dir(OwnershipTypeClass): + if str(item) == owner_type_str: + return item + return OwnershipTypeClass.CUSTOM + + +class ExtractOwnersFromTagsTransformer(DatasetTagsTransformer): + """Transformer that can be used to set extract ownership from entity tags (currently does not support column level tags)""" + + ctx: PipelineContext + config: ExtractOwnersFromTagsConfig + + def __init__(self, config: ExtractOwnersFromTagsConfig, ctx: PipelineContext): + super().__init__() + self.ctx = ctx + self.config = config + + @classmethod + def create( + cls, config_dict: dict, ctx: PipelineContext + ) -> "ExtractOwnersFromTagsTransformer": + config = ExtractOwnersFromTagsConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_owner_urn(self, owner_str: str) -> str: + if self.config.email_domain is not None: + return owner_str + "@" + self.config.email_domain + return owner_str + + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + in_tags_aspect: Optional[GlobalTagsClass] = cast(GlobalTagsClass, aspect) + if in_tags_aspect is None: + return None + tags = in_tags_aspect.tags + owners: List[OwnerClass] = [] + for tag_class in tags: + tag_urn = TagUrn.create_from_string(tag_class.tag) + tag_str = tag_urn.get_entity_id()[0] + re_match = re.search(self.config.tag_prefix, tag_str) + if re_match: + owner_str = tag_str[re_match.end() :].strip() + owner_urn_str = self.get_owner_urn(owner_str) + if self.config.is_user: + owner_urn = str(CorpuserUrn.create_from_id(owner_urn_str)) + else: + owner_urn = str(CorpGroupUrn.create_from_id(owner_urn_str)) + owner_type = get_owner_type(self.config.owner_type) + if owner_type == OwnershipTypeClass.CUSTOM: + assert ( + self.config.owner_type_urn is not None + ), "owner_type_urn must be set if owner_type is CUSTOM" + owner = OwnerClass( + owner=owner_urn, + type=owner_type, + typeUrn=self.config.owner_type_urn, + ) + owners.append(owner) + + owner_aspect = OwnershipClass(owners=owners) + return cast(Aspect, owner_aspect) diff --git a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py index eabf62a4cda2b..8b393a8f6f1c6 100644 --- a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py +++ b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py @@ -35,7 +35,10 @@ from datahub.cli.cli_utils import get_boolean_env_variable from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.ingestion.source.sql.sql_common import get_platform_from_sqlalchemy_uri +from datahub.emitter.serialization_helper import pre_json_transform +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( AssertionInfo, AssertionResult, @@ -251,13 +254,15 @@ def get_assertions_with_results( # possibly for each validation run assertionUrn = builder.make_assertion_urn( builder.datahub_guid( - { - "platform": GE_PLATFORM_NAME, - "nativeType": expectation_type, - "nativeParameters": kwargs, - "dataset": assertion_datasets[0], - "fields": assertion_fields, - } + pre_json_transform( + { + "platform": GE_PLATFORM_NAME, + "nativeType": expectation_type, + "nativeParameters": kwargs, + "dataset": assertion_datasets[0], + "fields": assertion_fields, + } + ) ) ) logger.debug( @@ -636,7 +641,7 @@ def get_dataset_partitions(self, batch_identifier, data_asset): ].batch_request.runtime_parameters["query"] partitionSpec = PartitionSpecClass( type=PartitionTypeClass.QUERY, - partition=f"Query_{builder.datahub_guid(query)}", + partition=f"Query_{builder.datahub_guid(pre_json_transform(query))}", ) batchSpec = BatchSpec( diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py index 5c52e1ab4f0b3..54f6a6e984c00 100644 --- a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py +++ b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py @@ -40,6 +40,7 @@ def assert_metadata_files_equal( update_golden: bool, copy_output: bool, ignore_paths: Sequence[str] = (), + ignore_order: bool = True, ) -> None: golden_exists = os.path.isfile(golden_path) @@ -65,7 +66,7 @@ def assert_metadata_files_equal( write_metadata_file(pathlib.Path(temp.name), golden_metadata) golden = load_json_file(temp.name) - diff = diff_metadata_json(output, golden, ignore_paths) + diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order) if diff and update_golden: if isinstance(diff, MCPDiff): diff.apply_delta(golden) @@ -91,16 +92,19 @@ def diff_metadata_json( output: MetadataJson, golden: MetadataJson, ignore_paths: Sequence[str] = (), + ignore_order: bool = True, ) -> Union[DeepDiff, MCPDiff]: ignore_paths = (*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info") try: - golden_map = get_aspects_by_urn(golden) - output_map = get_aspects_by_urn(output) - return MCPDiff.create( - golden=golden_map, - output=output_map, - ignore_paths=ignore_paths, - ) + if ignore_order: + golden_map = get_aspects_by_urn(golden) + output_map = get_aspects_by_urn(output) + return MCPDiff.create( + golden=golden_map, + output=output_map, + ignore_paths=ignore_paths, + ) + # if ignore_order is False, always use DeepDiff except CannotCompareMCPs as e: logger.info(f"{e}, falling back to MCE diff") except AssertionError as e: @@ -111,5 +115,5 @@ def diff_metadata_json( golden, output, exclude_regex_paths=ignore_paths, - ignore_order=True, + ignore_order=ignore_order, ) diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 793eccfb22c7e..eb2d975ee607f 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -2,12 +2,16 @@ import logging import operator import re +import time from functools import reduce -from typing import Any, Dict, List, Match, Optional, Union +from typing import Any, Dict, List, Match, Optional, Union, cast from datahub.emitter import mce_builder from datahub.emitter.mce_builder import OwnerType from datahub.metadata.schema_classes import ( + AuditStampClass, + InstitutionalMemoryClass, + InstitutionalMemoryMetadataClass, OwnerClass, OwnershipClass, OwnershipSourceClass, @@ -39,6 +43,7 @@ def _insert_match_value(original_value: str, match_value: str) -> str: class Constants: + ADD_DOC_LINK_OPERATION = "add_doc_link" ADD_TAG_OPERATION = "add_tag" ADD_TERM_OPERATION = "add_term" ADD_TERMS_OPERATION = "add_terms" @@ -47,6 +52,8 @@ class Constants: OPERATION_CONFIG = "config" TAG = "tag" TERM = "term" + DOC_LINK = "link" + DOC_DESCRIPTION = "description" OWNER_TYPE = "owner_type" OWNER_CATEGORY = "owner_category" MATCH = "match" @@ -163,7 +170,6 @@ def process(self, raw_props: Dict[str, Any]) -> Dict[str, Any]: ) operations_value_list.append(operation) # type: ignore operations_map[operation_type] = operations_value_list - aspect_map = self.convert_to_aspects(operations_map) except Exception as e: logger.error(f"Error while processing operation defs over raw_props: {e}") @@ -173,6 +179,7 @@ def convert_to_aspects( self, operation_map: Dict[str, Union[set, list]] ) -> Dict[str, Any]: aspect_map: Dict[str, Any] = {} + if Constants.ADD_TAG_OPERATION in operation_map: tag_aspect = mce_builder.make_global_tag_aspect_with_tag_list( sorted(operation_map[Constants.ADD_TAG_OPERATION]) @@ -195,11 +202,57 @@ def convert_to_aspects( ] ) aspect_map[Constants.ADD_OWNER_OPERATION] = owner_aspect + if Constants.ADD_TERM_OPERATION in operation_map: term_aspect = mce_builder.make_glossary_terms_aspect_from_urn_list( sorted(operation_map[Constants.ADD_TERM_OPERATION]) ) aspect_map[Constants.ADD_TERM_OPERATION] = term_aspect + + if Constants.ADD_DOC_LINK_OPERATION in operation_map: + try: + if len( + operation_map[Constants.ADD_DOC_LINK_OPERATION] + ) == 1 and isinstance( + operation_map[Constants.ADD_DOC_LINK_OPERATION], list + ): + docs_dict = cast( + List[Dict], operation_map[Constants.ADD_DOC_LINK_OPERATION] + )[0] + if "description" not in docs_dict or "link" not in docs_dict: + raise Exception( + "Documentation_link meta_mapping config needs a description key and a link key" + ) + + now = int(time.time() * 1000) # milliseconds since epoch + institutional_memory_element = InstitutionalMemoryMetadataClass( + url=docs_dict["link"], + description=docs_dict["description"], + createStamp=AuditStampClass( + time=now, actor="urn:li:corpuser:ingestion" + ), + ) + + # create a new institutional memory aspect + institutional_memory_aspect = InstitutionalMemoryClass( + elements=[institutional_memory_element] + ) + + aspect_map[ + Constants.ADD_DOC_LINK_OPERATION + ] = institutional_memory_aspect + else: + raise Exception( + f"Expected 1 item of type list for the documentation_link meta_mapping config," + f" received type of {type(operation_map[Constants.ADD_DOC_LINK_OPERATION])}" + f", and size of {len(operation_map[Constants.ADD_DOC_LINK_OPERATION])}." + ) + + except Exception as e: + logger.error( + f"Error while constructing aspect for documentation link and description : {e}" + ) + return aspect_map def get_operation_value( @@ -248,6 +301,16 @@ def get_operation_value( term = operation_config[Constants.TERM] term = _insert_match_value(term, _get_best_match(match, "term")) return mce_builder.make_term_urn(term) + elif ( + operation_type == Constants.ADD_DOC_LINK_OPERATION + and operation_config[Constants.DOC_LINK] + and operation_config[Constants.DOC_DESCRIPTION] + ): + link = operation_config[Constants.DOC_LINK] + link = _insert_match_value(link, _get_best_match(match, "link")) + description = operation_config[Constants.DOC_DESCRIPTION] + return {"link": link, "description": description} + elif operation_type == Constants.ADD_TERMS_OPERATION: separator = operation_config.get(Constants.SEPARATOR, ",") captured_terms = match.group(0) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index f18235af3d1fd..81c43884fdf7d 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -231,6 +231,13 @@ def _table_level_lineage( # In some cases like "MERGE ... then INSERT (col1, col2) VALUES (col1, col2)", # the `this` on the INSERT part isn't a table. if isinstance(expr.this, sqlglot.exp.Table) + } | { + # For CREATE DDL statements, the table name is nested inside + # a Schema object. + _TableName.from_sqlglot_table(expr.this.this) + for expr in statement.find_all(sqlglot.exp.Create) + if isinstance(expr.this, sqlglot.exp.Schema) + and isinstance(expr.this.this, sqlglot.exp.Table) } tables = ( @@ -242,7 +249,7 @@ def _table_level_lineage( - modified # ignore CTEs created in this statement - { - _TableName(database=None, schema=None, table=cte.alias_or_name) + _TableName(database=None, db_schema=None, table=cte.alias_or_name) for cte in statement.find_all(sqlglot.exp.CTE) } ) @@ -276,6 +283,9 @@ def __init__( shared_connection=shared_conn, ) + def get_urns(self) -> Set[str]: + return set(self._schema_cache.keys()) + def get_urn_for_table(self, table: _TableName, lower: bool = False) -> str: # TODO: Validate that this is the correct 2/3 layer hierarchy for the platform. @@ -390,8 +400,6 @@ def convert_graphql_schema_metadata_to_info( ) } - # TODO add a method to load all from graphql - def close(self) -> None: self._schema_cache.close() @@ -906,32 +914,39 @@ def create_lineage_sql_parsed_result( env: str, schema: Optional[str] = None, graph: Optional[DataHubGraph] = None, -) -> Optional["SqlParsingResult"]: - parsed_result: Optional["SqlParsingResult"] = None +) -> SqlParsingResult: + needs_close = False try: - schema_resolver = ( - graph._make_schema_resolver( + if graph: + schema_resolver = graph._make_schema_resolver( platform=platform, platform_instance=platform_instance, env=env, ) - if graph is not None - else SchemaResolver( + else: + needs_close = True + schema_resolver = SchemaResolver( platform=platform, platform_instance=platform_instance, env=env, graph=None, ) - ) - parsed_result = sqlglot_lineage( + return sqlglot_lineage( query, schema_resolver=schema_resolver, default_db=database, default_schema=schema, ) except Exception as e: - logger.debug(f"Fail to prase query {query}", exc_info=e) - logger.warning("Fail to parse custom SQL") - - return parsed_result + return SqlParsingResult( + in_tables=[], + out_tables=[], + column_lineage=None, + debug_info=SqlParsingDebugInfo( + table_error=e, + ), + ) + finally: + if needs_close: + schema_resolver.close() diff --git a/metadata-ingestion/tests/conftest.py b/metadata-ingestion/tests/conftest.py index 0eb9ab250339c..0f278ab1e1311 100644 --- a/metadata-ingestion/tests/conftest.py +++ b/metadata-ingestion/tests/conftest.py @@ -1,6 +1,8 @@ import logging import os +import pathlib import time +from typing import List import pytest @@ -49,3 +51,40 @@ def pytest_addoption(parser): default=False, ) parser.addoption("--copy-output-files", action="store_true", default=False) + + +def pytest_collection_modifyitems( + config: pytest.Config, items: List[pytest.Item] +) -> None: + # https://docs.pytest.org/en/latest/reference/reference.html#pytest.hookspec.pytest_collection_modifyitems + # Adapted from https://stackoverflow.com/a/57046943/5004662. + + root = pathlib.Path(config.rootpath) + integration_path = root / "tests/integration" + + for item in items: + test_path = pathlib.Path(item.fspath) + + if ( + "docker_compose_runner" in item.fixturenames # type: ignore[attr-defined] + or any( + marker.name == "integration_batch_2" for marker in item.iter_markers() + ) + ): + item.add_marker(pytest.mark.slow) + + is_already_integration = any( + marker.name == "integration" for marker in item.iter_markers() + ) + + if integration_path in test_path.parents or is_already_integration: + # If it doesn't have a marker yet, put it in integration_batch_0. + if not any( + marker.name.startswith("integration_batch_") + for marker in item.iter_markers() + ): + item.add_marker(pytest.mark.integration_batch_0) + + # Mark everything as an integration test. + if not is_already_integration: + item.add_marker(pytest.mark.integration) diff --git a/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py b/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py index 11fed2a805565..b6e1aca4d4fed 100644 --- a/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py +++ b/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List +from typing import Any, Dict import pytest from freezegun import freeze_time @@ -45,14 +45,6 @@ def test_glossary_ingest( ): test_resources_dir = pytestconfig.rootpath / "tests/integration/business-glossary" - # These paths change from one instance run of the clickhouse docker to the other, - # and the FROZEN_TIME does not apply to these. - ignore_paths: List[str] = [ - r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['metadata_modification_time'\]", - r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['data_paths'\]", - r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['metadata_path'\]", - ] - output_mces_path: str = f"{tmp_path}/glossary_events.json" golden_mces_path: str = f"{test_resources_dir}/{golden_file}" @@ -72,7 +64,6 @@ def test_glossary_ingest( # Verify the output. mce_helpers.check_golden_file( pytestconfig, - ignore_paths=ignore_paths, output_path=output_mces_path, golden_path=golden_mces_path, ) diff --git a/metadata-ingestion/tests/integration/delta_lake/test_delta_lake_minio.py b/metadata-ingestion/tests/integration/delta_lake/test_delta_lake_minio.py index 36ec1d317fec4..6146c6d1a948c 100644 --- a/metadata-ingestion/tests/integration/delta_lake/test_delta_lake_minio.py +++ b/metadata-ingestion/tests/integration/delta_lake/test_delta_lake_minio.py @@ -9,6 +9,8 @@ from tests.test_helpers import mce_helpers from tests.test_helpers.docker_helpers import wait_for_port +pytestmark = pytest.mark.integration_batch_2 + FROZEN_TIME = "2020-04-14 07:00:00" MINIO_PORT = 9000 @@ -64,7 +66,7 @@ def populate_minio(pytestconfig, s3_bkt): pytestconfig.rootpath / "tests/integration/delta_lake/test_data/" ) - for root, dirs, files in os.walk(test_resources_dir): + for root, _dirs, files in os.walk(test_resources_dir): for file in files: full_path = os.path.join(root, file) rel_path = os.path.relpath(full_path, test_resources_dir) @@ -72,7 +74,6 @@ def populate_minio(pytestconfig, s3_bkt): yield -@pytest.mark.slow_integration @freezegun.freeze_time("2023-01-01 00:00:00+00:00") def test_delta_lake_ingest(pytestconfig, tmp_path, test_resources_dir): # Run the metadata ingestion pipeline. diff --git a/metadata-ingestion/tests/integration/hana/test_hana.py b/metadata-ingestion/tests/integration/hana/test_hana.py index 0fa234d059e5e..726f8744167db 100644 --- a/metadata-ingestion/tests/integration/hana/test_hana.py +++ b/metadata-ingestion/tests/integration/hana/test_hana.py @@ -7,12 +7,12 @@ from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import wait_for_port +pytestmark = pytest.mark.integration_batch_2 FROZEN_TIME = "2020-04-14 07:00:00" @freeze_time(FROZEN_TIME) @pytest.mark.xfail # TODO: debug the flakes for this test -@pytest.mark.slow_integration @pytest.mark.skipif( platform.machine().lower() == "aarch64", reason="The hdbcli dependency is not available for aarch64", diff --git a/metadata-ingestion/tests/integration/hive/test_hive.py b/metadata-ingestion/tests/integration/hive/test_hive.py index ce166c3b336ac..caffb761380dd 100644 --- a/metadata-ingestion/tests/integration/hive/test_hive.py +++ b/metadata-ingestion/tests/integration/hive/test_hive.py @@ -12,6 +12,8 @@ data_platform = "hive" +pytestmark = pytest.mark.integration_batch_1 + @pytest.fixture(scope="module") def hive_runner(docker_compose_runner, pytestconfig): @@ -54,7 +56,6 @@ def base_pipeline_config(events_file, db=None): @freeze_time(FROZEN_TIME) -@pytest.mark.integration_batch_1 def test_hive_ingest( loaded_hive, pytestconfig, test_resources_dir, tmp_path, mock_time ): @@ -110,7 +111,6 @@ def test_hive_ingest_all_db( @freeze_time(FROZEN_TIME) -@pytest.mark.integration_batch_1 def test_hive_instance_check(loaded_hive, test_resources_dir, tmp_path, pytestconfig): instance: str = "production_warehouse" diff --git a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py index e2a86480672e5..65ede11c3f1c0 100644 --- a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py +++ b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py @@ -8,22 +8,31 @@ from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd -from tests.test_helpers.docker_helpers import wait_for_port +from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port from tests.test_helpers.state_helpers import ( get_current_checkpoint_from_pipeline, run_and_get_pipeline, validate_all_providers_have_committed_successfully, ) +pytestmark = [ + pytest.mark.integration_batch_1, + # Skip tests if not on Python 3.8 or higher. + pytest.mark.skipif( + sys.version_info < (3, 8), reason="Requires python 3.8 or higher" + ), +] FROZEN_TIME = "2020-04-14 07:00:00" GMS_PORT = 8080 GMS_SERVER = f"http://localhost:{GMS_PORT}" -@pytest.fixture(autouse=True) -def skip_tests_if_python_before_3_8(): - if sys.version_info < (3, 8): - pytest.skip("Requires python 3.8 or higher") +@pytest.fixture(autouse=True, scope="module") +def remove_docker_image(): + yield + + # The tabulario/spark-iceberg image is pretty large, so we remove it after the test. + cleanup_image("tabulario/spark-iceberg") def spark_submit(file_path: str, args: str = "") -> None: @@ -36,7 +45,6 @@ def spark_submit(file_path: str, args: str = "") -> None: @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_iceberg_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg/" @@ -69,7 +77,6 @@ def test_iceberg_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_iceberg_stateful_ingest( docker_compose_runner, pytestconfig, tmp_path, mock_time, mock_datahub_graph ): @@ -189,7 +196,6 @@ def test_iceberg_stateful_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_iceberg_profiling(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg/" diff --git a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py index 48063908e624f..8cf76cfb26af7 100644 --- a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py +++ b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py @@ -1,5 +1,5 @@ import subprocess -from typing import Any, Dict, List, cast +from typing import Any, Dict, List, Optional, cast from unittest import mock import pytest @@ -16,6 +16,7 @@ validate_all_providers_have_committed_successfully, ) +pytestmark = pytest.mark.integration_batch_1 FROZEN_TIME = "2021-10-25 13:00:00" GMS_PORT = 8080 GMS_SERVER = f"http://localhost:{GMS_PORT}" @@ -345,7 +346,6 @@ def loaded_kafka_connect(kafka_connect_runner): @freeze_time(FROZEN_TIME) -@pytest.mark.integration_batch_1 def test_kafka_connect_ingest( loaded_kafka_connect, pytestconfig, tmp_path, test_resources_dir ): @@ -363,7 +363,6 @@ def test_kafka_connect_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration_batch_1 def test_kafka_connect_mongosourceconnect_ingest( loaded_kafka_connect, pytestconfig, tmp_path, test_resources_dir ): @@ -381,7 +380,6 @@ def test_kafka_connect_mongosourceconnect_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration_batch_1 def test_kafka_connect_s3sink_ingest( loaded_kafka_connect, pytestconfig, tmp_path, test_resources_dir ): @@ -399,7 +397,6 @@ def test_kafka_connect_s3sink_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration_batch_1 def test_kafka_connect_ingest_stateful( loaded_kafka_connect, pytestconfig, tmp_path, mock_datahub_graph, test_resources_dir ): @@ -536,7 +533,7 @@ def test_kafka_connect_ingest_stateful( assert sorted(deleted_job_urns) == sorted(difference_job_urns) -def register_mock_api(request_mock: Any, override_data: dict = {}) -> None: +def register_mock_api(request_mock: Any, override_data: Optional[dict] = None) -> None: api_vs_response = { "http://localhost:28083": { "method": "GET", @@ -549,7 +546,7 @@ def register_mock_api(request_mock: Any, override_data: dict = {}) -> None: }, } - api_vs_response.update(override_data) + api_vs_response.update(override_data or {}) for url in api_vs_response.keys(): request_mock.register_uri( diff --git a/metadata-ingestion/tests/integration/nifi/test_nifi.py b/metadata-ingestion/tests/integration/nifi/test_nifi.py index 58efd32c6deb3..bf17ee7472258 100644 --- a/metadata-ingestion/tests/integration/nifi/test_nifi.py +++ b/metadata-ingestion/tests/integration/nifi/test_nifi.py @@ -7,7 +7,9 @@ from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import fs_helpers, mce_helpers -from tests.test_helpers.docker_helpers import wait_for_port +from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port + +pytestmark = pytest.mark.integration_batch_2 FROZEN_TIME = "2021-12-03 12:00:00" @@ -48,9 +50,11 @@ def loaded_nifi(docker_compose_runner, test_resources_dir): ) yield docker_services + # The nifi image is pretty large, so we remove it after the test. + cleanup_image("apache/nifi") + @freeze_time(FROZEN_TIME) -@pytest.mark.slow_integration def test_nifi_ingest_standalone( loaded_nifi, pytestconfig, tmp_path, test_resources_dir ): @@ -106,7 +110,6 @@ def test_nifi_ingest_standalone( @freeze_time(FROZEN_TIME) -@pytest.mark.slow_integration def test_nifi_ingest_cluster(loaded_nifi, pytestconfig, tmp_path, test_resources_dir): # Wait for nifi cluster to execute all lineage processors, max wait time 120 seconds url = "http://localhost:9080/nifi-api/flow/process-groups/root" diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json b/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json new file mode 100644 index 0000000000000..5f92cdcfb5bde --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json @@ -0,0 +1,1357 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "dummy", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "public issue_history", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Snowflake.Databases(\"hp123rt5.ap-southeast-2.fakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "SNOWFLAKE_TESTTABLE", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,PBI_TEST.TEST.TESTTABLE,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "snowflake native-query", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD)", + "type": "TRANSFORMED" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD),monthid)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD),seller)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV),agent_key)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD),client_director)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD),monthid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV),cd_agent_key)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = GoogleBigQuery.Database([BillingProject = #\"Parameter - Source\"]),\n#\"gcp-project\" = Source{[Name=#\"Parameter - Source\"]}[Data],\nuniversal_Schema = #\"gcp-project\"{[Name=\"universal\",Kind=\"Schema\"]}[Data],\nD_WH_DATE_Table = universal_Schema{[Name=\"D_WH_DATE\",Kind=\"Table\"]}[Data],\n#\"Filtered Rows\" = Table.SelectRows(D_WH_DATE_Table, each [D_DATE] > #datetime(2019, 9, 10, 0, 0, 0)),\n#\"Filtered Rows1\" = Table.SelectRows(#\"Filtered Rows\", each DateTime.IsInPreviousNHours([D_DATE], 87600))\n in \n#\"Filtered Rows1\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "big-query-with-parameter", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]){[Name=\"GSL_TEST_DB\"]}[Data], \"select A.name from GSL_TEST_DB.PUBLIC.SALES_ANALYST as A inner join GSL_TEST_DB.PUBLIC.SALES_FORECAST as B on A.name = B.name where startswith(A.name, 'mo')\", null, [EnableFolding=true])\nin\n Source", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "snowflake native-query-with-join", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.universal.D_WH_DATE,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Oracle.Database(\"localhost:1521/salesdb.GSLAB.COM\", [HierarchicalNavigation=true]), HR = Source{[Schema=\"HR\"]}[Data], EMPLOYEES1 = HR{[Name=\"EMPLOYEES\"]}[Data] \n in EMPLOYEES1", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_forecast,PROD)", + "type": "TRANSFORMED" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV),name)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "job-history", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.HR.EMPLOYEES,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = PostgreSQL.Database(\"localhost\" , \"mics\" ),\n public_order_date = Source{[Schema=\"public\",Item=\"order_date\"]}[Data] \n in \n public_order_date", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "postgres_test_table", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Sql.Database(\"localhost\", \"library\"),\n dbo_book_issue = Source{[Schema=\"dbo\",Item=\"book_issue\"]}[Data]\n in dbo_book_issue", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details", + "name": "dbo_book_issue", + "description": "hr pbi test description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"mth_date\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([mth_date])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details", + "name": "ms_sql_native_table", + "description": "hr pbi test description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,library.dbo.book_issue,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "json": { + "username": "User1@foo.com" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "json": { + "username": "User2@foo.com" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "json": { + "customProperties": { + "createdFrom": "Dataset", + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445", + "datasetWebUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details" + }, + "title": "test_tile", + "description": "test_tile", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "chartId": "powerbi.linkedin.com/charts/B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "json": { + "customProperties": { + "createdFrom": "Dataset", + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed", + "datasetWebUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details" + }, + "title": "yearly_sales", + "description": "yearly_sales", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "chartId": "powerbi.linkedin.com/charts/23212598-23b5-4980-87cc-5fc0ecd84385" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "json": { + "customProperties": { + "chartCount": "2", + "workspaceName": "demo-workspace", + "workspaceId": "64ED5CAD-7C10-4684-8180-826122881108" + }, + "title": "test_dashboard", + "description": "Description of test dashboard", + "charts": [ + "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + ], + "datasets": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "dashboardUrl": "https://localhost/dashboards/web/1" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "dashboardId": "powerbi.linkedin.com/dashboards/7D668CAD-7FFC-4505-9215-655BCA5BEBAE" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:users.User1@foo.com", + "type": "NONE" + }, + { + "owner": "urn:li:corpuser:users.User2@foo.com", + "type": "NONE" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,employee-dataset.employee_ctc,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "dummy", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,employee-dataset.employee_ctc,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,employee-dataset.employee_ctc,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,employee-dataset.employee_ctc,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "91580e0e-1680-4b1c-bbf9-4f6764d7a5ff" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/91580e0e-1680-4b1c-bbf9-4f6764d7a5ff/details", + "name": "employee_ctc", + "description": "Employee Management", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py index f95fd81681a9a..6f45dcf97f1dd 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py +++ b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py @@ -3,11 +3,14 @@ from typing import Any, Dict from unittest import mock +import pytest from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import mce_helpers +pytestmark = pytest.mark.integration_batch_2 + FROZEN_TIME = "2022-02-03 07:00:00" diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index e77a12aa4088e..e3cc6c8101650 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -15,8 +15,11 @@ AbstractDataPlatformInstanceResolver, create_dataplatform_instance_resolver, ) -from datahub.ingestion.source.powerbi.m_query import parser, tree_function -from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable +from datahub.ingestion.source.powerbi.m_query import parser, resolver, tree_function +from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable, Lineage +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, DownstreamColumnRef + +pytestmark = pytest.mark.integration_batch_2 M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -68,6 +71,15 @@ def get_default_instances( return PipelineContext(run_id="fake"), config, platform_instance_resolver +def combine_upstreams_from_lineage(lineage: List[Lineage]) -> List[DataPlatformTable]: + data_platforms: List[DataPlatformTable] = [] + + for item in lineage: + data_platforms.extend(item.upstreams) + + return data_platforms + + @pytest.mark.integration def test_parse_m_query1(): expression: str = M_QUERIES[0] @@ -180,7 +192,7 @@ def test_snowflake_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -210,7 +222,7 @@ def test_postgres_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -240,7 +252,7 @@ def test_databricks_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -270,7 +282,7 @@ def test_oracle_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -300,7 +312,7 @@ def test_mssql_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -346,7 +358,7 @@ def test_mssql_with_query(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert data_platform_tables[0].urn == expected_tables[index] @@ -386,7 +398,7 @@ def test_snowflake_native_query(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert data_platform_tables[0].urn == expected_tables[index] @@ -408,7 +420,7 @@ def test_google_bigquery_1(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -440,7 +452,7 @@ def test_google_bigquery_2(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -470,7 +482,7 @@ def test_for_each_expression_1(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -499,7 +511,7 @@ def test_for_each_expression_2(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -521,15 +533,15 @@ def test_native_query_disabled(): reporter = PowerBiDashboardSourceReport() ctx, config, platform_instance_resolver = get_default_instances() - config.native_query_parsing = False - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + config.native_query_parsing = False # Disable native query parsing + lineage: List[Lineage] = parser.get_upstream_tables( table, reporter, ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, ) - assert len(data_platform_tables) == 0 + assert len(lineage) == 0 @pytest.mark.integration @@ -546,12 +558,14 @@ def test_multi_source_table(): ctx, config, platform_instance_resolver = get_default_instances() - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, - reporter, - ctx=ctx, - config=config, - platform_instance_resolver=platform_instance_resolver, + data_platform_tables: List[DataPlatformTable] = combine_upstreams_from_lineage( + parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) assert len(data_platform_tables) == 2 @@ -579,12 +593,14 @@ def test_table_combine(): ctx, config, platform_instance_resolver = get_default_instances() - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, - reporter, - ctx=ctx, - config=config, - platform_instance_resolver=platform_instance_resolver, + data_platform_tables: List[DataPlatformTable] = combine_upstreams_from_lineage( + parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) assert len(data_platform_tables) == 2 @@ -622,7 +638,7 @@ def test_expression_is_none(): ctx, config, platform_instance_resolver = get_default_instances() - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + lineage: List[Lineage] = parser.get_upstream_tables( table, reporter, ctx=ctx, @@ -630,7 +646,7 @@ def test_expression_is_none(): platform_instance_resolver=platform_instance_resolver, ) - assert len(data_platform_tables) == 0 + assert len(lineage) == 0 def test_redshift_regular_case(): @@ -649,7 +665,7 @@ def test_redshift_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -676,7 +692,7 @@ def test_redshift_native_query(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -706,7 +722,7 @@ def test_sqlglot_parser(): } ) - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + lineage: List[resolver.Lineage] = parser.get_upstream_tables( table, reporter, ctx=ctx, @@ -714,6 +730,8 @@ def test_sqlglot_parser(): platform_instance_resolver=platform_instance_resolver, ) + data_platform_tables: List[DataPlatformTable] = lineage[0].upstreams + assert len(data_platform_tables) == 2 assert ( data_platform_tables[0].urn @@ -723,3 +741,76 @@ def test_sqlglot_parser(): data_platform_tables[1].urn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)" ) + + assert lineage[0].column_lineage == [ + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="client_director"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="tier"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column='upper("manager")'), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="team_type"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="date_target"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="monthid"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="target_team"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="seller_email"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="agent_key"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="sme_quota"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="revenue_quota"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="service_quota"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="bl_target"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="software_quota"), + upstreams=[], + logic=None, + ), + ] diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 5036f758a7de9..7232d2a38da1d 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,4 +1,5 @@ import logging +import re import sys from typing import Any, Dict, List, cast from unittest import mock @@ -20,6 +21,7 @@ ) from tests.test_helpers import mce_helpers +pytestmark = pytest.mark.integration_batch_2 FROZEN_TIME = "2022-02-03 07:00:00" @@ -1126,7 +1128,7 @@ def test_dataset_type_mapping_error( """ register_mock_api(request_mock=requests_mock) - try: + with pytest.raises(Exception, match=r"dataset_type_mapping is deprecated"): Pipeline.create( { "run_id": "powerbi-test", @@ -1149,11 +1151,6 @@ def test_dataset_type_mapping_error( }, } ) - except Exception as e: - assert ( - "dataset_type_mapping is deprecated. Use server_to_platform_instance only." - in str(e) - ) @freeze_time(FROZEN_TIME) @@ -1505,3 +1502,90 @@ def test_independent_datasets_extraction( output_path=tmp_path / "powerbi_independent_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_cll_extraction(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): + + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" + + register_mock_api( + request_mock=requests_mock, + ) + + default_conf: dict = default_source_config() + + del default_conf[ + "dataset_type_mapping" + ] # delete this key so that connector set it to default (all dataplatform) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_conf, + "extract_lineage": True, + "extract_column_level_lineage": True, + "enable_advance_lineage_sql_construct": True, + "native_query_parsing": True, + "extract_independent_datasets": True, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_cll_mces.json", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "golden_test_cll.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "powerbi_cll_mces.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_cll_extraction_flags( + mock_msal, pytestconfig, tmp_path, mock_time, requests_mock +): + + register_mock_api( + request_mock=requests_mock, + ) + + default_conf: dict = default_source_config() + pattern: str = re.escape( + "Enable all these flags in recipe: ['native_query_parsing', 'enable_advance_lineage_sql_construct', 'extract_lineage']" + ) + + with pytest.raises(Exception, match=pattern): + + Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_conf, + "extract_column_level_lineage": True, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_cll_mces.json", + }, + }, + } + ) diff --git a/metadata-ingestion/tests/integration/presto-on-hive/test_presto_on_hive.py b/metadata-ingestion/tests/integration/presto-on-hive/test_presto_on_hive.py index 17e21f3790070..31d801ccf7dee 100644 --- a/metadata-ingestion/tests/integration/presto-on-hive/test_presto_on_hive.py +++ b/metadata-ingestion/tests/integration/presto-on-hive/test_presto_on_hive.py @@ -10,6 +10,7 @@ from tests.test_helpers import fs_helpers, mce_helpers from tests.test_helpers.docker_helpers import wait_for_port +pytestmark = pytest.mark.integration_batch_1 FROZEN_TIME = "2021-09-23 12:00:00" data_platform = "presto-on-hive" @@ -51,7 +52,6 @@ def loaded_presto_on_hive(presto_on_hive_runner): @freeze_time(FROZEN_TIME) -@pytest.mark.integration_batch_1 @pytest.mark.parametrize( "mode,use_catalog_subtype,use_dataset_pascalcase_subtype,include_catalog_name_in_ids,simplify_nested_field_paths," "test_suffix", @@ -137,7 +137,6 @@ def test_presto_on_hive_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration_batch_1 def test_presto_on_hive_instance_ingest( loaded_presto_on_hive, test_resources_dir, pytestconfig, tmp_path, mock_time ): diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 20bed0d720ab9..60e93b58f477c 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -30,6 +30,8 @@ from tests.integration.snowflake.common import FROZEN_TIME, default_query_results from tests.test_helpers import mce_helpers +pytestmark = pytest.mark.integration_batch_2 + def random_email(): return ( @@ -55,7 +57,6 @@ def random_cloud_region(): ) -@pytest.mark.integration def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" @@ -184,7 +185,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph): test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 71428a7847953..c31867f5aa904 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -757,7 +757,7 @@ def test_tableau_no_verify(): @freeze_time(FROZEN_TIME) -@pytest.mark.slow_unit +@pytest.mark.integration_batch_2 def test_tableau_signout_timeout(pytestconfig, tmp_path, mock_datahub_graph): enable_logging() output_file_name: str = "tableau_signout_timeout_mces.json" diff --git a/metadata-ingestion/tests/integration/trino/test_trino.py b/metadata-ingestion/tests/integration/trino/test_trino.py index 22e5f6f91a06e..177c273c0d242 100644 --- a/metadata-ingestion/tests/integration/trino/test_trino.py +++ b/metadata-ingestion/tests/integration/trino/test_trino.py @@ -13,6 +13,8 @@ from tests.test_helpers import fs_helpers, mce_helpers from tests.test_helpers.docker_helpers import wait_for_port +pytestmark = pytest.mark.integration_batch_1 + FROZEN_TIME = "2021-09-23 12:00:00" data_platform = "trino" @@ -51,7 +53,6 @@ def loaded_trino(trino_runner): @freeze_time(FROZEN_TIME) -@pytest.mark.integration @pytest.mark.xfail def test_trino_ingest( loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time @@ -111,7 +112,6 @@ def test_trino_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_trino_hive_ingest( loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time ): @@ -167,7 +167,6 @@ def test_trino_hive_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_trino_instance_ingest( loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time ): diff --git a/metadata-ingestion/tests/performance/bigquery/__init__.py b/metadata-ingestion/tests/performance/bigquery/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/tests/performance/bigquery.py b/metadata-ingestion/tests/performance/bigquery/bigquery_events.py similarity index 100% rename from metadata-ingestion/tests/performance/bigquery.py rename to metadata-ingestion/tests/performance/bigquery/bigquery_events.py diff --git a/metadata-ingestion/tests/performance/test_bigquery_usage.py b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py similarity index 80% rename from metadata-ingestion/tests/performance/test_bigquery_usage.py rename to metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py index 7e05ef070b45d..bbc3378450bff 100644 --- a/metadata-ingestion/tests/performance/test_bigquery_usage.py +++ b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py @@ -2,13 +2,11 @@ import os import random from datetime import timedelta -from typing import Iterable, Tuple import humanfriendly import psutil from datahub.emitter.mce_builder import make_dataset_urn -from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_config import ( BigQueryUsageConfig, BigQueryV2Config, @@ -16,12 +14,13 @@ from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor from datahub.utilities.perf_timer import PerfTimer -from tests.performance.bigquery import generate_events, ref_from_table +from tests.performance.bigquery.bigquery_events import generate_events, ref_from_table from tests.performance.data_generation import ( NormalDistribution, generate_data, generate_queries, ) +from tests.performance.helpers import workunit_sink def run_test(): @@ -33,7 +32,7 @@ def run_test(): num_views=2000, time_range=timedelta(days=7), ) - all_tables = seed_metadata.tables + seed_metadata.views + all_tables = seed_metadata.all_tables config = BigQueryV2Config( start_time=seed_metadata.start_time, @@ -88,21 +87,6 @@ def run_test(): print(f"Hash collisions: {report.num_usage_query_hash_collisions}") -def workunit_sink(workunits: Iterable[MetadataWorkUnit]) -> Tuple[int, int]: - peak_memory_usage = psutil.Process(os.getpid()).memory_info().rss - i: int = 0 - for i, wu in enumerate(workunits): - if i % 10_000 == 0: - peak_memory_usage = max( - peak_memory_usage, psutil.Process(os.getpid()).memory_info().rss - ) - peak_memory_usage = max( - peak_memory_usage, psutil.Process(os.getpid()).memory_info().rss - ) - - return i, peak_memory_usage - - if __name__ == "__main__": root_logger = logging.getLogger() root_logger.setLevel(logging.INFO) diff --git a/metadata-ingestion/tests/performance/data_generation.py b/metadata-ingestion/tests/performance/data_generation.py index c530848f27f5c..67b156896909a 100644 --- a/metadata-ingestion/tests/performance/data_generation.py +++ b/metadata-ingestion/tests/performance/data_generation.py @@ -11,11 +11,14 @@ import uuid from dataclasses import dataclass from datetime import datetime, timedelta, timezone -from typing import Iterable, List, TypeVar +from typing import Iterable, List, TypeVar, Union, cast from faker import Faker from tests.performance.data_model import ( + Column, + ColumnMapping, + ColumnType, Container, FieldAccess, Query, @@ -52,15 +55,21 @@ def sample_with_floor(self, floor: int = 1) -> int: @dataclass class SeedMetadata: - containers: List[Container] + # Each list is a layer of containers, e.g. [[databases], [schemas]] + containers: List[List[Container]] + tables: List[Table] views: List[View] start_time: datetime end_time: datetime + @property + def all_tables(self) -> List[Table]: + return self.tables + cast(List[Table], self.views) + def generate_data( - num_containers: int, + num_containers: Union[List[int], int], num_tables: int, num_views: int, columns_per_table: NormalDistribution = NormalDistribution(5, 2), @@ -68,32 +77,52 @@ def generate_data( view_definition_length: NormalDistribution = NormalDistribution(150, 50), time_range: timedelta = timedelta(days=14), ) -> SeedMetadata: - containers = [Container(f"container-{i}") for i in range(num_containers)] + # Assemble containers + if isinstance(num_containers, int): + num_containers = [num_containers] + + containers: List[List[Container]] = [] + for i, num_in_layer in enumerate(num_containers): + layer = [ + Container( + f"{i}-container-{j}", + parent=random.choice(containers[-1]) if containers else None, + ) + for j in range(num_in_layer) + ] + containers.append(layer) + + # Assemble tables tables = [ Table( f"table-{i}", - container=random.choice(containers), + container=random.choice(containers[-1]), columns=[ f"column-{j}-{uuid.uuid4()}" for j in range(columns_per_table.sample_with_floor()) ], + column_mapping=None, ) for i in range(num_tables) ] views = [ View( f"view-{i}", - container=random.choice(containers), + container=random.choice(containers[-1]), columns=[ f"column-{j}-{uuid.uuid4()}" for j in range(columns_per_table.sample_with_floor()) ], + column_mapping=None, definition=f"{uuid.uuid4()}-{'*' * view_definition_length.sample_with_floor(10)}", parents=random.sample(tables, parents_per_view.sample_with_floor()), ) for i in range(num_views) ] + for table in tables + views: + _generate_column_mapping(table) + now = datetime.now(tz=timezone.utc) return SeedMetadata( containers=containers, @@ -162,6 +191,18 @@ def generate_queries( ) +def _generate_column_mapping(table: Table) -> ColumnMapping: + d = {} + for column in table.columns: + d[column] = Column( + name=column, + type=random.choice(list(ColumnType)), + nullable=random.random() < 0.1, # Fixed 10% chance for now + ) + table.column_mapping = d + return d + + def _sample_list(lst: List[T], dist: NormalDistribution, floor: int = 1) -> List[T]: return random.sample(lst, min(dist.sample_with_floor(floor), len(lst))) diff --git a/metadata-ingestion/tests/performance/data_model.py b/metadata-ingestion/tests/performance/data_model.py index c593e69ceb9a7..9425fa827070e 100644 --- a/metadata-ingestion/tests/performance/data_model.py +++ b/metadata-ingestion/tests/performance/data_model.py @@ -1,10 +1,10 @@ from dataclasses import dataclass from datetime import datetime -from typing import List, Optional +from enum import Enum +from typing import Dict, List, Optional from typing_extensions import Literal -Column = str StatementType = Literal[ # SELECT + values from OperationTypeClass "SELECT", "INSERT", @@ -21,13 +21,36 @@ @dataclass class Container: name: str + parent: Optional["Container"] = None + + +class ColumnType(str, Enum): + # Can add types that take parameters in the future + + INTEGER = "INTEGER" + FLOAT = "FLOAT" # Double precision (64 bit) + STRING = "STRING" + BOOLEAN = "BOOLEAN" + DATETIME = "DATETIME" + + +@dataclass +class Column: + name: str + type: ColumnType + nullable: bool + + +ColumnRef = str +ColumnMapping = Dict[ColumnRef, Column] @dataclass class Table: name: str container: Container - columns: List[Column] + columns: List[ColumnRef] + column_mapping: Optional[ColumnMapping] def is_view(self) -> bool: return False @@ -44,7 +67,7 @@ def is_view(self) -> bool: @dataclass class FieldAccess: - column: Column + column: ColumnRef table: Table diff --git a/metadata-ingestion/tests/performance/databricks/__init__.py b/metadata-ingestion/tests/performance/databricks/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/tests/performance/databricks/test_unity.py b/metadata-ingestion/tests/performance/databricks/test_unity.py new file mode 100644 index 0000000000000..cc9558f0692ed --- /dev/null +++ b/metadata-ingestion/tests/performance/databricks/test_unity.py @@ -0,0 +1,71 @@ +import logging +import os +from unittest.mock import patch + +import humanfriendly +import psutil + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig +from datahub.ingestion.source.unity.source import UnityCatalogSource +from datahub.utilities.perf_timer import PerfTimer +from tests.performance.data_generation import ( + NormalDistribution, + generate_data, + generate_queries, +) +from tests.performance.databricks.unity_proxy_mock import UnityCatalogApiProxyMock +from tests.performance.helpers import workunit_sink + + +def run_test(): + seed_metadata = generate_data( + num_containers=[1, 100, 5000], + num_tables=50000, + num_views=10000, + columns_per_table=NormalDistribution(100, 50), + parents_per_view=NormalDistribution(5, 5), + view_definition_length=NormalDistribution(1000, 300), + ) + queries = generate_queries( + seed_metadata, + num_selects=100000, + num_operations=100000, + num_unique_queries=10000, + num_users=1000, + ) + proxy_mock = UnityCatalogApiProxyMock( + seed_metadata, queries=queries, num_service_principals=10000 + ) + print("Data generated") + + config = UnityCatalogSourceConfig( + token="", workspace_url="http://localhost:1234", include_usage_statistics=False + ) + ctx = PipelineContext(run_id="test") + with patch( + "datahub.ingestion.source.unity.source.UnityCatalogApiProxy", + lambda *args, **kwargs: proxy_mock, + ): + source: UnityCatalogSource = UnityCatalogSource(ctx, config) + + pre_mem_usage = psutil.Process(os.getpid()).memory_info().rss + print(f"Test data size: {humanfriendly.format_size(pre_mem_usage)}") + + with PerfTimer() as timer: + workunits = source.get_workunits() + num_workunits, peak_memory_usage = workunit_sink(workunits) + print(f"Workunits Generated: {num_workunits}") + print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + + print( + f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" + ) + print(source.report.aspects) + + +if __name__ == "__main__": + root_logger = logging.getLogger() + root_logger.setLevel(logging.INFO) + root_logger.addHandler(logging.StreamHandler()) + run_test() diff --git a/metadata-ingestion/tests/performance/databricks/unity_proxy_mock.py b/metadata-ingestion/tests/performance/databricks/unity_proxy_mock.py new file mode 100644 index 0000000000000..593163e12bf0a --- /dev/null +++ b/metadata-ingestion/tests/performance/databricks/unity_proxy_mock.py @@ -0,0 +1,183 @@ +import uuid +from collections import defaultdict +from datetime import datetime, timezone +from typing import Dict, Iterable, List + +from databricks.sdk.service.catalog import ColumnTypeName +from databricks.sdk.service.sql import QueryStatementType + +from datahub.ingestion.source.unity.proxy_types import ( + Catalog, + CatalogType, + Column, + Metastore, + Query, + Schema, + ServicePrincipal, + Table, + TableType, +) +from tests.performance import data_model +from tests.performance.data_generation import SeedMetadata +from tests.performance.data_model import ColumnType, StatementType + + +class UnityCatalogApiProxyMock: + """Mimics UnityCatalogApiProxy for performance testing.""" + + def __init__( + self, + seed_metadata: SeedMetadata, + queries: Iterable[data_model.Query] = (), + num_service_principals: int = 0, + ) -> None: + self.seed_metadata = seed_metadata + self.queries = queries + self.num_service_principals = num_service_principals + self.warehouse_id = "invalid-warehouse-id" + + # Cache for performance + self._schema_to_table: Dict[str, List[data_model.Table]] = defaultdict(list) + for table in seed_metadata.all_tables: + self._schema_to_table[table.container.name].append(table) + + def check_basic_connectivity(self) -> bool: + return True + + def assigned_metastore(self) -> Metastore: + container = self.seed_metadata.containers[0][0] + return Metastore( + id=container.name, + name=container.name, + global_metastore_id=container.name, + metastore_id=container.name, + comment=None, + owner=None, + cloud=None, + region=None, + ) + + def catalogs(self, metastore: Metastore) -> Iterable[Catalog]: + for container in self.seed_metadata.containers[1]: + if not container.parent or metastore.name != container.parent.name: + continue + + yield Catalog( + id=f"{metastore.id}.{container.name}", + name=container.name, + metastore=metastore, + comment=None, + owner=None, + type=CatalogType.MANAGED_CATALOG, + ) + + def schemas(self, catalog: Catalog) -> Iterable[Schema]: + for container in self.seed_metadata.containers[2]: + # Assumes all catalog names are unique + if not container.parent or catalog.name != container.parent.name: + continue + + yield Schema( + id=f"{catalog.id}.{container.name}", + name=container.name, + catalog=catalog, + comment=None, + owner=None, + ) + + def tables(self, schema: Schema) -> Iterable[Table]: + for table in self._schema_to_table[schema.name]: + columns = [] + if table.column_mapping: + for i, col_name in enumerate(table.columns): + column = table.column_mapping[col_name] + columns.append( + Column( + id=column.name, + name=column.name, + type_name=self._convert_column_type(column.type), + type_text=column.type.value, + nullable=column.nullable, + position=i, + comment=None, + type_precision=0, + type_scale=0, + ) + ) + + yield Table( + id=f"{schema.id}.{table.name}", + name=table.name, + schema=schema, + table_type=TableType.VIEW if table.is_view() else TableType.MANAGED, + columns=columns, + created_at=datetime.now(tz=timezone.utc), + comment=None, + owner=None, + storage_location=None, + data_source_format=None, + generation=None, + created_by="", + updated_at=None, + updated_by=None, + table_id="", + view_definition=table.definition + if isinstance(table, data_model.View) + else None, + properties={}, + ) + + def service_principals(self) -> Iterable[ServicePrincipal]: + for i in range(self.num_service_principals): + yield ServicePrincipal( + id=str(i), + application_id=str(uuid.uuid4()), + display_name=f"user-{i}", + active=True, + ) + + def query_history( + self, + start_time: datetime, + end_time: datetime, + ) -> Iterable[Query]: + for i, query in enumerate(self.queries): + yield Query( + query_id=str(i), + query_text=query.text, + statement_type=self._convert_statement_type(query.type), + start_time=query.timestamp, + end_time=query.timestamp, + user_id=hash(query.actor), + user_name=query.actor, + executed_as_user_id=hash(query.actor), + executed_as_user_name=None, + ) + + def table_lineage(self, table: Table) -> None: + pass + + def get_column_lineage(self, table: Table) -> None: + pass + + @staticmethod + def _convert_column_type(t: ColumnType) -> ColumnTypeName: + if t == ColumnType.INTEGER: + return ColumnTypeName.INT + elif t == ColumnType.FLOAT: + return ColumnTypeName.DOUBLE + elif t == ColumnType.STRING: + return ColumnTypeName.STRING + elif t == ColumnType.BOOLEAN: + return ColumnTypeName.BOOLEAN + elif t == ColumnType.DATETIME: + return ColumnTypeName.TIMESTAMP + else: + raise ValueError(f"Unknown column type: {t}") + + @staticmethod + def _convert_statement_type(t: StatementType) -> QueryStatementType: + if t == "CUSTOM" or t == "UNKNOWN": + return QueryStatementType.OTHER + else: + return QueryStatementType[t] diff --git a/metadata-ingestion/tests/performance/helpers.py b/metadata-ingestion/tests/performance/helpers.py new file mode 100644 index 0000000000000..eb98e53670c96 --- /dev/null +++ b/metadata-ingestion/tests/performance/helpers.py @@ -0,0 +1,21 @@ +import os +from typing import Iterable, Tuple + +import psutil + +from datahub.ingestion.api.workunit import MetadataWorkUnit + + +def workunit_sink(workunits: Iterable[MetadataWorkUnit]) -> Tuple[int, int]: + peak_memory_usage = psutil.Process(os.getpid()).memory_info().rss + i: int = 0 + for i, wu in enumerate(workunits): + if i % 10_000 == 0: + peak_memory_usage = max( + peak_memory_usage, psutil.Process(os.getpid()).memory_info().rss + ) + peak_memory_usage = max( + peak_memory_usage, psutil.Process(os.getpid()).memory_info().rss + ) + + return i, peak_memory_usage diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py index f0db2d91e362c..30157c3a78094 100644 --- a/metadata-ingestion/tests/test_helpers/docker_helpers.py +++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py @@ -73,3 +73,26 @@ def run( yield docker_services return run + + +def cleanup_image(image_name: str) -> None: + assert ":" not in image_name, "image_name should not contain a tag" + + images_proc = subprocess.run( + f"docker image ls --filter 'reference={image_name}*' -q", + shell=True, + capture_output=True, + text=True, + check=True, + ) + + if not images_proc.stdout: + logger.debug(f"No images to cleanup for {image_name}") + return + + image_ids = images_proc.stdout.splitlines() + subprocess.run( + f"docker image rm {' '.join(image_ids)}", + shell=True, + check=True, + ) diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json new file mode 100644 index 0000000000000..4773974545bfa --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json @@ -0,0 +1,8 @@ +{ + "query_type": "CREATE", + "in_tables": [], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)" + ], + "column_lineage": null +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 483c1ac4cc7f9..2a965a9bb1e61 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -274,6 +274,21 @@ def test_expand_select_star_basic(): ) +def test_create_table_ddl(): + assert_sql_result( + """ +CREATE TABLE IF NOT EXISTS costs ( + id INTEGER PRIMARY KEY, + month TEXT NOT NULL, + total_cost REAL NOT NULL, + area REAL NOT NULL +) +""", + dialect="sqlite", + expected_file=RESOURCE_DIR / "test_create_table_ddl.json", + ) + + def test_snowflake_column_normalization(): # Technically speaking this is incorrect since the column names are different and both quoted. diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 4fc6c31626ba8..e9e91361f49f4 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -3,13 +3,14 @@ import os from datetime import datetime, timedelta, timezone from types import SimpleNamespace -from typing import Any, Dict, Optional, cast +from typing import Any, Dict, List, Optional, cast from unittest.mock import MagicMock, Mock, patch import pytest from google.api_core.exceptions import GoogleAPICallError from google.cloud.bigquery.table import Row, TableListItem +from datahub.configuration.common import AllowDenyPattern from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( @@ -17,9 +18,13 @@ BigqueryTableIdentifier, BigQueryTableRef, ) -from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_config import ( + BigQueryConnectionConfig, + BigQueryV2Config, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( + BigqueryDataset, BigqueryProject, BigQuerySchemaApi, BigqueryView, @@ -854,3 +859,47 @@ def test_get_table_name(full_table_name: str, datahub_full_table_name: str) -> N BigqueryTableIdentifier.from_string_name(full_table_name).get_table_name() == datahub_full_table_name ) + + +def test_default_config_for_excluding_projects_and_datasets(): + config = BigQueryV2Config.parse_obj({}) + assert config.exclude_empty_projects is False + config = BigQueryV2Config.parse_obj({"exclude_empty_projects": True}) + assert config.exclude_empty_projects + + +@patch.object(BigQueryConnectionConfig, "get_bigquery_client", new=lambda self: None) +@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") +def test_excluding_empty_projects_from_ingestion( + get_datasets_for_project_id_mock, +): + project_id_with_datasets = "project-id-with-datasets" + project_id_without_datasets = "project-id-without-datasets" + + def get_datasets_for_project_id_side_effect( + project_id: str, + ) -> List[BigqueryDataset]: + return ( + [] + if project_id == project_id_without_datasets + else [BigqueryDataset("some-dataset")] + ) + + get_datasets_for_project_id_mock.side_effect = ( + get_datasets_for_project_id_side_effect + ) + + base_config = { + "project_ids": [project_id_with_datasets, project_id_without_datasets], + "schema_pattern": AllowDenyPattern(deny=[".*"]), + "include_usage_statistics": False, + "include_table_lineage": False, + } + + config = BigQueryV2Config.parse_obj(base_config) + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test-1")) + assert len({wu.metadata.entityUrn for wu in source.get_workunits()}) == 2 # type: ignore + + config = BigQueryV2Config.parse_obj({**base_config, "exclude_empty_projects": True}) + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test-2")) + assert len({wu.metadata.entityUrn for wu in source.get_workunits()}) == 1 # type: ignore diff --git a/metadata-ingestion/tests/unit/test_bigquery_usage.py b/metadata-ingestion/tests/unit/test_bigquery_usage.py index e06c6fb3fe7e5..1eb5d8b00e27c 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_usage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_usage.py @@ -35,7 +35,7 @@ TimeWindowSizeClass, ) from datahub.testing.compare_metadata_json import diff_metadata_json -from tests.performance.bigquery import generate_events, ref_from_table +from tests.performance.bigquery.bigquery_events import generate_events, ref_from_table from tests.performance.data_generation import generate_data, generate_queries from tests.performance.data_model import Container, FieldAccess, Query, Table, View @@ -45,14 +45,15 @@ ACTOR_2, ACTOR_2_URN = "b@acryl.io", "urn:li:corpuser:b" DATABASE_1 = Container("database_1") DATABASE_2 = Container("database_2") -TABLE_1 = Table("table_1", DATABASE_1, ["id", "name", "age"]) -TABLE_2 = Table("table_2", DATABASE_1, ["id", "table_1_id", "value"]) +TABLE_1 = Table("table_1", DATABASE_1, ["id", "name", "age"], None) +TABLE_2 = Table("table_2", DATABASE_1, ["id", "table_1_id", "value"], None) VIEW_1 = View( name="view_1", container=DATABASE_1, columns=["id", "name", "total"], definition="VIEW DEFINITION 1", parents=[TABLE_1, TABLE_2], + column_mapping=None, ) ALL_TABLES = [TABLE_1, TABLE_2, VIEW_1] diff --git a/metadata-ingestion/tests/unit/test_mapping.py b/metadata-ingestion/tests/unit/test_mapping.py index d69dd4a8a96b0..5c258f16535f8 100644 --- a/metadata-ingestion/tests/unit/test_mapping.py +++ b/metadata-ingestion/tests/unit/test_mapping.py @@ -4,6 +4,7 @@ from datahub.metadata.schema_classes import ( GlobalTagsClass, GlossaryTermsClass, + InstitutionalMemoryClass, OwnerClass, OwnershipClass, OwnershipSourceTypeClass, @@ -233,6 +234,46 @@ def test_operation_processor_advanced_matching_tags(): assert tag_aspect.tags[0].tag == "urn:li:tag:case_4567" +def test_operation_processor_institutional_memory(): + raw_props = { + "documentation_link": "https://test.com/documentation#ignore-this", + } + processor = OperationProcessor( + operation_defs={ + "documentation_link": { + "match": r"(?:https?)?\:\/\/\w*[^#]*", + "operation": "add_doc_link", + "config": {"link": "{{ $match }}", "description": "test"}, + }, + }, + ) + aspect_map = processor.process(raw_props) + assert "add_doc_link" in aspect_map + + doc_link_aspect: InstitutionalMemoryClass = aspect_map["add_doc_link"] + + assert doc_link_aspect.elements[0].url == "https://test.com/documentation" + assert doc_link_aspect.elements[0].description == "test" + + +def test_operation_processor_institutional_memory_no_description(): + raw_props = { + "documentation_link": "test.com/documentation#ignore-this", + } + processor = OperationProcessor( + operation_defs={ + "documentation_link": { + "match": r"(?:https?)?\:\/\/\w*[^#]*", + "operation": "add_doc_link", + "config": {"link": "{{ $match }}"}, + }, + }, + ) + # we require a description, so this should stay empty + aspect_map = processor.process(raw_props) + assert aspect_map == {} + + def test_operation_processor_matching_nested_props(): raw_props = { "gdpr": { diff --git a/metadata-ingestion/tests/unit/test_mcp_builder.py b/metadata-ingestion/tests/unit/test_mcp_builder.py index 23f2bddc2084e..561b782ef9e46 100644 --- a/metadata-ingestion/tests/unit/test_mcp_builder.py +++ b/metadata-ingestion/tests/unit/test_mcp_builder.py @@ -1,5 +1,4 @@ import datahub.emitter.mcp_builder as builder -from datahub.emitter.mce_builder import datahub_guid def test_guid_generator(): @@ -80,7 +79,7 @@ def test_guid_generators(): key = builder.SchemaKey( database="test", schema="Test", platform="mysql", instance="TestInstance" ) - guid_datahub = datahub_guid(key.dict(by_alias=True)) + guid_datahub = key.guid() guid = key.guid() assert guid == guid_datahub diff --git a/metadata-ingestion/tests/unit/test_pydantic_validators.py b/metadata-ingestion/tests/unit/test_pydantic_validators.py index 07d86043a35bf..3e9ec6cbaf357 100644 --- a/metadata-ingestion/tests/unit/test_pydantic_validators.py +++ b/metadata-ingestion/tests/unit/test_pydantic_validators.py @@ -4,7 +4,7 @@ from pydantic import ValidationError from datahub.configuration.common import ConfigModel -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.utilities.global_warning_util import get_global_warnings diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index 95af0e623e991..808b38192411d 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -4,12 +4,11 @@ import pytest from sqlalchemy.engine.reflection import Inspector -from datahub.ingestion.source.sql.sql_common import ( - PipelineContext, - SQLAlchemySource, +from datahub.ingestion.source.sql.sql_common import PipelineContext, SQLAlchemySource +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( get_platform_from_sqlalchemy_uri, ) -from datahub.ingestion.source.sql.sql_config import SQLCommonConfig class _TestSQLAlchemyConfig(SQLCommonConfig): diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 8b2535eea1fe9..bc95451620d22 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -62,6 +62,9 @@ ) from datahub.ingestion.transformer.dataset_transformer import DatasetTransformer from datahub.ingestion.transformer.extract_dataset_tags import ExtractDatasetTags +from datahub.ingestion.transformer.extract_ownership_from_tags import ( + ExtractOwnersFromTagsTransformer, +) from datahub.ingestion.transformer.mark_dataset_status import MarkDatasetStatus from datahub.ingestion.transformer.remove_dataset_ownership import ( SimpleRemoveDatasetOwnership, @@ -72,6 +75,7 @@ GlobalTagsClass, MetadataChangeEventClass, OwnershipClass, + OwnershipTypeClass, StatusClass, TagAssociationClass, ) @@ -586,6 +590,91 @@ def test_mark_status_dataset(tmp_path): ) +def test_extract_owners_from_tags(): + def _test_owner( + tag: str, + config: Dict, + expected_owner: str, + expected_owner_type: Optional[str] = None, + ) -> None: + dataset = make_generic_dataset( + aspects=[ + models.GlobalTagsClass( + tags=[TagAssociationClass(tag=builder.make_tag_urn(tag))] + ) + ] + ) + transformer = ExtractOwnersFromTagsTransformer.create( + config, + PipelineContext(run_id="test"), + ) + transformed = list( + transformer.transform( + [ + RecordEnvelope(dataset, metadata={}), + ] + ) + ) + owners_aspect = transformed[0].record.proposedSnapshot.aspects[0] + owners = owners_aspect.owners + owner = owners[0] + if expected_owner_type is not None: + assert owner.type == expected_owner_type + assert owner.owner == expected_owner + + _test_owner( + tag="owner:foo", + config={ + "tag_prefix": "owner:", + }, + expected_owner="urn:li:corpuser:foo", + ) + _test_owner( + tag="abcdef-owner:foo", + config={ + "tag_prefix": ".*owner:", + }, + expected_owner="urn:li:corpuser:foo", + ) + _test_owner( + tag="owner:foo", + config={ + "tag_prefix": "owner:", + "is_user": False, + }, + expected_owner="urn:li:corpGroup:foo", + ) + _test_owner( + tag="owner:foo", + config={ + "tag_prefix": "owner:", + "email_domain": "example.com", + }, + expected_owner="urn:li:corpuser:foo@example.com", + ) + _test_owner( + tag="owner:foo", + config={ + "tag_prefix": "owner:", + "email_domain": "example.com", + "owner_type": "TECHNICAL_OWNER", + }, + expected_owner="urn:li:corpuser:foo@example.com", + expected_owner_type=OwnershipTypeClass.TECHNICAL_OWNER, + ) + _test_owner( + tag="owner:foo", + config={ + "tag_prefix": "owner:", + "email_domain": "example.com", + "owner_type": "AUTHOR", + "owner_type_urn": "urn:li:ownershipType:ad8557d6-dcb9-4d2a-83fc-b7d0d54f3e0f", + }, + expected_owner="urn:li:corpuser:foo@example.com", + expected_owner_type=OwnershipTypeClass.CUSTOM, + ) + + def test_add_dataset_browse_paths(): dataset = make_generic_dataset() diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionAction.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionAction.pdl new file mode 100644 index 0000000000000..df6620b66bfd8 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionAction.pdl @@ -0,0 +1,22 @@ +namespace com.linkedin.assertion + +/** + * The Actions about an Assertion. + * In the future, we'll likely extend this model to support additional + * parameters or options related to the assertion actions. + */ +record AssertionAction { + /** + * The type of the Action + */ + type: enum AssertionActionType { + /** + * Raise an incident. + */ + RAISE_INCIDENT + /** + * Resolve open incidents related to the assertion. + */ + RESOLVE_INCIDENT + } +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionActions.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionActions.pdl new file mode 100644 index 0000000000000..61846c1ba9c12 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionActions.pdl @@ -0,0 +1,18 @@ +namespace com.linkedin.assertion + +/** + * The Actions about an Assertion + */ +@Aspect = { + "name": "assertionActions" +} +record AssertionActions { + /** + * Actions to be executed on successful assertion run. + */ + onSuccess: array[AssertionAction] = [] + /** + * Actions to be executed on failed assertion run. + */ + onFailure: array[AssertionAction] = [] +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl index 77ee147a781e2..ae2a58028057b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl @@ -13,13 +13,58 @@ record AssertionInfo includes CustomProperties, ExternalReference { /** * Type of assertion. Assertion types can evolve to span Datasets, Flows (Pipelines), Models, Features etc. */ + @Searchable = { } type: enum AssertionType { - // A single-dataset assertion. When this is the value, the datasetAssertion field will be populated. + /** + * A single-dataset assertion. When this is the value, the datasetAssertion field will be populated. + */ DATASET + + /** + * A freshness assertion, or an assertion which indicates when a particular operation should occur + * to an asset. + */ + FRESHNESS + + /** + * A volume assertion, or an assertion which indicates how much data should be available for a + * particular asset. + */ + VOLUME + + /** + * A schema or structural assertion. + * + * Would have named this SCHEMA but the codegen for PDL does not allow this (reserved word). + */ + DATA_SCHEMA } /** - * Dataset Assertion information when type is DATASET + * A Dataset Assertion definition. This field is populated when the type is DATASET. */ datasetAssertion: optional DatasetAssertionInfo + + /** + * An Freshness Assertion definition. This field is populated when the type is FRESHNESS. + */ + freshnessAssertion: optional FreshnessAssertionInfo + + /** + * An Volume Assertion definition. This field is populated when the type is VOLUME. + */ + volumeAssertion: optional VolumeAssertionInfo + + /** + * An schema Assertion definition. This field is populated when the type is DATASET_SCHEMA + */ + schemaAssertion: optional SchemaAssertionInfo + + /** + * The source or origin of the Assertion definition. + * + * If the source type of the Assertion is EXTERNAL, it is expected to have a corresponding dataPlatformInstance aspect detailing + * the platform where it was ingested from. + */ + source: optional AssertionSource } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl index decbfc08263de..ded84e1969153 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl @@ -5,10 +5,15 @@ namespace com.linkedin.assertion */ record AssertionResult { /** - * The final result, e.g. either SUCCESS or FAILURE. + * The final result, e.g. either SUCCESS, FAILURE, or ERROR. */ @TimeseriesField = {} + @Searchable = {} type: enum AssertionResultType { + /** + * The Assertion has not yet been fully evaluated + */ + INIT /** * The Assertion Succeeded */ @@ -17,6 +22,10 @@ record AssertionResult { * The Assertion Failed */ FAILURE + /** + * The Assertion encountered an Error + */ + ERROR } /** @@ -45,8 +54,13 @@ record AssertionResult { nativeResults: optional map[string, string] /** - * URL where full results are available + * External URL where full results are available. Only present when assertion source is not native. */ externalUrl: optional string + /** + * The error object if AssertionResultType is an Error + */ + error: optional AssertionResultError + } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl new file mode 100644 index 0000000000000..e768fe8521942 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl @@ -0,0 +1,45 @@ +namespace com.linkedin.assertion + +/** + * An error encountered when evaluating an AssertionResult + */ +record AssertionResultError { + /** + * The type of error encountered + */ + type: enum AssertionResultErrorType { + /** + * Source is unreachable + */ + SOURCE_CONNECTION_ERROR + /** + * Source query failed to execute + */ + SOURCE_QUERY_FAILED + /** + * Insufficient data to evaluate the assertion + */ + INSUFFICIENT_DATA + /** + * Invalid parameters were detected + */ + INVALID_PARAMETERS + /** + * Event type not supported by the specified source + */ + INVALID_SOURCE_TYPE + /** + * Unsupported platform + */ + UNSUPPORTED_PLATFORM + /** + * Unknown error + */ + UNKNOWN_ERROR + } + + /** + * Additional metadata depending on the type of error + */ + properties: optional map[string, string] +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl index 9e75f96fafd06..14f1204232740 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl @@ -1,6 +1,7 @@ namespace com.linkedin.assertion -import com.linkedin.timeseries.TimeseriesAspectBase +import com.linkedin.timeseries.PartitionSpec +import com.linkedin.timeseries.TimeWindowSize import com.linkedin.common.ExternalReference import com.linkedin.common.Urn @@ -12,36 +13,31 @@ import com.linkedin.common.Urn "name": "assertionRunEvent", "type": "timeseries", } -record AssertionRunEvent includes TimeseriesAspectBase { +record AssertionRunEvent { + + /** + * The event timestamp field as epoch at UTC in milli seconds. + */ + @Searchable = { + "fieldName": "lastCompletedTime", + "fieldType": "DATETIME" + } + timestampMillis: long /** * Native (platform-specific) identifier for this run */ - //Multiple assertions could occur in same evaluator run runId: string - /* - * Urn of assertion which is evaluated - */ - @TimeseriesField = {} - assertionUrn: Urn - /* * Urn of entity on which the assertion is applicable */ - //example - dataset urn, if dataset is being asserted @TimeseriesField = {} asserteeUrn: Urn - - /** - * Specification of the batch which this run is evaluating - */ - batchSpec: optional BatchSpec /** * The status of the assertion run as per this timeseries event. */ - // Currently just supports COMPLETE, but should evolve to support other statuses like STARTED, RUNNING, etc. @TimeseriesField = {} status: enum AssertionRunStatus { /** @@ -59,4 +55,33 @@ record AssertionRunEvent includes TimeseriesAspectBase { * Runtime parameters of evaluation */ runtimeContext: optional map[string, string] + + /** + * Specification of the batch which this run is evaluating + */ + batchSpec: optional BatchSpec + + /* + * Urn of assertion which is evaluated + */ + @TimeseriesField = {} + assertionUrn: Urn + + /** + * Granularity of the event if applicable + */ + eventGranularity: optional TimeWindowSize + + /** + * The optional partition specification. + */ + partitionSpec: optional PartitionSpec = { + "type":"FULL_TABLE", + "partition":"FULL_TABLE_SNAPSHOT" + } + + /** + * The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value. + */ + messageId: optional string } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl new file mode 100644 index 0000000000000..d8892c0c71c6f --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.assertion + +/** + * The source of an assertion + */ +record AssertionSource { + /** + * The type of the Assertion Source + */ + @Searchable = { + "fieldName": "sourceType" + } + type: enum AssertionSourceType { + /** + * The assertion was defined natively on DataHub by a user. + */ + NATIVE + /** + * The assertion was defined and managed externally of DataHub. + */ + EXTERNAL + /** + * The assertion was inferred, e.g. from offline AI / ML models. + */ + INFERRED + } +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdAggregation.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdAggregation.pdl index b79b96f9379b0..968944165a1c8 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdAggregation.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdAggregation.pdl @@ -4,6 +4,7 @@ namespace com.linkedin.assertion * The function that is applied to the aggregation input (schema, rows, column values) before evaluating an operator. */ enum AssertionStdAggregation { + /** * Assertion is applied on number of rows. */ @@ -20,7 +21,7 @@ enum AssertionStdAggregation { COLUMN_COUNT /** - * Assertion is applied on individual column value. + * Assertion is applied on individual column value. (No aggregation) */ IDENTITY @@ -42,6 +43,13 @@ enum AssertionStdAggregation { /** * Assertion is applied on proportion of distinct values in column */ + UNIQUE_PROPORTION + + /** + * Assertion is applied on proportion of distinct values in column + * + * Deprecated! Use UNIQUE_PROPORTION instead. + */ UNIQUE_PROPOTION /** diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionValueChangeType.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionValueChangeType.pdl new file mode 100644 index 0000000000000..5a1ff4fa73ffb --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionValueChangeType.pdl @@ -0,0 +1,16 @@ +namespace com.linkedin.assertion + +/** +* An enum to represent a type of change in an assertion value, metric, or measurement. +*/ +enum AssertionValueChangeType { + /** + * A change that is defined in absolute terms. + */ + ABSOLUTE + /** + * A change that is defined in relative terms using percentage change + * from the original value. + */ + PERCENTAGE +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AuditLogSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AuditLogSpec.pdl new file mode 100644 index 0000000000000..4d5bf261cbf89 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AuditLogSpec.pdl @@ -0,0 +1,18 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldDataType + +/** +* Information about the Audit Log operation to use in evaluating an assertion. +**/ +record AuditLogSpec { + /** + * The list of operation types that should be monitored. If not provided, a default set will be used. + */ + operationTypes: optional array [string] + + /** + * Optional: The user name associated with the operation. + */ + userName: optional string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/DatasetAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/DatasetAssertionInfo.pdl index c411c7ff8a572..2a8bf28f1ff11 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/DatasetAssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/DatasetAssertionInfo.pdl @@ -18,9 +18,10 @@ record DatasetAssertionInfo { /** * Scope of the Assertion. What part of the dataset does this assertion apply to? **/ + @Searchable = {} scope: enum DatasetAssertionScope { /** - * This assertion applies to dataset columns + * This assertion applies to dataset column(s) */ DATASET_COLUMN @@ -29,6 +30,11 @@ record DatasetAssertionInfo { */ DATASET_ROWS + /** + * This assertion applies to the storage size of the dataset + */ + DATASET_STORAGE_SIZE + /** * This assertion applies to the schema of the dataset */ @@ -41,7 +47,9 @@ record DatasetAssertionInfo { } /** - * One or more dataset schema fields that are targeted by this assertion + * One or more dataset schema fields that are targeted by this assertion. + * + * This field is expected to be provided if the assertion scope is DATASET_COLUMN. */ @Relationship = { "/*": { @@ -49,11 +57,18 @@ record DatasetAssertionInfo { "entityTypes": [ "schemaField" ] } } + @Searchable = { + "/*": { + "fieldType": "URN" + } + } fields: optional array[Urn] /** * Standardized assertion operator + * This field is left blank if there is no selected aggregation or metric for a particular column. */ + @Searchable = {} aggregation: optional AssertionStdAggregation /** diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FixedIntervalSchedule.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FixedIntervalSchedule.pdl new file mode 100644 index 0000000000000..c08c33ffb92d3 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FixedIntervalSchedule.pdl @@ -0,0 +1,10 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.timeseries.TimeWindowSize + +/** +* Attributes defining a relative fixed interval SLA schedule. +*/ +record FixedIntervalSchedule includes TimeWindowSize { +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionInfo.pdl new file mode 100644 index 0000000000000..4445a11ff40a7 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionInfo.pdl @@ -0,0 +1,53 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.dataset.DatasetFilter + +/** +* Attributes defining a Freshness Assertion. +**/ +record FreshnessAssertionInfo { + /** + * The type of the freshness assertion being monitored. + */ + @Searchable = {} + type: enum FreshnessAssertionType { + /** + * An Freshness based on Operations performed on a particular Dataset (insert, update, delete, etc) and sourced from an audit log, as + * opposed to based on the highest watermark in a timestamp column (e.g. a query). Only valid when entity is of type "dataset". + */ + DATASET_CHANGE + /** + * An Freshness based on a successful execution of a Data Job. + */ + DATA_JOB_RUN + } + + /** + * The entity targeted by this Freshness check. + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset", "dataJob" ] + } + entity: Urn + + /** + * Produce FAILURE Assertion Result if the asset is not updated on the cadence and within the time range described by the schedule. + */ + @Searchable = { + "/type": { + "fieldName": "scheduleType" + } + } + schedule: FreshnessAssertionSchedule + + /** + * A definition of the specific filters that should be applied, when performing monitoring. + * If not provided, there is no filter, and the full table is under consideration. + */ + filter: optional DatasetFilter +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionSchedule.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionSchedule.pdl new file mode 100644 index 0000000000000..a87342ad4f5ed --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionSchedule.pdl @@ -0,0 +1,66 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn + +/** +* Attributes defining a single Freshness schedule. +*/ +record FreshnessAssertionSchedule { + + /** + * The type of a Freshness Assertion Schedule. + * + * Once we support data-time-relative schedules (e.g. schedules relative to time partitions), + * we will add those schedule types here. + */ + type: enum FreshnessAssertionScheduleType { + /** + * An highly configurable recurring schedule which describes the times of events described + * by a CRON schedule, with the evaluation schedule assuming to be matching the cron schedule. + * + * In a CRON schedule type, we compute the look-back window to be the time between the last scheduled event + * and the current event (evaluation time). This means that the evaluation schedule must match exactly + * the schedule defined inside the cron schedule. + * + * For example, a CRON schedule defined as "0 8 * * *" would represent a schedule of "every day by 8am". Assuming + * that the assertion evaluation schedule is defined to match this, the freshness assertion would be evaluated in the following way: + * + * 1. Compute the "last scheduled occurrence" of the event using the CRON schedule. For example, yesterday at 8am. + * 2. Compute the bounds of a time window between the "last scheduled occurrence" (yesterday at 8am) until the "current occurrence" (today at 8am) + * 3. Verify that the target event has occurred within the CRON-interval window. + * 4. If the target event has occurred within the time window, then assertion passes. + * 5. If the target event has not occurred within the time window, then the assertion fails. + * + */ + CRON + /** + * A fixed interval which is used to compute a look-back window for use when evaluating the assertion relative + * to the Evaluation Time of the Assertion. + * + * To compute the valid look-back window, we subtract the fixed interval from the evaluation time. Then, we verify + * that the target event has occurred within that window. + * + * For example, a fixed interval of "24h" would represent a schedule of "in the last 24 hours". + * The 24 hour interval is relative to the evaluation time of the assertion. For example if we schedule the assertion + * to be evaluated each hour, we'd compute the result as follows: + * + * 1. Subtract the fixed interval from the current time (Evaluation time) to compute the bounds of a fixed look-back window. + * 2. Verify that the target event has occurred within the CRON-interval window. + * 3. If the target event has occurred within the time window, then assertion passes. + * 4. If the target event has not occurred within the time window, then the assertion fails. + * + */ + FIXED_INTERVAL + } + + /** + * A cron schedule. This field is required when type is CRON. + */ + cron: optional FreshnessCronSchedule + + /** + * A fixed interval schedule. This field is required when type is FIXED_INTERVAL. + */ + fixedInterval: optional FixedIntervalSchedule + +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessCronSchedule.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessCronSchedule.pdl new file mode 100644 index 0000000000000..d48900690c51d --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessCronSchedule.pdl @@ -0,0 +1,25 @@ +namespace com.linkedin.assertion + +/** +* Attributes defining a CRON-formatted schedule used for defining a freshness assertion. +*/ +record FreshnessCronSchedule { + /** + * A cron-formatted execution interval, as a cron string, e.g. 1 * * * * + */ + cron: string + + /** + * Timezone in which the cron interval applies, e.g. America/Los Angeles + */ + timezone: string + + /** + * An optional offset in milliseconds to SUBTRACT from the timestamp generated by the cron schedule + * to generate the lower bounds of the "freshness window", or the window of time in which an event must have occurred in order for the Freshness check + * to be considering passing. + * + * If left empty, the start of the SLA window will be the _end_ of the previously evaluated Freshness window. + */ + windowStartOffsetMs: optional long +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldKind.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldKind.pdl new file mode 100644 index 0000000000000..7b25589e500da --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldKind.pdl @@ -0,0 +1,17 @@ +namespace com.linkedin.assertion + +enum FreshnessFieldKind { + /** + * Determine that a change has occurred by inspecting an last modified field which + * represents the last time at which a row was changed. + */ + LAST_MODIFIED, + /** + * Determine that a change has occurred by inspecting a field which should be tracked as the + * "high watermark" for the table. This should be an ascending number or date field. + * + * If rows with this column have not been added since the previous check + * then the Freshness Assertion will fail. + */ + HIGH_WATERMARK +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl new file mode 100644 index 0000000000000..04acd1c71352d --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl @@ -0,0 +1,14 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldSpec + + +/** +* Lightweight spec used for referencing a particular schema field. +**/ +record FreshnessFieldSpec includes SchemaFieldSpec { + /** + * The type of the field being used to verify the Freshness Assertion. + */ + kind: optional FreshnessFieldKind +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentFieldTransformer.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentFieldTransformer.pdl new file mode 100644 index 0000000000000..d1d3e7b23b666 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentFieldTransformer.pdl @@ -0,0 +1,60 @@ +namespace com.linkedin.assertion + +/** +* The definition of the transformer function that should be applied to a given field / column value in a dataset +* in order to determine the segment or bucket that it belongs to, which in turn is used to evaluate +* volume assertions. +*/ +record IncrementingSegmentFieldTransformer { + /** + * A 'standard' transformer type. Note that not all source systems will support all operators. + */ + type: enum IncrementingSegmentFieldTransformerType { + /** + * Rounds a timestamp (in seconds) down to the start of the month. + */ + TIMESTAMP_MS_TO_MINUTE + + /** + * Rounds a timestamp (in milliseconds) down to the nearest hour. + */ + TIMESTAMP_MS_TO_HOUR + + /** + * Rounds a timestamp (in milliseconds) down to the start of the day. + */ + TIMESTAMP_MS_TO_DATE + + /** + * Rounds a timestamp (in milliseconds) down to the start of the month + */ + TIMESTAMP_MS_TO_MONTH + + /** + * Rounds a timestamp (in milliseconds) down to the start of the year + */ + TIMESTAMP_MS_TO_YEAR + + /** + * Rounds a numeric value down to the nearest integer. + */ + FLOOR + + /** + * Rounds a numeric value up to the nearest integer. + */ + CEILING + + /** + * A backdoor to provide a native operator type specific to a given source system like + * Snowflake, Redshift, BQ, etc. + */ + NATIVE + } + + /** + * The 'native' transformer type, useful as a back door if a custom operator is required. + * This field is required if the type is NATIVE. + */ + nativeType: optional string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountChange.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountChange.pdl new file mode 100644 index 0000000000000..7c4c73f2ea887 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountChange.pdl @@ -0,0 +1,33 @@ +namespace com.linkedin.assertion + + +/** +* Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_CHANGE volume assertion. +*/ +record IncrementingSegmentRowCountChange { + /** + * A specification of how the 'segment' can be derived using a column and an optional transformer function. + */ + segment: IncrementingSegmentSpec + + /** + * The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + */ + type: AssertionValueChangeType + + /** + * The operator you'd like to apply to the row count value + * + * Note that only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountTotal.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountTotal.pdl new file mode 100644 index 0000000000000..6b035107aae09 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountTotal.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.assertion + +/** +* Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_TOTAL volume assertion. +*/ +record IncrementingSegmentRowCountTotal { + /** + * A specification of how the 'segment' can be derived using a column and an optional transformer function. + */ + segment: IncrementingSegmentSpec + + /** + * The operator you'd like to apply. + * + * Note that only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentSpec.pdl new file mode 100644 index 0000000000000..eddd0c3da3df7 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentSpec.pdl @@ -0,0 +1,33 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldSpec + +/** +* Core attributes required to identify an incrementing segment in a table. This type is mainly useful +* for tables that constantly increase with new rows being added on a particular cadence (e.g. fact or event tables) +* +* An incrementing segment represents a logical chunk of data which is INSERTED +* into a dataset on a regular interval, along with the presence of a constantly-incrementing column +* value such as an event time, date partition, or last modified column. +* +* An incrementing segment is principally identified by 2 key attributes combined: +* +* 1. A field or column that represents the incrementing value. New rows that are inserted will be identified using this column. +* Note that the value of this column may not by itself represent the "bucket" or the "segment" in which the row falls. +* +* 2. [Optional] An transformer function that may be applied to the selected column value in order +* to obtain the final "segment identifier" or "bucket identifier". Rows that have the same value after applying the transformation +* will be grouped into the same segment, using which the final value (e.g. row count) will be determined. +*/ +record IncrementingSegmentSpec { + /** + * The field to use to generate segments. It must be constantly incrementing as new rows are inserted. + */ + field: SchemaFieldSpec + + /** + * Optional transformer function to apply to the field in order to obtain the final segment or bucket identifier. + * If not provided, then no operator will be applied to the field. (identity function) + */ + transformer: optional IncrementingSegmentFieldTransformer +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountChange.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountChange.pdl new file mode 100644 index 0000000000000..85a915066f584 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountChange.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.assertion + +/** +* Attributes defining a ROW_COUNT_CHANGE volume assertion. +*/ +record RowCountChange { + /** + * The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + */ + type: AssertionValueChangeType + + /** + * The operator you'd like to apply. + * + * Note that only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountTotal.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountTotal.pdl new file mode 100644 index 0000000000000..f691f15f62e04 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountTotal.pdl @@ -0,0 +1,22 @@ +namespace com.linkedin.assertion + +/** +* Attributes defining a ROW_COUNT_TOTAL volume assertion. +*/ +record RowCountTotal { + /** + * The operator you'd like to apply. + * + * Note that only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl new file mode 100644 index 0000000000000..fd246e0c7cfc4 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl @@ -0,0 +1,29 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.schema.SchemaMetadata + +/** +* Attributes that are applicable to schema assertions +**/ +record SchemaAssertionInfo { + /** + * The entity targeted by the assertion + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset", "dataJob" ] + } + entity: Urn + + /** + * A definition of the expected structure for the asset + * + * Note that many of the fields of this model, especially those related to metadata (tags, terms) + * will go unused in this context. + */ + schema: SchemaMetadata +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl new file mode 100644 index 0000000000000..327b76f95762e --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl @@ -0,0 +1,82 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.dataset.DatasetFilter + +/** +* Attributes defining a dataset Volume Assertion +*/ +record VolumeAssertionInfo { + /** + * The type of the freshness assertion being monitored. + */ + @Searchable = {} + type: enum VolumeAssertionType { + /** + * A volume assertion that is evaluated against the total row count of a dataset. + */ + ROW_COUNT_TOTAL + /** + * A volume assertion that is evaluated against an incremental row count of a dataset, + * or a row count change. + */ + ROW_COUNT_CHANGE + /** + * A volume assertion that checks the latest "segment" in a table based on an incrementing + * column to check whether it's row count falls into a particular range. + * + * This can be used to monitor the row count of an incrementing date-partition column segment. + */ + INCREMENTING_SEGMENT_ROW_COUNT_TOTAL + /** + * A volume assertion that compares the row counts in neighboring "segments" or "partitions" + * of an incrementing column. + * This can be used to track changes between subsequent date partition + * in a table, for example. + */ + INCREMENTING_SEGMENT_ROW_COUNT_CHANGE + } + + /** + * The entity targeted by this Volume check. + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset" ] + } + entity: Urn + + /** + * Produce FAILURE Assertion Result if the row count of the asset does not meet specific requirements. + * Required if type is 'ROW_COUNT_TOTAL' + */ + rowCountTotal: optional RowCountTotal + + /** + * Produce FAILURE Assertion Result if the delta row count of the asset does not meet specific requirements + * within a given period of time. + * Required if type is 'ROW_COUNT_CHANGE' + */ + rowCountChange: optional RowCountChange + + /** + * Produce FAILURE Assertion Result if the asset's latest incrementing segment row count total + * does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_TOTAL' + */ + incrementingSegmentRowCountTotal: optional IncrementingSegmentRowCountTotal + + /** + * Produce FAILURE Assertion Result if the asset's incrementing segment row count delta + * does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_CHANGE' + */ + incrementingSegmentRowCountChange: optional IncrementingSegmentRowCountChange + + /** + * A definition of the specific filters that should be applied, when performing monitoring. + * If not provided, there is no filter, and the full table is under consideration. + */ + filter: optional DatasetFilter +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractProperties.pdl new file mode 100644 index 0000000000000..a623f585df30c --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractProperties.pdl @@ -0,0 +1,59 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn + +/** + * Information about a data contract + */ +@Aspect = { + "name": "dataContractProperties" +} +record DataContractProperties { + /** + * The entity that this contract is associated with. Currently, we only support Dataset contracts, but + * in the future we may also support Data Product level contracts. + */ + @Relationship = { + "name": "ContractFor", + "entityTypes": [ "dataset" ] + } + entity: Urn + + /** + * An optional set of schema contracts. If this is a dataset contract, there will only be one. + */ + @Relationship = { + "/*/assertion": { + "name": "IncludesSchemaAssertion", + "entityTypes": [ "assertion" ] + } + } + schema: optional array[SchemaContract] + + /** + * An optional set of FRESHNESS contracts. If this is a dataset contract, there will only be one. + */ + @Relationship = { + "/*/assertion": { + "name": "IncludesFreshnessAssertion", + "entityTypes": [ "assertion" ] + } + } + freshness: optional array[FreshnessContract] + + /** + * An optional set of Data Quality contracts, e.g. table and column level contract constraints. + */ + @Relationship = { + "/*/assertion": { + "name": "IncludesDataQualityAssertion", + "entityTypes": [ "assertion" ] + } + } + dataQuality: optional array[DataQualityContract] + + /** + * YAML-formatted contract definition + */ + rawContract: optional string +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractStatus.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractStatus.pdl new file mode 100644 index 0000000000000..d61fb191ae53d --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractStatus.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn +import com.linkedin.common.CustomProperties + +/** + * Information about the status of a data contract + */ +@Aspect = { + "name": "dataContractStatus" +} +record DataContractStatus includes CustomProperties { + /** + * The latest state of the data contract + */ + @Searchable = {} + state: enum DataContractState { + /** + * The data contract is active. + */ + ACTIVE + /** + * The data contract is pending implementation. + */ + PENDING + } +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl new file mode 100644 index 0000000000000..273d2c2a56f95 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl @@ -0,0 +1,16 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn + + +/** + * A data quality contract pertaining to a physical data asset + * Data Quality contracts are used to make assertions about data quality metrics for a physical data asset + */ +record DataQualityContract { + /** + * The assertion representing the Data Quality contract. + * E.g. a table or column-level assertion. + */ + assertion: Urn +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/FreshnessContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/FreshnessContract.pdl new file mode 100644 index 0000000000000..8cfa66846d505 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/FreshnessContract.pdl @@ -0,0 +1,13 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn + +/** + * A contract pertaining to the operational SLAs of a physical data asset + */ +record FreshnessContract { + /** + * The assertion representing the SLA contract. + */ + assertion: Urn +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl new file mode 100644 index 0000000000000..6c11e0da5b128 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl @@ -0,0 +1,13 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn + +/** + * Expectations for a logical schema + */ +record SchemaContract { + /** + * The assertion representing the schema contract. + */ + assertion: Urn +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetFilter.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetFilter.pdl new file mode 100644 index 0000000000000..6823398f79f3d --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetFilter.pdl @@ -0,0 +1,30 @@ +namespace com.linkedin.dataset + +/** + * A definition of filters that should be used when + * querying an external Dataset or Table. + * + * Note that this models should NOT be used for working with + * search / filter on DataHub Platform itself. + */ +record DatasetFilter { + /** + * How the partition will be represented in this model. + * + * In the future, we'll likely add support for more structured + * predicates. + */ + type: enum DatasetFilterType { + /** + * The partition is represented as a an opaque, raw SQL + * clause. + */ + SQL + } + + /** + * The raw where clause string which will be used for monitoring. + * Required if the type is SQL. + */ + sql: optional string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataContractKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataContractKey.pdl new file mode 100644 index 0000000000000..f1d4a709cd6bf --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataContractKey.pdl @@ -0,0 +1,14 @@ +namespace com.linkedin.metadata.key + +/** + * Key for a Data Contract + */ +@Aspect = { + "name": "dataContractKey" +} +record DataContractKey { + /** + * Unique id for the contract + */ + id: string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaFieldSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaFieldSpec.pdl new file mode 100644 index 0000000000000..e875ff7a84403 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaFieldSpec.pdl @@ -0,0 +1,21 @@ +namespace com.linkedin.schema + +/** +* Lightweight spec used for referencing a particular schema field. +**/ +record SchemaFieldSpec { + /** + * The field path + */ + path: string + + /** + * The DataHub standard schema field type. + */ + type: string + + /** + * The native field type + */ + nativeType: string +} \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 56fc5f6568eb7..11d0f74305d7b 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -262,6 +262,7 @@ entities: - assertionInfo - dataPlatformInstance - assertionRunEvent + - assertionActions - status - name: dataHubRetention category: internal @@ -457,4 +458,12 @@ entities: aspects: - ownershipTypeInfo - status + - name: dataContract + category: core + keyAspect: dataContractKey + aspects: + - dataContractProperties + - dataContractStatus + - status + events: diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 4be31b2b6bb15..4dfd96ac75c6c 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -276,6 +276,9 @@ bootstrap: enabled: ${UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED:false} # enable to run the upgrade to migrate legacy default browse paths to new ones backfillBrowsePathsV2: enabled: ${BACKFILL_BROWSE_PATHS_V2:false} # Enables running the backfill of browsePathsV2 upgrade step. There are concerns about the load of this step so hiding it behind a flag. Deprecating in favor of running through SystemUpdate + servlets: + waitTimeout: ${BOOTSTRAP_SERVLETS_WAITTIMEOUT:60} # Total waiting time in seconds for servlets to initialize + systemUpdate: initialBackOffMs: ${BOOTSTRAP_SYSTEM_UPDATE_INITIAL_BACK_OFF_MILLIS:5000} diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java index 980cafaceae27..032b934a7ba87 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java @@ -15,15 +15,18 @@ import org.apache.http.impl.client.HttpClients; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; import org.springframework.context.event.ContextRefreshedEvent; import org.springframework.context.event.EventListener; import org.springframework.stereotype.Component; import org.springframework.web.context.WebApplicationContext; +import org.springframework.context.annotation.Configuration; /** * Responsible for coordinating starting steps that happen before the application starts up. */ +@Configuration @Slf4j @Component public class OnBootApplicationListener { @@ -44,6 +47,8 @@ public class OnBootApplicationListener { @Qualifier("configurationProvider") private ConfigurationProvider provider; + @Value("${bootstrap.servlets.waitTimeout}") + private int _servletsWaitTimeout; @EventListener(ContextRefreshedEvent.class) public void onApplicationEvent(@Nonnull ContextRefreshedEvent event) { @@ -62,7 +67,7 @@ public void onApplicationEvent(@Nonnull ContextRefreshedEvent event) { public Runnable isSchemaRegistryAPIServletReady() { return () -> { final HttpGet request = new HttpGet(provider.getKafka().getSchemaRegistry().getUrl()); - int timeouts = 30; + int timeouts = _servletsWaitTimeout; boolean openAPIServeletReady = false; while (!openAPIServeletReady && timeouts > 0) { try { @@ -79,7 +84,7 @@ public Runnable isSchemaRegistryAPIServletReady() { timeouts--; } if (!openAPIServeletReady) { - log.error("Failed to bootstrap DataHub, OpenAPI servlet was not ready after 30 seconds"); + log.error("Failed to bootstrap DataHub, OpenAPI servlet was not ready after {} seconds", timeouts); System.exit(1); } else { _bootstrapManager.start();