diff --git a/.github/actions/docker-custom-build-and-push/action.yml b/.github/actions/docker-custom-build-and-push/action.yml index 96d4d759dbb84..bd6bb842b1fb8 100644 --- a/.github/actions/docker-custom-build-and-push/action.yml +++ b/.github/actions/docker-custom-build-and-push/action.yml @@ -30,6 +30,9 @@ inputs: # e.g. latest,head,sha12345 description: "List of tags to use for the Docker image" required: true + target: + description: "Sets the target stage to build" + required: false outputs: image_tag: description: "Docker image tags" @@ -62,6 +65,7 @@ runs: platforms: linux/amd64 build-args: ${{ inputs.build-args }} tags: ${{ steps.docker_meta.outputs.tags }} + target: ${{ inputs.target }} load: true push: false cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }} @@ -94,6 +98,7 @@ runs: platforms: ${{ inputs.platforms }} build-args: ${{ inputs.build-args }} tags: ${{ steps.docker_meta.outputs.tags }} + target: ${{ inputs.target }} push: true cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }} cache-to: type=inline diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml new file mode 100644 index 0000000000000..63bab821cc398 --- /dev/null +++ b/.github/workflows/airflow-plugin.yml @@ -0,0 +1,85 @@ +name: Airflow Plugin +on: + push: + branches: + - master + paths: + - ".github/workflows/airflow-plugin.yml" + - "metadata-ingestion-modules/airflow-plugin/**" + - "metadata-ingestion/**" + - "metadata-models/**" + pull_request: + branches: + - master + paths: + - ".github/**" + - "metadata-ingestion-modules/airflow-plugin/**" + - "metadata-ingestion/**" + - "metadata-models/**" + release: + types: [published] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + airflow-plugin: + runs-on: ubuntu-latest + env: + SPARK_VERSION: 3.0.3 + DATAHUB_TELEMETRY_ENABLED: false + strategy: + matrix: + include: + - python-version: "3.7" + extraPythonRequirement: "apache-airflow~=2.1.0" + - python-version: "3.7" + extraPythonRequirement: "apache-airflow~=2.2.0" + - python-version: "3.10" + extraPythonRequirement: "apache-airflow~=2.4.0" + - python-version: "3.10" + extraPythonRequirement: "apache-airflow~=2.6.0" + - python-version: "3.10" + extraPythonRequirement: "apache-airflow>2.6.0" + fail-fast: false + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + - name: Install dependencies + run: ./metadata-ingestion/scripts/install_deps.sh + - name: Install airflow package and test (extras ${{ matrix.extraPythonRequirement }}) + run: ./gradlew -Pextra_pip_requirements='${{ matrix.extraPythonRequirement }}' :metadata-ingestion-modules:airflow-plugin:lint :metadata-ingestion-modules:airflow-plugin:testQuick + - name: pip freeze show list installed + if: always() + run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && pip freeze + - uses: actions/upload-artifact@v3 + if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'apache-airflow>2.6.0' }} + with: + name: Test Results (Airflow Plugin ${{ matrix.python-version}}) + path: | + **/build/reports/tests/test/** + **/build/test-results/test/** + **/junit.*.xml + - name: Upload coverage to Codecov + if: always() + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + directory: . + fail_ci_if_error: false + flags: airflow-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} + name: pytest-airflow + verbose: true + + event-file: + runs-on: ubuntu-latest + steps: + - name: Upload + uses: actions/upload-artifact@v3 + with: + name: Event File + path: ${{ github.event_path }} diff --git a/.github/workflows/docker-ingestion-base.yml b/.github/workflows/docker-ingestion-base.yml deleted file mode 100644 index 0d29f79aa5f6c..0000000000000 --- a/.github/workflows/docker-ingestion-base.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: ingestion base -on: - release: - types: [published] - push: - branches: - - master - paths: - - ".github/workflows/docker-ingestion-base.yml" - - "docker/datahub-ingestion-base/**" - - "gradle*" - pull_request: - branches: - - master - paths: - - ".github/workflows/docker-ingestion-base.yml" - - "docker/datahub-ingestion-base/**" - - "gradle*" - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - build-base: - name: Build and Push Docker Image to Docker Hub - runs-on: ubuntu-latest - steps: - - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 - - name: Build and Push image - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion-base - tags: latest - username: ${{ secrets.ACRYL_DOCKER_USERNAME }} - password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - publish: ${{ github.ref == 'refs/heads/master' }} - context: . - file: ./docker/datahub-ingestion-base/Dockerfile - platforms: linux/amd64,linux/arm64/v8 diff --git a/.github/workflows/docker-ingestion.yml b/.github/workflows/docker-ingestion.yml deleted file mode 100644 index f3768cfde5002..0000000000000 --- a/.github/workflows/docker-ingestion.yml +++ /dev/null @@ -1,118 +0,0 @@ -name: datahub-ingestion docker -on: - push: - branches: - - master - paths-ignore: - - "docs/**" - - "**.md" - pull_request: - branches: - - master - paths: - - "metadata-ingestion/**" - - "metadata-models/**" - - "docker/datahub-ingestion/**" - - "docker/datahub-ingestion-slim/**" - - ".github/workflows/docker-ingestion.yml" - release: - types: [published] - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - setup: - runs-on: ubuntu-latest - outputs: - tag: ${{ steps.tag.outputs.tag }} - publish: ${{ steps.publish.outputs.publish }} - python_release_version: ${{ steps.python_release_version.outputs.release_version }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Compute Tag - id: tag - run: | - source .github/scripts/docker_helpers.sh - echo "tag=$(get_tag)" >> $GITHUB_OUTPUT - - name: Compute Python Release Version - id: python_release_version - run: | - source .github/scripts/docker_helpers.sh - echo "release_version=$(get_python_docker_release_v)" >> $GITHUB_OUTPUT - - name: Check whether publishing enabled - id: publish - env: - ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD }} - run: | - echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" - echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT - push_to_registries: - name: Build and Push Docker Image to Docker Hub - runs-on: ubuntu-latest - needs: setup - steps: - - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 - - name: Build and push - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - linkedin/datahub-ingestion - tags: ${{ needs.setup.outputs.tag }} - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - publish: ${{ needs.setup.outputs.publish == 'true' }} - context: . - file: ./docker/datahub-ingestion/Dockerfile - platforms: linux/amd64,linux/arm64/v8 - build-args: | - RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} - - name: Build and Push image (slim) - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion-slim - tags: ${{ needs.setup.outputs.tag }} - username: ${{ secrets.ACRYL_DOCKER_USERNAME }} - password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - publish: ${{ needs.setup.outputs.publish == 'true' }} - context: . - file: ./docker/datahub-ingestion-slim/Dockerfile - platforms: linux/amd64,linux/arm64/v8 - ingestion-slim_scan: - permissions: - contents: read # for actions/checkout to fetch code - security-events: write # for github/codeql-action/upload-sarif to upload SARIF results - actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status - name: "[Monitoring] Scan datahub-ingestion-slim images for vulnerabilities" - if: ${{ github.ref == 'refs/heads/master' }} - runs-on: ubuntu-latest - needs: [push_to_registries] - steps: - - name: Checkout # adding checkout step just to make trivy upload happy - uses: actions/checkout@v3 - - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 - with: - image: acryldata/datahub-ingestion-slim:latest - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@0.8.0 - env: - TRIVY_OFFLINE_SCAN: true - with: - image-ref: acryldata/datahub-ingestion-slim:latest - format: "template" - template: "@/contrib/sarif.tpl" - output: "trivy-results.sarif" - severity: "CRITICAL,HIGH" - ignore-unfixed: true - vuln-type: "os,library" - - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: "trivy-results.sarif" diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 1eb2a393600d2..532669c44722c 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -31,13 +31,19 @@ env: DATAHUB_ELASTIC_SETUP_IMAGE: "linkedin/datahub-elasticsearch-setup" DATAHUB_MYSQL_SETUP_IMAGE: "acryldata/datahub-mysql-setup" DATAHUB_UPGRADE_IMAGE: "acryldata/datahub-upgrade" + DATAHUB_INGESTION_BASE_IMAGE: "acryldata/datahub-ingestion-base" + DATAHUB_INGESTION_IMAGE: "acryldata/datahub-ingestion" jobs: setup: runs-on: ubuntu-latest outputs: tag: ${{ steps.tag.outputs.tag }} + slim_tag: ${{ steps.tag.outputs.slim_tag }} + full_tag: ${{ steps.tag.outputs.full_tag }} unique_tag: ${{ steps.tag.outputs.unique_tag }} + unique_slim_tag: ${{ steps.tag.outputs.unique_slim_tag }} + unique_full_tag: ${{ steps.tag.outputs.unique_full_tag }} publish: ${{ steps.publish.outputs.publish }} steps: - name: Checkout @@ -47,14 +53,18 @@ jobs: run: | source .github/scripts/docker_helpers.sh echo "tag=$(get_tag)" >> $GITHUB_OUTPUT + echo "slim_tag=$(get_tag)-slim" >> $GITHUB_OUTPUT + echo "full_tag=$(get_tag)-full" >> $GITHUB_OUTPUT echo "unique_tag=$(get_unique_tag)" >> $GITHUB_OUTPUT + echo "unique_slim_tag=$(get_unique_tag)-slim" >> $GITHUB_OUTPUT + echo "unique_full_tag=$(get_unique_tag)-full" >> $GITHUB_OUTPUT - name: Check whether publishing enabled id: publish env: - ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD }} + ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD != '' && secrets.ACRYL_DOCKER_PASSWORD != '' }} run: | - echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" - echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT + echo "Enable publish: ${{ env.ENABLE_PUBLISH }}" + echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT gms_build: name: Build and Push DataHub GMS Docker Image @@ -414,6 +424,289 @@ jobs: file: ./docker/elasticsearch-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + datahub_ingestion_base_build: + name: Build and Push DataHub Ingestion (Base) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: setup + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Build and push Base Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: base + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_base_slim_build: + name: Build and Push DataHub Ingestion (Base-Slim) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: [setup, datahub_ingestion_base_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + - name: Build and push Base-Slim Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: slim-install + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.slim_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + build-args: | + APP_ENV=slim + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base-Slim) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_base_full_build: + name: Build and Push DataHub Ingestion (Base-Full) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: [setup, datahub_ingestion_base_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + - name: Build and push Base-Full Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: full-install + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.unique_full_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + build-args: | + APP_ENV=full + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base-Full) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT + + + datahub_ingestion_slim_build: + name: Build and Push DataHub Ingestion Docker Images + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }} + needs: [setup, datahub_ingestion_base_slim_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + datahub-ingestion: + - 'docker/datahub-ingestion/**' + - name: Build codegen + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + run: ./gradlew :metadata-ingestion:codegen + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} + - name: Build and push Slim Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: final + images: | + ${{ env.DATAHUB_INGESTION_IMAGE }} + build-args: | + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} + APP_ENV=slim + tags: ${{ needs.setup.outputs.slim_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute Tag + id: tag + run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_slim_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Datahub Ingestion Slim images for vulnerabilities" + runs-on: ubuntu-latest + needs: [setup, datahub_ingestion_slim_build] + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: actions/checkout@v3 + - name: Download image Slim Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + - name: Run Trivy vulnerability scanner Slim Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" + + datahub_ingestion_full_build: + name: Build and Push DataHub Ingestion (Full) Docker Images + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }} + needs: [setup, datahub_ingestion_base_full_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + datahub-ingestion: + - 'docker/datahub-ingestion/**' + - name: Build codegen + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + run: ./gradlew :metadata-ingestion:codegen + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }} + - name: Build and push Full Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: final + images: | + ${{ env.DATAHUB_INGESTION_IMAGE }} + build-args: | + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }} + tags: ${{ needs.setup.outputs.unique_full_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute Tag (Full) + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_full_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Datahub Ingestion images for vulnerabilities" + runs-on: ubuntu-latest + needs: [setup, datahub_ingestion_full_build] + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: actions/checkout@v3 + - name: Download image Full Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} + - name: Run Trivy vulnerability scanner Full Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" + smoke_test: name: Run Smoke Tests runs-on: ubuntu-latest @@ -432,8 +725,11 @@ jobs: mae_consumer_build, mce_consumer_build, datahub_upgrade_build, + datahub_ingestion_slim_build, ] steps: + - name: Disk Check + run: df -h . && docker images - name: Check out the repo uses: actions/checkout@v3 - name: Set up JDK 11 @@ -450,6 +746,12 @@ jobs: - name: Build datahub cli run: | ./gradlew :metadata-ingestion:install + - name: Disk Check + run: df -h . && docker images + - name: Remove images + run: docker image prune -a -f || true + - name: Disk Check + run: df -h . && docker images - name: Download GMS image uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' }} @@ -490,13 +792,21 @@ jobs: if: ${{ needs.setup.outputs.publish != 'true' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - - name: Disable datahub-actions - run: | - yq -i 'del(.services.datahub-actions)' docker/quickstart/docker-compose-without-neo4j.quickstart.yml + - name: Download datahub-ingestion-slim image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + - name: Disk Check + run: df -h . && docker images - name: run quickstart env: DATAHUB_TELEMETRY_ENABLED: false DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }} + DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }} + ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag }} + ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor] acryl-datahub-actions' + ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' run: | ./smoke-test/run-quickstart.sh - name: sleep 60s @@ -504,6 +814,8 @@ jobs: # we are doing this because gms takes time to get ready # and we don't have a better readiness check when bootstrap is done sleep 60s + - name: Disk Check + run: df -h . && docker images - name: Disable ES Disk Threshold run: | curl -XPUT "http://localhost:9200/_cluster/settings" \ @@ -518,6 +830,8 @@ jobs: }' - name: Remove Source Code run: find ./*/* ! -path "./metadata-ingestion*" ! -path "./smoke-test*" ! -path "./gradle*" -delete + - name: Disk Check + run: df -h . && docker images - name: Smoke test env: RUN_QUICKSTART: false @@ -528,11 +842,14 @@ jobs: run: | echo "$DATAHUB_VERSION" ./smoke-test/smoke.sh + - name: Disk Check + run: df -h . && docker images - name: store logs if: failure() run: | docker ps -a docker logs datahub-gms >& gms-${{ matrix.test_strategy }}.log + docker logs datahub-actions >& actions-${{ matrix.test_strategy }}.log - name: Upload logs uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 23d7ee9427f42..fff41e481c3cb 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -25,7 +25,7 @@ jobs: metadata-ingestion: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 + SPARK_VERSION: 3.3.2 DATAHUB_TELEMETRY_ENABLED: false # TODO: Enable this once the test is fixed. # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} @@ -42,9 +42,7 @@ jobs: ] include: - python-version: "3.7" - extraPythonRequirement: "sqlalchemy==1.3.24 apache-airflow~=2.2.0" - python-version: "3.10" - extraPythonRequirement: "sqlalchemy~=1.4.0 apache-airflow>=2.4.0" fail-fast: false steps: - uses: actions/checkout@v3 @@ -56,8 +54,8 @@ jobs: run: ./metadata-ingestion/scripts/install_deps.sh - name: Install package run: ./gradlew :metadata-ingestion:installPackageOnly - - name: Run metadata-ingestion tests (extras ${{ matrix.extraPythonRequirement }}) - run: ./gradlew -Pextra_pip_requirements='${{ matrix.extraPythonRequirement }}' :metadata-ingestion:${{ matrix.command }} + - name: Run metadata-ingestion tests + run: ./gradlew :metadata-ingestion:${{ matrix.command }} - name: pip freeze show list installed if: always() run: source metadata-ingestion/venv/bin/activate && pip freeze @@ -80,7 +78,6 @@ jobs: name: pytest-${{ matrix.command }} verbose: true - event-file: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 5f501780873f6..ac411d812deea 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -40,6 +40,8 @@ jobs: python-version: "3.7" - name: Install dependencies run: ./metadata-ingestion/scripts/install_deps.sh + - name: Remove images + run: docker image prune -a -f || true - name: Smoke test run: | ./gradlew :metadata-integration:java:spark-lineage:integrationTest \ diff --git a/.github/workflows/test-results.yml b/.github/workflows/test-results.yml index 656e4dcbc4e43..0153060692271 100644 --- a/.github/workflows/test-results.yml +++ b/.github/workflows/test-results.yml @@ -2,7 +2,7 @@ name: Test Results on: workflow_run: - workflows: ["build & test", "metadata ingestion"] + workflows: ["build & test", "metadata ingestion", "Airflow Plugin"] types: - completed diff --git a/.gitignore b/.gitignore index 858f560f0b842..49ab5c475096c 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,8 @@ metadata-ingestion/generated/** # docs docs/generated/ +docs-website/versioned_docs/ +docs-website/versioned_sidebars/ tmp* temp/** diff --git a/README.md b/README.md index d2208cf6ced49..951dcebad6498 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,11 @@ Please follow the [DataHub Quickstart Guide](https://datahubproject.io/docs/quic If you're looking to build & modify datahub please take a look at our [Development Guide](https://datahubproject.io/docs/developers). -[![DataHub Demo GIF](docs/imgs/entity.png)](https://demo.datahubproject.io/) +

+ + + +

## Source Code and Repositories diff --git a/build.gradle b/build.gradle index 605b4fcc050e7..f73fe42d45956 100644 --- a/build.gradle +++ b/build.gradle @@ -1,16 +1,15 @@ buildscript { ext.junitJupiterVersion = '5.6.1' // Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md - ext.pegasusVersion = '29.22.16' + ext.pegasusVersion = '29.45.0' ext.mavenVersion = '3.6.3' - ext.springVersion = '5.3.27' - ext.springBootVersion = '2.7.11' + ext.springVersion = '5.3.29' + ext.springBootVersion = '2.7.14' ext.openTelemetryVersion = '1.18.0' ext.neo4jVersion = '4.4.9' ext.testContainersVersion = '1.17.4' ext.elasticsearchVersion = '7.10.2' - // TODO: Change to final release version once it's out ETA Mid-April - ext.jacksonVersion = '2.15.0-rc2' + ext.jacksonVersion = '2.15.2' ext.jettyVersion = '9.4.46.v20220331' ext.playVersion = '2.8.18' ext.log4jVersion = '2.19.0' @@ -18,6 +17,7 @@ buildscript { ext.logbackClassic = '1.2.12' ext.hadoop3Version = '3.3.5' ext.kafkaVersion = '2.3.0' + ext.hazelcastVersion = '5.3.1' ext.docker_registry = 'linkedin' @@ -29,16 +29,16 @@ buildscript { classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.8.1' classpath 'org.springframework.boot:spring-boot-gradle-plugin:' + springBootVersion classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.30.0" - classpath "com.palantir.gradle.gitversion:gradle-git-version:0.12.3" - classpath "org.gradle.playframework:gradle-playframework:0.12" - classpath "gradle.plugin.org.hidetake:gradle-swagger-generator-plugin:2.18.1" + classpath "com.palantir.gradle.gitversion:gradle-git-version:3.0.0" + classpath "org.gradle.playframework:gradle-playframework:0.14" + classpath "gradle.plugin.org.hidetake:gradle-swagger-generator-plugin:2.19.1" } } plugins { id 'com.gorylenko.gradle-git-properties' version '2.4.0-rc2' id 'com.github.johnrengelman.shadow' version '6.1.0' - id "com.palantir.docker" version "0.34.0" + id 'com.palantir.docker' version '0.35.0' // https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/ // TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0" } @@ -95,15 +95,15 @@ project.ext.externalDependency = [ 'graphqlJavaScalars': 'com.graphql-java:graphql-java-extended-scalars:19.1', 'gson': 'com.google.code.gson:gson:2.8.9', 'guice': 'com.google.inject:guice:4.2.3', - 'guava': 'com.google.guava:guava:27.0.1-jre', + 'guava': 'com.google.guava:guava:32.1.2-jre', 'h2': 'com.h2database:h2:2.1.214', 'hadoopCommon':'org.apache.hadoop:hadoop-common:2.7.2', 'hadoopMapreduceClient':'org.apache.hadoop:hadoop-mapreduce-client-core:2.7.2', "hadoopClient": "org.apache.hadoop:hadoop-client:$hadoop3Version", "hadoopCommon3":"org.apache.hadoop:hadoop-common:$hadoop3Version", - 'hazelcast':'com.hazelcast:hazelcast:5.2.3', - 'hazelcastSpring':'com.hazelcast:hazelcast-spring:5.2.1', - 'hazelcastTest':'com.hazelcast:hazelcast:5.2.1:tests', + 'hazelcast':"com.hazelcast:hazelcast:$hazelcastVersion", + 'hazelcastSpring':"com.hazelcast:hazelcast-spring:$hazelcastVersion", + 'hazelcastTest':"com.hazelcast:hazelcast:$hazelcastVersion:tests", 'hibernateCore': 'org.hibernate:hibernate-core:5.2.16.Final', 'httpClient': 'org.apache.httpcomponents:httpclient:4.5.9', 'httpAsyncClient': 'org.apache.httpcomponents:httpasyncclient:4.1.5', @@ -137,6 +137,7 @@ project.ext.externalDependency = [ 'kafkaAvroSerde': 'io.confluent:kafka-streams-avro-serde:5.5.1', 'kafkaAvroSerializer': 'io.confluent:kafka-avro-serializer:5.1.4', 'kafkaClients': "org.apache.kafka:kafka-clients:$kafkaVersion", + 'snappy': 'org.xerial.snappy:snappy-java:1.1.10.3', 'logbackClassic': "ch.qos.logback:logback-classic:$logbackClassic", 'slf4jApi': "org.slf4j:slf4j-api:$slf4jVersion", 'log4jCore': "org.apache.logging.log4j:log4j-core:$log4jVersion", @@ -238,7 +239,7 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) { subprojects { - apply plugin: 'maven' + apply plugin: 'maven-publish' apply plugin: 'com.gorylenko.gradle-git-properties' gitProperties { @@ -252,7 +253,7 @@ subprojects { plugins.withType(JavaPlugin) { dependencies { - testCompile externalDependency.testng + testImplementation externalDependency.testng constraints { implementation('io.netty:netty-all:4.1.86.Final') implementation('org.apache.commons:commons-compress:1.21') diff --git a/buildSrc/build.gradle b/buildSrc/build.gradle index 981a0ab221217..f88d2bdb966ce 100644 --- a/buildSrc/build.gradle +++ b/buildSrc/build.gradle @@ -5,12 +5,12 @@ buildscript { } dependencies { - compile('io.acryl:json-schema-avro:0.1.5') { + implementation('io.acryl:json-schema-avro:0.1.5') { exclude group: 'com.fasterxml.jackson.core', module: 'jackson-databind' exclude group: 'com.google.guava', module: 'guava' } - compile 'com.google.guava:guava:27.0.1-jre' - compile 'com.fasterxml.jackson.core:jackson-databind:2.13.5' - compile 'com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.5' - compile 'commons-io:commons-io:2.11.0' + implementation 'com.google.guava:guava:32.1.2-jre' + implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.5' + implementation 'com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.5' + implementation 'commons-io:commons-io:2.11.0' } \ No newline at end of file diff --git a/buildSrc/src/main/java/io/datahubproject/GenerateJsonSchemaTask.java b/buildSrc/src/main/java/io/datahubproject/GenerateJsonSchemaTask.java index a5a843d91b1eb..796d622860c15 100644 --- a/buildSrc/src/main/java/io/datahubproject/GenerateJsonSchemaTask.java +++ b/buildSrc/src/main/java/io/datahubproject/GenerateJsonSchemaTask.java @@ -21,10 +21,7 @@ import java.util.List; import java.util.stream.Collectors; import org.gradle.api.DefaultTask; -import org.gradle.api.tasks.CacheableTask; -import org.gradle.api.tasks.InputDirectory; -import org.gradle.api.tasks.OutputDirectory; -import org.gradle.api.tasks.TaskAction; +import org.gradle.api.tasks.*; import static com.github.fge.processing.ProcessingUtil.*; import static org.apache.commons.io.FilenameUtils.*; @@ -46,6 +43,7 @@ public void setInputDirectory(String inputDirectory) { } @InputDirectory + @PathSensitive(PathSensitivity.NAME_ONLY) public String getInputDirectory() { return inputDirectory; } diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index f21d10d8f3842..cf1f8ca3cdd84 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -1,7 +1,7 @@ plugins { id "io.github.kobylynskyi.graphql.codegen" version "4.1.1" - id 'com.palantir.docker' id 'scala' + id 'com.palantir.docker' } apply from: "../gradle/versioning/versioning.gradle" @@ -79,6 +79,8 @@ docker { files fileTree(rootProject.projectDir) { include 'docker/monitoring/*' include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -98,7 +100,7 @@ tasks.getByName("docker").dependsOn(unversionZip) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle index 57f64960033aa..e40f8e3eeb96d 100644 --- a/datahub-frontend/play.gradle +++ b/datahub-frontend/play.gradle @@ -4,7 +4,7 @@ apply plugin: "org.gradle.playframework" project.ext.httpPort = 9001 project.ext.playBinaryBaseName = "datahub-frontend" -tasks.withType(PlayRun) { +runPlay { httpPort = project.ext.httpPort } @@ -28,10 +28,13 @@ dependencies { implementation(externalDependency.commonsText) { because("previous versions are vulnerable to CVE-2022-42889") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } - compile project(":metadata-service:restli-client") - compile project(":metadata-service:auth-config") + implementation project(":metadata-service:restli-client") + implementation project(":metadata-service:auth-config") implementation externalDependency.jettyJaas implementation externalDependency.graphqlJava @@ -67,15 +70,15 @@ dependencies { testImplementation 'no.nav.security:mock-oauth2-server:0.3.1' testImplementation 'org.junit-pioneer:junit-pioneer:1.9.1' testImplementation externalDependency.junitJupiterApi - testRuntime externalDependency.junitJupiterEngine + testRuntimeOnly externalDependency.junitJupiterEngine implementation externalDependency.slf4jApi compileOnly externalDependency.lombok - runtime externalDependency.guice - runtime (externalDependency.playDocs) { + runtimeOnly externalDependency.guice + runtimeOnly (externalDependency.playDocs) { exclude group: 'com.typesafe.akka', module: 'akka-http-core_2.12' } - runtime externalDependency.playGuice + runtimeOnly externalDependency.playGuice implementation externalDependency.log4j2Api implementation externalDependency.logbackClassic diff --git a/datahub-graphql-core/build.gradle b/datahub-graphql-core/build.gradle index 8fd45033373dc..89ba8f17b6aeb 100644 --- a/datahub-graphql-core/build.gradle +++ b/datahub-graphql-core/build.gradle @@ -4,25 +4,26 @@ plugins { apply plugin: 'java' dependencies { - compile project(':metadata-service:restli-client') - compile project(':metadata-service:auth-impl') - compile project(':metadata-service:auth-config') - compile project(':metadata-service:configuration') - compile project(':metadata-service:services') - compile project(':metadata-io') - compile project(':metadata-utils') + implementation project(':metadata-service:restli-client') + implementation project(':metadata-service:auth-impl') + implementation project(':metadata-service:auth-config') + implementation project(':metadata-service:configuration') + implementation project(':metadata-service:services') + implementation project(':metadata-io') + implementation project(':metadata-utils') implementation externalDependency.graphqlJava implementation externalDependency.graphqlJavaScalars - compile externalDependency.antlr4Runtime - compile externalDependency.antlr4 - compile externalDependency.guava + implementation externalDependency.antlr4Runtime + implementation externalDependency.antlr4 + implementation externalDependency.guava + implementation externalDependency.opentelemetryAnnotations implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito + testImplementation externalDependency.mockito } graphqlCodegen { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index d6dd2de6d31e3..682710ad5d539 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -68,6 +68,7 @@ import com.linkedin.datahub.graphql.generated.ListQueriesResult; import com.linkedin.datahub.graphql.generated.ListTestsResult; import com.linkedin.datahub.graphql.generated.ListViewsResult; +import com.linkedin.datahub.graphql.generated.MatchedField; import com.linkedin.datahub.graphql.generated.MLFeature; import com.linkedin.datahub.graphql.generated.MLFeatureProperties; import com.linkedin.datahub.graphql.generated.MLFeatureTable; @@ -1008,6 +1009,10 @@ private void configureGenericEntityResolvers(final RuntimeWiring.Builder builder .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((SearchResult) env.getSource()).getEntity())) ) + .type("MatchedField", typeWiring -> typeWiring + .dataFetcher("entity", new EntityTypeResolver(entityTypes, + (env) -> ((MatchedField) env.getSource()).getEntity())) + ) .type("SearchAcrossLineageResult", typeWiring -> typeWiring .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((SearchAcrossLineageResult) env.getSource()).getEntity())) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java index 94880c77d74bc..3089b8c8fc2db 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java @@ -107,7 +107,31 @@ public static boolean canEditGroupMembers(@Nonnull String groupUrnStr, @Nonnull } public static boolean canCreateGlobalAnnouncements(@Nonnull QueryContext context) { - return isAuthorized(context, Optional.empty(), PoliciesConfig.CREATE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE); + final DisjunctivePrivilegeGroup orPrivilegeGroups = new DisjunctivePrivilegeGroup( + ImmutableList.of( + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.CREATE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())), + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())) + )); + + return AuthorizationUtils.isAuthorized( + context.getAuthorizer(), + context.getActorUrn(), + orPrivilegeGroups); + } + + public static boolean canManageGlobalAnnouncements(@Nonnull QueryContext context) { + final DisjunctivePrivilegeGroup orPrivilegeGroups = new DisjunctivePrivilegeGroup( + ImmutableList.of( + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())) + )); + + return AuthorizationUtils.isAuthorized( + context.getAuthorizer(), + context.getActorUrn(), + orPrivilegeGroups); } public static boolean canManageGlobalViews(@Nonnull QueryContext context) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java index d2a7b19857f95..02921b453e315 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java @@ -74,6 +74,7 @@ public CompletableFuture get(DataFetchingEnvironment environm platformPrivileges.setManageTags(AuthorizationUtils.canManageTags(context)); platformPrivileges.setManageGlobalViews(AuthorizationUtils.canManageGlobalViews(context)); platformPrivileges.setManageOwnershipTypes(AuthorizationUtils.canManageOwnershipTypes(context)); + platformPrivileges.setManageGlobalAnnouncements(AuthorizationUtils.canManageGlobalAnnouncements(context)); // Construct and return authenticated user object. final AuthenticatedUser authUser = new AuthenticatedUser(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java index 2c55bc79fe501..90017f7b87997 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java @@ -18,6 +18,7 @@ import com.linkedin.datahub.graphql.generated.Privilege; import com.linkedin.datahub.graphql.generated.QueriesTabConfig; import com.linkedin.datahub.graphql.generated.ResourcePrivileges; +import com.linkedin.datahub.graphql.generated.SearchResultsVisualConfig; import com.linkedin.datahub.graphql.generated.TelemetryConfig; import com.linkedin.datahub.graphql.generated.TestsConfig; import com.linkedin.datahub.graphql.generated.ViewsConfig; @@ -144,6 +145,13 @@ public CompletableFuture get(final DataFetchingEnvironment environmen } visualConfig.setEntityProfiles(entityProfilesConfig); } + if (_visualConfiguration != null && _visualConfiguration.getSearchResult() != null) { + SearchResultsVisualConfig searchResultsVisualConfig = new SearchResultsVisualConfig(); + if (_visualConfiguration.getSearchResult().getEnableNameHighlight() != null) { + searchResultsVisualConfig.setEnableNameHighlight(_visualConfiguration.getSearchResult().getEnableNameHighlight()); + } + visualConfig.setSearchResult(searchResultsVisualConfig); + } appConfig.setVisualConfig(visualConfig); final TelemetryConfig telemetryConfig = new TelemetryConfig(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java index cd2a3dda70033..d3cd0126fb852 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java @@ -23,7 +23,7 @@ public class DeletePostResolver implements DataFetcher get(final DataFetchingEnvironment environment) throws Exception { final QueryContext context = environment.getContext(); - if (!AuthorizationUtils.canCreateGlobalAnnouncements(context)) { + if (!AuthorizationUtils.canManageGlobalAnnouncements(context)) { throw new AuthorizationException( "Unauthorized to delete posts. Please contact your DataHub administrator if this needs corrective action."); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java index e40bbca56b416..fe5b79ba2ea3d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java @@ -73,7 +73,6 @@ private SearchUtils() { EntityType.CONTAINER, EntityType.DOMAIN, EntityType.DATA_PRODUCT, - EntityType.ROLE, EntityType.NOTEBOOK); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java index 6435d6ee4c8e5..f3ac008734339 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java @@ -39,6 +39,9 @@ public com.linkedin.metadata.query.SearchFlags apply(@Nonnull final SearchFlags if (searchFlags.getSkipAggregates() != null) { result.setSkipAggregates(searchFlags.getSkipAggregates()); } + if (searchFlags.getGetSuggestions() != null) { + result.setGetSuggestions(searchFlags.getGetSuggestions()); + } return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java index 0b292a373ea40..5ba32b0c2a77c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java @@ -1,12 +1,18 @@ package com.linkedin.datahub.graphql.types.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.generated.AggregationMetadata; import com.linkedin.datahub.graphql.generated.FacetMetadata; import com.linkedin.datahub.graphql.generated.MatchedField; import com.linkedin.datahub.graphql.generated.SearchResult; +import com.linkedin.datahub.graphql.generated.SearchSuggestion; import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; import com.linkedin.metadata.search.SearchEntity; +import com.linkedin.metadata.search.utils.SearchUtils; +import lombok.extern.slf4j.Slf4j; + +import java.net.URISyntaxException; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -16,6 +22,7 @@ import static com.linkedin.metadata.utils.SearchUtil.*; +@Slf4j public class MapperUtils { private MapperUtils() { @@ -54,7 +61,24 @@ public static String convertFilterValue(String filterValue, List isEnti public static List getMatchedFieldEntry(List highlightMetadata) { return highlightMetadata.stream() - .map(field -> new MatchedField(field.getName(), field.getValue())) + .map(field -> { + MatchedField matchedField = new MatchedField(); + matchedField.setName(field.getName()); + matchedField.setValue(field.getValue()); + if (SearchUtils.isUrn(field.getValue())) { + try { + Urn urn = Urn.createFromString(field.getValue()); + matchedField.setEntity(UrnToEntityMapper.map(urn)); + } catch (URISyntaxException e) { + log.warn("Failed to create urn from MatchedField value: {}", field.getValue(), e); + } + } + return matchedField; + }) .collect(Collectors.toList()); } + + public static SearchSuggestion mapSearchSuggestion(com.linkedin.metadata.search.SearchSuggestion suggestion) { + return new SearchSuggestion(suggestion.getText(), suggestion.getScore(), Math.toIntExact(suggestion.getFrequency())); + } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java index 9f750820e3093..b16e2f10d1df7 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java @@ -27,6 +27,7 @@ public SearchResults apply(com.linkedin.metadata.search.SearchResult input) { final SearchResultMetadata searchResultMetadata = input.getMetadata(); result.setSearchResults(input.getEntities().stream().map(MapperUtils::mapResult).collect(Collectors.toList())); result.setFacets(searchResultMetadata.getAggregations().stream().map(MapperUtils::mapFacet).collect(Collectors.toList())); + result.setSuggestions(searchResultMetadata.getSuggestions().stream().map(MapperUtils::mapSearchSuggestion).collect(Collectors.toList())); return result; } diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index 37183bac13f0e..dbee24b4bf6f7 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -125,6 +125,11 @@ type PlatformPrivileges { Whether the user should be able to create, update, and delete ownership types. """ manageOwnershipTypes: Boolean! + + """ + Whether the user can create and delete posts pinned to the home page. + """ + manageGlobalAnnouncements: Boolean! } """ @@ -216,6 +221,11 @@ type VisualConfig { Configuration for the queries tab """ entityProfiles: EntityProfilesConfig + + """ + Configuration for search results + """ + searchResult: SearchResultsVisualConfig } """ @@ -250,6 +260,16 @@ type EntityProfileConfig { defaultTab: String } +""" +Configuration for a search result +""" +type SearchResultsVisualConfig { + """ + Whether a search result should highlight the name/description if it was matched on those fields. + """ + enableNameHighlight: Boolean +} + """ Configurations related to tracking users in the app """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index f15535bfb4eb8..4cabdb04afe77 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -138,6 +138,11 @@ input SearchFlags { Whether to skip aggregates/facets """ skipAggregates: Boolean + + """ + Whether to request for search suggestions on the _entityName virtualized field + """ + getSuggestions: Boolean } """ @@ -448,6 +453,11 @@ enum FilterOperator { * Represent the relation: String field is one of the array values to, e.g. name in ["Profile", "Event"] """ IN + + """ + Represents the relation: The field exists. If the field is an array, the field is either not present or empty. + """ + EXISTS } """ @@ -478,6 +488,11 @@ type SearchResults { Candidate facet aggregations used for search filtering """ facets: [FacetMetadata!] + + """ + Search suggestions based on the query provided for alternate query texts + """ + suggestions: [SearchSuggestion!] } """ @@ -660,6 +675,11 @@ type MatchedField { Value of the field that matched """ value: String! + + """ + Entity if the value is an urn + """ + entity: Entity } """ @@ -717,6 +737,31 @@ type AggregationMetadata { entity: Entity } +""" +A suggestion for an alternate search query given an original query compared to all +of the entity names in our search index. +""" +type SearchSuggestion { + """ + The suggested text based on the provided query text compared to + the entity name field in the search index. + """ + text: String! + + """ + The "edit distance" for this suggestion. The closer this number is to 1, the + closer the suggested text is to the original text. The closer it is to 0, the + further from the original text it is. + """ + score: Float + + """ + The number of entities that would match on the name field given the suggested text + """ + frequency: Int +} + + """ Input for performing an auto completion query against a single Metadata Entity """ diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index ad2bf02bfdcc7..625cab6b354c5 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -12,14 +12,15 @@ ext { } dependencies { - compile project(':metadata-io') - compile project(':metadata-service:factories') - compile project(':metadata-service:restli-client') - compile project(':metadata-service:configuration') + implementation project(':metadata-io') + implementation project(':metadata-service:factories') + implementation project(':metadata-service:restli-client') + implementation project(':metadata-service:configuration') + implementation project(':metadata-dao-impl:kafka-producer') implementation externalDependency.charle - compile externalDependency.javaxInject - compile(externalDependency.hadoopClient) { + implementation externalDependency.javaxInject + implementation(externalDependency.hadoopClient) { exclude group: 'net.minidev', module: 'json-smart' exclude group: 'com.nimbusds', module: 'nimbus-jose-jwt' exclude group: "org.apache.htrace", module: "htrace-core4" @@ -52,18 +53,18 @@ dependencies { implementation externalDependency.slf4jApi compileOnly externalDependency.lombok - compile externalDependency.picocli - compile externalDependency.parquet + implementation externalDependency.picocli + implementation externalDependency.parquet implementation externalDependency.protobuf - compile externalDependency.springBeans - compile externalDependency.springBootAutoconfigure - compile externalDependency.springCore - compile externalDependency.springKafka + implementation externalDependency.springBeans + implementation externalDependency.springBootAutoconfigure + implementation externalDependency.springCore + implementation externalDependency.springKafka - runtime externalDependency.logbackClassic - runtime externalDependency.mariadbConnector - runtime externalDependency.mysqlConnector - runtime externalDependency.postgresql + runtimeOnly externalDependency.logbackClassic + runtimeOnly externalDependency.mariadbConnector + runtimeOnly externalDependency.mysqlConnector + runtimeOnly externalDependency.postgresql implementation externalDependency.awsMskIamAuth @@ -71,9 +72,9 @@ dependencies { annotationProcessor externalDependency.picocli testImplementation externalDependency.springBootTest - testCompile externalDependency.mockito - testCompile externalDependency.testng - testRuntime externalDependency.logbackClassic + testImplementation externalDependency.mockito + testImplementation externalDependency.testng + testRuntimeOnly externalDependency.logbackClassic } bootJar { @@ -89,6 +90,8 @@ docker { files fileTree(rootProject.projectDir) { include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -101,7 +104,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/datahub-web-react/README.md b/datahub-web-react/README.md index 6c91b169af858..8bf592b11a0ae 100644 --- a/datahub-web-react/README.md +++ b/datahub-web-react/README.md @@ -126,7 +126,9 @@ for functional configurability should reside. to render a view associated with a particular entity type (user, dataset, etc.). -![entity-registry](./entity-registry.png) +

+ +

**graphql** - The React App talks to the `dathub-frontend` server using GraphQL. This module is where the *queries* issued against the server are defined. Once defined, running `yarn run generate` will code-gen TypeScript objects to make invoking diff --git a/datahub-web-react/public/robots.txt b/datahub-web-react/public/robots.txt index e9e57dc4d41b9..7a00656bc3073 100644 --- a/datahub-web-react/public/robots.txt +++ b/datahub-web-react/public/robots.txt @@ -1,3 +1,6 @@ # https://www.robotstxt.org/robotstxt.html User-agent: * -Disallow: +Disallow: /api +Disallow: /gms +Disallow: /search +Disallow: /logOut diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx index dcefc7f70d785..a2e14308e8cee 100644 --- a/datahub-web-react/src/Mocks.tsx +++ b/datahub-web-react/src/Mocks.tsx @@ -1973,6 +1973,7 @@ export const mocks = [ count: 10, filters: [], orFilters: [], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2033,6 +2034,7 @@ export const mocks = [ ], }, ], + suggestions: [], }, } as GetSearchResultsQuery, }, @@ -2059,6 +2061,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2112,6 +2115,7 @@ export const mocks = [ ], }, ], + suggestions: [], }, } as GetSearchResultsQuery, }, @@ -2230,6 +2234,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2251,6 +2256,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -2772,6 +2778,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2794,6 +2801,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { __typename: 'FacetMetadata', @@ -2886,6 +2894,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2908,6 +2917,7 @@ export const mocks = [ }, ], facets: [], + suggestions: [], }, } as GetSearchResultsForMultipleQuery, }, @@ -2934,6 +2944,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2955,6 +2966,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3007,6 +3019,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3028,6 +3041,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3084,6 +3098,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3113,6 +3128,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3175,6 +3191,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3196,6 +3213,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3258,6 +3276,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3279,6 +3298,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3363,6 +3383,7 @@ export const mocks = [ generatePersonalAccessTokens: true, manageGlobalViews: true, manageOwnershipTypes: true, + manageGlobalAnnouncements: true, }, }, }, @@ -3450,6 +3471,7 @@ export const mocks = [ count: 10, filters: [], orFilters: [], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3461,6 +3483,7 @@ export const mocks = [ total: 0, searchResults: [], facets: [], + suggestions: [], }, }, }, @@ -3609,4 +3632,5 @@ export const platformPrivileges: PlatformPrivileges = { createDomains: true, manageGlobalViews: true, manageOwnershipTypes: true, + manageGlobalAnnouncements: true, }; diff --git a/datahub-web-react/src/app/entity/EntityRegistry.tsx b/datahub-web-react/src/app/entity/EntityRegistry.tsx index a07fd02841197..56b085cf69f4a 100644 --- a/datahub-web-react/src/app/entity/EntityRegistry.tsx +++ b/datahub-web-react/src/app/entity/EntityRegistry.tsx @@ -1,5 +1,7 @@ +import React from 'react'; import { Entity as EntityInterface, EntityType, SearchResult } from '../../types.generated'; import { FetchedEntity } from '../lineage/types'; +import { SearchResultProvider } from '../search/context/SearchResultContext'; import { Entity, EntityCapabilityType, IconStyleType, PreviewType } from './Entity'; import { GLOSSARY_ENTITY_TYPES } from './shared/constants'; import { GenericEntityProperties } from './shared/types'; @@ -119,7 +121,9 @@ export default class EntityRegistry { renderSearchResult(type: EntityType, searchResult: SearchResult): JSX.Element { const entity = validatedGet(type, this.entityTypeToEntity); - return entity.renderSearch(searchResult); + return ( + {entity.renderSearch(searchResult)} + ); } renderBrowse(type: EntityType, data: T): JSX.Element { diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx index 90032285cd35b..0f1b6dbf3d660 100644 --- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx @@ -19,13 +19,14 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { ChartStatsSummarySubHeader } from './profile/stats/ChartStatsSummarySubHeader'; import { InputFieldsTab } from '../shared/tabs/Entity/InputFieldsTab'; -import { ChartSnippet } from './ChartSnippet'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import { LOOKER_URN } from '../../ingest/source/builder/constants'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; +import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; /** * Definition of the DataHub Chart entity. @@ -203,7 +204,13 @@ export class ChartEntity implements Entity { lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} externalUrl={data.properties?.externalUrl} - snippet={} + snippet={ + matchedInputFieldRenderer(matchedField, data)} + /> + } + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx b/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx deleted file mode 100644 index 27982d3037207..0000000000000 --- a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx +++ /dev/null @@ -1,53 +0,0 @@ -import React from 'react'; - -import { Typography } from 'antd'; -import { InputFields, MatchedField, Maybe } from '../../../types.generated'; -import TagTermGroup from '../../shared/tags/TagTermGroup'; -import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights'; -import { getMatchPrioritizingPrimary } from '../shared/utils'; - -type Props = { - matchedFields: MatchedField[]; - inputFields: Maybe | undefined; - isMatchingDashboard?: boolean; -}; - -const LABEL_INDEX_NAME = 'fieldLabels'; -const TYPE_PROPERTY_KEY_NAME = 'type'; - -export const ChartSnippet = ({ matchedFields, inputFields, isMatchingDashboard = false }: Props) => { - const matchedField = getMatchPrioritizingPrimary(matchedFields, 'fieldLabels'); - - if (matchedField?.name === LABEL_INDEX_NAME) { - const matchedSchemaField = inputFields?.fields?.find( - (field) => field?.schemaField?.label === matchedField.value, - ); - const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find( - (term) => term?.term?.name === matchedField.value, - ); - - if (matchedGlossaryTerm) { - let termType = 'term'; - const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find( - (property) => property.key === TYPE_PROPERTY_KEY_NAME, - ); - if (typeProperty) { - termType = typeProperty.value || termType; - } - - return ( - - Matches {termType} {' '} - {isMatchingDashboard && 'on a contained Chart'} - - ); - } - } - - return matchedField ? ( - - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {matchedField.value}{' '} - {isMatchingDashboard && 'on a contained Chart'} - - ) : null; -}; diff --git a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx index 9b69d250e315a..7d0fc143043e2 100644 --- a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx +++ b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx @@ -12,6 +12,7 @@ import { Deprecation, ChartStatsSummary, DataProduct, + EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -40,6 +41,8 @@ export const ChartPreview = ({ externalUrl, parentContainers, snippet, + degree, + paths, }: { urn: string; platform?: string; @@ -62,6 +65,8 @@ export const ChartPreview = ({ externalUrl?: string | null; parentContainers?: ParentContainersResult | null; snippet?: React.ReactNode | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); @@ -96,6 +101,8 @@ export const ChartPreview = ({ createdMs={createdMs} /> } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/container/ContainerEntity.tsx b/datahub-web-react/src/app/entity/container/ContainerEntity.tsx index 201dcb9e4487a..9aecf6900f634 100644 --- a/datahub-web-react/src/app/entity/container/ContainerEntity.tsx +++ b/datahub-web-react/src/app/entity/container/ContainerEntity.tsx @@ -154,6 +154,8 @@ export class ContainerEntity implements Entity { externalUrl={data.properties?.externalUrl} tags={data.tags} glossaryTerms={data.glossaryTerms} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/container/preview/Preview.tsx b/datahub-web-react/src/app/entity/container/preview/Preview.tsx index 0bcf59683c3f7..fb1bd8f567420 100644 --- a/datahub-web-react/src/app/entity/container/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/container/preview/Preview.tsx @@ -13,6 +13,7 @@ import { Deprecation, GlossaryTerms, DataProduct, + EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; @@ -44,6 +45,8 @@ export const Preview = ({ parentContainers, externalUrl, deprecation, + degree, + paths, }: { urn: string; name: string; @@ -64,6 +67,8 @@ export const Preview = ({ deprecation?: Deprecation | null; parentContainers?: ParentContainersResult | null; externalUrl?: string | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); const typeName = capitalizeFirstLetterOnly(subTypes?.typeNames?.[0]) || 'Container'; @@ -97,6 +102,8 @@ export const Preview = ({ ]) || undefined } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx index d948b21a46262..0a36d0e5f1bfa 100644 --- a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx +++ b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx @@ -24,12 +24,13 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import { DashboardStatsSummarySubHeader } from './profile/DashboardStatsSummarySubHeader'; -import { ChartSnippet } from '../chart/ChartSnippet'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; import { LOOKER_URN } from '../../ingest/source/builder/constants'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; +import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; /** * Definition of the DataHub Dashboard entity. @@ -227,13 +228,14 @@ export class DashboardEntity implements Entity { lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} snippet={ - matchedInputFieldRenderer(matchedField, data)} + matchSuffix="on a contained chart" /> } subtype={data.subTypes?.typeNames?.[0]} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dashboard/preview/DashboardPreview.tsx b/datahub-web-react/src/app/entity/dashboard/preview/DashboardPreview.tsx index a5536be9cca7c..d822fd1f613b3 100644 --- a/datahub-web-react/src/app/entity/dashboard/preview/DashboardPreview.tsx +++ b/datahub-web-react/src/app/entity/dashboard/preview/DashboardPreview.tsx @@ -12,6 +12,7 @@ import { Deprecation, DashboardStatsSummary, DataProduct, + EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -43,6 +44,8 @@ export const DashboardPreview = ({ parentContainers, deprecation, snippet, + degree, + paths, }: { urn: string; platform?: string; @@ -67,6 +70,8 @@ export const DashboardPreview = ({ externalUrl?: string | null; parentContainers?: ParentContainersResult | null; snippet?: React.ReactNode | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); @@ -103,6 +108,8 @@ export const DashboardPreview = ({ createdMs={createdMs} /> } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx b/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx index c6f7c8b6a6cf7..3bf24ac276c8e 100644 --- a/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx +++ b/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx @@ -158,6 +158,8 @@ export class DataFlowEntity implements Entity { externalUrl={data.properties?.externalUrl} jobCount={(data as any).childJobs?.total} deprecation={data.deprecation} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx index 103e3bc0b83e4..c313171d2f241 100644 --- a/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx @@ -5,6 +5,7 @@ import { DataProduct, Deprecation, Domain, + EntityPath, EntityType, GlobalTags, Owner, @@ -35,6 +36,8 @@ export const Preview = ({ insights, jobCount, deprecation, + degree, + paths, }: { urn: string; name: string; @@ -51,6 +54,8 @@ export const Preview = ({ snippet?: React.ReactNode | null; insights?: Array | null; jobCount?: number | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -80,6 +85,8 @@ export const Preview = ({ ]) || undefined } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx b/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx index a2a369ec53ecf..29741119ac52b 100644 --- a/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx +++ b/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx @@ -180,6 +180,8 @@ export class DataJobEntity implements Entity { lastRunTimeMs={ ((data as any).lastRun?.runs?.length && (data as any).lastRun?.runs[0]?.created?.time) || undefined } + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx index 00166964c8152..61963ff2dce6b 100644 --- a/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx @@ -7,6 +7,7 @@ import { DataProduct, Deprecation, Domain, + EntityPath, EntityType, GlobalTags, Owner, @@ -38,6 +39,8 @@ export const Preview = ({ insights, lastRunTimeMs, externalUrl, + degree, + paths, }: { urn: string; name: string; @@ -54,6 +57,8 @@ export const Preview = ({ insights?: Array | null; lastRunTimeMs?: number | null; externalUrl?: string | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -85,6 +90,8 @@ export const Preview = ({ ]) || undefined } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataProduct/DataProductEntity.tsx b/datahub-web-react/src/app/entity/dataProduct/DataProductEntity.tsx index faa254cce73a6..c3f1273681c19 100644 --- a/datahub-web-react/src/app/entity/dataProduct/DataProductEntity.tsx +++ b/datahub-web-react/src/app/entity/dataProduct/DataProductEntity.tsx @@ -151,6 +151,8 @@ export class DataProductEntity implements Entity { domain={data.domain?.domain} entityCount={data?.entities?.total || undefined} externalUrl={data.properties?.externalUrl} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataProduct/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataProduct/preview/Preview.tsx index c938d6534c479..7f3b6d7042e8e 100644 --- a/datahub-web-react/src/app/entity/dataProduct/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataProduct/preview/Preview.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { EntityType, Owner, GlobalTags, GlossaryTerms, Domain } from '../../../../types.generated'; +import { EntityType, Owner, GlobalTags, GlossaryTerms, Domain, EntityPath } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType } from '../../Entity'; @@ -14,6 +14,8 @@ interface Props { glossaryTerms?: GlossaryTerms | null; entityCount?: number; externalUrl?: string | null; + degree?: number; + paths?: EntityPath[]; } export const Preview = ({ @@ -26,6 +28,8 @@ export const Preview = ({ glossaryTerms, entityCount, externalUrl, + degree, + paths, }: Props): JSX.Element => { const entityRegistry = useEntityRegistry(); @@ -45,6 +49,8 @@ export const Preview = ({ entityCount={entityCount} externalUrl={externalUrl} displayAssetCount + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx index b55b4c54951ef..535a3f569964c 100644 --- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx +++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx @@ -25,11 +25,12 @@ import { OperationsTab } from './profile/OperationsTab'; import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { SidebarSiblingsSection } from '../shared/containers/profile/sidebar/SidebarSiblingsSection'; import { DatasetStatsSummarySubHeader } from './profile/stats/stats/DatasetStatsSummarySubHeader'; -import { DatasetSearchSnippet } from './DatasetSearchSnippet'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; +import { matchedFieldPathsRenderer } from '../../search/matches/matchedFieldPathsRenderer'; const SUBTYPES = { VIEW: 'view', @@ -290,7 +291,7 @@ export class DatasetEntity implements Entity { subtype={data.subTypes?.typeNames?.[0]} container={data.container} parentContainers={data.parentContainers} - snippet={} + snippet={} insights={result.insights} externalUrl={data.properties?.externalUrl} statsSummary={data.statsSummary} @@ -301,6 +302,8 @@ export class DatasetEntity implements Entity { (data as any).lastOperation?.length && (data as any).lastOperation[0].lastUpdatedTimestamp } health={data.health} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; @@ -314,6 +317,7 @@ export class DatasetEntity implements Entity { subtype: entity?.subTypes?.typeNames?.[0] || undefined, icon: entity?.platform?.properties?.logoUrl || undefined, platform: entity?.platform, + health: entity?.health || undefined, }; }; diff --git a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx b/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx deleted file mode 100644 index e4f88eb0fbbfa..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx +++ /dev/null @@ -1,39 +0,0 @@ -import React from 'react'; - -import { Typography } from 'antd'; -import { MatchedField } from '../../../types.generated'; -import { TagSummary } from './shared/TagSummary'; -import { TermSummary } from './shared/TermSummary'; -import { FIELDS_TO_HIGHLIGHT } from './search/highlights'; -import { getMatchPrioritizingPrimary } from '../shared/utils'; -import { downgradeV2FieldPath } from './profile/schema/utils/utils'; - -type Props = { - matchedFields: MatchedField[]; -}; - -const LABEL_INDEX_NAME = 'fieldLabels'; - -export const DatasetSearchSnippet = ({ matchedFields }: Props) => { - const matchedField = getMatchPrioritizingPrimary(matchedFields, LABEL_INDEX_NAME); - - let snippet: React.ReactNode; - - if (matchedField) { - if (matchedField.value.includes('urn:li:tag')) { - snippet = ; - } else if (matchedField.value.includes('urn:li:glossaryTerm')) { - snippet = ; - } else if (matchedField.name === 'fieldPaths') { - snippet = {downgradeV2FieldPath(matchedField.value)}; - } else { - snippet = {matchedField.value}; - } - } - - return matchedField ? ( - - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {snippet}{' '} - - ) : null; -}; diff --git a/datahub-web-react/src/app/entity/dataset/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataset/preview/Preview.tsx index fd2583e4f5982..15c54f86038c2 100644 --- a/datahub-web-react/src/app/entity/dataset/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataset/preview/Preview.tsx @@ -14,6 +14,7 @@ import { DatasetStatsSummary, DataProduct, Health, + EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -49,6 +50,8 @@ export const Preview = ({ statsSummary, lastUpdatedMs, health, + degree, + paths, }: { urn: string; name: string; @@ -77,6 +80,8 @@ export const Preview = ({ statsSummary?: DatasetStatsSummary | null; lastUpdatedMs?: number | null; health?: Health[] | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -114,6 +119,8 @@ export const Preview = ({ /> } health={health || undefined} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataset/search/highlights.ts b/datahub-web-react/src/app/entity/dataset/search/highlights.ts deleted file mode 100644 index 64505e0709c7b..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/search/highlights.ts +++ /dev/null @@ -1,7 +0,0 @@ -export const FIELDS_TO_HIGHLIGHT = new Map(); -FIELDS_TO_HIGHLIGHT.set('fieldPaths', 'column'); -FIELDS_TO_HIGHLIGHT.set('fieldDescriptions', 'column description'); -FIELDS_TO_HIGHLIGHT.set('fieldTags', 'column tag'); -FIELDS_TO_HIGHLIGHT.set('editedFieldDescriptions', 'column description'); -FIELDS_TO_HIGHLIGHT.set('editedFieldTags', 'column tag'); -FIELDS_TO_HIGHLIGHT.set('fieldLabels', 'label'); diff --git a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx deleted file mode 100644 index 106cc298fb58c..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx +++ /dev/null @@ -1,38 +0,0 @@ -import React from 'react'; -import styled from 'styled-components'; -import { useGetTagQuery } from '../../../../graphql/tag.generated'; -import { EntityType, Tag } from '../../../../types.generated'; -import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; -import { useEntityRegistry } from '../../../useEntityRegistry'; -import { StyledTag } from '../../shared/components/styled/StyledTag'; - -const TagLink = styled.span` - display: inline-block; -`; - -type Props = { - urn: string; -}; - -export const TagSummary = ({ urn }: Props) => { - const entityRegistry = useEntityRegistry(); - const { data } = useGetTagQuery({ variables: { urn } }); - return ( - <> - {data && ( - - - - {entityRegistry.getDisplayName(EntityType.Tag, data?.tag)} - - - - )} - - ); -}; diff --git a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx deleted file mode 100644 index cc1274693a342..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx +++ /dev/null @@ -1,36 +0,0 @@ -import React from 'react'; -import { Tag } from 'antd'; -import { BookOutlined } from '@ant-design/icons'; -import styled from 'styled-components'; -import { useGetGlossaryTermQuery } from '../../../../graphql/glossaryTerm.generated'; -import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; -import { EntityType, GlossaryTerm } from '../../../../types.generated'; -import { useEntityRegistry } from '../../../useEntityRegistry'; - -const TermLink = styled.span` - display: inline-block; -`; - -type Props = { - urn: string; -}; - -export const TermSummary = ({ urn }: Props) => { - const entityRegistry = useEntityRegistry(); - const { data } = useGetGlossaryTermQuery({ variables: { urn } }); - - return ( - <> - {data && ( - - - - - {entityRegistry.getDisplayName(EntityType.GlossaryTerm, data?.glossaryTerm)} - - - - )} - - ); -}; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx index 26d3cf456ab7a..b6802e37652cb 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx @@ -4,6 +4,8 @@ import { Deprecation, Domain, EntityType, Owner, ParentNodesResult } from '../.. import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType, PreviewType } from '../../Entity'; +import UrlButton from '../../shared/UrlButton'; +import { getRelatedEntitiesUrl } from '../utils'; export const Preview = ({ urn, @@ -39,6 +41,9 @@ export const Preview = ({ deprecation={deprecation} parentNodes={parentNodes} domain={domain} + entityTitleSuffix={ + View Related Entities + } /> ); }; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx index d0e8de0928b48..098e97e526fd8 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx @@ -5,7 +5,7 @@ import { EmbeddedListSearchSection } from '../../shared/components/styled/search import { useEntityData } from '../../shared/EntityContext'; export default function GlossaryRelatedEntity() { - const { entityData }: any = useEntityData(); + const { entityData } = useEntityData(); const entityUrn = entityData?.urn; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts index 3a2a3d35a8126..cbfa76fa34866 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts +++ b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts @@ -6,3 +6,7 @@ export function sortGlossaryTerms(entityRegistry: EntityRegistry, nodeA?: Entity const nodeBName = entityRegistry.getDisplayName(EntityType.GlossaryTerm, nodeB) || ''; return nodeAName.localeCompare(nodeBName); } + +export function getRelatedEntitiesUrl(entityRegistry: EntityRegistry, urn: string) { + return `${entityRegistry.getEntityUrl(EntityType.GlossaryTerm, urn)}/${encodeURIComponent('Related Entities')}`; +} diff --git a/datahub-web-react/src/app/entity/group/preview/Preview.tsx b/datahub-web-react/src/app/entity/group/preview/Preview.tsx index dc83f6fe4f840..5b9a25e198cfe 100644 --- a/datahub-web-react/src/app/entity/group/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/group/preview/Preview.tsx @@ -8,6 +8,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry'; import { ANTD_GRAY } from '../../shared/constants'; import { IconStyleType } from '../../Entity'; import NoMarkdownViewer from '../../shared/components/styled/StripMarkdownText'; +import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter'; const PreviewContainer = styled.div` margin-bottom: 4px; @@ -87,7 +88,7 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpGroup)} - {name || urn} + {name ? : urn} {membersCount} members @@ -96,7 +97,12 @@ export const Preview = ({ {description && description.length > 0 && ( - {description} + } + > + {description} + )} diff --git a/datahub-web-react/src/app/entity/mlFeature/MLFeatureEntity.tsx b/datahub-web-react/src/app/entity/mlFeature/MLFeatureEntity.tsx index 8fddae7c15186..a7f586c9108ee 100644 --- a/datahub-web-react/src/app/entity/mlFeature/MLFeatureEntity.tsx +++ b/datahub-web-react/src/app/entity/mlFeature/MLFeatureEntity.tsx @@ -145,6 +145,8 @@ export class MLFeatureEntity implements Entity { dataProduct={getDataProduct(genericProperties?.dataProduct)} platform={platform} platformInstanceId={data.dataPlatformInstance?.instanceId} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlFeature/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlFeature/preview/Preview.tsx index 7572bdb08f702..57a8b375bd17b 100644 --- a/datahub-web-react/src/app/entity/mlFeature/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlFeature/preview/Preview.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { DataPlatform, DataProduct, EntityType, Owner } from '../../../../types.generated'; +import { DataPlatform, DataProduct, EntityPath, EntityType, Owner } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -14,6 +14,8 @@ export const Preview = ({ dataProduct, owners, platform, + degree, + paths, }: { urn: string; name: string; @@ -23,6 +25,8 @@ export const Preview = ({ dataProduct?: DataProduct | null; owners?: Array | null; platform?: DataPlatform | null | undefined; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -40,6 +44,8 @@ export const Preview = ({ typeIcon={entityRegistry.getIcon(EntityType.Mlfeature, 14, IconStyleType.ACCENT)} owners={owners} dataProduct={dataProduct} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlFeatureTable/MLFeatureTableEntity.tsx b/datahub-web-react/src/app/entity/mlFeatureTable/MLFeatureTableEntity.tsx index 3bb54b739e749..b3e509decd29d 100644 --- a/datahub-web-react/src/app/entity/mlFeatureTable/MLFeatureTableEntity.tsx +++ b/datahub-web-react/src/app/entity/mlFeatureTable/MLFeatureTableEntity.tsx @@ -144,6 +144,8 @@ export class MLFeatureTableEntity implements Entity { platformName={data.platform?.properties?.displayName || capitalizeFirstLetterOnly(data.platform?.name)} platformInstanceId={data.dataPlatformInstance?.instanceId} dataProduct={getDataProduct(genericProperties?.dataProduct)} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlFeatureTable/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlFeatureTable/preview/Preview.tsx index cf6d7bf5d19f7..97065d9f6dfe0 100644 --- a/datahub-web-react/src/app/entity/mlFeatureTable/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlFeatureTable/preview/Preview.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { DataProduct, EntityType, Owner } from '../../../../types.generated'; +import { DataProduct, EntityPath, EntityType, Owner } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType } from '../../Entity'; @@ -13,6 +13,8 @@ export const Preview = ({ platformName, dataProduct, platformInstanceId, + degree, + paths, }: { urn: string; name: string; @@ -22,6 +24,8 @@ export const Preview = ({ platformName?: string | null; dataProduct?: DataProduct | null; platformInstanceId?: string; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -38,6 +42,8 @@ export const Preview = ({ platformInstanceId={platformInstanceId} dataProduct={dataProduct} logoComponent={entityRegistry.getIcon(EntityType.MlfeatureTable, 20, IconStyleType.HIGHLIGHT)} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx b/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx index 3e800f4f733d2..62690d611dcdd 100644 --- a/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx +++ b/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx @@ -127,7 +127,7 @@ export class MLModelEntity implements Entity { renderSearch = (result: SearchResult) => { const data = result.entity as MlModel; - return ; + return ; }; getLineageVizConfig = (entity: MlModel) => { diff --git a/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx index 6b01ad9ac2845..4b57976dfe1a2 100644 --- a/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx @@ -1,12 +1,20 @@ import React from 'react'; -import { EntityType, MlModel } from '../../../../types.generated'; +import { EntityPath, EntityType, MlModel } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType } from '../../Entity'; import { getDataProduct } from '../../shared/utils'; -export const Preview = ({ model }: { model: MlModel }): JSX.Element => { +export const Preview = ({ + model, + degree, + paths, +}: { + model: MlModel; + degree?: number; + paths?: EntityPath[]; +}): JSX.Element => { const entityRegistry = useEntityRegistry(); const genericProperties = entityRegistry.getGenericEntityProperties(EntityType.Mlmodel, model); @@ -24,6 +32,8 @@ export const Preview = ({ model }: { model: MlModel }): JSX.Element => { tags={model.globalTags || undefined} owners={model?.ownership?.owners} dataProduct={getDataProduct(genericProperties?.dataProduct)} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlModelGroup/MLModelGroupEntity.tsx b/datahub-web-react/src/app/entity/mlModelGroup/MLModelGroupEntity.tsx index 1282eab47cefc..7adc7a6ee7e63 100644 --- a/datahub-web-react/src/app/entity/mlModelGroup/MLModelGroupEntity.tsx +++ b/datahub-web-react/src/app/entity/mlModelGroup/MLModelGroupEntity.tsx @@ -110,7 +110,7 @@ export class MLModelGroupEntity implements Entity { renderSearch = (result: SearchResult) => { const data = result.entity as MlModelGroup; - return ; + return ; }; getLineageVizConfig = (entity: MlModelGroup) => { diff --git a/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx index f1c25d1acadac..910397af899f5 100644 --- a/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx @@ -1,11 +1,19 @@ import React from 'react'; -import { EntityType, MlModelGroup } from '../../../../types.generated'; +import { EntityPath, EntityType, MlModelGroup } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { getDataProduct } from '../../shared/utils'; -export const Preview = ({ group }: { group: MlModelGroup }): JSX.Element => { +export const Preview = ({ + group, + degree, + paths, +}: { + group: MlModelGroup; + degree?: number; + paths?: EntityPath[]; +}): JSX.Element => { const entityRegistry = useEntityRegistry(); const genericProperties = entityRegistry.getGenericEntityProperties(EntityType.MlmodelGroup, group); return ( @@ -21,6 +29,8 @@ export const Preview = ({ group }: { group: MlModelGroup }): JSX.Element => { qualifier={group?.origin} owners={group?.ownership?.owners} dataProduct={getDataProduct(genericProperties?.dataProduct)} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlPrimaryKey/MLPrimaryKeyEntity.tsx b/datahub-web-react/src/app/entity/mlPrimaryKey/MLPrimaryKeyEntity.tsx index c6b4bba46f331..2549f4f6a0047 100644 --- a/datahub-web-react/src/app/entity/mlPrimaryKey/MLPrimaryKeyEntity.tsx +++ b/datahub-web-react/src/app/entity/mlPrimaryKey/MLPrimaryKeyEntity.tsx @@ -143,6 +143,8 @@ export class MLPrimaryKeyEntity implements Entity { platform={platform} platformInstanceId={data.dataPlatformInstance?.instanceId} dataProduct={getDataProduct(genericProperties?.dataProduct)} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlPrimaryKey/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlPrimaryKey/preview/Preview.tsx index e1207e8f98f09..e72062ea2ae03 100644 --- a/datahub-web-react/src/app/entity/mlPrimaryKey/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlPrimaryKey/preview/Preview.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { DataPlatform, DataProduct, EntityType, Owner } from '../../../../types.generated'; +import { DataPlatform, DataProduct, EntityPath, EntityType, Owner } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -14,6 +14,8 @@ export const Preview = ({ platform, dataProduct, platformInstanceId, + degree, + paths, }: { urn: string; name: string; @@ -23,6 +25,8 @@ export const Preview = ({ platform?: DataPlatform | null | undefined; dataProduct?: DataProduct | null; platformInstanceId?: string; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -40,6 +44,8 @@ export const Preview = ({ owners={owners} dataProduct={dataProduct} platformInstanceId={platformInstanceId} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx index 9677af0776604..dce74c02cdb34 100644 --- a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx +++ b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx @@ -1,28 +1,11 @@ -import { ArrowRightOutlined } from '@ant-design/icons'; -import { Button } from 'antd'; import React from 'react'; -import styled from 'styled-components/macro'; import { EntityType } from '../../../types.generated'; import analytics, { EventType, EntityActionType } from '../../analytics'; +import UrlButton from './UrlButton'; const GITHUB_LINK = 'github.com'; const GITHUB = 'GitHub'; -const ExternalUrlWrapper = styled.span` - font-size: 12px; -`; - -const StyledButton = styled(Button)` - > :hover { - text-decoration: underline; - } - &&& { - padding-bottom: 0px; - } - padding-left: 12px; - padding-right: 12px; -`; - interface Props { externalUrl: string; platformName?: string; @@ -46,17 +29,8 @@ export default function ExternalUrlButton({ externalUrl, platformName, entityTyp } return ( - - - {displayedName ? `View in ${displayedName}` : 'View link'}{' '} - - - + + {displayedName ? `View in ${displayedName}` : 'View link'} + ); } diff --git a/datahub-web-react/src/app/entity/shared/UrlButton.tsx b/datahub-web-react/src/app/entity/shared/UrlButton.tsx new file mode 100644 index 0000000000000..a6f6da4a60ad5 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/UrlButton.tsx @@ -0,0 +1,37 @@ +import React, { ReactNode } from 'react'; +import { ArrowRightOutlined } from '@ant-design/icons'; +import { Button } from 'antd'; +import styled from 'styled-components/macro'; + +const UrlButtonContainer = styled.span` + font-size: 12px; +`; + +const StyledButton = styled(Button)` + > :hover { + text-decoration: underline; + } + &&& { + padding-bottom: 0px; + } + padding-left: 12px; + padding-right: 12px; +`; + +interface Props { + href: string; + children: ReactNode; + onClick?: () => void; +} + +const NOOP = () => {}; + +export default function UrlButton({ href, children, onClick = NOOP }: Props) { + return ( + + + {children} + + + ); +} diff --git a/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts b/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts index 6e23d5400ab77..00e89e5943c17 100644 --- a/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts +++ b/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts @@ -1,10 +1,6 @@ import { dataset3WithLineage, dataset3WithSchema, dataset4WithLineage } from '../../../../Mocks'; import { EntityType, SchemaFieldDataType } from '../../../../types.generated'; -import { - combineEntityDataWithSiblings, - combineSiblingsInSearchResults, - shouldEntityBeTreatedAsPrimary, -} from '../siblingUtils'; +import { combineEntityDataWithSiblings, shouldEntityBeTreatedAsPrimary } from '../siblingUtils'; const usageStats = { buckets: [ @@ -191,494 +187,6 @@ const datasetUnprimaryWithNoPrimarySiblings = { }, }; -const searchResultWithSiblings = [ - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - customProperties: [], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['table'], - __typename: 'SubTypes', - }, - domain: null, - container: { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - parentContainers: { - count: 2, - containers: [ - { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - { - urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'cypress_project', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Project'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - ], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: false, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - platform: { - urn: 'urn:li:dataPlatform:dbt', - type: 'DATA_PLATFORM', - name: 'dbt', - properties: { - type: 'OTHERS', - displayName: 'dbt', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/dbtlogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - name: 'cypress_project.jaffle_shop.raw_orders', - properties: { - name: 'raw_orders', - description: '', - qualifiedName: null, - __typename: 'DatasetProperties', - }, - __typename: 'Dataset', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:dbt', - type: 'DATA_PLATFORM', - name: 'dbt', - properties: { - type: 'OTHERS', - displayName: 'dbt', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/dbtlogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: '', - qualifiedName: null, - customProperties: [ - { - key: 'catalog_version', - value: '1.0.4', - __typename: 'StringMapEntry', - }, - { - key: 'node_type', - value: 'seed', - __typename: 'StringMapEntry', - }, - { - key: 'materialization', - value: 'seed', - __typename: 'StringMapEntry', - }, - { - key: 'dbt_file_path', - value: 'data/raw_orders.csv', - __typename: 'StringMapEntry', - }, - { - key: 'catalog_schema', - value: 'https://schemas.getdbt.com/dbt/catalog/v1.json', - __typename: 'StringMapEntry', - }, - { - key: 'catalog_type', - value: 'table', - __typename: 'StringMapEntry', - }, - { - key: 'manifest_version', - value: '1.0.4', - __typename: 'StringMapEntry', - }, - { - key: 'manifest_schema', - value: 'https://schemas.getdbt.com/dbt/manifest/v4.json', - __typename: 'StringMapEntry', - }, - ], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['seed'], - __typename: 'SubTypes', - }, - domain: null, - container: null, - parentContainers: { - count: 0, - containers: [], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: true, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - type: 'DATASET', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - name: 'cypress_project.jaffle_shop.raw_orders', - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - __typename: 'DatasetProperties', - }, - __typename: 'Dataset', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, -]; - -const searchResultWithGhostSiblings = [ - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - customProperties: [], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['table'], - __typename: 'SubTypes', - }, - domain: null, - container: { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - parentContainers: { - count: 2, - containers: [ - { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - { - urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'cypress_project', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Project'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - ], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: false, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: false, - type: 'DATASET', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, -]; - describe('siblingUtils', () => { describe('combineEntityDataWithSiblings', () => { it('combines my metadata with my siblings as primary', () => { @@ -719,32 +227,6 @@ describe('siblingUtils', () => { }); }); - describe('combineSiblingsInSearchResults', () => { - it('combines search results to deduplicate siblings', () => { - const result = combineSiblingsInSearchResults(searchResultWithSiblings as any); - - expect(result).toHaveLength(1); - expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - expect(result?.[0]?.matchedEntities?.[1]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - - expect(result?.[0]?.matchedEntities).toHaveLength(2); - }); - - it('will not combine an entity with a ghost node', () => { - const result = combineSiblingsInSearchResults(searchResultWithGhostSiblings as any); - - expect(result).toHaveLength(1); - expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - expect(result?.[0]?.matchedEntities).toHaveLength(1); - }); - }); - describe('shouldEntityBeTreatedAsPrimary', () => { it('will say a primary entity is primary', () => { expect(shouldEntityBeTreatedAsPrimary(datasetPrimaryWithSiblings)).toBeTruthy(); diff --git a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts b/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts deleted file mode 100644 index 86dec46528b49..0000000000000 --- a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { getMatchPrioritizingPrimary } from '../utils'; - -const MOCK_MATCHED_FIELDS = [ - { - name: 'fieldPaths', - value: 'rain', - }, - { - name: 'description', - value: 'rainbow', - }, - { - name: 'fieldPaths', - value: 'rainbow', - }, - { - name: 'fieldPaths', - value: 'rainbows', - }, -]; - -describe('utils', () => { - describe('getMatchPrioritizingPrimary', () => { - it('prioritizes exact match', () => { - global.window.location.search = 'query=rainbow'; - const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths'); - expect(match?.value).toEqual('rainbow'); - expect(match?.name).toEqual('fieldPaths'); - }); - it('will accept first contains match', () => { - global.window.location.search = 'query=bow'; - const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths'); - expect(match?.value).toEqual('rainbow'); - expect(match?.name).toEqual('fieldPaths'); - }); - }); -}); diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx index 59293c2b0eee5..212813ffcb643 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx @@ -17,6 +17,7 @@ export type Props = { suffix?: JSX.Element; limit?: number; shouldWrap?: boolean; + customRender?: (text: string) => JSX.Element; }; export const removeMarkdown = (text: string) => { @@ -29,7 +30,7 @@ export const removeMarkdown = (text: string) => { .replace(/^•/, ''); // remove first • }; -export default function NoMarkdownViewer({ children, readMore, suffix, limit, shouldWrap }: Props) { +export default function NoMarkdownViewer({ children, customRender, readMore, suffix, limit, shouldWrap }: Props) { let plainText = removeMarkdown(children || ''); if (limit) { @@ -44,7 +45,8 @@ export default function NoMarkdownViewer({ children, readMore, suffix, limit, sh return ( - {plainText} {showReadMore && <>{readMore}} {suffix} + {customRender ? customRender(plainText) : plainText} + {showReadMore && <>{readMore}} {suffix} ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx index c1a23811fdd7e..08087bfd79b8e 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx @@ -6,7 +6,15 @@ export const generateColor = new ColorHash({ saturation: 0.9, }); -export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number }>` +export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number; highlightTag?: boolean }>` + &&& { + ${(props) => + props.highlightTag && + ` + background: ${props.theme.styles['highlight-color']}; + border: 1px solid ${props.theme.styles['highlight-border-color']}; + `} + } ${(props) => props.fontSize && `font-size: ${props.fontSize}px;`} ${(props) => props.$colorHash && diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx index 649645532d2f5..4119a341c5f1b 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx @@ -16,7 +16,6 @@ import { FilterSet, GetSearchResultsParams, SearchResultsInterface } from './typ import { isListSubset } from '../../../utils'; import { EntityAndType } from '../../../types'; import { Message } from '../../../../../shared/Message'; -import { EntityActionProps } from '../../../../../recommendations/renderer/component/EntityNameList'; import { generateOrFilters } from '../../../../../search/utils/generateOrFilters'; import { mergeFilterSets } from '../../../../../search/utils/filterUtils'; import { useDownloadScrollAcrossEntitiesSearchResults } from '../../../../../search/utils/useDownloadScrollAcrossEntitiesSearchResults'; @@ -26,6 +25,7 @@ import { DownloadSearchResults, } from '../../../../../search/utils/types'; import { useEntityContext } from '../../../EntityContext'; +import { EntityActionProps } from './EntitySearchResults'; import { useUserContext } from '../../../../../context/useUserContext'; const Container = styled.div` @@ -251,7 +251,7 @@ export const EmbeddedListSearch = ({ }, [isSelectMode]); useEffect(() => { - if (defaultFilters) { + if (defaultFilters && filters.length === 0) { onChangeFilters(defaultFilters); } // only want to run once on page load diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchModal.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchModal.tsx index d80ada885330f..f88972bbda6a6 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchModal.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchModal.tsx @@ -3,9 +3,9 @@ import { Button, Modal } from 'antd'; import styled from 'styled-components'; import { FacetFilterInput } from '../../../../../../types.generated'; import { EmbeddedListSearch } from './EmbeddedListSearch'; -import { EntityActionProps } from '../../../../../recommendations/renderer/component/EntityNameList'; import { UnionType } from '../../../../../search/utils/constants'; import { FilterSet } from './types'; +import { EntityActionProps } from './EntitySearchResults'; const SearchContainer = styled.div` height: 500px; diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx index bad7f32db5361..e4d43f34dcba7 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx @@ -3,11 +3,11 @@ import { Pagination, Typography } from 'antd'; import styled from 'styled-components'; import { FacetFilterInput, FacetMetadata, SearchResults as SearchResultType } from '../../../../../../types.generated'; import { SearchCfg } from '../../../../../../conf'; -import { EntityNameList, EntityActionProps } from '../../../../../recommendations/renderer/component/EntityNameList'; import { ReactComponent as LoadingSvg } from '../../../../../../images/datahub-logo-color-loading_pendulum.svg'; import { EntityAndType } from '../../../types'; import { UnionType } from '../../../../../search/utils/constants'; import { SearchFiltersSection } from '../../../../../search/SearchFiltersSection'; +import { EntitySearchResults, EntityActionProps } from './EntitySearchResults'; import MatchingViewsLabel from './MatchingViewsLabel'; const SearchBody = styled.div` @@ -125,8 +125,8 @@ export const EmbeddedListSearchResults = ({ )} {!loading && ( - searchResult.entity) || []} + ({ // when we add impact analysis, we will want to pipe the path to each element to the result this diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EntitySearchResults.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EntitySearchResults.tsx new file mode 100644 index 0000000000000..05bbf01f40cf6 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EntitySearchResults.tsx @@ -0,0 +1,98 @@ +import React from 'react'; +import { Checkbox } from 'antd'; +import styled from 'styled-components'; +import { EntityPath, EntityType, SearchResult } from '../../../../../../types.generated'; +import { EntityAndType } from '../../../types'; +import { useEntityRegistry } from '../../../../../useEntityRegistry'; +import { ListItem, StyledList, ThinDivider } from '../../../../../recommendations/renderer/component/EntityNameList'; + +const StyledCheckbox = styled(Checkbox)` + margin-right: 12px; +`; + +export type EntityActionProps = { + urn: string; + type: EntityType; +}; + +type AdditionalProperties = { + degree?: number; + paths?: EntityPath[]; +}; + +type Props = { + // additional data about the search result that is not part of the entity used to enrich the + // presentation of the entity. For example, metadata about how the entity is related for the case + // of impact analysis + additionalPropertiesList?: Array; + searchResults: Array; + isSelectMode?: boolean; + selectedEntities?: EntityAndType[]; + setSelectedEntities?: (entities: EntityAndType[]) => any; + bordered?: boolean; + entityAction?: React.FC; +}; + +export const EntitySearchResults = ({ + additionalPropertiesList, + searchResults, + isSelectMode, + selectedEntities = [], + setSelectedEntities, + bordered = true, + entityAction, +}: Props) => { + const entityRegistry = useEntityRegistry(); + const selectedEntityUrns = selectedEntities?.map((entity) => entity.urn) || []; + + if ( + additionalPropertiesList?.length !== undefined && + additionalPropertiesList.length > 0 && + additionalPropertiesList?.length !== searchResults.length + ) { + console.warn( + 'Warning: additionalPropertiesList length provided to EntityNameList does not match entity array length', + { additionalPropertiesList, searchResults }, + ); + } + + /** + * Invoked when a new entity is selected. Simply updates the state of the list of selected entities. + */ + const onSelectEntity = (selectedEntity: EntityAndType, selected: boolean) => { + if (selected) { + setSelectedEntities?.([...selectedEntities, selectedEntity]); + } else { + setSelectedEntities?.(selectedEntities?.filter((entity) => entity.urn !== selectedEntity.urn) || []); + } + }; + + const EntityAction = entityAction as React.FC; + + return ( + { + const { entity } = searchResult; + return ( + <> + + {isSelectMode && ( + = 0} + onChange={(e) => + onSelectEntity({ urn: entity.urn, type: entity.type }, e.target.checked) + } + /> + )} + {entityRegistry.renderSearchResult(entity.type, searchResult)} + {entityAction && } + + + + ); + }} + /> + ); +}; diff --git a/datahub-web-react/src/app/entity/shared/constants.ts b/datahub-web-react/src/app/entity/shared/constants.ts index e14affc95b6f9..447780fb0d641 100644 --- a/datahub-web-react/src/app/entity/shared/constants.ts +++ b/datahub-web-react/src/app/entity/shared/constants.ts @@ -23,6 +23,7 @@ export const ANTD_GRAY = { export const ANTD_GRAY_V2 = { 2: '#F3F5F6', 5: '#DDE0E4', + 6: '#B2B8BD', 8: '#5E666E', 10: '#1B1E22', }; diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx index baef67a3d1c88..30713afa888b8 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx @@ -2,7 +2,7 @@ import React from 'react'; import styled from 'styled-components'; import { Link } from 'react-router-dom'; import { Health } from '../../../../../../types.generated'; -import { getHealthSummaryIcon, isUnhealthy } from '../../../../../shared/health/healthUtils'; +import { getHealthSummaryIcon, HealthSummaryIconType, isUnhealthy } from '../../../../../shared/health/healthUtils'; import { EntityHealthPopover } from './EntityHealthPopover'; const Container = styled.div` @@ -14,17 +14,19 @@ const Container = styled.div` type Props = { health: Health[]; baseUrl: string; + fontSize?: number; + tooltipPlacement?: any; }; -export const EntityHealth = ({ health, baseUrl }: Props) => { +export const EntityHealth = ({ health, baseUrl, fontSize, tooltipPlacement }: Props) => { const unhealthy = isUnhealthy(health); - const icon = getHealthSummaryIcon(health); + const icon = getHealthSummaryIcon(health, HealthSummaryIconType.FILLED, fontSize); return ( <> {(unhealthy && ( - + {icon} diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealthPopover.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealthPopover.tsx index 0d327a54a62d1..4dde3ffcbb6a4 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealthPopover.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealthPopover.tsx @@ -50,10 +50,12 @@ type Props = { health: Health[]; baseUrl: string; children: React.ReactNode; + fontSize?: number; + placement?: any; }; -export const EntityHealthPopover = ({ health, baseUrl, children }: Props) => { - const icon = getHealthSummaryIcon(health, HealthSummaryIconType.OUTLINED); +export const EntityHealthPopover = ({ health, baseUrl, children, fontSize, placement = 'right' }: Props) => { + const icon = getHealthSummaryIcon(health, HealthSummaryIconType.OUTLINED, fontSize); const message = getHealthSummaryMessage(health); return ( { } color="#262626" - placement="right" + placement={placement} zIndex={10000000} > {children} diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx index 5605bacc1d4e4..51a422ba93418 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx @@ -14,6 +14,7 @@ import ParentNodesView, { const LogoIcon = styled.span` display: flex; + gap: 4px; margin-right: 8px; `; diff --git a/datahub-web-react/src/app/entity/shared/siblingUtils.ts b/datahub-web-react/src/app/entity/shared/siblingUtils.ts index 2cad28d754a80..66481051055ec 100644 --- a/datahub-web-react/src/app/entity/shared/siblingUtils.ts +++ b/datahub-web-react/src/app/entity/shared/siblingUtils.ts @@ -2,7 +2,8 @@ import merge from 'deepmerge'; import { unionBy, keyBy, values } from 'lodash'; import { useLocation } from 'react-router-dom'; import * as QueryString from 'query-string'; -import { Dataset, Entity, MatchedField, Maybe, SiblingProperties } from '../../../types.generated'; +import { Dataset, Entity, Maybe, SiblingProperties } from '../../../types.generated'; +import { GenericEntityProperties } from './types'; export function stripSiblingsFromEntity(entity: any) { return { @@ -169,23 +170,17 @@ export const shouldEntityBeTreatedAsPrimary = (extractedBaseEntity: { siblings?: return isPrimary; }; -export const combineEntityDataWithSiblings = (baseEntity: T): T => { - if (!baseEntity) { - return baseEntity; - } - const baseEntityKey = Object.keys(baseEntity)[0]; - const extractedBaseEntity = baseEntity[baseEntityKey]; - +const combineEntityWithSiblings = (entity: GenericEntityProperties) => { // eslint-disable-next-line @typescript-eslint/dot-notation - const siblingAspect = extractedBaseEntity.siblings; + const siblingAspect = entity.siblings; if ((siblingAspect?.siblings || []).length === 0) { - return baseEntity; + return entity; } // eslint-disable-next-line @typescript-eslint/dot-notation - const siblings: T[] = siblingAspect?.siblings || []; + const siblings = siblingAspect?.siblings || []; - const isPrimary = shouldEntityBeTreatedAsPrimary(extractedBaseEntity); + const isPrimary = shouldEntityBeTreatedAsPrimary(entity); const combinedBaseEntity: any = siblings.reduce( (prev, current) => @@ -193,62 +188,75 @@ export const combineEntityDataWithSiblings = (baseEntity: T): T => { arrayMerge: combineMerge, customMerge: customMerge.bind({}, isPrimary), }), - extractedBaseEntity, - ) as T; + entity, + ); // Force the urn of the combined entity to the current entity urn. - combinedBaseEntity.urn = extractedBaseEntity.urn; + combinedBaseEntity.urn = entity.urn; + + return combinedBaseEntity; +}; + +export const combineEntityDataWithSiblings = (baseEntity: T): T => { + if (!baseEntity) { + return baseEntity; + } + const baseEntityKey = Object.keys(baseEntity)[0]; + const extractedBaseEntity = baseEntity[baseEntityKey]; + + // eslint-disable-next-line @typescript-eslint/dot-notation + const siblingAspect = extractedBaseEntity.siblings; + if ((siblingAspect?.siblings || []).length === 0) { + return baseEntity; + } + + const combinedBaseEntity = combineEntityWithSiblings(extractedBaseEntity); return { [baseEntityKey]: combinedBaseEntity } as unknown as T; }; -export type CombinedSearchResult = { +export type CombinedEntity = { entity: Entity; - matchedFields: MatchedField[]; - matchedEntities?: Entity[]; + matchedEntities?: Array; }; -export function combineSiblingsInSearchResults( - results: - | { - entity: Entity; - matchedFields: MatchedField[]; - }[] - | undefined, -) { - const combinedResults: CombinedSearchResult[] | undefined = []; - const siblingsToPair: Record = {}; - - // set sibling associations - results?.forEach((result) => { - if (result.entity.urn in siblingsToPair) { - // filter from repeating - // const siblingsCombinedResult = siblingsToPair[result.entity.urn]; - // siblingsCombinedResult.matchedEntities?.push(result.entity); - return; - } - - const combinedResult: CombinedSearchResult = result; - const { entity }: { entity: any } = result; - const siblingUrns = entity?.siblings?.siblings?.map((sibling) => sibling.urn) || []; - if (siblingUrns.length > 0) { - combinedResult.matchedEntities = entity.siblings.isPrimary - ? [stripSiblingsFromEntity(entity), ...entity.siblings.siblings] - : [...entity.siblings.siblings, stripSiblingsFromEntity(entity)]; - - combinedResult.matchedEntities = combinedResult.matchedEntities.filter( - (resultToFilter) => (resultToFilter as Dataset).exists, - ); +type CombinedEntityResult = + | { + skipped: true; + } + | { + skipped: false; + combinedEntity: CombinedEntity; + }; + +export function combineSiblingsForEntity(entity: Entity, visitedSiblingUrns: Set): CombinedEntityResult { + if (visitedSiblingUrns.has(entity.urn)) return { skipped: true }; + + const combinedEntity: CombinedEntity = { entity: combineEntityWithSiblings({ ...entity }) }; + const siblings = (combinedEntity.entity as GenericEntityProperties).siblings?.siblings ?? []; + const isPrimary = (combinedEntity.entity as GenericEntityProperties).siblings?.isPrimary; + const siblingUrns = siblings.map((sibling) => sibling?.urn); + + if (siblingUrns.length > 0) { + combinedEntity.matchedEntities = isPrimary + ? [stripSiblingsFromEntity(combinedEntity.entity), ...siblings] + : [...siblings, stripSiblingsFromEntity(combinedEntity.entity)]; + + combinedEntity.matchedEntities = combinedEntity.matchedEntities.filter( + (resultToFilter) => (resultToFilter as Dataset).exists, + ); + + siblingUrns.forEach((urn) => urn && visitedSiblingUrns.add(urn)); + } - siblingUrns.forEach((urn) => { - siblingsToPair[urn] = combinedResult; - }); - } - combinedResults.push(combinedResult); - }); + return { combinedEntity, skipped: false }; +} - return combinedResults; +export function createSiblingEntityCombiner() { + const visitedSiblingUrns: Set = new Set(); + return (entity: Entity) => combineSiblingsForEntity(entity, visitedSiblingUrns); } + // used to determine whether sibling entities should be shown merged or not export const SEPARATE_SIBLINGS_URL_PARAM = 'separate_siblings'; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx index 1aef497ced57b..bcce994c3f0f8 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx @@ -33,7 +33,7 @@ type LinkListProps = { }; export const LinkList = ({ refetch }: LinkListProps) => { - const { entityData } = useEntityData(); + const { urn: entityUrn, entityData } = useEntityData(); const entityRegistry = useEntityRegistry(); const [removeLinkMutation] = useRemoveLinkMutation(); const links = entityData?.institutionalMemory?.elements || []; @@ -41,7 +41,7 @@ export const LinkList = ({ refetch }: LinkListProps) => { const handleDeleteLink = async (metadata: InstitutionalMemoryMetadata) => { try { await removeLinkMutation({ - variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn } }, + variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn || entityUrn } }, }); message.success({ content: 'Link Removed', duration: 2 }); } catch (e: unknown) { diff --git a/datahub-web-react/src/app/entity/shared/utils.ts b/datahub-web-react/src/app/entity/shared/utils.ts index 7ec604785d1ff..a158cc9b7c119 100644 --- a/datahub-web-react/src/app/entity/shared/utils.ts +++ b/datahub-web-react/src/app/entity/shared/utils.ts @@ -1,9 +1,7 @@ -import * as QueryString from 'query-string'; import { Maybe } from 'graphql/jsutils/Maybe'; -import { Entity, EntityType, MatchedField, EntityRelationshipsResult, DataProduct } from '../../../types.generated'; +import { Entity, EntityType, EntityRelationshipsResult, DataProduct } from '../../../types.generated'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; -import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights'; import { GenericEntityProperties } from './types'; export function dictToQueryStringParams(params: Record) { @@ -87,46 +85,6 @@ export const isListSubset = (l1, l2): boolean => { return l1.every((result) => l2.indexOf(result) >= 0); }; -function normalize(value: string) { - return value.trim().toLowerCase(); -} - -function fromQueryGetBestMatch(selectedMatchedFields: MatchedField[], rawQuery: string) { - const query = normalize(rawQuery); - // first lets see if there's an exact match between a field value and the query - const exactMatch = selectedMatchedFields.find((field) => normalize(field.value) === query); - if (exactMatch) { - return exactMatch; - } - - // if no exact match exists, we'll see if the entire query is contained in any of the values - const containedMatch = selectedMatchedFields.find((field) => normalize(field.value).includes(query)); - if (containedMatch) { - return containedMatch; - } - - // otherwise, just return whichever is first - return selectedMatchedFields[0]; -} - -export const getMatchPrioritizingPrimary = ( - matchedFields: MatchedField[], - primaryField: string, -): MatchedField | undefined => { - const { location } = window; - const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); - const query: string = decodeURIComponent(params.query ? (params.query as string) : ''); - - const primaryMatches = matchedFields.filter((field) => field.name === primaryField); - if (primaryMatches.length > 0) { - return fromQueryGetBestMatch(primaryMatches, query); - } - - const matchesThatShouldBeShownOnFE = matchedFields.filter((field) => FIELDS_TO_HIGHLIGHT.has(field.name)); - - return fromQueryGetBestMatch(matchesThatShouldBeShownOnFE, query); -}; - function getGraphqlErrorCode(e) { if (e.graphQLErrors && e.graphQLErrors.length) { const firstError = e.graphQLErrors[0]; diff --git a/datahub-web-react/src/app/entity/user/preview/Preview.tsx b/datahub-web-react/src/app/entity/user/preview/Preview.tsx index 01f68d9065523..05baefb295b98 100644 --- a/datahub-web-react/src/app/entity/user/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/user/preview/Preview.tsx @@ -7,6 +7,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry'; import { ANTD_GRAY } from '../../shared/constants'; import { IconStyleType } from '../../Entity'; import { CustomAvatar } from '../../../shared/avatar'; +import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter'; const PreviewContainer = styled.div` display: flex; @@ -80,11 +81,15 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpUser)} - {name || urn} + {name ? : urn} - {title && {title}} + {title && ( + + + + )} diff --git a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx index 03689460eb02b..eda9b7d7fe2a4 100644 --- a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx +++ b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx @@ -1,4 +1,4 @@ -import React, { useEffect, useRef, useState } from 'react'; +import React, { CSSProperties, useEffect, useRef, useState } from 'react'; import { useHistory } from 'react-router'; import { Select } from 'antd'; import styled from 'styled-components'; @@ -55,11 +55,21 @@ const ViewSelectContainer = styled.div` .ant-select-selection-item { font-weight: 700; font-size: 14px; + text-align: left; } } } `; +const SelectStyled = styled(Select)` + min-width: 90px; + max-width: 200px; +`; + +type Props = { + dropdownStyle?: CSSProperties; +}; + /** * The View Select component allows you to select a View to apply to query on the current page. For example, * search, recommendations, and browse. @@ -69,7 +79,7 @@ const ViewSelectContainer = styled.div` * * In the event that a user refreshes their browser, the state of the view should be saved as well. */ -export const ViewSelect = () => { +export const ViewSelect = ({ dropdownStyle = {} }: Props) => { const history = useHistory(); const userContext = useUserContext(); const [isOpen, setIsOpen] = useState(false); @@ -188,12 +198,11 @@ export const ViewSelect = () => { return ( - + {viewBuilderDisplayState.visible && ( { ref={clearButtonRef} onClick={onHandleClickClear} > - All Entities + View all ); diff --git a/datahub-web-react/src/app/home/HomePageHeader.tsx b/datahub-web-react/src/app/home/HomePageHeader.tsx index def413e13213f..5919d2dbf5b7e 100644 --- a/datahub-web-react/src/app/home/HomePageHeader.tsx +++ b/datahub-web-react/src/app/home/HomePageHeader.tsx @@ -273,6 +273,7 @@ export const HomePageHeader = () => { autoCompleteStyle={styles.searchBox} entityRegistry={entityRegistry} viewsEnabled={viewsEnabled} + combineSiblings showQuickFilters /> {searchResultsToShow && searchResultsToShow.length > 0 && ( diff --git a/datahub-web-react/src/app/lineage/LineageEntityNode.tsx b/datahub-web-react/src/app/lineage/LineageEntityNode.tsx index 4526e3a225ce2..f5be1d57db070 100644 --- a/datahub-web-react/src/app/lineage/LineageEntityNode.tsx +++ b/datahub-web-react/src/app/lineage/LineageEntityNode.tsx @@ -12,11 +12,12 @@ import { getShortenedTitle, nodeHeightFromTitleLength } from './utils/titleUtils import { LineageExplorerContext } from './utils/LineageExplorerContext'; import { useGetEntityLineageLazyQuery } from '../../graphql/lineage.generated'; import { useIsSeparateSiblingsMode } from '../entity/shared/siblingUtils'; -import { centerX, centerY, iconHeight, iconWidth, iconX, iconY, textX, width } from './constants'; +import { centerX, centerY, iconHeight, iconWidth, iconX, iconY, textX, width, healthX, healthY } from './constants'; import LineageEntityColumns from './LineageEntityColumns'; import { convertInputFieldsToSchemaFields } from './utils/columnLineageUtils'; import ManageLineageMenu from './manage/ManageLineageMenu'; import { useGetLineageTimeParams } from './utils/useGetLineageTimeParams'; +import { EntityHealth } from '../entity/shared/containers/profile/header/EntityHealth'; const CLICK_DELAY_THRESHOLD = 1000; const DRAG_DISTANCE_THRESHOLD = 20; @@ -136,6 +137,11 @@ export default function LineageEntityNode({ capitalizeFirstLetterOnly(node.data.subtype) || (node.data.type && entityRegistry.getEntityName(node.data.type)); + // Health + const { health } = node.data; + const baseUrl = node.data.type && node.data.urn && entityRegistry.getEntityUrl(node.data.type, node.data.urn); + const hasHealth = (health && baseUrl) || false; + return ( {unexploredHiddenChildren && (isHovered || isSelected) ? ( @@ -359,6 +365,16 @@ export default function LineageEntityNode({ {getShortenedTitle(node.data.name, width)} )} + + {hasHealth && ( + + )} + {unexploredHiddenChildren && isHovered ? ( ; downstreamRelationships?: Array; + health?: Health[]; }; export type VizNode = { diff --git a/datahub-web-react/src/app/lineage/utils/constructFetchedNode.ts b/datahub-web-react/src/app/lineage/utils/constructFetchedNode.ts index 143b226bda687..778d0e325f7cb 100644 --- a/datahub-web-react/src/app/lineage/utils/constructFetchedNode.ts +++ b/datahub-web-react/src/app/lineage/utils/constructFetchedNode.ts @@ -67,6 +67,7 @@ export default function constructFetchedNode( canEditLineage: fetchedNode.canEditLineage, upstreamRelationships: fetchedNode?.upstreamRelationships || [], downstreamRelationships: fetchedNode?.downstreamRelationships || [], + health: fetchedNode?.health, }; // eslint-disable-next-line no-param-reassign diff --git a/datahub-web-react/src/app/lineage/utils/constructTree.ts b/datahub-web-react/src/app/lineage/utils/constructTree.ts index 8374509ad74eb..7da6fc56b57bd 100644 --- a/datahub-web-react/src/app/lineage/utils/constructTree.ts +++ b/datahub-web-react/src/app/lineage/utils/constructTree.ts @@ -100,6 +100,7 @@ export default function constructTree( canEditLineage: fetchedEntity?.canEditLineage, upstreamRelationships: fetchedEntity?.upstreamRelationships || [], downstreamRelationships: fetchedEntity?.downstreamRelationships || [], + health: fetchedEntity?.health, }; const lineageConfig = entityRegistry.getLineageVizConfig(entityAndType.type, entityAndType.entity); let updatedLineageConfig = { ...lineageConfig }; diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 0a7d16ade0ac0..319c8ed0a3e1d 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -34,6 +34,8 @@ import ExternalUrlButton from '../entity/shared/ExternalUrlButton'; import EntityPaths from './EntityPaths/EntityPaths'; import { DataProductLink } from '../shared/tags/DataProductLink'; import { EntityHealth } from '../entity/shared/containers/profile/header/EntityHealth'; +import SearchTextHighlighter from '../search/matches/SearchTextHighlighter'; +import { getUniqueOwners } from './utils'; const PreviewContainer = styled.div` display: flex; @@ -172,6 +174,7 @@ interface Props { deprecation?: Deprecation | null; topUsers?: Array | null; externalUrl?: string | null; + entityTitleSuffix?: React.ReactNode; subHeader?: React.ReactNode; snippet?: React.ReactNode; insights?: Array | null; @@ -224,6 +227,7 @@ export default function DefaultPreviewCard({ titleSizePx, dataTestID, externalUrl, + entityTitleSuffix, onClick, degree, parentContainers, @@ -260,6 +264,7 @@ export default function DefaultPreviewCard({ }; const shouldShowRightColumn = (topUsers && topUsers.length > 0) || (owners && owners.length > 0); + const uniqueOwners = getUniqueOwners(owners); return ( @@ -287,14 +292,14 @@ export default function DefaultPreviewCard({ ) : ( - {name || ' '} + )} {deprecation?.deprecated && ( )} - {health && health.length > 0 && } + {health && health.length > 0 ? : null} {externalUrl && ( )} + {entityTitleSuffix} - {degree !== undefined && degree !== null && ( ) : undefined } + customRender={(text) => } > {description} @@ -375,12 +381,14 @@ export default function DefaultPreviewCard({ )} - {(topUsers?.length || 0) > 0 && (owners?.length || 0) > 0 && } - {owners && owners?.length > 0 && ( + {(topUsers?.length || 0) > 0 && (uniqueOwners?.length || 0) > 0 && ( + + )} + {uniqueOwners && uniqueOwners?.length > 0 && ( Owners
- owner.owner)} max={2} /> + owner.owner)} max={2} />
)} diff --git a/datahub-web-react/src/app/preview/utils.ts b/datahub-web-react/src/app/preview/utils.ts new file mode 100644 index 0000000000000..f5a562dc2ffe7 --- /dev/null +++ b/datahub-web-react/src/app/preview/utils.ts @@ -0,0 +1,6 @@ +import { Owner } from '../../types.generated'; + +export function getUniqueOwners(owners?: Owner[] | null) { + const uniqueOwnerUrns = new Set(); + return owners?.filter((owner) => !uniqueOwnerUrns.has(owner.owner.urn) && uniqueOwnerUrns.add(owner.owner.urn)); +} diff --git a/datahub-web-react/src/app/recommendations/renderer/component/EntityNameList.tsx b/datahub-web-react/src/app/recommendations/renderer/component/EntityNameList.tsx index a0fea45c9ed2d..4ff78e64625b1 100644 --- a/datahub-web-react/src/app/recommendations/renderer/component/EntityNameList.tsx +++ b/datahub-web-react/src/app/recommendations/renderer/component/EntityNameList.tsx @@ -1,19 +1,14 @@ import React from 'react'; -import { Divider, List, Checkbox } from 'antd'; +import { Divider, List } from 'antd'; import styled from 'styled-components'; -import { Entity, EntityType, EntityPath } from '../../../../types.generated'; +import { Entity } from '../../../../types.generated'; import { useEntityRegistry } from '../../../useEntityRegistry'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { IconStyleType } from '../../../entity/Entity'; -import { EntityAndType } from '../../../entity/shared/types'; import { getPlatformName } from '../../../entity/shared/utils'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; -const StyledCheckbox = styled(Checkbox)` - margin-right: 12px; -`; - -const StyledList = styled(List)` +export const StyledList = styled(List)` overflow-y: auto; height: 100%; margin-top: -1px; @@ -45,7 +40,7 @@ const StyledList = styled(List)` } ` as typeof List; -const ListItem = styled.div<{ isSelectMode: boolean }>` +export const ListItem = styled.div<{ isSelectMode: boolean }>` padding-right: 40px; padding-left: ${(props) => (props.isSelectMode ? '20px' : '40px')}; padding-top: 16px; @@ -54,78 +49,23 @@ const ListItem = styled.div<{ isSelectMode: boolean }>` align-items: center; `; -const ThinDivider = styled(Divider)` +export const ThinDivider = styled(Divider)` padding: 0px; margin: 0px; `; -export type EntityActionProps = { - urn: string; - type: EntityType; -}; - -type AdditionalProperties = { - degree?: number; - paths?: EntityPath[]; -}; - type Props = { - // additional data about the search result that is not part of the entity used to enrich the - // presentation of the entity. For example, metadata about how the entity is related for the case - // of impact analysis - additionalPropertiesList?: Array; entities: Array; onClick?: (index: number) => void; - isSelectMode?: boolean; - selectedEntities?: EntityAndType[]; - setSelectedEntities?: (entities: EntityAndType[]) => any; - bordered?: boolean; - entityAction?: React.FC; }; -export const EntityNameList = ({ - additionalPropertiesList, - entities, - onClick, - isSelectMode, - selectedEntities = [], - setSelectedEntities, - bordered = true, - entityAction, -}: Props) => { +export const EntityNameList = ({ entities, onClick }: Props) => { const entityRegistry = useEntityRegistry(); - const selectedEntityUrns = selectedEntities?.map((entity) => entity.urn) || []; - - if ( - additionalPropertiesList?.length !== undefined && - additionalPropertiesList.length > 0 && - additionalPropertiesList?.length !== entities.length - ) { - console.warn( - 'Warning: additionalPropertiesList length provided to EntityNameList does not match entity array length', - { additionalPropertiesList, entities }, - ); - } - - /** - * Invoked when a new entity is selected. Simply updates the state of the list of selected entities. - */ - const onSelectEntity = (selectedEntity: EntityAndType, selected: boolean) => { - if (selected) { - setSelectedEntities?.([...selectedEntities, selectedEntity]); - } else { - setSelectedEntities?.(selectedEntities?.filter((entity) => entity.urn !== selectedEntity.urn) || []); - } - }; - - const EntityAction = entityAction as React.FC; return ( { - const additionalProperties = additionalPropertiesList?.[index]; const genericProps = entityRegistry.getGenericEntityProperties(entity.type, entity); const platformLogoUrl = genericProps?.platform?.properties?.logoUrl; const platformName = getPlatformName(genericProps); @@ -140,15 +80,7 @@ export const EntityNameList = ({ return ( <> - - {isSelectMode && ( - = 0} - onChange={(e) => - onSelectEntity({ urn: entity.urn, type: entity.type }, e.target.checked) - } - /> - )} + onClick?.(index)} entityCount={entityCount} - degree={additionalProperties?.degree} deprecation={deprecation} - paths={additionalProperties?.paths} health={health || undefined} /> - {entityAction && } diff --git a/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx b/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx index e5f58a8662acc..c562fc6e8349a 100644 --- a/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx +++ b/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx @@ -23,9 +23,7 @@ import { REMOVED_FILTER_NAME, TAGS_FILTER_NAME, TYPE_NAMES_FILTER_NAME, - DATA_PRODUCTS_FILTER_NAME, } from './utils/constants'; -import SetDataProductModal from '../entity/shared/containers/profile/sidebar/DataProduct/SetDataProductModal'; type Props = { facet?: FacetMetadata | null; @@ -80,23 +78,6 @@ export const AdvancedFilterSelectValueModal = ({ ); } - if (filterField === DATA_PRODUCTS_FILTER_NAME) { - return ( - initialValues?.includes(agg?.entity?.urn || ''))?.entity || null - } - onModalClose={onCloseModal} - onOkOverride={(dataProductUrn) => { - onSelect([dataProductUrn]); - onCloseModal(); - }} - /> - ); - } - if (filterField === CONTAINER_FILTER_NAME) { return ( 0 ? suggestions[0].text : ''; + const refineSearchText = getRefineSearchText(filters, viewUrn); + + const onClickExploreAll = useCallback(() => { + analytics.event({ type: EventType.SearchResultsExploreAllClickEvent }); + navigateToSearchUrl({ query: '*', history }); + }, [history]); + + const searchForSuggestion = () => { + navigateToSearchUrl({ query: suggestText, history }); + }; + + const clearFiltersAndView = () => { + navigateToSearchUrl({ query, history }); + userContext.updateLocalState({ + ...userContext.localState, + selectedViewUrn: undefined, + }); + }; + + return ( + +
No results found for "{query}"
+ {refineSearchText && ( + <> + Try {refineSearchText}{' '} + {suggestText && ( + <> + or searching for {suggestText} + + )} + + )} + {!refineSearchText && suggestText && ( + <> + Did you mean {suggestText} + + )} + {!refineSearchText && !suggestText && ( + + )} +
+ ); +} diff --git a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx b/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx deleted file mode 100644 index 9b577048145c5..0000000000000 --- a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx +++ /dev/null @@ -1,98 +0,0 @@ -import { ArrowRightOutlined } from '@ant-design/icons'; -import { Button, Card, Divider, List, Space, Typography } from 'antd'; -import { ListProps } from 'antd/lib/list'; -import * as React from 'react'; -import { useHistory } from 'react-router-dom'; -import styled from 'styled-components'; -import { EntityType, SearchResult } from '../../types.generated'; -import { IconStyleType } from '../entity/Entity'; -import { useEntityRegistry } from '../useEntityRegistry'; -import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; -import analytics, { EventType } from '../analytics'; - -const styles = { - header: { marginBottom: 20 }, - resultHeaderCardBody: { padding: '16px 24px' }, - resultHeaderCard: { right: '52px', top: '-40px', position: 'absolute' }, - seeAllButton: { fontSize: 18 }, - resultsContainer: { width: '100%', padding: '40px 132px' }, -}; - -const ResultList = styled(List)` - &&& { - width: 100%; - border-color: ${(props) => props.theme.styles['border-color-base']}; - margin-top: 8px; - padding: 16px 48px; - box-shadow: ${(props) => props.theme.styles['box-shadow']}; - } -`; - -interface Props { - type: EntityType; - query: string; - searchResults: Array; -} - -export const EntityGroupSearchResults = ({ type, query, searchResults }: Props) => { - const history = useHistory(); - const entityRegistry = useEntityRegistry(); - - const onResultClick = (result: SearchResult, index: number) => { - analytics.event({ - type: EventType.SearchResultClickEvent, - query, - entityUrn: result.entity.urn, - entityType: result.entity.type, - index, - total: searchResults.length, - }); - }; - - return ( - - >> - header={ - - {entityRegistry.getCollectionName(type)} - - {entityRegistry.getIcon(type, 36, IconStyleType.ACCENT)} - - - } - footer={ - searchResults.length > 0 && ( - - ) - } - dataSource={searchResults as SearchResult[]} - split={false} - renderItem={(searchResult, index) => ( - <> - onResultClick(searchResult, index)}> - {entityRegistry.renderSearchResult(type, searchResult)} - - {index < searchResults.length - 1 && } - - )} - bordered - /> - - ); -}; diff --git a/datahub-web-react/src/app/search/PostLinkCard.tsx b/datahub-web-react/src/app/search/PostLinkCard.tsx index 04308632c61c9..2111c0b25ad84 100644 --- a/datahub-web-react/src/app/search/PostLinkCard.tsx +++ b/datahub-web-react/src/app/search/PostLinkCard.tsx @@ -39,12 +39,17 @@ const TextContainer = styled.div` flex: 2; `; -const TextWrapper = styled.div` - text-align: left; +const FlexWrapper = styled.div<{ alignCenter?: boolean }>` display: flex; flex-direction: column; justify-content: center; flex: 2; + ${(props) => props.alignCenter && 'align-items: center;'} +`; + +const TextWrapper = styled.div` + display: flex; + flex-direction: column; `; const HeaderText = styled(Typography.Text)` @@ -74,19 +79,21 @@ export const PostLinkCard = ({ linkPost }: Props) => { const link = linkPost?.content?.link || ''; return ( - + {hasMedia && ( )} - - Link - - {linkPost?.content?.title} - - + + + Link + + {linkPost?.content?.title} + + + diff --git a/datahub-web-react/src/app/search/PostTextCard.tsx b/datahub-web-react/src/app/search/PostTextCard.tsx index 1bba55425fe0d..15b34e37fc01c 100644 --- a/datahub-web-react/src/app/search/PostTextCard.tsx +++ b/datahub-web-react/src/app/search/PostTextCard.tsx @@ -7,7 +7,6 @@ import { Post } from '../../types.generated'; const CardContainer = styled.div` display: flex; flex-direction: row; - min-height: 140px; border: 1px solid ${ANTD_GRAY[4]}; border-radius: 12px; box-shadow: ${(props) => props.theme.styles['box-shadow']}; @@ -15,6 +14,7 @@ const CardContainer = styled.div` box-shadow: ${(props) => props.theme.styles['box-shadow-hover']}; } white-space: unset; + padding-bottom: 4px; `; const TextContainer = styled.div` @@ -28,6 +28,9 @@ const TextContainer = styled.div` const TitleText = styled(Typography.Title)` word-break: break-word; min-height: 20px; + &&& { + margin-top: 8px; + } `; const HeaderText = styled(Typography.Text)` diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx index 97be6ab6b65e3..fb10e1ca0026e 100644 --- a/datahub-web-react/src/app/search/SearchBar.tsx +++ b/datahub-web-react/src/app/search/SearchBar.tsx @@ -3,7 +3,7 @@ import { Input, AutoComplete, Button } from 'antd'; import { CloseCircleFilled, SearchOutlined } from '@ant-design/icons'; import styled from 'styled-components/macro'; import { useHistory } from 'react-router'; -import { AutoCompleteResultForEntity, Entity, EntityType, FacetFilterInput, ScenarioType } from '../../types.generated'; +import { AutoCompleteResultForEntity, EntityType, FacetFilterInput, ScenarioType } from '../../types.generated'; import EntityRegistry from '../entity/EntityRegistry'; import filterSearchQuery from './utils/filterSearchQuery'; import { ANTD_GRAY, ANTD_GRAY_V2 } from '../entity/shared/constants'; @@ -23,6 +23,7 @@ import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; import { getQuickFilterDetails } from './autoComplete/quickFilters/utils'; import ViewAllSearchItem from './ViewAllSearchItem'; import { ViewSelect } from '../entity/view/select/ViewSelect'; +import { combineSiblingsInAutoComplete } from './utils/combineSiblingsInAutoComplete'; const StyledAutoComplete = styled(AutoComplete)` width: 100%; @@ -88,15 +89,6 @@ const QUICK_FILTER_AUTO_COMPLETE_OPTION = { ], }; -const renderItem = (query: string, entity: Entity) => { - return { - value: entity.urn, - label: , - type: entity.type, - style: { padding: '12px 12px 12px 16px' }, - }; -}; - const renderRecommendedQuery = (query: string) => { return { value: query, @@ -123,6 +115,7 @@ interface Props { hideRecommendations?: boolean; showQuickFilters?: boolean; viewsEnabled?: boolean; + combineSiblings?: boolean; setIsSearchBarFocused?: (isSearchBarFocused: boolean) => void; onFocus?: () => void; onBlur?: () => void; @@ -149,6 +142,7 @@ export const SearchBar = ({ hideRecommendations, showQuickFilters, viewsEnabled = false, + combineSiblings = false, setIsSearchBarFocused, onFocus, onBlur, @@ -227,14 +221,26 @@ export const SearchBar = ({ ]; }, [showQuickFilters, suggestions.length, effectiveQuery, selectedQuickFilter, entityRegistry]); - const autoCompleteEntityOptions = useMemo( - () => - suggestions.map((entity: AutoCompleteResultForEntity) => ({ - label: , - options: [...entity.entities.map((e: Entity) => renderItem(effectiveQuery, e))], - })), - [effectiveQuery, suggestions], - ); + const autoCompleteEntityOptions = useMemo(() => { + return suggestions.map((suggestion: AutoCompleteResultForEntity) => { + const combinedSuggestion = combineSiblingsInAutoComplete(suggestion, { combineSiblings }); + return { + label: , + options: combinedSuggestion.combinedEntities.map((combinedEntity) => ({ + value: combinedEntity.entity.urn, + label: ( + + ), + type: combinedEntity.entity.type, + style: { padding: '12px 12px 12px 16px' }, + })), + }; + }); + }, [combineSiblings, effectiveQuery, suggestions]); const previousSelectedQuickFilterValue = usePrevious(selectedQuickFilter?.value); useEffect(() => { @@ -371,7 +377,15 @@ export const SearchBar = ({ onKeyUp={handleStopPropagation} onKeyDown={handleStopPropagation} > - +
)} diff --git a/datahub-web-react/src/app/search/SearchPage.tsx b/datahub-web-react/src/app/search/SearchPage.tsx index ce353640d8179..6387f0ef8c05e 100644 --- a/datahub-web-react/src/app/search/SearchPage.tsx +++ b/datahub-web-react/src/app/search/SearchPage.tsx @@ -59,6 +59,7 @@ export const SearchPage = () => { orFilters, viewUrn, sortInput, + searchFlags: { getSuggestions: true }, }, }, }); @@ -235,6 +236,7 @@ export const SearchPage = () => { error={error} searchResponse={data?.searchAcrossEntities} facets={data?.searchAcrossEntities?.facets} + suggestions={data?.searchAcrossEntities?.suggestions || []} selectedFilters={filters} loading={loading} onChangeFilters={onChangeFilters} diff --git a/datahub-web-react/src/app/search/SearchResultList.tsx b/datahub-web-react/src/app/search/SearchResultList.tsx index b860e7b670c33..386b22f34602b 100644 --- a/datahub-web-react/src/app/search/SearchResultList.tsx +++ b/datahub-web-react/src/app/search/SearchResultList.tsx @@ -1,17 +1,16 @@ -import React, { useCallback } from 'react'; -import { Button, Checkbox, Divider, Empty, List, ListProps } from 'antd'; +import React from 'react'; +import { Checkbox, Divider, List, ListProps } from 'antd'; import styled from 'styled-components'; -import { useHistory } from 'react-router'; -import { RocketOutlined } from '@ant-design/icons'; -import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; import { ANTD_GRAY } from '../entity/shared/constants'; -import { CombinedSearchResult, SEPARATE_SIBLINGS_URL_PARAM } from '../entity/shared/siblingUtils'; +import { SEPARATE_SIBLINGS_URL_PARAM } from '../entity/shared/siblingUtils'; import { CompactEntityNameList } from '../recommendations/renderer/component/CompactEntityNameList'; import { useEntityRegistry } from '../useEntityRegistry'; -import { SearchResult } from '../../types.generated'; +import { SearchResult, SearchSuggestion } from '../../types.generated'; import analytics, { EventType } from '../analytics'; import { EntityAndType } from '../entity/shared/types'; import { useIsSearchV2 } from './useSearchAndBrowseVersion'; +import { CombinedSearchResult } from './utils/combineSiblingsInSearchResults'; +import EmptySearchResults from './EmptySearchResults'; const ResultList = styled(List)` &&& { @@ -27,13 +26,6 @@ const StyledCheckbox = styled(Checkbox)` margin-right: 12px; `; -const NoDataContainer = styled.div` - > div { - margin-top: 28px; - margin-bottom: 28px; - } -`; - const ThinDivider = styled(Divider)` margin-top: 16px; margin-bottom: 16px; @@ -69,6 +61,7 @@ type Props = { isSelectMode: boolean; selectedEntities: EntityAndType[]; setSelectedEntities: (entities: EntityAndType[]) => any; + suggestions: SearchSuggestion[]; }; export const SearchResultList = ({ @@ -78,17 +71,12 @@ export const SearchResultList = ({ isSelectMode, selectedEntities, setSelectedEntities, + suggestions, }: Props) => { - const history = useHistory(); const entityRegistry = useEntityRegistry(); const selectedEntityUrns = selectedEntities.map((entity) => entity.urn); const showSearchFiltersV2 = useIsSearchV2(); - const onClickExploreAll = useCallback(() => { - analytics.event({ type: EventType.SearchResultsExploreAllClickEvent }); - navigateToSearchUrl({ query: '*', history }); - }, [history]); - const onClickResult = (result: SearchResult, index: number) => { analytics.event({ type: EventType.SearchResultClickEvent, @@ -117,21 +105,9 @@ export const SearchResultList = ({ id="search-result-list" dataSource={searchResults} split={false} - locale={{ - emptyText: ( - - - - - ), - }} + locale={{ emptyText: }} renderItem={(item, index) => ( - + onClickResult(item, index)} diff --git a/datahub-web-react/src/app/search/SearchResults.tsx b/datahub-web-react/src/app/search/SearchResults.tsx index 4885715fe200f..d21213f462f54 100644 --- a/datahub-web-react/src/app/search/SearchResults.tsx +++ b/datahub-web-react/src/app/search/SearchResults.tsx @@ -2,11 +2,10 @@ import React from 'react'; import { Pagination, Typography } from 'antd'; import styled from 'styled-components/macro'; import { Message } from '../shared/Message'; -import { Entity, FacetFilterInput, FacetMetadata, MatchedField } from '../../types.generated'; +import { Entity, FacetFilterInput, FacetMetadata, MatchedField, SearchSuggestion } from '../../types.generated'; import { SearchCfg } from '../../conf'; import { SearchResultsRecommendations } from './SearchResultsRecommendations'; import SearchExtendedMenu from '../entity/shared/components/styled/search/SearchExtendedMenu'; -import { combineSiblingsInSearchResults } from '../entity/shared/siblingUtils'; import { SearchSelectBar } from '../entity/shared/components/styled/search/SearchSelectBar'; import { SearchResultList } from './SearchResultList'; import { isListSubset } from '../entity/shared/utils'; @@ -26,6 +25,8 @@ import { BrowseProvider } from './sidebar/BrowseContext'; import { useIsBrowseV2, useIsSearchV2 } from './useSearchAndBrowseVersion'; import useToggleSidebar from './useToggleSidebar'; import SearchSortSelect from './sorting/SearchSortSelect'; +import { combineSiblingsInSearchResults } from './utils/combineSiblingsInSearchResults'; +import SearchQuerySuggester from './suggestions/SearchQuerySugggester'; const SearchResultsWrapper = styled.div<{ v2Styles: boolean }>` display: flex; @@ -131,6 +132,7 @@ interface Props { setNumResultsPerPage: (numResults: number) => void; isSelectMode: boolean; selectedEntities: EntityAndType[]; + suggestions: SearchSuggestion[]; setSelectedEntities: (entities: EntityAndType[]) => void; setIsSelectMode: (showSelectMode: boolean) => any; onChangeSelectAll: (selected: boolean) => void; @@ -155,6 +157,7 @@ export const SearchResults = ({ setNumResultsPerPage, isSelectMode, selectedEntities, + suggestions, setIsSelectMode, setSelectedEntities, onChangeSelectAll, @@ -238,6 +241,7 @@ export const SearchResults = ({ {(error && ) || (!loading && ( + {totalResults > 0 && } - - SearchCfg.RESULTS_PER_PAGE} - onShowSizeChange={(_currNum, newNum) => setNumResultsPerPage(newNum)} - pageSizeOptions={['10', '20', '50', '100']} - /> - + {totalResults > 0 && ( + + SearchCfg.RESULTS_PER_PAGE} + onShowSizeChange={(_currNum, newNum) => setNumResultsPerPage(newNum)} + pageSizeOptions={['10', '20', '50', '100']} + /> + + )} {authenticatedUserUrn && ( ; hasParentTooltip: boolean; } -export default function AutoCompleteEntity({ query, entity, hasParentTooltip }: Props) { +export default function AutoCompleteEntity({ query, entity, siblings, hasParentTooltip }: Props) { const entityRegistry = useEntityRegistry(); const genericEntityProps = entityRegistry.getGenericEntityProperties(entity.type, entity); - const platformName = getPlatformName(genericEntityProps); - const platformLogoUrl = genericEntityProps?.platform?.properties?.logoUrl; const displayName = entityRegistry.getDisplayName(entity.type, entity); - const icon = - (platformLogoUrl && ) || - entityRegistry.getIcon(entity.type, 12, IconStyleType.ACCENT); const { matchedText, unmatchedText } = getAutoCompleteEntityText(displayName, query); + const entities = siblings?.length ? siblings : [entity]; + const platforms = + genericEntityProps?.siblingPlatforms + ?.map( + (platform) => + getPlatformName(entityRegistry.getGenericEntityProperties(EntityType.DataPlatform, platform)) || '', + ) + .filter(Boolean) ?? []; + const parentContainers = genericEntityProps?.parentContainers?.containers || []; // Need to reverse parentContainers since it returns direct parent first. const orderedParentContainers = [...parentContainers].reverse(); const subtype = genericEntityProps?.subTypes?.typeNames?.[0]; + const showPlatforms = !!platforms.length; + const showPlatformDivider = !!platforms.length && !!parentContainers.length; + const showParentContainers = !!parentContainers.length; + const showHeader = showPlatforms || showParentContainers; + return ( - {icon} - + {showHeader && ( + + + {entities.map((ent) => ( + + ))} + + {showPlatforms && } + {showPlatformDivider && } + {showParentContainers && } + + )} { + const entityRegistry = useEntityRegistry(); + + const genericEntityProps = entityRegistry.getGenericEntityProperties(entity.type, entity); + const platformLogoUrl = genericEntityProps?.platform?.properties?.logoUrl; + const platformName = getPlatformName(genericEntityProps); + return ( + (platformLogoUrl && ) || + entityRegistry.getIcon(entity.type, 12, IconStyleType.ACCENT) + ); +}; + +export default AutoCompleteEntityIcon; diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx index c97d171b4c931..b8f5a2c7e4081 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx @@ -18,9 +18,10 @@ export const SuggestionContainer = styled.div` interface Props { query: string; entity: Entity; + siblings?: Array; } -export default function AutoCompleteItem({ query, entity }: Props) { +export default function AutoCompleteItem({ query, entity, siblings }: Props) { const entityRegistry = useEntityRegistry(); const displayTooltip = getShouldDisplayTooltip(entity, entityRegistry); let componentToRender: React.ReactNode = null; @@ -33,7 +34,14 @@ export default function AutoCompleteItem({ query, entity }: Props) { componentToRender = ; break; default: - componentToRender = ; + componentToRender = ( + + ); break; } diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx new file mode 100644 index 0000000000000..61fe6bcae71d0 --- /dev/null +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx @@ -0,0 +1,22 @@ +import { Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; + +const PlatformText = styled(Typography.Text)` + font-size: 12px; + line-height: 20px; + font-weight: 500; + color: ${ANTD_GRAY_V2[8]}; + white-space: nowrap; +`; + +type Props = { + platforms: Array; +}; + +const AutoCompletePlatformNames = ({ platforms }: Props) => { + return {platforms.join(' & ')}; +}; + +export default AutoCompletePlatformNames; diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx index 1f88b94bb0cc7..53b4d53ef46d4 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx @@ -1,20 +1,10 @@ import { Typography } from 'antd'; import React from 'react'; -import styled from 'styled-components'; import { CorpUser, EntityType } from '../../../types.generated'; -import { ANTD_GRAY } from '../../entity/shared/constants'; import { CustomAvatar } from '../../shared/avatar'; import { useEntityRegistry } from '../../useEntityRegistry'; import { getAutoCompleteEntityText } from './utils'; - -export const SuggestionText = styled.div` - margin-left: 12px; - margin-top: 2px; - margin-bottom: 2px; - color: ${ANTD_GRAY[9]}; - font-size: 16px; - overflow: hidden; -`; +import { SuggestionText } from './styledComponents'; interface Props { query: string; diff --git a/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx b/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx index 77ccde06172c9..98a4f5aa214bb 100644 --- a/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx +++ b/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx @@ -4,20 +4,21 @@ import React, { Fragment } from 'react'; import styled from 'styled-components/macro'; import { Container, EntityType } from '../../../types.generated'; import { useEntityRegistry } from '../../useEntityRegistry'; -import { ANTD_GRAY } from '../../entity/shared/constants'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; const NUM_VISIBLE_CONTAINERS = 2; const ParentContainersWrapper = styled.div` font-size: 12px; - color: ${ANTD_GRAY[9]}; + color: ${ANTD_GRAY_V2[8]}; display: flex; align-items: center; - margin-bottom: 3px; `; const ParentContainer = styled(Typography.Text)` + color: ${ANTD_GRAY_V2[8]}; margin-left: 4px; + font-weight: 500; `; export const ArrowWrapper = styled.span` diff --git a/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx b/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx index 79743858b06d9..f4c31b18c99b2 100644 --- a/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx +++ b/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx @@ -1,7 +1,7 @@ import { SearchOutlined } from '@ant-design/icons'; import React from 'react'; import styled from 'styled-components/macro'; -import { SuggestionText } from './AutoCompleteUser'; +import { SuggestionText } from './styledComponents'; const TextWrapper = styled.span``; diff --git a/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx b/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx new file mode 100644 index 0000000000000..9e4b084ab3889 --- /dev/null +++ b/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx @@ -0,0 +1,11 @@ +import styled from 'styled-components'; +import { ANTD_GRAY } from '../../entity/shared/constants'; + +export const SuggestionText = styled.div` + margin-left: 12px; + margin-top: 2px; + margin-bottom: 2px; + color: ${ANTD_GRAY[9]}; + font-size: 16px; + overflow: hidden; +`; diff --git a/datahub-web-react/src/app/search/context/SearchContext.tsx b/datahub-web-react/src/app/search/context/SearchContext.tsx index ec9a0c895e876..656c57b0b22d0 100644 --- a/datahub-web-react/src/app/search/context/SearchContext.tsx +++ b/datahub-web-react/src/app/search/context/SearchContext.tsx @@ -1,11 +1,13 @@ import React, { useContext } from 'react'; export type SearchContextType = { + query: string | undefined; selectedSortOption: string | undefined; setSelectedSortOption: (sortOption: string) => void; }; export const DEFAULT_CONTEXT = { + query: undefined, selectedSortOption: undefined, setSelectedSortOption: (_: string) => null, }; @@ -21,3 +23,7 @@ export function useSearchContext() { export function useSelectedSortOption() { return useSearchContext().selectedSortOption; } + +export function useSearchQuery() { + return useSearchContext().query; +} diff --git a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx index bfb65c1d74d3e..5ad9667ab1fc0 100644 --- a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx +++ b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx @@ -8,6 +8,7 @@ export default function SearchContextProvider({ children }: { children: React.Re const history = useHistory(); const location = useLocation(); const params = useMemo(() => QueryString.parse(location.search, { arrayFormat: 'comma' }), [location.search]); + const query = (params.query ? decodeURIComponent(params.query as string) : undefined) as string | undefined; const selectedSortOption = params.sortOption as string | undefined; function setSelectedSortOption(selectedOption: string) { @@ -15,7 +16,7 @@ export default function SearchContextProvider({ children }: { children: React.Re } return ( - + {children} ); diff --git a/datahub-web-react/src/app/search/context/SearchResultContext.tsx b/datahub-web-react/src/app/search/context/SearchResultContext.tsx new file mode 100644 index 0000000000000..68adead005149 --- /dev/null +++ b/datahub-web-react/src/app/search/context/SearchResultContext.tsx @@ -0,0 +1,72 @@ +import React, { ReactNode, createContext, useContext, useMemo } from 'react'; +import { SearchResult } from '../../../types.generated'; +import { + getMatchedFieldsByUrn, + getMatchedFieldNames, + getMatchedFieldsByNames, + shouldShowInMatchedFieldList, + getMatchedFieldLabel, + getMatchesPrioritized, +} from '../matches/utils'; +import { MatchedFieldName } from '../matches/constants'; + +type SearchResultContextValue = { + searchResult: SearchResult; +} | null; + +const SearchResultContext = createContext(null); + +type Props = { + children: ReactNode; + searchResult: SearchResult; +}; + +export const SearchResultProvider = ({ children, searchResult }: Props) => { + const value = useMemo( + () => ({ + searchResult, + }), + [searchResult], + ); + return {children}; +}; + +const useSearchResultContext = () => { + return useContext(SearchResultContext); +}; + +export const useSearchResult = () => { + return useSearchResultContext()?.searchResult; +}; + +export const useEntityType = () => { + return useSearchResultContext()?.searchResult.entity.type; +}; + +export const useMatchedFields = () => { + return useSearchResult()?.matchedFields ?? []; +}; + +export const useMatchedFieldsForList = (primaryField: MatchedFieldName) => { + const entityType = useEntityType(); + const matchedFields = useMatchedFields(); + const showableFields = matchedFields.filter((field) => shouldShowInMatchedFieldList(entityType, field)); + return entityType ? getMatchesPrioritized(entityType, showableFields, primaryField) : []; +}; + +export const useMatchedFieldsByGroup = (fieldName: MatchedFieldName) => { + const entityType = useEntityType(); + const matchedFields = useMatchedFields(); + const matchedFieldNames = getMatchedFieldNames(entityType, fieldName); + return getMatchedFieldsByNames(matchedFields, matchedFieldNames); +}; + +export const useHasMatchedFieldByUrn = (urn: string, fieldName: MatchedFieldName) => { + const matchedFields = useMatchedFieldsByGroup(fieldName); + return getMatchedFieldsByUrn(matchedFields, urn).length > 0; +}; + +export const useMatchedFieldLabel = (fieldName: string) => { + const entityType = useEntityType(); + return getMatchedFieldLabel(entityType, fieldName); +}; diff --git a/datahub-web-react/src/app/search/context/constants.ts b/datahub-web-react/src/app/search/context/constants.ts index 372230db023e9..5f841b8536e19 100644 --- a/datahub-web-react/src/app/search/context/constants.ts +++ b/datahub-web-react/src/app/search/context/constants.ts @@ -1,15 +1,23 @@ import { SortOrder } from '../../../types.generated'; export const RELEVANCE = 'relevance'; -export const NAME_FIELD = 'name'; +export const ENTITY_NAME_FIELD = '_entityName'; export const LAST_OPERATION_TIME_FIELD = 'lastOperationTime'; export const DEFAULT_SORT_OPTION = RELEVANCE; export const SORT_OPTIONS = { [RELEVANCE]: { label: 'Relevance', field: RELEVANCE, sortOrder: SortOrder.Descending }, - [`${NAME_FIELD}_${SortOrder.Ascending}`]: { label: 'A to Z', field: NAME_FIELD, sortOrder: SortOrder.Ascending }, - [`${NAME_FIELD}_${SortOrder.Descending}`]: { label: 'Z to A', field: NAME_FIELD, sortOrder: SortOrder.Descending }, + [`${ENTITY_NAME_FIELD}_${SortOrder.Ascending}`]: { + label: 'A to Z', + field: ENTITY_NAME_FIELD, + sortOrder: SortOrder.Ascending, + }, + [`${ENTITY_NAME_FIELD}_${SortOrder.Descending}`]: { + label: 'Z to A', + field: ENTITY_NAME_FIELD, + sortOrder: SortOrder.Descending, + }, [`${LAST_OPERATION_TIME_FIELD}_${SortOrder.Descending}`]: { label: 'Last Modified in Platform', field: LAST_OPERATION_TIME_FIELD, diff --git a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx new file mode 100644 index 0000000000000..0bfe000dea366 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx @@ -0,0 +1,133 @@ +import React from 'react'; + +import { Tooltip, Typography } from 'antd'; +import styled from 'styled-components'; +import { useMatchedFieldLabel, useMatchedFieldsForList } from '../context/SearchResultContext'; +import { MatchedField } from '../../../types.generated'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; +import { useSearchQuery } from '../context/SearchContext'; +import { MatchesGroupedByFieldName } from './constants'; +import { useEntityRegistry } from '../../useEntityRegistry'; +import { getDescriptionSlice, isDescriptionField, isHighlightableEntityField } from './utils'; + +const MatchesContainer = styled.div` + display: flex; + flex-wrap: wrap; + gap: 8px; +`; + +const MatchText = styled(Typography.Text)` + color: ${ANTD_GRAY_V2[8]}; + background: ${(props) => props.theme.styles['highlight-color']}; + border-radius: 4px; + padding: 2px 4px 2px 4px; + padding-right: 4px; +`; + +const MATCH_GROUP_LIMIT = 3; +const TOOLTIP_MATCH_GROUP_LIMIT = 10; + +type CustomFieldRenderer = (field: MatchedField) => JSX.Element | null; + +type Props = { + customFieldRenderer?: CustomFieldRenderer; + matchSuffix?: string; +}; + +const RenderedField = ({ + customFieldRenderer, + field, +}: { + customFieldRenderer?: CustomFieldRenderer; + field: MatchedField; +}) => { + const entityRegistry = useEntityRegistry(); + const query = useSearchQuery()?.trim().toLowerCase(); + const customRenderedField = customFieldRenderer?.(field); + if (customRenderedField) return {customRenderedField}; + if (isHighlightableEntityField(field)) { + return field.entity ? <>{entityRegistry.getDisplayName(field.entity.type, field.entity)} : <>; + } + if (isDescriptionField(field) && query) return {getDescriptionSlice(field.value, query)}; + return {field.value}; +}; + +const MatchedFieldsList = ({ + groupedMatch, + limit, + tooltip, + matchSuffix = '', + customFieldRenderer, +}: { + groupedMatch: MatchesGroupedByFieldName; + limit: number; + tooltip?: JSX.Element; + matchSuffix?: string; + customFieldRenderer?: CustomFieldRenderer; +}) => { + const label = useMatchedFieldLabel(groupedMatch.fieldName); + const count = groupedMatch.matchedFields.length; + const moreCount = Math.max(count - limit, 0); + const andMore = ( + <> + {' '} + & more + + ); + return ( + <> + Matches {count > 1 && `${count} `} + {label} + {count > 1 && 's'}{' '} + {groupedMatch.matchedFields.slice(0, limit).map((field, index) => ( + <> + {index > 0 && ', '} + <> + + + + ))} + {moreCount > 0 && + (tooltip ? ( + + {andMore} + + ) : ( + <>{andMore} + ))}{' '} + {matchSuffix} + + ); +}; + +export const MatchedFieldList = ({ customFieldRenderer, matchSuffix = '' }: Props) => { + const groupedMatches = useMatchedFieldsForList('fieldLabels'); + + return ( + <> + {groupedMatches.length > 0 ? ( + + {groupedMatches.map((groupedMatch) => { + return ( + + + } + /> + + ); + })} + + ) : null} + + ); +}; diff --git a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx new file mode 100644 index 0000000000000..d8da1088ea89d --- /dev/null +++ b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx @@ -0,0 +1,42 @@ +import React from 'react'; +import Highlight from 'react-highlighter'; +import styled from 'styled-components'; +import { useMatchedFieldsByGroup } from '../context/SearchResultContext'; +import { useSearchQuery } from '../context/SearchContext'; +import { MatchedFieldName } from './constants'; +import { useAppConfig } from '../../useAppConfig'; + +type Props = { + field: MatchedFieldName; + text: string; + enableFullHighlight?: boolean; +}; + +const HIGHLIGHT_ALL_PATTERN = /.*/; + +const StyledHighlight = styled(Highlight).attrs((props) => ({ + matchStyle: { background: props.theme.styles['highlight-color'] }, +}))``; + +const SearchTextHighlighter = ({ field, text, enableFullHighlight = false }: Props) => { + const appConfig = useAppConfig(); + const enableNameHighlight = appConfig.config.visualConfig.searchResult?.enableNameHighlight; + const matchedFields = useMatchedFieldsByGroup(field); + const hasMatchedField = !!matchedFields?.length; + const normalizedSearchQuery = useSearchQuery()?.trim().toLowerCase(); + const normalizedText = text.trim().toLowerCase(); + const hasSubstring = hasMatchedField && !!normalizedSearchQuery && normalizedText.includes(normalizedSearchQuery); + const pattern = enableFullHighlight ? HIGHLIGHT_ALL_PATTERN : undefined; + + return ( + <> + {enableNameHighlight && hasMatchedField ? ( + {text} + ) : ( + text + )} + + ); +}; + +export default SearchTextHighlighter; diff --git a/datahub-web-react/src/app/search/matches/constants.ts b/datahub-web-react/src/app/search/matches/constants.ts new file mode 100644 index 0000000000000..25ca82eef9597 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/constants.ts @@ -0,0 +1,129 @@ +import { EntityType, MatchedField } from '../../../types.generated'; + +export type MatchedFieldName = + | 'urn' + | 'name' + | 'displayName' + | 'title' + | 'description' + | 'editedDescription' + | 'editedFieldDescriptions' + | 'fieldDescriptions' + | 'tags' + | 'fieldTags' + | 'editedFieldTags' + | 'glossaryTerms' + | 'fieldGlossaryTerms' + | 'editedFieldGlossaryTerms' + | 'fieldLabels' + | 'fieldPaths'; + +export type MatchedFieldConfig = { + name: MatchedFieldName; + groupInto?: MatchedFieldName; + label: string; + showInMatchedFieldList?: boolean; +}; + +const DEFAULT_MATCHED_FIELD_CONFIG: Array = [ + { + name: 'urn', + label: 'urn', + }, + { + name: 'title', + label: 'title', + }, + { + name: 'displayName', + groupInto: 'name', + label: 'display name', + }, + { + name: 'name', + groupInto: 'name', + label: 'name', + }, + { + name: 'editedDescription', + groupInto: 'description', + label: 'description', + }, + { + name: 'description', + groupInto: 'description', + label: 'description', + }, + { + name: 'editedFieldDescriptions', + groupInto: 'fieldDescriptions', + label: 'column description', + showInMatchedFieldList: true, + }, + { + name: 'fieldDescriptions', + groupInto: 'fieldDescriptions', + label: 'column description', + showInMatchedFieldList: true, + }, + { + name: 'tags', + label: 'tag', + }, + { + name: 'editedFieldTags', + groupInto: 'fieldTags', + label: 'column tag', + showInMatchedFieldList: true, + }, + { + name: 'fieldTags', + groupInto: 'fieldTags', + label: 'column tag', + showInMatchedFieldList: true, + }, + { + name: 'glossaryTerms', + label: 'term', + }, + { + name: 'editedFieldGlossaryTerms', + groupInto: 'fieldGlossaryTerms', + label: 'column term', + showInMatchedFieldList: true, + }, + { + name: 'fieldGlossaryTerms', + groupInto: 'fieldGlossaryTerms', + label: 'column term', + showInMatchedFieldList: true, + }, + { + name: 'fieldLabels', + label: 'label', + showInMatchedFieldList: true, + }, + { + name: 'fieldPaths', + label: 'column', + showInMatchedFieldList: true, + }, +]; + +export const CHART_DASHBOARD_FIELD_CONFIG: Array = DEFAULT_MATCHED_FIELD_CONFIG.map((config) => { + if (config.name === 'title') return { ...config, groupInto: 'name' }; + return config; +}); + +export const MATCHED_FIELD_CONFIG = { + [EntityType.Chart]: CHART_DASHBOARD_FIELD_CONFIG, + [EntityType.Dashboard]: CHART_DASHBOARD_FIELD_CONFIG, + DEFAULT: DEFAULT_MATCHED_FIELD_CONFIG, +} as const; + +export type MatchesGroupedByFieldName = { + fieldName: string; + matchedFields: Array; +}; + +export const HIGHLIGHTABLE_ENTITY_TYPES = [EntityType.Tag, EntityType.GlossaryTerm]; diff --git a/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx new file mode 100644 index 0000000000000..0a33530552864 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx @@ -0,0 +1,8 @@ +import React from 'react'; + +import { MatchedField } from '../../../types.generated'; +import { downgradeV2FieldPath } from '../../entity/dataset/profile/schema/utils/utils'; + +export const matchedFieldPathsRenderer = (matchedField: MatchedField) => { + return matchedField?.name === 'fieldPaths' ? {downgradeV2FieldPath(matchedField.value)} : null; +}; diff --git a/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx new file mode 100644 index 0000000000000..25634c9e8b80e --- /dev/null +++ b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx @@ -0,0 +1,40 @@ +import React from 'react'; + +import { Chart, Dashboard, EntityType, GlossaryTerm, MatchedField } from '../../../types.generated'; +import { useEntityRegistry } from '../../useEntityRegistry'; + +const LABEL_INDEX_NAME = 'fieldLabels'; +const TYPE_PROPERTY_KEY_NAME = 'type'; + +const TermName = ({ term }: { term: GlossaryTerm }) => { + const entityRegistry = useEntityRegistry(); + return <>{entityRegistry.getDisplayName(EntityType.GlossaryTerm, term)}; +}; + +export const matchedInputFieldRenderer = (matchedField: MatchedField, entity: Chart | Dashboard) => { + if (matchedField?.name === LABEL_INDEX_NAME) { + const matchedSchemaField = entity.inputFields?.fields?.find( + (field) => field?.schemaField?.label === matchedField.value, + ); + const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find( + (term) => term?.term?.name === matchedField.value, + ); + + if (matchedGlossaryTerm) { + let termType = 'term'; + const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find( + (property) => property.key === TYPE_PROPERTY_KEY_NAME, + ); + if (typeProperty) { + termType = typeProperty.value || termType; + } + + return ( + <> + {termType} + + ); + } + } + return null; +}; diff --git a/datahub-web-react/src/app/search/matches/utils.test.ts b/datahub-web-react/src/app/search/matches/utils.test.ts new file mode 100644 index 0000000000000..8b5ed27f5c2ad --- /dev/null +++ b/datahub-web-react/src/app/search/matches/utils.test.ts @@ -0,0 +1,110 @@ +import { EntityType } from '../../../types.generated'; +import { getMatchesPrioritized } from './utils'; + +const mapping = new Map(); +mapping.set('fieldPaths', 'column'); +mapping.set('fieldDescriptions', 'column description'); +mapping.set('fieldTags', 'column tag'); + +const MOCK_MATCHED_FIELDS = [ + { + name: 'fieldPaths', + value: 'rain', + }, + { + name: 'fieldDescriptions', + value: 'rainbow', + }, + { + name: 'fieldPaths', + value: 'rainbow', + }, + { + name: 'fieldPaths', + value: 'rainbows', + }, +]; + +const MOCK_MATCHED_DESCRIPTION_FIELDS = [ + { + name: 'editedDescription', + value: 'edited description value', + }, + { + name: 'description', + value: 'description value', + }, + { + name: 'fieldDescriptions', + value: 'field descriptions value', + }, + { + name: 'editedFieldDescriptions', + value: 'edited field descriptions value', + }, +]; + +describe('utils', () => { + describe('getMatchPrioritizingPrimary', () => { + it('prioritizes exact match', () => { + global.window.location.search = 'query=rainbow'; + const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths'); + expect(groupedMatches).toEqual([ + { + fieldName: 'fieldPaths', + matchedFields: [ + { name: 'fieldPaths', value: 'rainbow' }, + { name: 'fieldPaths', value: 'rainbows' }, + { name: 'fieldPaths', value: 'rain' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }], + }, + ]); + }); + it('will accept first contains match', () => { + global.window.location.search = 'query=bow'; + const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths'); + expect(groupedMatches).toEqual([ + { + fieldName: 'fieldPaths', + matchedFields: [ + { name: 'fieldPaths', value: 'rainbow' }, + { name: 'fieldPaths', value: 'rainbows' }, + { name: 'fieldPaths', value: 'rain' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }], + }, + ]); + }); + it('will group by field name', () => { + global.window.location.search = ''; + const groupedMatches = getMatchesPrioritized( + EntityType.Dataset, + MOCK_MATCHED_DESCRIPTION_FIELDS, + 'fieldPaths', + ); + expect(groupedMatches).toEqual([ + { + fieldName: 'description', + matchedFields: [ + { name: 'editedDescription', value: 'edited description value' }, + { name: 'description', value: 'description value' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [ + { name: 'fieldDescriptions', value: 'field descriptions value' }, + { name: 'editedFieldDescriptions', value: 'edited field descriptions value' }, + ], + }, + ]); + }); + }); +}); diff --git a/datahub-web-react/src/app/search/matches/utils.ts b/datahub-web-react/src/app/search/matches/utils.ts new file mode 100644 index 0000000000000..78c62f7eef458 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/utils.ts @@ -0,0 +1,136 @@ +import * as QueryString from 'query-string'; +import { EntityType, MatchedField } from '../../../types.generated'; +import { + HIGHLIGHTABLE_ENTITY_TYPES, + MATCHED_FIELD_CONFIG, + MatchedFieldConfig, + MatchedFieldName, + MatchesGroupedByFieldName, +} from './constants'; + +const getFieldConfigsByEntityType = (entityType: EntityType | undefined): Array => { + return entityType && entityType in MATCHED_FIELD_CONFIG + ? MATCHED_FIELD_CONFIG[entityType] + : MATCHED_FIELD_CONFIG.DEFAULT; +}; + +export const shouldShowInMatchedFieldList = (entityType: EntityType | undefined, field: MatchedField): boolean => { + const configs = getFieldConfigsByEntityType(entityType); + return configs.some((config) => config.name === field.name && config.showInMatchedFieldList); +}; + +export const getMatchedFieldLabel = (entityType: EntityType | undefined, fieldName: string): string => { + const configs = getFieldConfigsByEntityType(entityType); + return configs.find((config) => config.name === fieldName)?.label ?? ''; +}; + +export const getGroupedFieldName = ( + entityType: EntityType | undefined, + fieldName: string, +): MatchedFieldName | undefined => { + const configs = getFieldConfigsByEntityType(entityType); + const fieldConfig = configs.find((config) => config.name === fieldName); + return fieldConfig?.groupInto; +}; + +export const getMatchedFieldNames = ( + entityType: EntityType | undefined, + fieldName: MatchedFieldName, +): Array => { + return getFieldConfigsByEntityType(entityType) + .filter((config) => fieldName === config.groupInto || fieldName === config.name) + .map((field) => field.name); +}; + +export const getMatchedFieldsByNames = (fields: Array, names: Array): Array => { + return fields.filter((field) => names.includes(field.name)); +}; + +export const getMatchedFieldsByUrn = (fields: Array, urn: string): Array => { + return fields.filter((field) => field.value === urn); +}; + +function normalize(value: string) { + return value.trim().toLowerCase(); +} + +function fromQueryGetBestMatch( + selectedMatchedFields: MatchedField[], + rawQuery: string, + prioritizedField: string, +): Array { + const query = normalize(rawQuery); + const priorityMatches: Array = selectedMatchedFields.filter( + (field) => field.name === prioritizedField, + ); + const nonPriorityMatches: Array = selectedMatchedFields.filter( + (field) => field.name !== prioritizedField, + ); + const exactMatches: Array = []; + const containedMatches: Array = []; + const rest: Array = []; + + [...priorityMatches, ...nonPriorityMatches].forEach((field) => { + const normalizedValue = normalize(field.value); + if (normalizedValue === query) exactMatches.push(field); + else if (normalizedValue.includes(query)) containedMatches.push(field); + else rest.push(field); + }); + + return [...exactMatches, ...containedMatches, ...rest]; +} + +const getMatchesGroupedByFieldName = ( + entityType: EntityType, + matchedFields: Array, +): Array => { + const fieldNameToMatches = new Map>(); + const fieldNames: Array = []; + matchedFields.forEach((field) => { + const groupedFieldName = getGroupedFieldName(entityType, field.name) || field.name; + const matchesInMap = fieldNameToMatches.get(groupedFieldName); + if (matchesInMap) { + matchesInMap.push(field); + } else { + fieldNameToMatches.set(groupedFieldName, [field]); + fieldNames.push(groupedFieldName); + } + }); + return fieldNames.map((fieldName) => ({ + fieldName, + matchedFields: fieldNameToMatches.get(fieldName) ?? [], + })); +}; + +export const getMatchesPrioritized = ( + entityType: EntityType, + matchedFields: MatchedField[], + prioritizedField: string, +): Array => { + const { location } = window; + const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); + const query: string = decodeURIComponent(params.query ? (params.query as string) : ''); + const matches = fromQueryGetBestMatch(matchedFields, query, prioritizedField); + return getMatchesGroupedByFieldName(entityType, matches); +}; + +export const isHighlightableEntityField = (field: MatchedField) => + !!field.entity && HIGHLIGHTABLE_ENTITY_TYPES.includes(field.entity.type); + +export const isDescriptionField = (field: MatchedField) => field.name.toLowerCase().includes('description'); + +const SURROUNDING_DESCRIPTION_CHARS = 10; +const MAX_DESCRIPTION_CHARS = 50; + +export const getDescriptionSlice = (text: string, target: string) => { + const queryIndex = text.indexOf(target); + const start = Math.max(0, queryIndex - SURROUNDING_DESCRIPTION_CHARS); + const end = Math.min( + start + MAX_DESCRIPTION_CHARS, + text.length, + queryIndex + target.length + SURROUNDING_DESCRIPTION_CHARS, + ); + const startEllipsis = start > 0 ? '...' : ''; + const endEllipsis = end < text.length ? '...' : ''; + return `${startEllipsis}${text.slice(start, end)}${endEllipsis}`; +}; diff --git a/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx b/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx new file mode 100644 index 0000000000000..9dbd67883bf64 --- /dev/null +++ b/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx @@ -0,0 +1,39 @@ +import styled from 'styled-components'; +import React from 'react'; +import { useHistory } from 'react-router'; +import { SearchSuggestion } from '../../../types.generated'; +import { navigateToSearchUrl } from '../utils/navigateToSearchUrl'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; + +const TextWrapper = styled.div` + font-size: 14px; + color: ${ANTD_GRAY_V2[8]}; + margin: 16px 0 -8px 32px; +`; + +export const SuggestedText = styled.span` + color: ${(props) => props.theme.styles['primary-color']}; + text-decoration: underline ${(props) => props.theme.styles['primary-color']}; + cursor: pointer; +`; + +interface Props { + suggestions: SearchSuggestion[]; +} + +export default function SearchQuerySuggester({ suggestions }: Props) { + const history = useHistory(); + + if (suggestions.length === 0) return null; + const suggestText = suggestions[0].text; + + function searchForSuggestion() { + navigateToSearchUrl({ query: suggestText, history }); + } + + return ( + + Did you mean {suggestText} + + ); +} diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts new file mode 100644 index 0000000000000..e8e64559e67a0 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts @@ -0,0 +1,31 @@ +import { AutoCompleteResultForEntity, EntityType } from '../../../types.generated'; +import { CombinedEntity, createSiblingEntityCombiner } from '../../entity/shared/siblingUtils'; + +export type CombinedSuggestion = { + type: EntityType; + combinedEntities: Array; + suggestions?: AutoCompleteResultForEntity['suggestions']; +}; + +export function combineSiblingsInAutoComplete( + autoCompleteResultForEntity: AutoCompleteResultForEntity, + { combineSiblings = false } = {}, +): CombinedSuggestion { + const combine = createSiblingEntityCombiner(); + const combinedEntities: Array = []; + + autoCompleteResultForEntity.entities.forEach((entity) => { + if (!combineSiblings) { + combinedEntities.push({ entity }); + return; + } + const combinedResult = combine(entity); + if (!combinedResult.skipped) combinedEntities.push(combinedResult.combinedEntity); + }); + + return { + type: autoCompleteResultForEntity.type, + suggestions: autoCompleteResultForEntity.suggestions, + combinedEntities, + }; +} diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts new file mode 100644 index 0000000000000..4cf61c715b0e9 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts @@ -0,0 +1,521 @@ +import { combineSiblingsInSearchResults } from './combineSiblingsInSearchResults'; + +const searchResultWithSiblings = [ + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + customProperties: [], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['table'], + __typename: 'SubTypes', + }, + domain: null, + container: { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + parentContainers: { + count: 2, + containers: [ + { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + { + urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'cypress_project', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Project'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + ], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: false, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + platform: { + urn: 'urn:li:dataPlatform:dbt', + type: 'DATA_PLATFORM', + name: 'dbt', + properties: { + type: 'OTHERS', + displayName: 'dbt', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/dbtlogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + name: 'cypress_project.jaffle_shop.raw_orders', + properties: { + name: 'raw_orders', + description: '', + qualifiedName: null, + __typename: 'DatasetProperties', + }, + __typename: 'Dataset', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:dbt', + type: 'DATA_PLATFORM', + name: 'dbt', + properties: { + type: 'OTHERS', + displayName: 'dbt', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/dbtlogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: '', + qualifiedName: null, + customProperties: [ + { + key: 'catalog_version', + value: '1.0.4', + __typename: 'StringMapEntry', + }, + { + key: 'node_type', + value: 'seed', + __typename: 'StringMapEntry', + }, + { + key: 'materialization', + value: 'seed', + __typename: 'StringMapEntry', + }, + { + key: 'dbt_file_path', + value: 'data/raw_orders.csv', + __typename: 'StringMapEntry', + }, + { + key: 'catalog_schema', + value: 'https://schemas.getdbt.com/dbt/catalog/v1.json', + __typename: 'StringMapEntry', + }, + { + key: 'catalog_type', + value: 'table', + __typename: 'StringMapEntry', + }, + { + key: 'manifest_version', + value: '1.0.4', + __typename: 'StringMapEntry', + }, + { + key: 'manifest_schema', + value: 'https://schemas.getdbt.com/dbt/manifest/v4.json', + __typename: 'StringMapEntry', + }, + ], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['seed'], + __typename: 'SubTypes', + }, + domain: null, + container: null, + parentContainers: { + count: 0, + containers: [], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: true, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + type: 'DATASET', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + name: 'cypress_project.jaffle_shop.raw_orders', + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + __typename: 'DatasetProperties', + }, + __typename: 'Dataset', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, +]; + +const searchResultWithGhostSiblings = [ + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + customProperties: [], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['table'], + __typename: 'SubTypes', + }, + domain: null, + container: { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + parentContainers: { + count: 2, + containers: [ + { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + { + urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'cypress_project', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Project'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + ], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: false, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: false, + type: 'DATASET', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, +]; + +describe('siblingUtils', () => { + describe('combineSiblingsInSearchResults', () => { + it('combines search results to deduplicate siblings', () => { + const result = combineSiblingsInSearchResults(searchResultWithSiblings as any); + + expect(result).toHaveLength(1); + expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + expect(result?.[0]?.matchedEntities?.[1]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + + expect(result?.[0]?.matchedEntities).toHaveLength(2); + + expect(result?.[0]?.matchedFields).toHaveLength(2); + }); + + it('will not combine an entity with a ghost node', () => { + const result = combineSiblingsInSearchResults(searchResultWithGhostSiblings as any); + + expect(result).toHaveLength(1); + expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + expect(result?.[0]?.matchedEntities).toHaveLength(1); + + expect(result?.[0]?.matchedFields).toHaveLength(2); + }); + }); +}); diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts new file mode 100644 index 0000000000000..4a5c8da6381b8 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts @@ -0,0 +1,28 @@ +import { Entity, MatchedField } from '../../../types.generated'; +import { CombinedEntity, createSiblingEntityCombiner } from '../../entity/shared/siblingUtils'; + +type UncombinedSeaerchResults = { + entity: Entity; + matchedFields: Array; +}; + +export type CombinedSearchResult = CombinedEntity & Pick; + +export function combineSiblingsInSearchResults( + searchResults: Array | undefined = [], +): Array { + const combine = createSiblingEntityCombiner(); + const combinedSearchResults: Array = []; + + searchResults.forEach((searchResult) => { + const combinedResult = combine(searchResult.entity); + if (!combinedResult.skipped) { + combinedSearchResults.push({ + ...searchResult, + ...combinedResult.combinedEntity, + }); + } + }); + + return combinedSearchResults; +} diff --git a/datahub-web-react/src/app/search/utils/constants.ts b/datahub-web-react/src/app/search/utils/constants.ts index eecd18441e7a5..af45129022cc1 100644 --- a/datahub-web-react/src/app/search/utils/constants.ts +++ b/datahub-web-react/src/app/search/utils/constants.ts @@ -10,7 +10,6 @@ export const TAGS_FILTER_NAME = 'tags'; export const GLOSSARY_TERMS_FILTER_NAME = 'glossaryTerms'; export const CONTAINER_FILTER_NAME = 'container'; export const DOMAINS_FILTER_NAME = 'domains'; -export const DATA_PRODUCTS_FILTER_NAME = 'dataProducts'; export const OWNERS_FILTER_NAME = 'owners'; export const TYPE_NAMES_FILTER_NAME = 'typeNames'; export const PLATFORM_FILTER_NAME = 'platform'; @@ -57,7 +56,6 @@ export const ORDERED_FIELDS = [ TAGS_FILTER_NAME, GLOSSARY_TERMS_FILTER_NAME, DOMAINS_FILTER_NAME, - DATA_PRODUCTS_FILTER_NAME, FIELD_TAGS_FILTER_NAME, FIELD_GLOSSARY_TERMS_FILTER_NAME, FIELD_PATHS_FILTER_NAME, @@ -74,7 +72,6 @@ export const FIELD_TO_LABEL = { owners: 'Owner', tags: 'Tag', domains: 'Domain', - [DATA_PRODUCTS_FILTER_NAME]: 'Data Product', platform: 'Platform', fieldTags: 'Column Tag', glossaryTerms: 'Glossary Term', diff --git a/datahub-web-react/src/app/settings/SettingsPage.tsx b/datahub-web-react/src/app/settings/SettingsPage.tsx index bfec9b395cff2..339cc0cf44bac 100644 --- a/datahub-web-react/src/app/settings/SettingsPage.tsx +++ b/datahub-web-react/src/app/settings/SettingsPage.tsx @@ -7,6 +7,7 @@ import { ToolOutlined, FilterOutlined, TeamOutlined, + PushpinOutlined, } from '@ant-design/icons'; import { Redirect, Route, useHistory, useLocation, useRouteMatch, Switch } from 'react-router'; import styled from 'styled-components'; @@ -19,6 +20,7 @@ import { Preferences } from './Preferences'; import { ManageViews } from '../entity/view/ManageViews'; import { useUserContext } from '../context/useUserContext'; import { ManageOwnership } from '../entity/ownership/ManageOwnership'; +import ManagePosts from './posts/ManagePosts'; const PageContainer = styled.div` display: flex; @@ -62,6 +64,7 @@ const PATHS = [ { path: 'preferences', content: }, { path: 'views', content: }, { path: 'ownership', content: }, + { path: 'posts', content: }, ]; /** @@ -91,6 +94,7 @@ export const SettingsPage = () => { const showUsersGroups = (isIdentityManagementEnabled && me && me?.platformPrivileges?.manageIdentities) || false; const showViews = isViewsEnabled || false; const showOwnershipTypes = me && me?.platformPrivileges?.manageOwnershipTypes; + const showHomePagePosts = me && me?.platformPrivileges?.manageGlobalAnnouncements; return ( @@ -143,6 +147,11 @@ export const SettingsPage = () => { Ownership Types )} + {showHomePagePosts && ( + + Home Page Posts + + )} diff --git a/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx b/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx new file mode 100644 index 0000000000000..a8d6cfa64c9c1 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx @@ -0,0 +1,91 @@ +import React, { useState } from 'react'; +import { Form, Input, Typography, FormInstance, Radio } from 'antd'; +import styled from 'styled-components'; +import { + DESCRIPTION_FIELD_NAME, + LINK_FIELD_NAME, + LOCATION_FIELD_NAME, + TITLE_FIELD_NAME, + TYPE_FIELD_NAME, +} from './constants'; +import { PostContentType } from '../../../types.generated'; + +const TopFormItem = styled(Form.Item)` + margin-bottom: 24px; +`; + +const SubFormItem = styled(Form.Item)` + margin-bottom: 0; +`; + +type Props = { + setCreateButtonEnabled: (isEnabled: boolean) => void; + form: FormInstance; +}; + +export default function CreatePostForm({ setCreateButtonEnabled, form }: Props) { + const [postType, setPostType] = useState(PostContentType.Text); + + return ( +
{ + setCreateButtonEnabled(!form.getFieldsError().some((field) => field.errors.length > 0)); + }} + > + Post Type}> + setPostType(e.target.value)} + value={postType} + defaultValue={postType} + optionType="button" + buttonStyle="solid" + > + Announcement + Link + + + + Title}> + The title for your new post. + + + + + {postType === PostContentType.Text && ( + Description}> + The main content for your new post. + + + + + )} + {postType === PostContentType.Link && ( + <> + Link URL}> + + Where users will be directed when they click this post. + + + + + + Image URL}> + + A URL to an image you want to display on your link post. + + + + + + + )} +
+ ); +} diff --git a/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx b/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx new file mode 100644 index 0000000000000..b4851ecb02969 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx @@ -0,0 +1,107 @@ +import React, { useState } from 'react'; +import { Button, Form, message, Modal } from 'antd'; +import CreatePostForm from './CreatePostForm'; +import { + CREATE_POST_BUTTON_ID, + DESCRIPTION_FIELD_NAME, + LINK_FIELD_NAME, + LOCATION_FIELD_NAME, + TYPE_FIELD_NAME, + TITLE_FIELD_NAME, +} from './constants'; +import { useEnterKeyListener } from '../../shared/useEnterKeyListener'; +import { MediaType, PostContentType, PostType } from '../../../types.generated'; +import { useCreatePostMutation } from '../../../graphql/mutations.generated'; + +type Props = { + onClose: () => void; + onCreate: ( + contentType: string, + title: string, + description: string | undefined, + link: string | undefined, + location: string | undefined, + ) => void; +}; + +export default function CreatePostModal({ onClose, onCreate }: Props) { + const [createPostMutation] = useCreatePostMutation(); + const [createButtonEnabled, setCreateButtonEnabled] = useState(false); + const [form] = Form.useForm(); + const onCreatePost = () => { + const contentTypeValue = form.getFieldValue(TYPE_FIELD_NAME) ?? PostContentType.Text; + const mediaValue = + form.getFieldValue(TYPE_FIELD_NAME) && form.getFieldValue(LOCATION_FIELD_NAME) + ? { + type: MediaType.Image, + location: form.getFieldValue(LOCATION_FIELD_NAME) ?? null, + } + : null; + createPostMutation({ + variables: { + input: { + postType: PostType.HomePageAnnouncement, + content: { + contentType: contentTypeValue, + title: form.getFieldValue(TITLE_FIELD_NAME), + description: form.getFieldValue(DESCRIPTION_FIELD_NAME) ?? null, + link: form.getFieldValue(LINK_FIELD_NAME) ?? null, + media: mediaValue, + }, + }, + }, + }) + .then(({ errors }) => { + if (!errors) { + message.success({ + content: `Created Post!`, + duration: 3, + }); + onCreate( + form.getFieldValue(TYPE_FIELD_NAME) ?? PostContentType.Text, + form.getFieldValue(TITLE_FIELD_NAME), + form.getFieldValue(DESCRIPTION_FIELD_NAME), + form.getFieldValue(LINK_FIELD_NAME), + form.getFieldValue(LOCATION_FIELD_NAME), + ); + form.resetFields(); + } + }) + .catch((e) => { + message.destroy(); + message.error({ content: 'Failed to create Post! An unknown error occured.', duration: 3 }); + console.error('Failed to create Post:', e.message); + }); + onClose(); + }; + + // Handle the Enter press + useEnterKeyListener({ + querySelectorToExecuteClick: '#createPostButton', + }); + + return ( + + + + + } + > + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/ManagePosts.tsx b/datahub-web-react/src/app/settings/posts/ManagePosts.tsx new file mode 100644 index 0000000000000..e0f694c192c62 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/ManagePosts.tsx @@ -0,0 +1,40 @@ +import { Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components/macro'; +import { PostList } from './PostsList'; + +const PageContainer = styled.div` + padding-top: 20px; + width: 100%; + height: 100%; +`; + +const PageHeaderContainer = styled.div` + && { + padding-left: 24px; + } +`; + +const PageTitle = styled(Typography.Title)` + && { + margin-bottom: 12px; + } +`; + +const ListContainer = styled.div``; + +export default function ManagePosts() { + return ( + + + Home Page Posts + + View and manage pinned posts that appear to all users on the landing page. + + + + + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx b/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx new file mode 100644 index 0000000000000..e3fc424a47ef2 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx @@ -0,0 +1,62 @@ +import React from 'react'; +import { DeleteOutlined } from '@ant-design/icons'; +import { Dropdown, Menu, message, Modal } from 'antd'; +import { MenuIcon } from '../../entity/shared/EntityDropdown/EntityDropdown'; +import { useDeletePostMutation } from '../../../graphql/post.generated'; + +type Props = { + urn: string; + title: string; + onDelete?: () => void; +}; + +export default function PostItemMenu({ title, urn, onDelete }: Props) { + const [deletePostMutation] = useDeletePostMutation(); + + const deletePost = () => { + deletePostMutation({ + variables: { + urn, + }, + }) + .then(({ errors }) => { + if (!errors) { + message.success('Deleted Post!'); + onDelete?.(); + } + }) + .catch(() => { + message.destroy(); + message.error({ content: `Failed to delete Post!: An unknown error occurred.`, duration: 3 }); + }); + }; + + const onConfirmDelete = () => { + Modal.confirm({ + title: `Delete Post '${title}'`, + content: `Are you sure you want to remove this Post?`, + onOk() { + deletePost(); + }, + onCancel() {}, + okText: 'Yes', + maskClosable: true, + closable: true, + }); + }; + + return ( + + +  Delete + + + } + > + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/PostsList.tsx b/datahub-web-react/src/app/settings/posts/PostsList.tsx new file mode 100644 index 0000000000000..5ae2be1547f9b --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostsList.tsx @@ -0,0 +1,200 @@ +import React, { useEffect, useState } from 'react'; +import { Button, Empty, Pagination, Typography } from 'antd'; +import { useLocation } from 'react-router'; +import styled from 'styled-components'; +import * as QueryString from 'query-string'; +import { PlusOutlined } from '@ant-design/icons'; +import { AlignType } from 'rc-table/lib/interface'; +import CreatePostModal from './CreatePostModal'; +import { PostColumn, PostEntry, PostListMenuColumn } from './PostsListColumns'; +import { useEntityRegistry } from '../../useEntityRegistry'; +import { useListPostsQuery } from '../../../graphql/post.generated'; +import { scrollToTop } from '../../shared/searchUtils'; +import { addToListPostCache, removeFromListPostCache } from './utils'; +import { Message } from '../../shared/Message'; +import TabToolbar from '../../entity/shared/components/styled/TabToolbar'; +import { SearchBar } from '../../search/SearchBar'; +import { StyledTable } from '../../entity/shared/components/styled/StyledTable'; +import { POST_TYPE_TO_DISPLAY_TEXT } from './constants'; + +const PostsContainer = styled.div``; + +export const PostsPaginationContainer = styled.div` + display: flex; + justify-content: center; + padding: 12px; + padding-left: 16px; + border-bottom: 1px solid; + border-color: ${(props) => props.theme.styles['border-color-base']}; + display: flex; + justify-content: space-between; + align-items: center; +`; + +const PaginationInfo = styled(Typography.Text)` + padding: 0px; +`; + +const DEFAULT_PAGE_SIZE = 10; + +export const PostList = () => { + const entityRegistry = useEntityRegistry(); + const location = useLocation(); + const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); + const paramsQuery = (params?.query as string) || undefined; + const [query, setQuery] = useState(undefined); + useEffect(() => setQuery(paramsQuery), [paramsQuery]); + + const [page, setPage] = useState(1); + const [isCreatingPost, setIsCreatingPost] = useState(false); + + const pageSize = DEFAULT_PAGE_SIZE; + const start = (page - 1) * pageSize; + + const { loading, error, data, client, refetch } = useListPostsQuery({ + variables: { + input: { + start, + count: pageSize, + query, + }, + }, + fetchPolicy: query && query.length > 0 ? 'no-cache' : 'cache-first', + }); + + const totalPosts = data?.listPosts?.total || 0; + const lastResultIndex = start + pageSize > totalPosts ? totalPosts : start + pageSize; + const posts = data?.listPosts?.posts || []; + + const onChangePage = (newPage: number) => { + scrollToTop(); + setPage(newPage); + }; + + const handleDelete = (urn: string) => { + removeFromListPostCache(client, urn, page, pageSize); + setTimeout(() => { + refetch?.(); + }, 2000); + }; + + const allColumns = [ + { + title: 'Title', + dataIndex: '', + key: 'title', + sorter: (sourceA, sourceB) => { + return sourceA.title.localeCompare(sourceB.title); + }, + render: (record: PostEntry) => PostColumn(record.title, 200), + width: '20%', + }, + { + title: 'Description', + dataIndex: '', + key: 'description', + render: (record: PostEntry) => PostColumn(record.description || ''), + }, + { + title: 'Type', + dataIndex: '', + key: 'type', + render: (record: PostEntry) => PostColumn(POST_TYPE_TO_DISPLAY_TEXT[record.contentType]), + style: { minWidth: 100 }, + width: '10%', + }, + { + title: '', + dataIndex: '', + width: '5%', + align: 'right' as AlignType, + key: 'menu', + render: PostListMenuColumn(handleDelete), + }, + ]; + + const tableData = posts.map((post) => { + return { + urn: post.urn, + title: post.content.title, + description: post.content.description, + contentType: post.content.contentType, + }; + }); + + return ( + <> + {!data && loading && } + {error && } + + + + null} + onQueryChange={(q) => setQuery(q && q.length > 0 ? q : undefined)} + entityRegistry={entityRegistry} + hideRecommendations + /> + + }} + /> + {totalPosts > pageSize && ( + + + + {lastResultIndex > 0 ? (page - 1) * pageSize + 1 : 0} - {lastResultIndex} + {' '} + of {totalPosts} + + + + + )} + {isCreatingPost && ( + setIsCreatingPost(false)} + onCreate={(urn, title, description) => { + addToListPostCache( + client, + { + urn, + properties: { + title, + description: description || null, + }, + }, + pageSize, + ); + setTimeout(() => refetch(), 2000); + }} + /> + )} + + + ); +}; diff --git a/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx b/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx new file mode 100644 index 0000000000000..38f910baf8f41 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx @@ -0,0 +1,26 @@ +import React from 'react'; +// import { Typography } from 'antd'; +import styled from 'styled-components/macro'; +import { Maybe } from 'graphql/jsutils/Maybe'; +import PostItemMenu from './PostItemMenu'; + +export interface PostEntry { + title: string; + contentType: string; + description: Maybe; + urn: string; +} + +const PostText = styled.div<{ minWidth?: number }>` + ${(props) => props.minWidth !== undefined && `min-width: ${props.minWidth}px;`} +`; + +export function PostListMenuColumn(handleDelete: (urn: string) => void) { + return (record: PostEntry) => ( + handleDelete(record.urn)} /> + ); +} + +export function PostColumn(text: string, minWidth?: number) { + return {text}; +} diff --git a/datahub-web-react/src/app/settings/posts/constants.ts b/datahub-web-react/src/app/settings/posts/constants.ts new file mode 100644 index 0000000000000..5a164019fe2e5 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/constants.ts @@ -0,0 +1,13 @@ +import { PostContentType } from '../../../types.generated'; + +export const TITLE_FIELD_NAME = 'title'; +export const DESCRIPTION_FIELD_NAME = 'description'; +export const LINK_FIELD_NAME = 'link'; +export const LOCATION_FIELD_NAME = 'location'; +export const TYPE_FIELD_NAME = 'type'; +export const CREATE_POST_BUTTON_ID = 'createPostButton'; + +export const POST_TYPE_TO_DISPLAY_TEXT = { + [PostContentType.Link]: 'Link', + [PostContentType.Text]: 'Announcement', +}; diff --git a/datahub-web-react/src/app/settings/posts/utils.ts b/datahub-web-react/src/app/settings/posts/utils.ts new file mode 100644 index 0000000000000..ce48c7400738c --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/utils.ts @@ -0,0 +1,77 @@ +import { ListPostsDocument, ListPostsQuery } from '../../../graphql/post.generated'; + +/** + * Add an entry to the list posts cache. + */ +export const addToListPostCache = (client, newPost, pageSize) => { + // Read the data from our cache for this query. + const currData: ListPostsQuery | null = client.readQuery({ + query: ListPostsDocument, + variables: { + input: { + start: 0, + count: pageSize, + }, + }, + }); + + // Add our new post into the existing list. + const newPosts = [newPost, ...(currData?.listPosts?.posts || [])]; + + // Write our data back to the cache. + client.writeQuery({ + query: ListPostsDocument, + variables: { + input: { + start: 0, + count: pageSize, + }, + }, + data: { + listPosts: { + start: 0, + count: (currData?.listPosts?.count || 0) + 1, + total: (currData?.listPosts?.total || 0) + 1, + posts: newPosts, + }, + }, + }); +}; + +/** + * Remove an entry from the list posts cache. + */ +export const removeFromListPostCache = (client, urn, page, pageSize) => { + // Read the data from our cache for this query. + const currData: ListPostsQuery | null = client.readQuery({ + query: ListPostsDocument, + variables: { + input: { + start: (page - 1) * pageSize, + count: pageSize, + }, + }, + }); + + // Remove the post from the existing posts set. + const newPosts = [...(currData?.listPosts?.posts || []).filter((post) => post.urn !== urn)]; + + // Write our data back to the cache. + client.writeQuery({ + query: ListPostsDocument, + variables: { + input: { + start: (page - 1) * pageSize, + count: pageSize, + }, + }, + data: { + listPosts: { + start: currData?.listPosts?.start || 0, + count: (currData?.listPosts?.count || 1) - 1, + total: (currData?.listPosts?.total || 1) - 1, + posts: newPosts, + }, + }, + }); +}; diff --git a/datahub-web-react/src/app/shared/health/healthUtils.tsx b/datahub-web-react/src/app/shared/health/healthUtils.tsx index 823d77d7eabe9..ff7d9b417617c 100644 --- a/datahub-web-react/src/app/shared/health/healthUtils.tsx +++ b/datahub-web-react/src/app/shared/health/healthUtils.tsx @@ -11,13 +11,17 @@ import { HealthStatus, HealthStatusType, Health } from '../../../types.generated const HEALTH_INDICATOR_COLOR = '#d48806'; -const UnhealthyIconFilled = styled(ExclamationCircleTwoTone)` - font-size: 16px; +const UnhealthyIconFilled = styled(ExclamationCircleTwoTone)<{ fontSize: number }>` + && { + font-size: ${(props) => props.fontSize}px; + } `; -const UnhealthyIconOutlined = styled(ExclamationCircleOutlined)` +const UnhealthyIconOutlined = styled(ExclamationCircleOutlined)<{ fontSize: number }>` color: ${HEALTH_INDICATOR_COLOR}; - font-size: 16px; + && { + font-size: ${(props) => props.fontSize}px; + } `; export enum HealthSummaryIconType { @@ -32,12 +36,16 @@ export const isUnhealthy = (healths: Health[]) => { return isFailingAssertions; }; -export const getHealthSummaryIcon = (healths: Health[], type: HealthSummaryIconType = HealthSummaryIconType.FILLED) => { +export const getHealthSummaryIcon = ( + healths: Health[], + type: HealthSummaryIconType = HealthSummaryIconType.FILLED, + fontSize = 16, +) => { const unhealthy = isUnhealthy(healths); return unhealthy - ? (type === HealthSummaryIconType.FILLED && ) || ( - - ) + ? (type === HealthSummaryIconType.FILLED && ( + + )) || : undefined; }; diff --git a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx index 2288238091776..ed2460b6eea3c 100644 --- a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx +++ b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx @@ -8,6 +8,7 @@ import { StyledTag } from '../../../entity/shared/components/styled/StyledTag'; import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { TagProfileDrawer } from '../TagProfileDrawer'; +import { useHasMatchedFieldByUrn } from '../../../search/context/SearchResultContext'; const TagLink = styled.span` display: inline-block; @@ -41,6 +42,7 @@ export default function Tag({ }: Props) { const entityRegistry = useEntityRegistry(); const [removeTagMutation] = useRemoveTagMutation(); + const highlightTag = useHasMatchedFieldByUrn(tag.tag.urn, 'tags'); const [tagProfileDrawerVisible, setTagProfileDrawerVisible] = useState(false); const [addTagUrn, setAddTagUrn] = useState(''); @@ -110,6 +112,7 @@ export default function Tag({ removeTag(tag); }} fontSize={fontSize} + highlightTag={highlightTag} > ` +const StyledTag = styled(Tag)<{ fontSize?: number; highlightTerm?: boolean }>` + &&& { + ${(props) => + props.highlightTerm && + ` + background: ${props.theme.styles['highlight-color']}; + border: 1px solid ${props.theme.styles['highlight-border-color']}; + `} + } ${(props) => props.fontSize && `font-size: ${props.fontSize}px;`} `; @@ -38,6 +47,7 @@ export default function TermContent({ }: Props) { const entityRegistry = useEntityRegistry(); const [removeTermMutation] = useRemoveTermMutation(); + const highlightTerm = useHasMatchedFieldByUrn(term.term.urn, 'glossaryTerms'); const removeTerm = (termToRemove: GlossaryTermAssociation) => { onOpenModal?.(); @@ -85,6 +95,7 @@ export default function TermContent({ removeTerm(term); }} fontSize={fontSize} + highlightTerm={highlightTerm} > diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx index 3b34b108ecc93..807a17c4fd6a4 100644 --- a/datahub-web-react/src/appConfigContext.tsx +++ b/datahub-web-react/src/appConfigContext.tsx @@ -27,6 +27,9 @@ export const DEFAULT_APP_CONFIG = { entityProfile: { domainDefaultTab: null, }, + searchResult: { + enableNameHighlight: false, + }, }, authConfig: { tokenAuthEnabled: false, diff --git a/datahub-web-react/src/conf/Global.ts b/datahub-web-react/src/conf/Global.ts index b16dd1eaace57..e1220b8c81b53 100644 --- a/datahub-web-react/src/conf/Global.ts +++ b/datahub-web-react/src/conf/Global.ts @@ -28,6 +28,7 @@ export enum PageRoutes { SETTINGS_VIEWS = '/settings/views', EMBED = '/embed', EMBED_LOOKUP = '/embed/lookup/:url', + SETTINGS_POSTS = '/settings/posts', } /** diff --git a/datahub-web-react/src/conf/theme/theme_dark.config.json b/datahub-web-react/src/conf/theme/theme_dark.config.json index b648f3d997f21..9746c3ddde5f3 100644 --- a/datahub-web-react/src/conf/theme/theme_dark.config.json +++ b/datahub-web-react/src/conf/theme/theme_dark.config.json @@ -17,7 +17,9 @@ "disabled-color": "fade(white, 25%)", "steps-nav-arrow-color": "fade(white, 25%)", "homepage-background-upper-fade": "#FFFFFF", - "homepage-background-lower-fade": "#333E4C" + "homepage-background-lower-fade": "#333E4C", + "highlight-color": "#E6F4FF", + "highlight-border-color": "#BAE0FF" }, "assets": { "logoUrl": "/assets/logo.png" diff --git a/datahub-web-react/src/conf/theme/theme_light.config.json b/datahub-web-react/src/conf/theme/theme_light.config.json index e842fdb1bb8aa..906c04e38a1ba 100644 --- a/datahub-web-react/src/conf/theme/theme_light.config.json +++ b/datahub-web-react/src/conf/theme/theme_light.config.json @@ -20,7 +20,9 @@ "homepage-background-lower-fade": "#FFFFFF", "homepage-text-color": "#434343", "box-shadow": "0px 0px 30px 0px rgb(239 239 239)", - "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)" + "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)", + "highlight-color": "#E6F4FF", + "highlight-border-color": "#BAE0FF" }, "assets": { "logoUrl": "/assets/logo.png" diff --git a/datahub-web-react/src/conf/theme/types.ts b/datahub-web-react/src/conf/theme/types.ts index 98140cbbd553d..7d78230092700 100644 --- a/datahub-web-react/src/conf/theme/types.ts +++ b/datahub-web-react/src/conf/theme/types.ts @@ -18,6 +18,8 @@ export type Theme = { 'homepage-background-lower-fade': string; 'box-shadow': string; 'box-shadow-hover': string; + 'highlight-color': string; + 'highlight-border-color': string; }; assets: { logoUrl: string; diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql index 4b1295f1024a2..bf15e5f757f8f 100644 --- a/datahub-web-react/src/graphql/app.graphql +++ b/datahub-web-react/src/graphql/app.graphql @@ -45,6 +45,9 @@ query appConfig { defaultTab } } + searchResult { + enableNameHighlight + } } telemetryConfig { enableThirdPartyLogging diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql index 61c79abf929a0..52385dee8631a 100644 --- a/datahub-web-react/src/graphql/lineage.graphql +++ b/datahub-web-react/src/graphql/lineage.graphql @@ -198,6 +198,12 @@ fragment lineageNodeProperties on EntityWithRelationships { path } } + health { + type + status + message + causes + } } ... on MLModelGroup { urn diff --git a/datahub-web-react/src/graphql/me.graphql b/datahub-web-react/src/graphql/me.graphql index 2c693c747af56..af850c9c3ce28 100644 --- a/datahub-web-react/src/graphql/me.graphql +++ b/datahub-web-react/src/graphql/me.graphql @@ -46,6 +46,7 @@ query getMe { createTags manageGlobalViews manageOwnershipTypes + manageGlobalAnnouncements } } } diff --git a/datahub-web-react/src/graphql/post.graphql b/datahub-web-react/src/graphql/post.graphql index c19f38fc7751c..ee092ad4fba90 100644 --- a/datahub-web-react/src/graphql/post.graphql +++ b/datahub-web-react/src/graphql/post.graphql @@ -20,3 +20,11 @@ query listPosts($input: ListPostsInput!) { } } } + +mutation createPost($input: CreatePostInput!) { + createPost(input: $input) +} + +mutation deletePost($urn: String!) { + deletePost(urn: $urn) +} diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index f18b071705393..7cd868d7cd2b2 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -2,6 +2,7 @@ fragment autoCompleteFields on Entity { urn type ... on Dataset { + exists name platform { ...platformFields @@ -19,6 +20,29 @@ fragment autoCompleteFields on Entity { subTypes { typeNames } + siblings { + isPrimary + siblings { + urn + type + ... on Dataset { + exists + platform { + ...platformFields + } + parentContainers { + ...parentContainersFields + } + name + properties { + name + description + qualifiedName + externalUrl + } + } + } + } ...datasetStatsFields } ... on CorpUser { @@ -250,83 +274,77 @@ fragment datasetStatsFields on Dataset { } } +fragment nonSiblingsDatasetSearchFields on Dataset { + exists + name + origin + uri + platform { + ...platformFields + } + dataPlatformInstance { + ...dataPlatformInstanceFields + } + editableProperties { + description + } + platformNativeType + properties { + name + description + qualifiedName + customProperties { + key + value + } + externalUrl + } + ownership { + ...ownershipFields + } + globalTags { + ...globalTagsFields + } + glossaryTerms { + ...glossaryTerms + } + subTypes { + typeNames + } + domain { + ...entityDomain + } + ...entityDataProduct + parentContainers { + ...parentContainersFields + } + deprecation { + ...deprecationFields + } + health { + type + status + message + causes + } + ...datasetStatsFields +} + fragment searchResultFields on Entity { urn type ... on Dataset { - exists - name - origin - uri - platform { - ...platformFields - } - dataPlatformInstance { - ...dataPlatformInstanceFields - } - editableProperties { - description - } - platformNativeType - properties { - name - description - qualifiedName - customProperties { - key - value - } - externalUrl - } - ownership { - ...ownershipFields - } - globalTags { - ...globalTagsFields - } - glossaryTerms { - ...glossaryTerms - } - subTypes { - typeNames - } - domain { - ...entityDomain - } - ...entityDataProduct - parentContainers { - ...parentContainersFields - } - deprecation { - ...deprecationFields - } - health { - type - status - message - causes - } + ...nonSiblingsDatasetSearchFields siblings { isPrimary siblings { urn type ... on Dataset { - exists - platform { - ...platformFields - } - name - properties { - name - description - qualifiedName - externalUrl - } + ...nonSiblingsDatasetSearchFields } } } - ...datasetStatsFields } ... on CorpUser { username @@ -814,6 +832,11 @@ fragment searchResults on SearchResults { matchedFields { name value + entity { + urn + type + ...entityDisplayNameFields + } } insights { text @@ -823,6 +846,11 @@ fragment searchResults on SearchResults { facets { ...facetFields } + suggestions { + text + frequency + score + } } fragment schemaFieldEntityFields on SchemaFieldEntity { diff --git a/docker/airflow/local_airflow.md b/docker/airflow/local_airflow.md index d0a2b18cff2d2..55a64f5c122c5 100644 --- a/docker/airflow/local_airflow.md +++ b/docker/airflow/local_airflow.md @@ -138,25 +138,57 @@ Successfully added `conn_id`=datahub_rest_default : datahub_rest://:@http://data Navigate the Airflow UI to find the sample Airflow dag we just brought in -![Find the DAG](../../docs/imgs/airflow/find_the_dag.png) + +

+ +

+ By default, Airflow loads all DAG-s in paused status. Unpause the sample DAG to use it. -![Paused DAG](../../docs/imgs/airflow/paused_dag.png) -![Unpaused DAG](../../docs/imgs/airflow/unpaused_dag.png) + +

+ +

+ + +

+ +

+ Then trigger the DAG to run. -![Trigger the DAG](../../docs/imgs/airflow/trigger_dag.png) + +

+ +

+ After the DAG runs successfully, go over to your DataHub instance to see the Pipeline and navigate its lineage. -![DataHub Pipeline View](../../docs/imgs/airflow/datahub_pipeline_view.png) -![DataHub Pipeline Entity](../../docs/imgs/airflow/datahub_pipeline_entity.png) +

+ +

+ + + +

+ +

-![DataHub Task View](../../docs/imgs/airflow/datahub_task_view.png) -![DataHub Lineage View](../../docs/imgs/airflow/datahub_lineage_view.png) + +

+ +

+ + + +

+ +

+ ## TroubleShooting @@ -164,9 +196,17 @@ Most issues are related to connectivity between Airflow and DataHub. Here is how you can debug them. -![Find the Task Log](../../docs/imgs/airflow/finding_failed_log.png) -![Inspect the Log](../../docs/imgs/airflow/connection_error.png) +

+ +

+ + + +

+ +

+ In this case, clearly the connection `datahub-rest` has not been registered. Looks like we forgot to register the connection with Airflow! Let's execute Step 4 to register the datahub connection with Airflow. @@ -175,4 +215,8 @@ In case the connection was registered successfully but you are still seeing `Fai After re-running the DAG, we see success! -![Pipeline Success](../../docs/imgs/airflow/successful_run.png) + +

+ +

+ diff --git a/docker/build.gradle b/docker/build.gradle index f33e06f383240..ae101fe1defc5 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -35,8 +35,31 @@ task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') { environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - environment "ACTIONS_VERSION", 'alpine3.17-slim' - environment "DATAHUB_ACTIONS_IMAGE", 'nginx' + // environment "ACTIONS_VERSION", 'alpine3.17-slim' + // environment "DATAHUB_ACTIONS_IMAGE", 'nginx' + + def cmd = [ + 'source ../metadata-ingestion/venv/bin/activate && ', + 'datahub docker quickstart', + '--no-pull-images', + '--standalone_consumers', + '--version', "v${version}", + '--dump-logs-on-failure' + ] + + commandLine 'bash', '-c', cmd.join(" ") +} + +task quickstartSlim(type: Exec, dependsOn: ':metadata-ingestion:install') { + dependsOn(([':docker:datahub-ingestion'] + quickstart_modules).collect { it + ':dockerTag' }) + shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' + + environment "DATAHUB_TELEMETRY_ENABLED", "false" + environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" + environment "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion" + environment "ACTIONS_VERSION", "v${version}-slim" + environment "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions' + environment "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' def cmd = [ 'source ../metadata-ingestion/venv/bin/activate && ', @@ -64,6 +87,7 @@ task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') { dependsOn(debug_modules.collect { it + ':dockerTagDebug' }) shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' + environment "DATAHUB_PRECREATE_TOPICS", "true" environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile index 23c04972209ed..9efc0d2ce8753 100644 --- a/docker/datahub-frontend/Dockerfile +++ b/docker/datahub-frontend/Dockerfile @@ -29,6 +29,8 @@ FROM base as dev-install VOLUME [ "/datahub-frontend" ] FROM ${APP_ENV}-install as final +COPY ./docker/datahub-frontend/start.sh / +RUN chown datahub:datahub /start.sh && chmod 755 /start.sh USER datahub ARG SERVER_PORT=9002 @@ -37,5 +39,4 @@ RUN echo $SERVER_PORT EXPOSE $SERVER_PORT HEALTHCHECK --start-period=2m --retries=4 CMD curl --fail http://localhost:$SERVER_PORT/admin || exit 1 -COPY ./docker/datahub-frontend/start.sh / CMD ./start.sh diff --git a/docker/datahub-frontend/start.sh b/docker/datahub-frontend/start.sh index a1548670309b5..9dc1514144bb1 100755 --- a/docker/datahub-frontend/start.sh +++ b/docker/datahub-frontend/start.sh @@ -26,6 +26,21 @@ if [[ ! -z ${SSL_TRUSTSTORE_PASSWORD:-} ]]; then TRUSTSTORE_PASSWORD="-Djavax.net.ssl.trustStorePassword=$SSL_TRUSTSTORE_PASSWORD" fi +HTTP_PROXY="" +if [[ ! -z ${HTTP_PROXY_HOST:-} ]] && [[ ! -z ${HTTP_PROXY_PORT:-} ]]; then + HTTP_PROXY="-Dhttp.proxyHost=$HTTP_PROXY_HOST -Dhttp.proxyPort=$HTTP_PROXY_PORT" +fi + +HTTPS_PROXY="" +if [[ ! -z ${HTTPS_PROXY_HOST:-} ]] && [[ ! -z ${HTTPS_PROXY_PORT:-} ]]; then + HTTPS_PROXY="-Dhttps.proxyHost=$HTTPS_PROXY_HOST -Dhttps.proxyPort=$HTTPS_PROXY_PORT" +fi + +NO_PROXY="" +if [[ ! -z ${HTTP_NON_PROXY_HOSTS:-} ]]; then + NO_PROXY="-Dhttp.nonProxyHosts='$HTTP_NON_PROXY_HOSTS'" +fi + # make sure there is no whitespace at the beginning and the end of # this string export JAVA_OPTS="-Xms512m \ @@ -37,6 +52,7 @@ export JAVA_OPTS="-Xms512m \ -Dlogback.debug=false \ ${PROMETHEUS_AGENT:-} ${OTEL_AGENT:-} \ ${TRUSTSTORE_FILE:-} ${TRUSTSTORE_TYPE:-} ${TRUSTSTORE_PASSWORD:-} \ + ${HTTP_PROXY:-} ${HTTPS_PROXY:-} ${NO_PROXY:-} \ -Dpidfile.path=/dev/null" exec ./datahub-frontend/bin/datahub-frontend diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 9893d44caf460..3d47f79617370 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -1,3 +1,6 @@ +ARG APP_ENV=full +ARG BASE_IMAGE=base + FROM golang:1-alpine3.17 AS binary ENV DOCKERIZE_VERSION v0.6.1 @@ -16,9 +19,7 @@ ENV CONFLUENT_KAFKA_VERSION=1.6.1 ENV DEBIAN_FRONTEND noninteractive -RUN apt-get update && apt-get install -y \ - && apt-get install -y -qq \ - # gcc \ +RUN apt-get update && apt-get install -y -qq \ make \ python3-ldap \ libldap2-dev \ @@ -31,15 +32,34 @@ RUN apt-get update && apt-get install -y \ zip \ unzip \ ldap-utils \ - openjdk-11-jre-headless \ - && python -m pip install --upgrade pip wheel setuptools==57.5.0 \ - && curl -Lk -o /root/librdkafka-${LIBRDKAFKA_VERSION}.tar.gz https://github.com/edenhill/librdkafka/archive/v${LIBRDKAFKA_VERSION}.tar.gz \ - && tar -xzf /root/librdkafka-${LIBRDKAFKA_VERSION}.tar.gz -C /root \ - && cd /root/librdkafka-${LIBRDKAFKA_VERSION} \ - && ./configure --prefix /usr && make && make install && make clean && ./configure --clean \ - && apt-get remove -y make + && python -m pip install --no-cache --upgrade pip wheel setuptools \ + && wget -q https://github.com/edenhill/librdkafka/archive/v${LIBRDKAFKA_VERSION}.tar.gz -O - | \ + tar -xz -C /root \ + && cd /root/librdkafka-${LIBRDKAFKA_VERSION} \ + && ./configure --prefix /usr && make && make install && cd .. && rm -rf /root/librdkafka-${LIBRDKAFKA_VERSION} \ + && apt-get remove -y make \ + && rm -rf /var/lib/apt/lists/* /var/cache/apk/* + +# compiled against newer golang for security fixes COPY --from=binary /go/bin/dockerize /usr/local/bin +COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt +COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh + +RUN pip install --no-cache -r requirements.txt && \ + pip uninstall -y acryl-datahub && \ + chmod +x /entrypoint.sh && \ + addgroup --gid 1000 datahub && \ + adduser --disabled-password --uid 1000 --gid 1000 --home /datahub-ingestion datahub + +ENTRYPOINT [ "/entrypoint.sh" ] + +FROM ${BASE_IMAGE} as full-install + +RUN apt-get update && apt-get install -y -qq \ + default-jre-headless \ + && rm -rf /var/lib/apt/lists/* /var/cache/apk/* + RUN if [ $(arch) = "x86_64" ]; then \ mkdir /opt/oracle && \ cd /opt/oracle && \ @@ -58,7 +78,10 @@ RUN if [ $(arch) = "x86_64" ]; then \ ldconfig; \ fi; -COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt +FROM ${BASE_IMAGE} as slim-install +# Do nothing else on top of base + +FROM ${APP_ENV}-install -RUN pip install -r requirements.txt && \ - pip uninstall -y acryl-datahub +USER datahub +ENV PATH="/datahub-ingestion/.local/bin:$PATH" \ No newline at end of file diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 3d9e0777e5ce0..82d9a93a9a2c3 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -1,3 +1,7 @@ +# Excluded for slim +# pyspark==3.0.3 +# pydeequ==1.0.1 + acryl-datahub-classify==0.0.6 acryl-iceberg-legacy==0.0.4 acryl-PyHive==0.6.13 @@ -253,7 +257,6 @@ pycryptodome==3.18.0 pycryptodomex==3.18.0 pydantic==1.10.8 pydash==7.0.3 -pydeequ==1.0.1 pydruid==0.6.5 Pygments==2.15.1 pymongo==4.3.3 @@ -261,7 +264,6 @@ PyMySQL==1.0.3 pyOpenSSL==22.0.0 pyparsing==3.0.9 pyrsistent==0.19.3 -pyspark==3.0.3 pyspnego==0.9.0 python-daemon==3.0.1 python-dateutil==2.8.2 diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index fe3c12a59886f..10cd2ee71cce3 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -12,14 +12,17 @@ ext { } docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" + name "${docker_registry}/${docker_repo}:v${version}-slim" + version "v${version}-slim" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } + buildArgs([APP_ENV: 'slim']) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -27,10 +30,11 @@ task mkdirBuildDocker { } } dockerClean.finalizedBy(mkdirBuildDocker) +dockerClean.dependsOn([':docker:datahub-ingestion:dockerClean']) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/datahub-ingestion-base/entrypoint.sh b/docker/datahub-ingestion-base/entrypoint.sh new file mode 100644 index 0000000000000..518bb21561467 --- /dev/null +++ b/docker/datahub-ingestion-base/entrypoint.sh @@ -0,0 +1,14 @@ +#!/usr/bin/bash + +if [ ! -z "$ACTIONS_EXTRA_PACKAGES" ]; then + pip install --user $ACTIONS_EXTRA_PACKAGES +fi + +if [[ ! -z "$ACTIONS_CONFIG" && ! -z "$ACTIONS_EXTRA_PACKAGES" ]]; then + mkdir -p /tmp/datahub/logs + curl -q "$ACTIONS_CONFIG" -o config.yaml + exec dockerize -wait ${DATAHUB_GMS_PROTOCOL:-http}://$DATAHUB_GMS_HOST:$DATAHUB_GMS_PORT/health -timeout 240s \ + datahub actions --config config.yaml +else + exec datahub $@ +fi diff --git a/docker/datahub-ingestion-slim/Dockerfile b/docker/datahub-ingestion-slim/Dockerfile deleted file mode 100644 index 580dcc4277124..0000000000000 --- a/docker/datahub-ingestion-slim/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -# Defining environment -ARG APP_ENV=prod -ARG DOCKER_VERSION=latest - -FROM acryldata/datahub-ingestion:$DOCKER_VERSION as base - -USER 0 -RUN pip uninstall -y pyspark -USER datahub diff --git a/docker/datahub-ingestion-slim/build.gradle b/docker/datahub-ingestion-slim/build.gradle deleted file mode 100644 index f21b66b576a0c..0000000000000 --- a/docker/datahub-ingestion-slim/build.gradle +++ /dev/null @@ -1,39 +0,0 @@ -plugins { - id 'com.palantir.docker' - id 'java' // required for versioning -} - -apply from: "../../gradle/versioning/versioning.gradle" - -ext { - docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry - docker_repo = 'datahub-ingestion-slim' - docker_dir = 'datahub-ingestion-slim' -} - -docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" - dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") - files fileTree(rootProject.projectDir) { - include "docker/${docker_dir}/*" - } - buildArgs([DOCKER_VERSION: version]) - - buildx(false) -} -tasks.getByPath('docker').dependsOn(['build', ':docker:datahub-ingestion:docker']) - -task mkdirBuildDocker { - doFirst { - mkdir "${project.buildDir}/docker" - } -} -dockerClean.finalizedBy(mkdirBuildDocker) - -task cleanLocalDockerImages { - doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) - } -} -dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 45a98efb7f6fb..0ecc30d02ac3f 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -1,42 +1,27 @@ # Defining environment -ARG APP_ENV=prod +ARG APP_ENV=full +ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=latest -FROM acryldata/datahub-ingestion-base:$DOCKER_VERSION as base - -FROM eclipse-temurin:11 as prod-build -COPY . /datahub-src -WORKDIR /datahub-src -# We noticed that the gradle wrapper download failed frequently on in CI on arm64 machines. -# I suspect this was due because of the QEMU emulation slowdown, combined with the arm64 -# build being starved for CPU by the x86_64 build's codegen step. -# -# The middle step will attempt to download gradle wrapper 5 times with exponential backoff. -# The ./gradlew --version will force the download of the gradle wrapper but is otherwise a no-op. -# Note that the retry logic will always return success, so we should always attempt to run codegen. -# Inspired by https://github.com/gradle/gradle/issues/18124#issuecomment-958182335. -# and https://unix.stackexchange.com/a/82610/378179. -# This is a workaround for https://github.com/gradle/gradle/issues/18124. -RUN (for attempt in 1 2 3 4 5; do ./gradlew --version && break ; echo "Failed to download gradle wrapper (attempt $attempt)" && sleep $((2<<$attempt)) ; done ) && \ - ./gradlew :metadata-events:mxe-schemas:build - -FROM base as prod-codegen -COPY --from=prod-build /datahub-src /datahub-src -RUN cd /datahub-src/metadata-ingestion && \ - pip install -e ".[base]" && \ - ./scripts/codegen.sh - -FROM base as prod-install -COPY --from=prod-codegen /datahub-src/metadata-ingestion /datahub-ingestion -COPY --from=prod-codegen /root/.cache/pip /root/.cache/pip +FROM $BASE_IMAGE:$DOCKER_VERSION as base +USER 0 + +COPY ./metadata-ingestion /datahub-ingestion + ARG RELEASE_VERSION -RUN cd /datahub-ingestion && \ - sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ +WORKDIR /datahub-ingestion +RUN sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ cat src/datahub/__init__.py && \ - pip install ".[all]" && \ - pip freeze && \ - # This is required to fix security vulnerability in htrace-core4 - rm -f /usr/local/lib/python3.10/site-packages/pyspark/jars/htrace-core4-4.1.0-incubating.jar + chown -R datahub /datahub-ingestion + +USER datahub +ENV PATH="/datahub-ingestion/.local/bin:$PATH" + +FROM base as slim-install +RUN pip install --no-cache --user ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" + +FROM base as full-install +RUN pip install --no-cache --user ".[all]" FROM base as dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. @@ -44,7 +29,5 @@ FROM base as dev-install FROM ${APP_ENV}-install as final -RUN addgroup --system datahub && adduser --system datahub --ingroup datahub USER datahub - -ENTRYPOINT [ "datahub" ] +ENV PATH="/datahub-ingestion/.local/bin:$PATH" diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 7a24d87794c0e..22531c0c4fd0e 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -11,24 +11,30 @@ ext { docker_dir = 'datahub-ingestion' } +dependencies { + project(':docker:datahub-ingestion-base') + project(':metadata-ingestion') +} + docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" + name "${docker_registry}/${docker_repo}:v${version}-slim" + version "v${version}-slim" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" include "metadata-ingestion/**" - include "metadata-events/**" - include "metadata-models/**" - include "li-utils/**" - include "docs/**" - include "gradle/**" - include "buildSrc/**" - include "*" + }.exclude { + i -> i.file.isHidden() || + i.file == buildDir || + i.file == project(':metadata-ingestion').buildDir } - buildArgs([DOCKER_VERSION: version]) + buildArgs([DOCKER_VERSION: version, + RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', ''), + APP_ENV: 'slim']) } -tasks.getByPath('docker').dependsOn(['build', ':docker:datahub-ingestion-base:docker']) +tasks.getByName('docker').dependsOn(['build', + ':docker:datahub-ingestion-base:docker', + ':metadata-ingestion:codegen']) task mkdirBuildDocker { doFirst { @@ -39,7 +45,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 5ea364dd31ca7..08f8cc1ec9c45 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -26,6 +26,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 10b3f3c0eca5e..a755eda21cbf5 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -27,6 +27,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 9228c11446ddf..d07ea5fa88f8b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -26,6 +26,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/elasticsearch-setup/build.gradle b/docker/elasticsearch-setup/build.gradle index cc2fe1ec5c4db..ffee3b9c65cf4 100644 --- a/docker/elasticsearch-setup/build.gradle +++ b/docker/elasticsearch-setup/build.gradle @@ -17,6 +17,8 @@ docker { files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" include "metadata-service/restli-servlet-impl/src/main/resources/index/**" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index 8cf9d0869dc9b..a9c75521fead1 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -1,5 +1,7 @@ +ARG KAFKA_DOCKER_VERSION=7.4.1 + # Using as a base image because to get the needed jars for confluent utils -FROM confluentinc/cp-base-new@sha256:ac4e0f9bcaecdab728740529f37452231fa40760fcf561759fc3b219f46d2cc9 as confluent_base +FROM confluentinc/cp-base-new:$KAFKA_DOCKER_VERSION as confluent_base ARG MAVEN_REPO="https://repo1.maven.org/maven2" ARG SNAKEYAML_VERSION="2.0" @@ -13,15 +15,6 @@ FROM python:3-alpine ENV KAFKA_VERSION 3.4.1 ENV SCALA_VERSION 2.13 -# Set the classpath for JARs required by `cub` -ENV CUB_CLASSPATH='"/usr/share/java/cp-base-new/*"' - -# Confluent Docker Utils Version (Namely the tag or branch to grab from git to install) -ARG PYTHON_CONFLUENT_DOCKER_UTILS_VERSION="v0.0.60" - -# This can be overriden for an offline/air-gapped builds -ARG PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC="git+https://github.com/confluentinc/confluent-docker-utils@${PYTHON_CONFLUENT_DOCKER_UTILS_VERSION}" - LABEL name="kafka" version=${KAFKA_VERSION} RUN apk add --no-cache bash coreutils @@ -35,11 +28,6 @@ RUN mkdir -p /opt \ && mv /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION} /opt/kafka \ && adduser -DH -s /sbin/nologin kafka \ && chown -R kafka: /opt/kafka \ - && echo "===> Installing python packages ..." \ - && pip install --no-cache-dir --upgrade pip wheel setuptools \ - && pip install jinja2 requests \ - && pip install "Cython<3.0" "PyYAML<6" --no-build-isolation \ - && pip install --prefer-binary --prefix=/usr/local --upgrade "${PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC}" \ && rm -rf /tmp/* \ && apk del --purge .build-deps @@ -69,7 +57,8 @@ ENV USE_CONFLUENT_SCHEMA_REGISTRY="TRUE" COPY docker/kafka-setup/kafka-setup.sh ./kafka-setup.sh COPY docker/kafka-setup/kafka-config.sh ./kafka-config.sh COPY docker/kafka-setup/kafka-topic-workers.sh ./kafka-topic-workers.sh +COPY docker/kafka-setup/kafka-ready.sh ./kafka-ready.sh -RUN chmod +x ./kafka-setup.sh && chmod +x ./kafka-topic-workers.sh +RUN chmod +x ./kafka-setup.sh ./kafka-topic-workers.sh ./kafka-ready.sh CMD ./kafka-setup.sh diff --git a/docker/kafka-setup/build.gradle b/docker/kafka-setup/build.gradle index a5d33457e45f7..573ef21c88bf9 100644 --- a/docker/kafka-setup/build.gradle +++ b/docker/kafka-setup/build.gradle @@ -16,6 +16,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -24,7 +26,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -35,7 +37,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/kafka-setup/kafka-ready.sh b/docker/kafka-setup/kafka-ready.sh new file mode 100755 index 0000000000000..ba87bde047ef5 --- /dev/null +++ b/docker/kafka-setup/kafka-ready.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for i in {1..60} +do + kafka-broker-api-versions.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER + if [ $? -eq 0 ]; then + break + fi + if [ $i -eq 60 ]; then + echo "Kafka bootstrap server $KAFKA_BOOTSTRAP_SERVER not ready." + exit 1 + fi + sleep 5s +done diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh old mode 100644 new mode 100755 index 7b015421b7963..629e9bc9484ee --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -49,8 +49,8 @@ if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH fi -cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 - +# cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 +. kafka-ready.sh ############################################################ # Start Topic Creation Logic diff --git a/docker/mysql-setup/build.gradle b/docker/mysql-setup/build.gradle index 48a28f15a581d..0d8941cce4833 100644 --- a/docker/mysql-setup/build.gradle +++ b/docker/mysql-setup/build.gradle @@ -17,6 +17,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/postgres-setup/build.gradle b/docker/postgres-setup/build.gradle index a5b0413ec4be8..8a026be09d2b4 100644 --- a/docker/postgres-setup/build.gradle +++ b/docker/postgres-setup/build.gradle @@ -17,6 +17,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 5a8edd6eacf19..38418bc8c41b9 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 6d51f2efcfcf2..cf879faa6a3f0 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 48f2d797bd8a4..007830078d2b4 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index bd30c359a2a76..390543b92123f 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 12f37033efc2f..851c10d9ea97f 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -77,7 +77,12 @@ task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, args = ['run', 'generate'] } -task yarnStart(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { +task downloadHistoricalVersions(type: Exec) { + workingDir '.' + commandLine 'python3', 'download_historical_versions.py' +} + +task yarnStart(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate, downloadHistoricalVersions]) { args = ['run', 'start'] } task fastReload(type: YarnTask) { @@ -105,7 +110,7 @@ task serve(type: YarnTask, dependsOn: [yarnInstall] ) { } -task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate]) { +task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate, downloadHistoricalVersions]) { inputs.files(projectMdFiles) inputs.file("package.json").withPathSensitivity(PathSensitivity.RELATIVE) inputs.dir("src").withPathSensitivity(PathSensitivity.RELATIVE) diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index c10c178424b53..9bdba5f317542 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -69,6 +69,11 @@ module.exports = { label: "Roadmap", position: "right", }, + { + type: 'docsVersionDropdown', + position: 'right', + dropdownActiveClassDisabled: true, + }, { href: "https://slack.datahubproject.io", "aria-label": "Slack", @@ -173,8 +178,8 @@ module.exports = { appId: "RK0UG797F3", apiKey: "39d7eb90d8b31d464e309375a52d674f", indexName: "datahubproject", - // contextualSearch: true, - // searchParameters: {}, + insights: true, + contextualSearch: true, // debug: true, }, }, diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py new file mode 100644 index 0000000000000..83157edc1972c --- /dev/null +++ b/docs-website/download_historical_versions.py @@ -0,0 +1,78 @@ +import json +import os +import tarfile +import urllib.request + +repo_url = "https://api.github.com/repos/datahub-project/static-assets" + + +def download_file(url, destination): + with urllib.request.urlopen(url) as response: + with open(destination, "wb") as f: + while True: + chunk = response.read(8192) + if not chunk: + break + f.write(chunk) + + +def fetch_urls(repo_url: str, folder_path: str, file_format: str): + api_url = f"{repo_url}/contents/{folder_path}" + response = urllib.request.urlopen(api_url) + data = response.read().decode("utf-8") + urls = [ + file["download_url"] + for file in json.loads(data) + if file["name"].endswith(file_format) + ] + print(urls) + return urls + + +def extract_tar_file(destination_path): + with tarfile.open(destination_path, "r:gz") as tar: + tar.extractall() + os.remove(destination_path) + + +def download_versioned_docs(folder_path: str, destination_dir: str, file_format: str): + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + + urls = fetch_urls(repo_url, folder_path, file_format) + + for url in urls: + filename = os.path.basename(url) + destination_path = os.path.join(destination_dir, filename) + + version = ".".join(filename.split(".")[:3]) + extracted_path = os.path.join(destination_dir, version) + print("extracted_path", extracted_path) + if os.path.exists(extracted_path): + print(f"{extracted_path} already exists, skipping downloads") + continue + try: + download_file(url, destination_path) + print(f"Downloaded {filename} to {destination_dir}") + if file_format == ".tar.gz": + extract_tar_file(destination_path) + except urllib.error.URLError as e: + print(f"Error while downloading {filename}: {e}") + continue + + +def main(): + download_versioned_docs( + folder_path="versioned_docs", + destination_dir="versioned_docs", + file_format=".tar.gz", + ) + download_versioned_docs( + folder_path="versioned_sidebars", + destination_dir="versioned_sidebars", + file_format=".json", + ) + + +if __name__ == "__main__": + main() diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 51a57fc41dd36..fcf82b786a1b9 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -138,7 +138,15 @@ module.exports = { ], }, { - Deployment: [ + type: "category", + label: "Deployment", + link: { + type: "generated-index", + title: "Deployment Guides", + description: + "Learn how to deploy DataHub to your environment, set up authentication, manage upgrades, and more.", + }, + items: [ // The purpose of this section is to provide the minimum steps required to deploy DataHub to the vendor of your choosing "docs/deploy/aws", "docs/deploy/gcp", @@ -160,6 +168,7 @@ module.exports = { "docs/authentication/guides/sso/configure-oidc-react-google", "docs/authentication/guides/sso/configure-oidc-react-okta", "docs/authentication/guides/sso/configure-oidc-react-azure", + "docs/authentication/guides/sso/configure-oidc-behind-proxy", ], }, ], @@ -395,7 +404,14 @@ module.exports = { ], }, { - Features: [ + type: "category", + label: "Features", + link: { + type: "generated-index", + title: "Feature Guides", + description: "Learn about the features of DataHub.", + }, + items: [ "docs/ui-ingestion", "docs/how/search", "docs/schema-history", @@ -418,7 +434,10 @@ module.exports = { }, "docs/act-on-metadata/impact-analysis", { - Observability: ["docs/managed-datahub/observe/freshness-assertions"], + Observability: [ + "docs/managed-datahub/observe/freshness-assertions", + "docs/managed-datahub/observe/volume-assertions", + ], }, ], }, diff --git a/docs-website/src/pages/docs/_components/SearchBar/index.jsx b/docs-website/src/pages/docs/_components/SearchBar/index.jsx index 37f8a5c252aee..054c041d8a9e5 100644 --- a/docs-website/src/pages/docs/_components/SearchBar/index.jsx +++ b/docs-website/src/pages/docs/_components/SearchBar/index.jsx @@ -303,11 +303,16 @@ function SearchBar() { strokeLinejoin="round" > - - {docsSearchVersionsHelpers.versioningEnabled && } - -
{!!searchResultState.totalResults && documentsFoundPlural(searchResultState.totalResults)}
+ {docsSearchVersionsHelpers.versioningEnabled && ( + + )} +
+ {!!searchResultState.totalResults && + documentsFoundPlural(searchResultState.totalResults)} +
{searchResultState.items.length > 0 ? (
@@ -369,4 +374,4 @@ function SearchBar() { ); } -export default SearchBar; +export default SearchBar; \ No newline at end of file diff --git a/docs-website/src/pages/docs/_components/SearchBar/search.module.scss b/docs-website/src/pages/docs/_components/SearchBar/search.module.scss index 17e5f22490664..30a2973384ba6 100644 --- a/docs-website/src/pages/docs/_components/SearchBar/search.module.scss +++ b/docs-website/src/pages/docs/_components/SearchBar/search.module.scss @@ -21,13 +21,21 @@ height: 1.5rem; } +.searchQueryInput { + padding: 0.8rem 0.8rem 0.8rem 3rem; +} + +.searchVersionInput { + padding: 0.8rem 2rem 0.8rem 2rem; + text-align: center; +} + .searchQueryInput, .searchVersionInput { border-radius: 1000em; border-style: solid; border-color: transparent; font: var(--ifm-font-size-base) var(--ifm-font-family-base); - padding: 0.8rem 0.8rem 0.8rem 3rem; width: 100%; background: var(--docsearch-searchbox-background); color: var(--docsearch-text-color); @@ -93,6 +101,7 @@ @media only screen and (max-width: 996px) { .searchVersionColumn { max-width: 40% !important; + margin: auto; } .searchResultsColumn { @@ -113,9 +122,15 @@ .searchVersionColumn { max-width: 100% !important; padding-left: var(--ifm-spacing-horizontal) !important; + margin: auto; } } +.searchVersionColumn { + margin: auto; +} + + .loadingSpinner { width: 3rem; height: 3rem; diff --git a/docs-website/versions.json b/docs-website/versions.json new file mode 100644 index 0000000000000..0b79ac9498e06 --- /dev/null +++ b/docs-website/versions.json @@ -0,0 +1,3 @@ +[ + "0.10.5" +] diff --git a/docs/actions/concepts.md b/docs/actions/concepts.md index 381f2551d2237..5b05a0c586a5d 100644 --- a/docs/actions/concepts.md +++ b/docs/actions/concepts.md @@ -40,7 +40,11 @@ The Actions Framework consists of a few core concepts-- Each of these will be described in detail below. -![](imgs/actions.png) + +

+ +

+ **In the Actions Framework, Events flow continuously from left-to-right.** ### Pipelines diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md index e1fadee6d371a..d76b776d3dddb 100644 --- a/docs/advanced/no-code-modeling.md +++ b/docs/advanced/no-code-modeling.md @@ -159,11 +159,19 @@ along with simplifying the number of raw data models that need defined, includin From an architectural PoV, we will move from a before that looks something like this: -![no-code-before](../imgs/no-code-before.png) + +

+ +

+ to an after that looks like this -![no-code-after](../imgs/no-code-after.png) + +

+ +

+ That is, a move away from patterns of strong-typing-everywhere to a more generic + flexible world. @@ -211,7 +219,7 @@ record ServiceKey { * Name of the service */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } name: string diff --git a/docs/api/graphql/how-to-set-up-graphql.md b/docs/api/graphql/how-to-set-up-graphql.md index 562e8edb9f5d9..584bf34ad3f92 100644 --- a/docs/api/graphql/how-to-set-up-graphql.md +++ b/docs/api/graphql/how-to-set-up-graphql.md @@ -62,7 +62,11 @@ Postman is a popular API client that provides a graphical user interface for sen Within Postman, you can create a `POST` request and set the request URL to the `/api/graphql` endpoint. In the request body, select the `GraphQL` option and enter your GraphQL query in the request body. -![postman-graphql](../../imgs/apis/postman-graphql.png) + +

+ +

+ Please refer to [Querying with GraphQL](https://learning.postman.com/docs/sending-requests/graphql/graphql/) in the Postman documentation for more information. diff --git a/docs/api/tutorials/custom-properties.md b/docs/api/tutorials/custom-properties.md index dbc07bfaa712e..fe0d7e62dcde8 100644 --- a/docs/api/tutorials/custom-properties.md +++ b/docs/api/tutorials/custom-properties.md @@ -34,7 +34,11 @@ In this example, we will add some custom properties `cluster_name` and `retentio After you have ingested sample data, the dataset `fct_users_deleted` should have a custom properties section with `encoding` set to `utf-8`. -![dataset-properties-before](../../imgs/apis/tutorials/dataset-properties-before.png) + +

+ +

+ ```shell datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)" --aspect datasetProperties @@ -80,7 +84,11 @@ The following code adds custom properties `cluster_name` and `retention_time` to You can now see the two new properties are added to `fct_users_deleted` and the previous property `encoding` is unchanged. -![dataset-properties-added](../../imgs/apis/tutorials/dataset-properties-added.png) + +

+ +

+ We can also verify this operation by programmatically checking the `datasetProperties` aspect after running this code using the `datahub` cli. @@ -130,7 +138,11 @@ The following code shows you how can add and remove custom properties in the sam You can now see the `cluster_name` property is added to `fct_users_deleted` and the `retention_time` property is removed. -![dataset-properties-added-removed](../../imgs/apis/tutorials/dataset-properties-added-removed.png) + +

+ +

+ We can also verify this operation programmatically by checking the `datasetProperties` aspect using the `datahub` cli. @@ -179,7 +191,11 @@ The following code replaces the current custom properties with a new properties You can now see the `cluster_name` and `retention_time` properties are added to `fct_users_deleted` but the previous `encoding` property is no longer present. -![dataset-properties-replaced](../../imgs/apis/tutorials/dataset-properties-replaced.png) + +

+ +

+ We can also verify this operation programmatically by checking the `datasetProperties` aspect using the `datahub` cli. diff --git a/docs/api/tutorials/datasets.md b/docs/api/tutorials/datasets.md index 62b30e97c8020..7c6d4a88d4190 100644 --- a/docs/api/tutorials/datasets.md +++ b/docs/api/tutorials/datasets.md @@ -42,7 +42,11 @@ For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart. You can now see `realestate_db.sales` dataset has been created. -![dataset-created](../../imgs/apis/tutorials/dataset-created.png) + +

+ +

+ ## Delete Dataset @@ -110,4 +114,8 @@ Expected Response: The dataset `fct_users_deleted` has now been deleted, so if you search for a hive dataset named `fct_users_delete`, you will no longer be able to see it. -![dataset-deleted](../../imgs/apis/tutorials/dataset-deleted.png) + +

+ +

+ diff --git a/docs/api/tutorials/deprecation.md b/docs/api/tutorials/deprecation.md index 6a8f7c8a1d2be..73e73f5224cbc 100644 --- a/docs/api/tutorials/deprecation.md +++ b/docs/api/tutorials/deprecation.md @@ -155,4 +155,8 @@ Expected Response: You can now see the dataset `fct_users_created` has been marked as `Deprecated.` -![tag-removed](../../imgs/apis/tutorials/deprecation-updated.png) + +

+ +

+ diff --git a/docs/api/tutorials/descriptions.md b/docs/api/tutorials/descriptions.md index 46f42b7a05be6..27c57309ba76a 100644 --- a/docs/api/tutorials/descriptions.md +++ b/docs/api/tutorials/descriptions.md @@ -275,7 +275,11 @@ Expected Response: You can now see the description is added to `fct_users_deleted`. -![dataset-description-added](../../imgs/apis/tutorials/dataset-description-added.png) + +

+ +

+ ## Add Description on Column @@ -357,4 +361,8 @@ Expected Response: You can now see column description is added to `user_name` column of `fct_users_deleted`. -![column-description-added](../../imgs/apis/tutorials/column-description-added.png) + +

+ +

+ diff --git a/docs/api/tutorials/domains.md b/docs/api/tutorials/domains.md index c8c47f85c570f..617864d233b7a 100644 --- a/docs/api/tutorials/domains.md +++ b/docs/api/tutorials/domains.md @@ -74,7 +74,11 @@ Expected Response: You can now see `Marketing` domain has been created under `Govern > Domains`. -![domain-created](../../imgs/apis/tutorials/domain-created.png) + +

+ +

+ ## Read Domains @@ -209,7 +213,11 @@ Expected Response: You can now see `Marketing` domain has been added to the dataset. -![domain-added](../../imgs/apis/tutorials/domain-added.png) + +

+ +

+ ## Remove Domains @@ -259,4 +267,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see a domain `Marketing` has been removed from the `fct_users_created` dataset. -![domain-removed](../../imgs/apis/tutorials/domain-removed.png) + +

+ +

+ diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index e37986af7bbbd..dc43cb178f949 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -112,7 +112,11 @@ Expected Response: You can now see the lineage between `fct_users_deleted` and `logging_events`. -![lineage-added](../../imgs/apis/tutorials/lineage-added.png) + +

+ +

+ ## Add Column-level Lineage @@ -130,15 +134,19 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`. You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage. -![column-level-lineage-added](../../imgs/apis/tutorials/column-level-lineage-added.png) + +

+ +

+ ## Read Lineage -```json -mutation searchAcrossLineage { +```graphql +query searchAcrossLineage { searchAcrossLineage( input: { query: "*" diff --git a/docs/api/tutorials/ml.md b/docs/api/tutorials/ml.md index b16f2669b30c7..cb77556d48ebf 100644 --- a/docs/api/tutorials/ml.md +++ b/docs/api/tutorials/ml.md @@ -94,9 +94,17 @@ Please note that an MlModelGroup serves as a container for all the runs of a sin You can search the entities in DataHub UI. -![feature-table-created](../../imgs/apis/tutorials/feature-table-created.png) -![model-group-created](../../imgs/apis/tutorials/model-group-created.png) +

+ +

+ + + +

+ +

+ ## Read ML Entities @@ -499,6 +507,14 @@ Expected Response: (Note that this entity does not exist in the sample ingestion You can access to `Features` or `Group` Tab of each entity to view the added entities. -![feature-added-to-model](../../imgs/apis/tutorials/feature-added-to-model.png) -![model-group-added-to-model](../../imgs/apis/tutorials/model-group-added-to-model.png) +

+ +

+ + + +

+ +

+ diff --git a/docs/api/tutorials/owners.md b/docs/api/tutorials/owners.md index 3c7a46b136d76..5bc3b95cb5631 100644 --- a/docs/api/tutorials/owners.md +++ b/docs/api/tutorials/owners.md @@ -77,7 +77,11 @@ Update succeeded for urn urn:li:corpuser:datahub. ### Expected Outcomes of Upserting User You can see the user `The bar` has been created and the user `Datahub` has been updated under `Settings > Access > Users & Groups` -![user-upserted](../../imgs/apis/tutorials/user-upserted.png) + +

+ +

+ ## Upsert Group @@ -125,7 +129,11 @@ Update succeeded for group urn:li:corpGroup:foogroup@acryl.io. ### Expected Outcomes of Upserting Group You can see the group `Foo Group` has been created under `Settings > Access > Users & Groups` -![group-upserted](../../imgs/apis/tutorials/group-upserted.png) + +

+ +

+ ## Read Owners @@ -272,7 +280,11 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see `bfoo` has been added as an owner to the `fct_users_created` dataset. -![ownership-added](../../imgs/apis/tutorials/owner-added.png) + +

+ +

+ ## Remove Owners @@ -340,4 +352,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see `John Doe` has been removed as an owner from the `fct_users_created` dataset. -![ownership-removed](../../imgs/apis/tutorials/owner-removed.png) + +

+ +

+ diff --git a/docs/api/tutorials/tags.md b/docs/api/tutorials/tags.md index 2f80a833136c1..b2234bf00bcb9 100644 --- a/docs/api/tutorials/tags.md +++ b/docs/api/tutorials/tags.md @@ -91,7 +91,11 @@ Expected Response: You can now see the new tag `Deprecated` has been created. -![tag-created](../../imgs/apis/tutorials/tag-created.png) + +

+ +

+ We can also verify this operation by programmatically searching `Deprecated` tag after running this code using the `datahub` cli. @@ -307,7 +311,11 @@ Expected Response: You can now see `Deprecated` tag has been added to `user_name` column. -![tag-added](../../imgs/apis/tutorials/tag-added.png) + +

+ +

+ We can also verify this operation programmatically by checking the `globalTags` aspect using the `datahub` cli. @@ -359,7 +367,11 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see `Deprecated` tag has been removed to `user_name` column. -![tag-removed](../../imgs/apis/tutorials/tag-removed.png) + +

+ +

+ We can also verify this operation programmatically by checking the `gloablTags` aspect using the `datahub` cli. diff --git a/docs/api/tutorials/terms.md b/docs/api/tutorials/terms.md index 207e14ea4afe8..99acf77d26ab0 100644 --- a/docs/api/tutorials/terms.md +++ b/docs/api/tutorials/terms.md @@ -95,7 +95,11 @@ Expected Response: You can now see the new term `Rate of Return` has been created. -![term-created](../../imgs/apis/tutorials/term-created.png) + +

+ +

+ We can also verify this operation by programmatically searching `Rate of Return` term after running this code using the `datahub` cli. @@ -289,7 +293,11 @@ Expected Response: You can now see `Rate of Return` term has been added to `user_name` column. -![term-added](../../imgs/apis/tutorials/term-added.png) + +

+ +

+ ## Remove Terms @@ -361,4 +369,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ You can now see `Rate of Return` term has been removed to `user_name` column. -![term-removed](../../imgs/apis/tutorials/term-removed.png) + +

+ +

+ diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md index 6b76b995cc427..6a9c1860d71b0 100644 --- a/docs/architecture/architecture.md +++ b/docs/architecture/architecture.md @@ -10,8 +10,16 @@ disparate tools & systems. The figures below describe the high-level architecture of DataHub. -![datahub-architecture](../imgs/datahub-architecture.png) -![Acryl DataHub System Architecture ](../managed-datahub/imgs/saas/DataHub-Architecture.png) + +

+ +

+ + +

+ +

+ For a more detailed look at the components that make up the Architecture, check out [Components](../components.md). diff --git a/docs/architecture/metadata-ingestion.md b/docs/architecture/metadata-ingestion.md index 2b60383319c68..abf8fc24d1385 100644 --- a/docs/architecture/metadata-ingestion.md +++ b/docs/architecture/metadata-ingestion.md @@ -6,7 +6,11 @@ title: "Ingestion Framework" DataHub supports an extremely flexible ingestion architecture that can support push, pull, asynchronous and synchronous models. The figure below describes all the options possible for connecting your favorite system to DataHub. -![Ingestion Architecture](../imgs/ingestion-architecture.png) + +

+ +

+ ## Metadata Change Proposal: The Center Piece diff --git a/docs/architecture/metadata-serving.md b/docs/architecture/metadata-serving.md index ada41179af4e0..57194f49d5ea4 100644 --- a/docs/architecture/metadata-serving.md +++ b/docs/architecture/metadata-serving.md @@ -6,7 +6,11 @@ title: "Serving Tier" The figure below shows the high-level system diagram for DataHub's Serving Tier. -![datahub-serving](../imgs/datahub-serving.png) + +

+ +

+ The primary component is called [the Metadata Service](../../metadata-service) and exposes a REST API and a GraphQL API for performing CRUD operations on metadata. The service also exposes search and graph query API-s to support secondary-index style queries, full-text search queries as well as relationship queries like lineage. In addition, the [datahub-frontend](../../datahub-frontend) service expose a GraphQL API on top of the metadata graph. diff --git a/docs/authentication/concepts.md b/docs/authentication/concepts.md index 715e94c7e0380..0940f86a805f1 100644 --- a/docs/authentication/concepts.md +++ b/docs/authentication/concepts.md @@ -11,7 +11,11 @@ We introduced a few important concepts to the Metadata Service to make authentic In following sections, we'll take a closer look at each individually. -![](../imgs/metadata-service-auth.png) + +

+ +

+ *High level overview of Metadata Service Authentication* ## What is an Actor? diff --git a/docs/authentication/guides/sso/configure-oidc-behind-proxy.md b/docs/authentication/guides/sso/configure-oidc-behind-proxy.md new file mode 100644 index 0000000000000..c998816e04735 --- /dev/null +++ b/docs/authentication/guides/sso/configure-oidc-behind-proxy.md @@ -0,0 +1,64 @@ +# Configuring Frontend to use a Proxy when communicating with SSO Provider +*Authored on 22/08/2023* + +The `datahub-frontend-react` server can be configured to use an http proxy when retrieving the openid-configuration. +This can be needed if your infrastructure is locked down and disallows connectivity by default, using proxies for fine-grained egress control. + +## Configure http proxy and non proxy hosts + +To do this, you will need to pass a set of environment variables to the datahub-frontend-react container (e.g. in the `docker-compose.yml` file or your kubernetes manifest). + +``` +HTTP_PROXY_HOST=host of your http proxy +HTTP_PROXY_PORT=port of your http proxy +HTTPS_PROXY_HOST=host of your http(s) proxy used for https connections (often the same as the http proxy) +HTTPS_PROXY_PORT=port of your http(s) proxy used for https connections (often the same as the http proxy) +HTTP_NON_PROXY_HOSTS=localhost|datahub-gms (or any other hosts that you would like to bypass the proxy for, delimited by pipe) +``` + +## Optional: provide custom truststore +If your upstream proxy performs SSL termination to inspect traffic, this will result in different (self-signed) certificates for HTTPS connections. +The default truststore used in the `datahub-frontend-react` docker image will not trust these kinds of connections. +To address this, you can copy or mount your own truststore (provided by the proxy or network administrators) into the docker container. + +Depending on your setup, you have a few options to achieve this: + +### Make truststore available in the frontend + +#### Option a) Build frontend docker image with your own truststore included + +To build a custom image for your frontend, with the certificates built-in, you can use the official frontend image as a base, then copy in your required files. + +Example Dockerfile: + +```dockerfile +FROM linkedin/datahub-frontend-react: +COPY /truststore-directory /certificates +``` + +Building this Dockerfile will result in your own custom docker image on your local machine. +You will then be able to tag it, publish it to your own registry, etc. + +#### Option b) Mount truststore from your host machine using a docker volume + +Adapt your docker-compose.yml to include a new volume mount in the `datahub-frontend-react` container + +```docker + datahub-frontend-react: + # ... + volumes: + # ... + - /truststore-directory:/certificates +``` + +### Reference new truststore + +Add the following environment values to the `datahub-frontend-react` container: + +``` +SSL_TRUSTSTORE_FILE=path/to/truststore.jks (e.g. /certificates) +SSL_TRUSTSTORE_TYPE=jks +SSL_TRUSTSTORE_PASSWORD=MyTruststorePassword +``` + +Once these steps are done, your frontend container will use the new truststore when validating SSL/HTTPS connections. diff --git a/docs/authentication/guides/sso/configure-oidc-react-azure.md b/docs/authentication/guides/sso/configure-oidc-react-azure.md index d185957967882..177387327c0e8 100644 --- a/docs/authentication/guides/sso/configure-oidc-react-azure.md +++ b/docs/authentication/guides/sso/configure-oidc-react-azure.md @@ -32,7 +32,11 @@ Azure supports more than one redirect URI, so both can be configured at the same At this point, your app registration should look like the following: -![azure-setup-app-registration](img/azure-setup-app-registration.png) + +

+ +

+ e. Click **Register**. @@ -40,7 +44,11 @@ e. Click **Register**. Once registration is done, you will land on the app registration **Overview** tab. On the left-side navigation bar, click on **Authentication** under **Manage** and add extra redirect URIs if need be (if you want to support both local testing and Azure deployments). -![azure-setup-authentication](img/azure-setup-authentication.png) + +

+ +

+ Click **Save**. @@ -51,7 +59,11 @@ Select **Client secrets**, then **New client secret**. Type in a meaningful des **IMPORTANT:** Copy the `value` of your newly create secret since Azure will never display its value afterwards. -![azure-setup-certificates-secrets](img/azure-setup-certificates-secrets.png) + +

+ +

+ ### 4. Configure API permissions @@ -66,7 +78,11 @@ Click on **Add a permission**, then from the **Microsoft APIs** tab select **Mic At this point, you should be looking at a screen like the following: -![azure-setup-api-permissions](img/azure-setup-api-permissions.png) + +

+ +

+ ### 5. Obtain Application (Client) ID diff --git a/docs/authentication/guides/sso/configure-oidc-react-google.md b/docs/authentication/guides/sso/configure-oidc-react-google.md index 474538097aae2..af62185e6e787 100644 --- a/docs/authentication/guides/sso/configure-oidc-react-google.md +++ b/docs/authentication/guides/sso/configure-oidc-react-google.md @@ -31,7 +31,11 @@ Note that in order to complete this step you should be logged into a Google acco c. Fill out the details in the App Information & Domain sections. Make sure the 'Application Home Page' provided matches where DataHub is deployed at your organization. -![google-setup-1](img/google-setup-1.png) + +

+ +

+ Once you've completed this, **Save & Continue**. @@ -70,7 +74,11 @@ f. You will now receive a pair of values, a client id and a client secret. Bookm At this point, you should be looking at a screen like the following: -![google-setup-2](img/google-setup-2.png) + +

+ +

+ Success! diff --git a/docs/authentication/guides/sso/configure-oidc-react-okta.md b/docs/authentication/guides/sso/configure-oidc-react-okta.md index cfede999f1e70..320b887a28f16 100644 --- a/docs/authentication/guides/sso/configure-oidc-react-okta.md +++ b/docs/authentication/guides/sso/configure-oidc-react-okta.md @@ -69,8 +69,16 @@ for example, `https://dev-33231928.okta.com/.well-known/openid-configuration`. At this point, you should be looking at a screen like the following: -![okta-setup-1](img/okta-setup-1.png) -![okta-setup-2](img/okta-setup-2.png) + +

+ +

+ + +

+ +

+ Success! @@ -96,7 +104,11 @@ Replacing the placeholders above with the client id & client secret received fro > > By default, we assume that the groups will appear in a claim named "groups". This can be customized using the `AUTH_OIDC_GROUPS_CLAIM` container configuration. > -> ![okta-setup-2](img/okta-setup-groups-claim.png) +> +

+ +

+ ### 5. Restart `datahub-frontend-react` docker container diff --git a/docs/authentication/guides/sso/img/azure-setup-api-permissions.png b/docs/authentication/guides/sso/img/azure-setup-api-permissions.png deleted file mode 100755 index 4964b7d48ffec..0000000000000 Binary files a/docs/authentication/guides/sso/img/azure-setup-api-permissions.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/azure-setup-app-registration.png b/docs/authentication/guides/sso/img/azure-setup-app-registration.png deleted file mode 100755 index ffb23a7e3ddec..0000000000000 Binary files a/docs/authentication/guides/sso/img/azure-setup-app-registration.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/azure-setup-authentication.png b/docs/authentication/guides/sso/img/azure-setup-authentication.png deleted file mode 100755 index 2d27ec88fb40b..0000000000000 Binary files a/docs/authentication/guides/sso/img/azure-setup-authentication.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png b/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png deleted file mode 100755 index db6585d84d8ee..0000000000000 Binary files a/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/google-setup-1.png b/docs/authentication/guides/sso/img/google-setup-1.png deleted file mode 100644 index 88c674146f1e4..0000000000000 Binary files a/docs/authentication/guides/sso/img/google-setup-1.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/google-setup-2.png b/docs/authentication/guides/sso/img/google-setup-2.png deleted file mode 100644 index 850512b891d5f..0000000000000 Binary files a/docs/authentication/guides/sso/img/google-setup-2.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/okta-setup-1.png b/docs/authentication/guides/sso/img/okta-setup-1.png deleted file mode 100644 index 3949f18657c5e..0000000000000 Binary files a/docs/authentication/guides/sso/img/okta-setup-1.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/okta-setup-2.png b/docs/authentication/guides/sso/img/okta-setup-2.png deleted file mode 100644 index fa6ea4d991894..0000000000000 Binary files a/docs/authentication/guides/sso/img/okta-setup-2.png and /dev/null differ diff --git a/docs/authentication/guides/sso/img/okta-setup-groups-claim.png b/docs/authentication/guides/sso/img/okta-setup-groups-claim.png deleted file mode 100644 index ed35426685e46..0000000000000 Binary files a/docs/authentication/guides/sso/img/okta-setup-groups-claim.png and /dev/null differ diff --git a/docs/authentication/personal-access-tokens.md b/docs/authentication/personal-access-tokens.md index 0188aab49444e..dc57a989a4e0c 100644 --- a/docs/authentication/personal-access-tokens.md +++ b/docs/authentication/personal-access-tokens.md @@ -71,7 +71,11 @@ curl 'http://localhost:8080/entities/urn:li:corpuser:datahub' -H 'Authorization: Since authorization happens at the GMS level, this means that ingestion is also protected behind access tokens, to use them simply add a `token` to the sink config property as seen below: -![](../imgs/ingestion-with-token.png) + +

+ +

+ :::note diff --git a/docs/authorization/access-policies-guide.md b/docs/authorization/access-policies-guide.md index 5820e513a83e3..1eabb64d2878f 100644 --- a/docs/authorization/access-policies-guide.md +++ b/docs/authorization/access-policies-guide.md @@ -110,10 +110,13 @@ In the second step, we can simply select the Privileges that this Platform Polic | Manage Tags | Allow the actor to create and remove any Tags | | Manage Public Views | Allow the actor to create, edit, and remove any public (shared) Views. | | Manage Ownership Types | Allow the actor to create, edit, and remove any Ownership Types. | +| Manage Platform Settings | (Acryl DataHub only) Allow the actor to manage global integrations and notification settings | +| Manage Monitors | (Acryl DataHub only) Allow the actor to create, remove, start, or stop any entity assertion monitors | | Restore Indices API[^1] | Allow the actor to restore indices for a set of entities via API | | Enable/Disable Writeability API[^1] | Allow the actor to enable or disable GMS writeability for use in data migrations | | Apply Retention API[^1] | Allow the actor to apply aspect retention via API | + [^1]: Only active if REST_API_AUTHORIZATION_ENABLED environment flag is enabled #### Step 3: Choose Policy Actors @@ -204,8 +207,15 @@ The common Metadata Privileges, which span across entity types, include: | Edit Status | Allow actor to edit the status of an entity (soft deleted or not). | | Edit Domain | Allow actor to edit the Domain of an entity. | | Edit Deprecation | Allow actor to edit the Deprecation status of an entity. | -| Edit Assertions | Allow actor to add and remove assertions from an entity. | -| Edit All | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. | +| Edit Lineage | Allow actor to edit custom lineage edges for the entity. | +| Edit Data Product | Allow actor to edit the data product that an entity is part of | +| Propose Tags | (Acryl DataHub only) Allow actor to propose new Tags for the entity. | +| Propose Glossary Terms | (Acryl DataHub only) Allow actor to propose new Glossary Terms for the entity. | +| Propose Documentation | (Acryl DataHub only) Allow actor to propose new Documentation for the entity. | +| Manage Tag Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Tags for the entity. | +| Manage Glossary Terms Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Glossary Terms for the entity. | +| Manage Documentation Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Documentation for the entity | +| Edit Entity | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. | | Get Timeline API[^1] | Allow actor to get the timeline of an entity via API. | | Get Entity API[^1] | Allow actor to get an entity via API. | | Get Timeseries Aspect API[^1] | Allow actor to get a timeseries aspect via API. | @@ -225,10 +235,19 @@ The common Metadata Privileges, which span across entity types, include: | Dataset | Edit Dataset Queries | Allow actor to edit the Highlighted Queries on the Queries tab of the dataset. | | Dataset | View Dataset Usage | Allow actor to access usage metadata about a dataset both in the UI and in the GraphQL API. This includes example queries, number of queries, etc. Also applies to REST APIs when REST API Authorization is enabled. | | Dataset | View Dataset Profile | Allow actor to access a dataset's profile both in the UI and in the GraphQL API. This includes snapshot statistics like #rows, #columns, null percentage per field, etc. | +| Dataset | Edit Assertions | Allow actor to change the assertions associated with a dataset. | +| Dataset | Edit Incidents | (Acryl DataHub only) Allow actor to change the incidents associated with a dataset. | +| Dataset | Edit Monitors | (Acryl DataHub only) Allow actor to change the assertion monitors associated with a dataset. | | Tag | Edit Tag Color | Allow actor to change the color of a Tag. | | Group | Edit Group Members | Allow actor to add and remove members to a group. | +| Group | Edit Contact Information | Allow actor to change email, slack handle associated with the group. | +| Group | Manage Group Subscriptions | (Acryl DataHub only) Allow actor to subscribe the group to entities. | +| Group | Manage Group Notifications | (Acryl DataHub only) Allow actor to change notification settings for the group. | | User | Edit User Profile | Allow actor to change the user's profile including display name, bio, title, profile image, etc. | | User + Group | Edit Contact Information | Allow actor to change the contact information such as email & chat handles. | +| Term Group | Manage Direct Glossary Children | Allow actor to change the direct child Term Groups or Terms of the group. | +| Term Group | Manage All Glossary Children | Allow actor to change any direct or indirect child Term Groups or Terms of the group. | + > **Still have questions about Privileges?** Let us know in [Slack](https://slack.datahubproject.io)! diff --git a/docs/components.md b/docs/components.md index ef76729bb37fb..b59dabcf999cc 100644 --- a/docs/components.md +++ b/docs/components.md @@ -6,7 +6,11 @@ title: "Components" The DataHub platform consists of the components shown in the following diagram. -![DataHub Component Overview](./imgs/datahub-components.png) + +

+ +

+ ## Metadata Store diff --git a/docs/demo/DataHub-UIOverview.pdf b/docs/demo/DataHub-UIOverview.pdf deleted file mode 100644 index cd6106e84ac23..0000000000000 Binary files a/docs/demo/DataHub-UIOverview.pdf and /dev/null differ diff --git a/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf b/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf deleted file mode 100644 index 71498045f9b5b..0000000000000 Binary files a/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf and /dev/null differ diff --git a/docs/demo/Data_Discoverability_at_SpotHero.pdf b/docs/demo/Data_Discoverability_at_SpotHero.pdf deleted file mode 100644 index 83e37d8606428..0000000000000 Binary files a/docs/demo/Data_Discoverability_at_SpotHero.pdf and /dev/null differ diff --git a/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf b/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf deleted file mode 100644 index 2d6a33a464650..0000000000000 Binary files a/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf and /dev/null differ diff --git a/docs/demo/Datahub_at_Grofers.pdf b/docs/demo/Datahub_at_Grofers.pdf deleted file mode 100644 index c29cece9e250a..0000000000000 Binary files a/docs/demo/Datahub_at_Grofers.pdf and /dev/null differ diff --git a/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf b/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf deleted file mode 100644 index 0d067eef28d03..0000000000000 Binary files a/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf and /dev/null differ diff --git a/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf b/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf deleted file mode 100644 index 382754f863c8a..0000000000000 Binary files a/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf and /dev/null differ diff --git a/docs/demo/Saxo Bank Data Workbench.pdf b/docs/demo/Saxo Bank Data Workbench.pdf deleted file mode 100644 index c43480d32b8f2..0000000000000 Binary files a/docs/demo/Saxo Bank Data Workbench.pdf and /dev/null differ diff --git a/docs/demo/Taming the Data Beast Using DataHub.pdf b/docs/demo/Taming the Data Beast Using DataHub.pdf deleted file mode 100644 index d0062465d9220..0000000000000 Binary files a/docs/demo/Taming the Data Beast Using DataHub.pdf and /dev/null differ diff --git a/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf b/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf deleted file mode 100644 index fb7bd2b693e87..0000000000000 Binary files a/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf and /dev/null differ diff --git a/docs/demo/ViasatMetadataJourney.pdf b/docs/demo/ViasatMetadataJourney.pdf deleted file mode 100644 index ccffd18a06d18..0000000000000 Binary files a/docs/demo/ViasatMetadataJourney.pdf and /dev/null differ diff --git a/docs/deploy/aws.md b/docs/deploy/aws.md index 7b01ffa02a744..228fcb51d1a28 100644 --- a/docs/deploy/aws.md +++ b/docs/deploy/aws.md @@ -201,7 +201,11 @@ Provision a MySQL database in AWS RDS that shares the VPC with the kubernetes cl the VPC of the kubernetes cluster. Once the database is provisioned, you should be able to see the following page. Take a note of the endpoint marked by the red box. -![AWS RDS](../imgs/aws/aws-rds.png) + +

+ +

+ First, add the DB password to kubernetes by running the following. @@ -234,7 +238,11 @@ Provision an elasticsearch domain running elasticsearch version 7.10 or above th cluster or has VPC peering set up between the VPC of the kubernetes cluster. Once the domain is provisioned, you should be able to see the following page. Take a note of the endpoint marked by the red box. -![AWS Elasticsearch Service](../imgs/aws/aws-elasticsearch.png) + +

+ +

+ Update the elasticsearch settings under global in the values.yaml as follows. @@ -330,7 +338,11 @@ Provision an MSK cluster that shares the VPC with the kubernetes cluster or has the kubernetes cluster. Once the domain is provisioned, click on the “View client information” button in the ‘Cluster Summary” section. You should see a page like below. Take a note of the endpoints marked by the red boxes. -![AWS MSK](../imgs/aws/aws-msk.png) + +

+ +

+ Update the kafka settings under global in the values.yaml as follows. diff --git a/docs/deploy/confluent-cloud.md b/docs/deploy/confluent-cloud.md index d93ffcceaecee..794b55d4686bf 100644 --- a/docs/deploy/confluent-cloud.md +++ b/docs/deploy/confluent-cloud.md @@ -24,7 +24,11 @@ decommissioned. To create the topics, navigate to your **Cluster** and click "Create Topic". Feel free to tweak the default topic configurations to match your preferences. -![CreateTopic](../imgs/confluent-create-topic.png) + +

+ +

+ ## Step 2: Configure DataHub Container to use Confluent Cloud Topics @@ -140,12 +144,20 @@ and another for the user info used for connecting to the schema registry. You'll select "Clients" -> "Configure new Java Client". You should see a page like the following: -![Config](../imgs/confluent-cloud-config.png) + +

+ +

+ You'll want to generate both a Kafka Cluster API Key & a Schema Registry key. Once you do so,you should see the config automatically populate with your new secrets: -![Config](../imgs/confluent-cloud-config-2.png) + +

+ +

+ You'll need to copy the values of `sasl.jaas.config` and `basic.auth.user.info` for the next step. diff --git a/docs/deploy/gcp.md b/docs/deploy/gcp.md index 3713d69f90636..0cd3d92a8f3cd 100644 --- a/docs/deploy/gcp.md +++ b/docs/deploy/gcp.md @@ -65,16 +65,28 @@ the GKE page on [GCP website](https://console.cloud.google.com/kubernetes/discov Once all deploy is successful, you should see a page like below in the "Services & Ingress" tab on the left. -![Services and Ingress](../imgs/gcp/services_ingress.png) + +

+ +

+ Tick the checkbox for datahub-datahub-frontend and click "CREATE INGRESS" button. You should land on the following page. -![Ingress1](../imgs/gcp/ingress1.png) + +

+ +

+ Type in an arbitrary name for the ingress and click on the second step "Host and path rules". You should land on the following page. -![Ingress2](../imgs/gcp/ingress2.png) + +

+ +

+ Select "datahub-datahub-frontend" in the dropdown menu for backends, and then click on "ADD HOST AND PATH RULE" button. In the second row that got created, add in the host name of choice (here gcp.datahubproject.io) and select @@ -83,14 +95,22 @@ In the second row that got created, add in the host name of choice (here gcp.dat This step adds the rule allowing requests from the host name of choice to get routed to datahub-frontend service. Click on step 3 "Frontend configuration". You should land on the following page. -![Ingress3](../imgs/gcp/ingress3.png) + +

+ +

+ Choose HTTPS in the dropdown menu for protocol. To enable SSL, you need to add a certificate. If you do not have one, you can click "CREATE A NEW CERTIFICATE" and input the host name of choice. GCP will create a certificate for you. Now press "CREATE" button on the left to create ingress! After around 5 minutes, you should see the following. -![Ingress Ready](../imgs/gcp/ingress_ready.png) + +

+ +

+ In your domain provider, add an A record for the host name set above using the IP address on the ingress page (noted with the red box). Once DNS updates, you should be able to access DataHub through the host name!! @@ -98,5 +118,9 @@ with the red box). Once DNS updates, you should be able to access DataHub throug Note, ignore the warning icon next to ingress. It takes about ten minutes for ingress to check that the backend service is ready and show a check mark as follows. However, ingress is fully functional once you see the above page. -![Ingress Final](../imgs/gcp/ingress_final.png) + +

+ +

+ diff --git a/docs/dev-guides/timeline.md b/docs/dev-guides/timeline.md index 966e659b90991..829aef1d3eefa 100644 --- a/docs/dev-guides/timeline.md +++ b/docs/dev-guides/timeline.md @@ -14,7 +14,11 @@ The Timeline API is available in server versions `0.8.28` and higher. The `cli` ## Entity Timeline Conceptually For the visually inclined, here is a conceptual diagram that illustrates how to think about the entity timeline with categorical changes overlaid on it. -![../imgs/timeline/timeline-conceptually.png](../imgs/timeline/timeline-conceptually.png) + +

+ +

+ ## Change Event Each modification is modeled as a @@ -228,8 +232,16 @@ http://localhost:8080/openapi/timeline/v1/urn%3Ali%3Adataset%3A%28urn%3Ali%3Adat The API is browse-able via the UI through through the dropdown. Here are a few screenshots showing how to navigate to it. You can try out the API and send example requests. -![../imgs/timeline/dropdown-apis.png](../imgs/timeline/dropdown-apis.png) -![../imgs/timeline/swagger-ui.png](../imgs/timeline/swagger-ui.png) + +

+ +

+ + +

+ +

+ # Future Work diff --git a/docs/docker/development.md b/docs/docker/development.md index 2153aa9dc613f..91a303744a03b 100644 --- a/docs/docker/development.md +++ b/docs/docker/development.md @@ -92,7 +92,11 @@ Environment variables control the debugging ports for GMS and the frontend. The screenshot shows an example configuration for IntelliJ using the default GMS debugging port of 5001. -![](../imgs/development/intellij-remote-debug.png) + +

+ +

+ ## Tips for People New To Docker diff --git a/docs/glossary/business-glossary.md b/docs/glossary/business-glossary.md index faab6f12fc55e..e10cbed30b913 100644 --- a/docs/glossary/business-glossary.md +++ b/docs/glossary/business-glossary.md @@ -31,59 +31,103 @@ In order to view a Business Glossary, users must have the Platform Privilege cal Once granted this privilege, you can access your Glossary by clicking the dropdown at the top of the page called **Govern** and then click **Glossary**: -![](../imgs/glossary/glossary-button.png) + +

+ +

+ You are now at the root of your Glossary and should see all Terms and Term Groups with no parents assigned to them. You should also notice a hierarchy navigator on the left where you can easily check out the structure of your Glossary! -![](../imgs/glossary/root-glossary.png) + +

+ +

+ ## Creating a Term or Term Group There are two ways to create Terms and Term Groups through the UI. First, you can create directly from the Glossary home page by clicking the menu dots on the top right and selecting your desired option: -![](../imgs/glossary/root-glossary-create.png) + +

+ +

+ You can also create Terms or Term Groups directly from a Term Group's page. In order to do that you need to click the menu dots on the top right and select what you want: -![](../imgs/glossary/create-from-node.png) + +

+ +

+ Note that the modal that pops up will automatically set the current Term Group you are in as the **Parent**. You can easily change this by selecting the input and navigating through your Glossary to find your desired Term Group. In addition, you could start typing the name of a Term Group to see it appear by searching. You can also leave this input blank in order to create a Term or Term Group with no parent. -![](../imgs/glossary/create-modal.png) + +

+ +

+ ## Editing a Term or Term Group In order to edit a Term or Term Group, you first need to go the page of the Term or Term group you want to edit. Then simply click the edit icon right next to the name to open up an inline editor. Change the text and it will save when you click outside or hit Enter. -![](../imgs/glossary/edit-term.png) + +

+ +

+ ## Moving a Term or Term Group Once a Term or Term Group has been created, you can always move it to be under a different Term Group parent. In order to do this, click the menu dots on the top right of either entity and select **Move**. -![](../imgs/glossary/move-term-button.png) + +

+ +

+ This will open a modal where you can navigate through your Glossary to find your desired Term Group. -![](../imgs/glossary/move-term-modal.png) + +

+ +

+ ## Deleting a Term or Term Group In order to delete a Term or Term Group, you need to go to the entity page of what you want to delete then click the menu dots on the top right. From here you can select **Delete** followed by confirming through a separate modal. **Note**: at the moment we only support deleting Term Groups that do not have any children. Until cascade deleting is supported, you will have to delete all children first, then delete the Term Group. -![](../imgs/glossary/delete-button.png) + +

+ +

+ ## Adding a Term to an Entity Once you've defined your Glossary, you can begin attaching terms to data assets. To add a Glossary Term to an asset, go to the entity page of your asset and find the **Add Terms** button on the right sidebar. -![](../imgs/glossary/add-term-to-entity.png) + +

+ +

+ In the modal that pops up you can select the Term you care about in one of two ways: - Search for the Term by name in the input - Navigate through the Glossary dropdown that appears after clicking into the input -![](../imgs/glossary/add-term-modal.png) + +

+ +

+ ## Privileges diff --git a/docs/how/configuring-authorization-with-apache-ranger.md b/docs/how/configuring-authorization-with-apache-ranger.md index 26d3be6d358b2..46f9432e6c18a 100644 --- a/docs/how/configuring-authorization-with-apache-ranger.md +++ b/docs/how/configuring-authorization-with-apache-ranger.md @@ -67,7 +67,11 @@ Now, you should have the DataHub plugin registered with Apache Ranger. Next, we' **DATAHUB** plugin and **ranger_datahub** service is shown in below screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-plugin.png) + +

+ +

+ 4. Create a new policy under service **ranger_datahub** - this will be used to control DataHub authorization. 5. Create a test user & assign them to a policy. We'll use the `datahub` user, which is the default root user inside DataHub. @@ -80,7 +84,11 @@ Now, you should have the DataHub plugin registered with Apache Ranger. Next, we' DataHub platform access policy screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-platform-access-policy.png) + +

+ +

+ Once we've created our first policy, we can set up DataHub to start authorizing requests using Ranger policies. @@ -178,7 +186,11 @@ then follow the below sections to undo the configuration steps you have performe **ranger_datahub** service is shown in below screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-plugin.png) + +

+ +

+ 2. Delete **datahub** plugin: Execute below curl command to delete **datahub** plugin Replace variables with corresponding values in curl command diff --git a/docs/how/delete-metadata.md b/docs/how/delete-metadata.md index acbb573020be0..f720a66ce5765 100644 --- a/docs/how/delete-metadata.md +++ b/docs/how/delete-metadata.md @@ -43,6 +43,9 @@ datahub delete --platform snowflake # Filters can be combined, which will select entities that match all filters. datahub delete --platform looker --entity-type chart datahub delete --platform bigquery --env PROD + +# You can also do recursive deletes for container and dataPlatformInstance entities. +datahub delete --urn "urn:li:container:f76..." --recursive ``` When performing hard deletes, you can optionally add the `--only-soft-deleted` flag to only hard delete entities that were previously soft deleted. @@ -122,6 +125,14 @@ datahub delete --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted datahub delete --platform snowflake --env DEV ``` +#### Delete everything within a specific Snowflake DB + +```shell +# You can find your container urn by navigating to the relevant +# DB in the DataHub UI and clicking the "copy urn" button. +datahub delete --urn "urn:li:container:77644901c4f574845578ebd18b7c14fa" --recursive +``` + #### Delete all BigQuery datasets in the PROD environment ```shell @@ -129,6 +140,13 @@ datahub delete --platform snowflake --env DEV datahub delete --env PROD --entity-type dataset --platform bigquery ``` +#### Delete everything within a MySQL platform instance + +```shell +# The instance name comes from the `platform_instance` config option in the ingestion recipe. +datahub delete --urn 'urn:li:dataPlatformInstance:(urn:li:dataPlatform:mysql,my_instance_name)' --recursive +``` + #### Delete all pipelines and tasks from Airflow ```shell @@ -138,6 +156,7 @@ datahub delete --platform "airflow" #### Delete all containers for a particular platform ```shell +# Note: this will leave S3 datasets intact. datahub delete --entity-type container --platform s3 ``` diff --git a/docs/how/search.md b/docs/how/search.md index bf1c8e8632e24..6a5e85e547fc5 100644 --- a/docs/how/search.md +++ b/docs/how/search.md @@ -2,14 +2,6 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; # About DataHub Search - - - - The **search bar** is an important mechanism for discovering data assets in DataHub. From the search bar, you can find Datasets, Columns, Dashboards, Charts, Data Pipelines, and more. Simply type in a term and press 'enter'. diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 2b6fd5571cc9e..7ba516c82cf1b 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -15,6 +15,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #8300: Clickhouse source now inherited from TwoTierSQLAlchemy. In old way we have platform_instance -> container -> co container db (None) -> container schema and now we have platform_instance -> container database. - #8300: Added `uri_opts` argument; now we can add any options for clickhouse client. +- #8659: BigQuery ingestion no longer creates DataPlatformInstance aspects by default. + This will only affect users that were depending on this aspect for custom functionality, + and can be enabled via the `include_data_platform_instance` config option. ## 0.10.5 diff --git a/docs/imgs/add-schema-tag.png b/docs/imgs/add-schema-tag.png deleted file mode 100644 index b6fd273389c90..0000000000000 Binary files a/docs/imgs/add-schema-tag.png and /dev/null differ diff --git a/docs/imgs/add-tag-search.png b/docs/imgs/add-tag-search.png deleted file mode 100644 index a129f5eba4271..0000000000000 Binary files a/docs/imgs/add-tag-search.png and /dev/null differ diff --git a/docs/imgs/add-tag.png b/docs/imgs/add-tag.png deleted file mode 100644 index 386b4cdcd9911..0000000000000 Binary files a/docs/imgs/add-tag.png and /dev/null differ diff --git a/docs/imgs/added-tag.png b/docs/imgs/added-tag.png deleted file mode 100644 index 96ae48318a35a..0000000000000 Binary files a/docs/imgs/added-tag.png and /dev/null differ diff --git a/docs/imgs/airflow/connection_error.png b/docs/imgs/airflow/connection_error.png deleted file mode 100644 index c2f3344b8cc45..0000000000000 Binary files a/docs/imgs/airflow/connection_error.png and /dev/null differ diff --git a/docs/imgs/airflow/datahub_lineage_view.png b/docs/imgs/airflow/datahub_lineage_view.png deleted file mode 100644 index c7c774c203d2f..0000000000000 Binary files a/docs/imgs/airflow/datahub_lineage_view.png and /dev/null differ diff --git a/docs/imgs/airflow/datahub_pipeline_entity.png b/docs/imgs/airflow/datahub_pipeline_entity.png deleted file mode 100644 index 715baefd784ca..0000000000000 Binary files a/docs/imgs/airflow/datahub_pipeline_entity.png and /dev/null differ diff --git a/docs/imgs/airflow/datahub_pipeline_view.png b/docs/imgs/airflow/datahub_pipeline_view.png deleted file mode 100644 index 5b3afd13c4ce6..0000000000000 Binary files a/docs/imgs/airflow/datahub_pipeline_view.png and /dev/null differ diff --git a/docs/imgs/airflow/datahub_task_view.png b/docs/imgs/airflow/datahub_task_view.png deleted file mode 100644 index 66b3487d87319..0000000000000 Binary files a/docs/imgs/airflow/datahub_task_view.png and /dev/null differ diff --git a/docs/imgs/airflow/entity_page_screenshot.png b/docs/imgs/airflow/entity_page_screenshot.png deleted file mode 100644 index a782969a1f17b..0000000000000 Binary files a/docs/imgs/airflow/entity_page_screenshot.png and /dev/null differ diff --git a/docs/imgs/airflow/find_the_dag.png b/docs/imgs/airflow/find_the_dag.png deleted file mode 100644 index 37cda041e4b75..0000000000000 Binary files a/docs/imgs/airflow/find_the_dag.png and /dev/null differ diff --git a/docs/imgs/airflow/finding_failed_log.png b/docs/imgs/airflow/finding_failed_log.png deleted file mode 100644 index 96552ba1e1983..0000000000000 Binary files a/docs/imgs/airflow/finding_failed_log.png and /dev/null differ diff --git a/docs/imgs/airflow/paused_dag.png b/docs/imgs/airflow/paused_dag.png deleted file mode 100644 index c314de5d38d75..0000000000000 Binary files a/docs/imgs/airflow/paused_dag.png and /dev/null differ diff --git a/docs/imgs/airflow/successful_run.png b/docs/imgs/airflow/successful_run.png deleted file mode 100644 index b997cc7210ff6..0000000000000 Binary files a/docs/imgs/airflow/successful_run.png and /dev/null differ diff --git a/docs/imgs/airflow/trigger_dag.png b/docs/imgs/airflow/trigger_dag.png deleted file mode 100644 index a44999c929d4e..0000000000000 Binary files a/docs/imgs/airflow/trigger_dag.png and /dev/null differ diff --git a/docs/imgs/airflow/unpaused_dag.png b/docs/imgs/airflow/unpaused_dag.png deleted file mode 100644 index 8462562f31d97..0000000000000 Binary files a/docs/imgs/airflow/unpaused_dag.png and /dev/null differ diff --git a/docs/imgs/apache-ranger/datahub-platform-access-policy.png b/docs/imgs/apache-ranger/datahub-platform-access-policy.png deleted file mode 100644 index 7e3ff6fd372a9..0000000000000 Binary files a/docs/imgs/apache-ranger/datahub-platform-access-policy.png and /dev/null differ diff --git a/docs/imgs/apache-ranger/datahub-plugin.png b/docs/imgs/apache-ranger/datahub-plugin.png deleted file mode 100644 index 5dd044c014657..0000000000000 Binary files a/docs/imgs/apache-ranger/datahub-plugin.png and /dev/null differ diff --git a/docs/imgs/apis/postman-graphql.png b/docs/imgs/apis/postman-graphql.png deleted file mode 100644 index 1cffd226fdf77..0000000000000 Binary files a/docs/imgs/apis/postman-graphql.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/column-description-added.png b/docs/imgs/apis/tutorials/column-description-added.png deleted file mode 100644 index ed8cbd3bf5622..0000000000000 Binary files a/docs/imgs/apis/tutorials/column-description-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/column-level-lineage-added.png b/docs/imgs/apis/tutorials/column-level-lineage-added.png deleted file mode 100644 index 6092436e0a6a8..0000000000000 Binary files a/docs/imgs/apis/tutorials/column-level-lineage-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/custom-properties-added.png b/docs/imgs/apis/tutorials/custom-properties-added.png deleted file mode 100644 index a7e85d875045c..0000000000000 Binary files a/docs/imgs/apis/tutorials/custom-properties-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/datahub-main-ui.png b/docs/imgs/apis/tutorials/datahub-main-ui.png deleted file mode 100644 index b058e2683a851..0000000000000 Binary files a/docs/imgs/apis/tutorials/datahub-main-ui.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-created.png b/docs/imgs/apis/tutorials/dataset-created.png deleted file mode 100644 index 086dd8b7c9b16..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-deleted.png b/docs/imgs/apis/tutorials/dataset-deleted.png deleted file mode 100644 index d94ad7e85195f..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-deleted.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-description-added.png b/docs/imgs/apis/tutorials/dataset-description-added.png deleted file mode 100644 index 41aa9f109115b..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-description-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-properties-added-removed.png b/docs/imgs/apis/tutorials/dataset-properties-added-removed.png deleted file mode 100644 index 9eb0284776f13..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-properties-added-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-properties-added.png b/docs/imgs/apis/tutorials/dataset-properties-added.png deleted file mode 100644 index e0d2acbb66eb5..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-properties-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-properties-before.png b/docs/imgs/apis/tutorials/dataset-properties-before.png deleted file mode 100644 index b4915121a8c65..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-properties-before.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/dataset-properties-replaced.png b/docs/imgs/apis/tutorials/dataset-properties-replaced.png deleted file mode 100644 index 8624689c20ada..0000000000000 Binary files a/docs/imgs/apis/tutorials/dataset-properties-replaced.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/deprecation-updated.png b/docs/imgs/apis/tutorials/deprecation-updated.png deleted file mode 100644 index 06fedf746f694..0000000000000 Binary files a/docs/imgs/apis/tutorials/deprecation-updated.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/domain-added.png b/docs/imgs/apis/tutorials/domain-added.png deleted file mode 100644 index cb2002ec9ab4d..0000000000000 Binary files a/docs/imgs/apis/tutorials/domain-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/domain-created.png b/docs/imgs/apis/tutorials/domain-created.png deleted file mode 100644 index cafab2a5e8d5c..0000000000000 Binary files a/docs/imgs/apis/tutorials/domain-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/domain-removed.png b/docs/imgs/apis/tutorials/domain-removed.png deleted file mode 100644 index 1b21172be11d2..0000000000000 Binary files a/docs/imgs/apis/tutorials/domain-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/feature-added-to-model.png b/docs/imgs/apis/tutorials/feature-added-to-model.png deleted file mode 100644 index 311506e4b2783..0000000000000 Binary files a/docs/imgs/apis/tutorials/feature-added-to-model.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/feature-table-created.png b/docs/imgs/apis/tutorials/feature-table-created.png deleted file mode 100644 index 0541cbe572435..0000000000000 Binary files a/docs/imgs/apis/tutorials/feature-table-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/group-upserted.png b/docs/imgs/apis/tutorials/group-upserted.png deleted file mode 100644 index 5283f6273f02a..0000000000000 Binary files a/docs/imgs/apis/tutorials/group-upserted.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/lineage-added.png b/docs/imgs/apis/tutorials/lineage-added.png deleted file mode 100644 index b381498bad5ac..0000000000000 Binary files a/docs/imgs/apis/tutorials/lineage-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/model-group-added-to-model.png b/docs/imgs/apis/tutorials/model-group-added-to-model.png deleted file mode 100644 index 360b7fbb2d922..0000000000000 Binary files a/docs/imgs/apis/tutorials/model-group-added-to-model.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/model-group-created.png b/docs/imgs/apis/tutorials/model-group-created.png deleted file mode 100644 index 2e0fdcea4803f..0000000000000 Binary files a/docs/imgs/apis/tutorials/model-group-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/owner-added.png b/docs/imgs/apis/tutorials/owner-added.png deleted file mode 100644 index 6508c231cfb4b..0000000000000 Binary files a/docs/imgs/apis/tutorials/owner-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/owner-removed.png b/docs/imgs/apis/tutorials/owner-removed.png deleted file mode 100644 index a7b6567888caf..0000000000000 Binary files a/docs/imgs/apis/tutorials/owner-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/sample-ingestion.png b/docs/imgs/apis/tutorials/sample-ingestion.png deleted file mode 100644 index 40aa046904841..0000000000000 Binary files a/docs/imgs/apis/tutorials/sample-ingestion.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/tag-added.png b/docs/imgs/apis/tutorials/tag-added.png deleted file mode 100644 index fd99a04f6cceb..0000000000000 Binary files a/docs/imgs/apis/tutorials/tag-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/tag-created.png b/docs/imgs/apis/tutorials/tag-created.png deleted file mode 100644 index 99e3fea8a14e1..0000000000000 Binary files a/docs/imgs/apis/tutorials/tag-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/tag-removed.png b/docs/imgs/apis/tutorials/tag-removed.png deleted file mode 100644 index 31a267549843e..0000000000000 Binary files a/docs/imgs/apis/tutorials/tag-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/term-added.png b/docs/imgs/apis/tutorials/term-added.png deleted file mode 100644 index 62e285a92e7af..0000000000000 Binary files a/docs/imgs/apis/tutorials/term-added.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/term-created.png b/docs/imgs/apis/tutorials/term-created.png deleted file mode 100644 index deff0179b155e..0000000000000 Binary files a/docs/imgs/apis/tutorials/term-created.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/term-removed.png b/docs/imgs/apis/tutorials/term-removed.png deleted file mode 100644 index dbf9f35f09339..0000000000000 Binary files a/docs/imgs/apis/tutorials/term-removed.png and /dev/null differ diff --git a/docs/imgs/apis/tutorials/user-upserted.png b/docs/imgs/apis/tutorials/user-upserted.png deleted file mode 100644 index 38c5bbb9ad828..0000000000000 Binary files a/docs/imgs/apis/tutorials/user-upserted.png and /dev/null differ diff --git a/docs/imgs/aws/aws-elasticsearch.png b/docs/imgs/aws/aws-elasticsearch.png deleted file mode 100644 index e16d5eee26fd8..0000000000000 Binary files a/docs/imgs/aws/aws-elasticsearch.png and /dev/null differ diff --git a/docs/imgs/aws/aws-msk.png b/docs/imgs/aws/aws-msk.png deleted file mode 100644 index 96a3173747007..0000000000000 Binary files a/docs/imgs/aws/aws-msk.png and /dev/null differ diff --git a/docs/imgs/aws/aws-rds.png b/docs/imgs/aws/aws-rds.png deleted file mode 100644 index ab329952c7756..0000000000000 Binary files a/docs/imgs/aws/aws-rds.png and /dev/null differ diff --git a/docs/imgs/browse-domains.png b/docs/imgs/browse-domains.png deleted file mode 100644 index 41444470517d2..0000000000000 Binary files a/docs/imgs/browse-domains.png and /dev/null differ diff --git a/docs/imgs/cancelled-ingestion.png b/docs/imgs/cancelled-ingestion.png deleted file mode 100644 index 0c4af7b66a8ff..0000000000000 Binary files a/docs/imgs/cancelled-ingestion.png and /dev/null differ diff --git a/docs/imgs/confluent-cloud-config-2.png b/docs/imgs/confluent-cloud-config-2.png deleted file mode 100644 index 543101154f42c..0000000000000 Binary files a/docs/imgs/confluent-cloud-config-2.png and /dev/null differ diff --git a/docs/imgs/confluent-cloud-config.png b/docs/imgs/confluent-cloud-config.png deleted file mode 100644 index a2490eab5c6a7..0000000000000 Binary files a/docs/imgs/confluent-cloud-config.png and /dev/null differ diff --git a/docs/imgs/confluent-create-topic.png b/docs/imgs/confluent-create-topic.png deleted file mode 100644 index 1972bb3770388..0000000000000 Binary files a/docs/imgs/confluent-create-topic.png and /dev/null differ diff --git a/docs/imgs/create-domain.png b/docs/imgs/create-domain.png deleted file mode 100644 index 1db2090fca6b8..0000000000000 Binary files a/docs/imgs/create-domain.png and /dev/null differ diff --git a/docs/imgs/create-new-ingestion-source-button.png b/docs/imgs/create-new-ingestion-source-button.png deleted file mode 100644 index c425f0837c51d..0000000000000 Binary files a/docs/imgs/create-new-ingestion-source-button.png and /dev/null differ diff --git a/docs/imgs/create-secret.png b/docs/imgs/create-secret.png deleted file mode 100644 index a0cc63e3b4892..0000000000000 Binary files a/docs/imgs/create-secret.png and /dev/null differ diff --git a/docs/imgs/custom-ingestion-cli-version.png b/docs/imgs/custom-ingestion-cli-version.png deleted file mode 100644 index 43d4736684abb..0000000000000 Binary files a/docs/imgs/custom-ingestion-cli-version.png and /dev/null differ diff --git a/docs/imgs/datahub-architecture.png b/docs/imgs/datahub-architecture.png deleted file mode 100644 index 236f939f74198..0000000000000 Binary files a/docs/imgs/datahub-architecture.png and /dev/null differ diff --git a/docs/imgs/datahub-architecture.svg b/docs/imgs/datahub-architecture.svg deleted file mode 100644 index 842194a5e377c..0000000000000 --- a/docs/imgs/datahub-architecture.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/imgs/datahub-components.png b/docs/imgs/datahub-components.png deleted file mode 100644 index 8b7d0e5330275..0000000000000 Binary files a/docs/imgs/datahub-components.png and /dev/null differ diff --git a/docs/imgs/datahub-logo-color-mark.svg b/docs/imgs/datahub-logo-color-mark.svg deleted file mode 100644 index a984092952bae..0000000000000 --- a/docs/imgs/datahub-logo-color-mark.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/imgs/datahub-metadata-ingestion-framework.png b/docs/imgs/datahub-metadata-ingestion-framework.png deleted file mode 100644 index 1319329710906..0000000000000 Binary files a/docs/imgs/datahub-metadata-ingestion-framework.png and /dev/null differ diff --git a/docs/imgs/datahub-metadata-model.png b/docs/imgs/datahub-metadata-model.png deleted file mode 100644 index 59449cd0d4ef5..0000000000000 Binary files a/docs/imgs/datahub-metadata-model.png and /dev/null differ diff --git a/docs/imgs/datahub-sequence-diagram.png b/docs/imgs/datahub-sequence-diagram.png deleted file mode 100644 index b5a8f8a9c25ce..0000000000000 Binary files a/docs/imgs/datahub-sequence-diagram.png and /dev/null differ diff --git a/docs/imgs/datahub-serving.png b/docs/imgs/datahub-serving.png deleted file mode 100644 index 67a2f8eb3f085..0000000000000 Binary files a/docs/imgs/datahub-serving.png and /dev/null differ diff --git a/docs/imgs/development/intellij-remote-debug.png b/docs/imgs/development/intellij-remote-debug.png deleted file mode 100644 index 32a41a75d1dc3..0000000000000 Binary files a/docs/imgs/development/intellij-remote-debug.png and /dev/null differ diff --git a/docs/imgs/domain-entities.png b/docs/imgs/domain-entities.png deleted file mode 100644 index 5766d051fa209..0000000000000 Binary files a/docs/imgs/domain-entities.png and /dev/null differ diff --git a/docs/imgs/domains-tab.png b/docs/imgs/domains-tab.png deleted file mode 100644 index 20be5b103fdca..0000000000000 Binary files a/docs/imgs/domains-tab.png and /dev/null differ diff --git a/docs/imgs/entity-registry-diagram.png b/docs/imgs/entity-registry-diagram.png deleted file mode 100644 index 08cb5edd8e13f..0000000000000 Binary files a/docs/imgs/entity-registry-diagram.png and /dev/null differ diff --git a/docs/imgs/entity.png b/docs/imgs/entity.png deleted file mode 100644 index cfe9eb38b2921..0000000000000 Binary files a/docs/imgs/entity.png and /dev/null differ diff --git a/docs/imgs/example-mysql-recipe.png b/docs/imgs/example-mysql-recipe.png deleted file mode 100644 index 9cb2cbb169a56..0000000000000 Binary files a/docs/imgs/example-mysql-recipe.png and /dev/null differ diff --git a/docs/imgs/failed-ingestion.png b/docs/imgs/failed-ingestion.png deleted file mode 100644 index 4f9de8eb002d2..0000000000000 Binary files a/docs/imgs/failed-ingestion.png and /dev/null differ diff --git a/docs/imgs/feature-create-new-tag.gif b/docs/imgs/feature-create-new-tag.gif deleted file mode 100644 index 57b8ad852dd5b..0000000000000 Binary files a/docs/imgs/feature-create-new-tag.gif and /dev/null differ diff --git a/docs/imgs/feature-datahub-analytics.png b/docs/imgs/feature-datahub-analytics.png deleted file mode 100644 index 7fe66b84682f9..0000000000000 Binary files a/docs/imgs/feature-datahub-analytics.png and /dev/null differ diff --git a/docs/imgs/feature-rich-documentation.gif b/docs/imgs/feature-rich-documentation.gif deleted file mode 100644 index 48ad795670022..0000000000000 Binary files a/docs/imgs/feature-rich-documentation.gif and /dev/null differ diff --git a/docs/imgs/feature-tag-browse.gif b/docs/imgs/feature-tag-browse.gif deleted file mode 100644 index e70a30db7d3ba..0000000000000 Binary files a/docs/imgs/feature-tag-browse.gif and /dev/null differ diff --git a/docs/imgs/feature-validation-timeseries.png b/docs/imgs/feature-validation-timeseries.png deleted file mode 100644 index 28ce1daec5f32..0000000000000 Binary files a/docs/imgs/feature-validation-timeseries.png and /dev/null differ diff --git a/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif b/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif deleted file mode 100644 index aad77df373574..0000000000000 Binary files a/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif and /dev/null differ diff --git a/docs/imgs/gcp/ingress1.png b/docs/imgs/gcp/ingress1.png deleted file mode 100644 index 4cb49834af5b6..0000000000000 Binary files a/docs/imgs/gcp/ingress1.png and /dev/null differ diff --git a/docs/imgs/gcp/ingress2.png b/docs/imgs/gcp/ingress2.png deleted file mode 100644 index cdf2446b0e923..0000000000000 Binary files a/docs/imgs/gcp/ingress2.png and /dev/null differ diff --git a/docs/imgs/gcp/ingress3.png b/docs/imgs/gcp/ingress3.png deleted file mode 100644 index cc3745ad97f5b..0000000000000 Binary files a/docs/imgs/gcp/ingress3.png and /dev/null differ diff --git a/docs/imgs/gcp/ingress_final.png b/docs/imgs/gcp/ingress_final.png deleted file mode 100644 index a30ca744c49f7..0000000000000 Binary files a/docs/imgs/gcp/ingress_final.png and /dev/null differ diff --git a/docs/imgs/gcp/ingress_ready.png b/docs/imgs/gcp/ingress_ready.png deleted file mode 100644 index d14016e420fd3..0000000000000 Binary files a/docs/imgs/gcp/ingress_ready.png and /dev/null differ diff --git a/docs/imgs/gcp/services_ingress.png b/docs/imgs/gcp/services_ingress.png deleted file mode 100644 index 1d9ff2b313715..0000000000000 Binary files a/docs/imgs/gcp/services_ingress.png and /dev/null differ diff --git a/docs/imgs/glossary/add-term-modal.png b/docs/imgs/glossary/add-term-modal.png deleted file mode 100644 index e32a9cb8d648c..0000000000000 Binary files a/docs/imgs/glossary/add-term-modal.png and /dev/null differ diff --git a/docs/imgs/glossary/add-term-to-entity.png b/docs/imgs/glossary/add-term-to-entity.png deleted file mode 100644 index 7487a68c0d755..0000000000000 Binary files a/docs/imgs/glossary/add-term-to-entity.png and /dev/null differ diff --git a/docs/imgs/glossary/create-from-node.png b/docs/imgs/glossary/create-from-node.png deleted file mode 100644 index 70638d083343c..0000000000000 Binary files a/docs/imgs/glossary/create-from-node.png and /dev/null differ diff --git a/docs/imgs/glossary/create-modal.png b/docs/imgs/glossary/create-modal.png deleted file mode 100644 index e84fb5a36e2d4..0000000000000 Binary files a/docs/imgs/glossary/create-modal.png and /dev/null differ diff --git a/docs/imgs/glossary/delete-button.png b/docs/imgs/glossary/delete-button.png deleted file mode 100644 index 3e0cc2a5b0a54..0000000000000 Binary files a/docs/imgs/glossary/delete-button.png and /dev/null differ diff --git a/docs/imgs/glossary/edit-term.png b/docs/imgs/glossary/edit-term.png deleted file mode 100644 index 62b0e425c8c4f..0000000000000 Binary files a/docs/imgs/glossary/edit-term.png and /dev/null differ diff --git a/docs/imgs/glossary/glossary-button.png b/docs/imgs/glossary/glossary-button.png deleted file mode 100644 index e4b8fd2393587..0000000000000 Binary files a/docs/imgs/glossary/glossary-button.png and /dev/null differ diff --git a/docs/imgs/glossary/move-term-button.png b/docs/imgs/glossary/move-term-button.png deleted file mode 100644 index df03c820340ef..0000000000000 Binary files a/docs/imgs/glossary/move-term-button.png and /dev/null differ diff --git a/docs/imgs/glossary/move-term-modal.png b/docs/imgs/glossary/move-term-modal.png deleted file mode 100644 index 0fda501911b2b..0000000000000 Binary files a/docs/imgs/glossary/move-term-modal.png and /dev/null differ diff --git a/docs/imgs/glossary/root-glossary-create.png b/docs/imgs/glossary/root-glossary-create.png deleted file mode 100644 index c91f397eb6213..0000000000000 Binary files a/docs/imgs/glossary/root-glossary-create.png and /dev/null differ diff --git a/docs/imgs/glossary/root-glossary.png b/docs/imgs/glossary/root-glossary.png deleted file mode 100644 index 1296c16b0dc3d..0000000000000 Binary files a/docs/imgs/glossary/root-glossary.png and /dev/null differ diff --git a/docs/imgs/ingestion-architecture.png b/docs/imgs/ingestion-architecture.png deleted file mode 100644 index fc7bc74acacfa..0000000000000 Binary files a/docs/imgs/ingestion-architecture.png and /dev/null differ diff --git a/docs/imgs/ingestion-logs.png b/docs/imgs/ingestion-logs.png deleted file mode 100644 index 42211be7379d6..0000000000000 Binary files a/docs/imgs/ingestion-logs.png and /dev/null differ diff --git a/docs/imgs/ingestion-privileges.png b/docs/imgs/ingestion-privileges.png deleted file mode 100644 index 8e23868309676..0000000000000 Binary files a/docs/imgs/ingestion-privileges.png and /dev/null differ diff --git a/docs/imgs/ingestion-tab.png b/docs/imgs/ingestion-tab.png deleted file mode 100644 index 046068c63bdb7..0000000000000 Binary files a/docs/imgs/ingestion-tab.png and /dev/null differ diff --git a/docs/imgs/ingestion-with-token.png b/docs/imgs/ingestion-with-token.png deleted file mode 100644 index 5e1a2cce036f7..0000000000000 Binary files a/docs/imgs/ingestion-with-token.png and /dev/null differ diff --git a/docs/imgs/invite-users-button.png b/docs/imgs/invite-users-button.png deleted file mode 100644 index a5d07a1c1e7e7..0000000000000 Binary files a/docs/imgs/invite-users-button.png and /dev/null differ diff --git a/docs/imgs/invite-users-popup.png b/docs/imgs/invite-users-popup.png deleted file mode 100644 index 621b1521eae75..0000000000000 Binary files a/docs/imgs/invite-users-popup.png and /dev/null differ diff --git a/docs/imgs/lineage.png b/docs/imgs/lineage.png deleted file mode 100644 index 7488c1e04c31b..0000000000000 Binary files a/docs/imgs/lineage.png and /dev/null differ diff --git a/docs/imgs/list-domains.png b/docs/imgs/list-domains.png deleted file mode 100644 index 98a28130f8c99..0000000000000 Binary files a/docs/imgs/list-domains.png and /dev/null differ diff --git a/docs/imgs/locust-example.png b/docs/imgs/locust-example.png deleted file mode 100644 index bbae3e0ca19d0..0000000000000 Binary files a/docs/imgs/locust-example.png and /dev/null differ diff --git a/docs/imgs/metadata-model-chart.png b/docs/imgs/metadata-model-chart.png deleted file mode 100644 index 2fb7483654906..0000000000000 Binary files a/docs/imgs/metadata-model-chart.png and /dev/null differ diff --git a/docs/imgs/metadata-model-to-fork-or-not-to.png b/docs/imgs/metadata-model-to-fork-or-not-to.png deleted file mode 100644 index f9d89d555196d..0000000000000 Binary files a/docs/imgs/metadata-model-to-fork-or-not-to.png and /dev/null differ diff --git a/docs/imgs/metadata-modeling.png b/docs/imgs/metadata-modeling.png deleted file mode 100644 index cbad7613e04e4..0000000000000 Binary files a/docs/imgs/metadata-modeling.png and /dev/null differ diff --git a/docs/imgs/metadata-service-auth.png b/docs/imgs/metadata-service-auth.png deleted file mode 100644 index 15a3ac51876c2..0000000000000 Binary files a/docs/imgs/metadata-service-auth.png and /dev/null differ diff --git a/docs/imgs/metadata-serving.png b/docs/imgs/metadata-serving.png deleted file mode 100644 index 54b928a0cff52..0000000000000 Binary files a/docs/imgs/metadata-serving.png and /dev/null differ diff --git a/docs/imgs/metadata.png b/docs/imgs/metadata.png deleted file mode 100644 index 45bb0cdce12e9..0000000000000 Binary files a/docs/imgs/metadata.png and /dev/null differ diff --git a/docs/imgs/name-ingestion-source.png b/docs/imgs/name-ingestion-source.png deleted file mode 100644 index bde1208248473..0000000000000 Binary files a/docs/imgs/name-ingestion-source.png and /dev/null differ diff --git a/docs/imgs/no-code-after.png b/docs/imgs/no-code-after.png deleted file mode 100644 index c0eee88625ace..0000000000000 Binary files a/docs/imgs/no-code-after.png and /dev/null differ diff --git a/docs/imgs/no-code-before.png b/docs/imgs/no-code-before.png deleted file mode 100644 index 50315578b1804..0000000000000 Binary files a/docs/imgs/no-code-before.png and /dev/null differ diff --git a/docs/imgs/platform-instances-for-ingestion.png b/docs/imgs/platform-instances-for-ingestion.png deleted file mode 100644 index 740249a805fb8..0000000000000 Binary files a/docs/imgs/platform-instances-for-ingestion.png and /dev/null differ diff --git a/docs/imgs/quickstart-ingestion-config.png b/docs/imgs/quickstart-ingestion-config.png deleted file mode 100644 index de51777ccddc3..0000000000000 Binary files a/docs/imgs/quickstart-ingestion-config.png and /dev/null differ diff --git a/docs/imgs/reset-credentials-screen.png b/docs/imgs/reset-credentials-screen.png deleted file mode 100644 index 4b680837b77ab..0000000000000 Binary files a/docs/imgs/reset-credentials-screen.png and /dev/null differ diff --git a/docs/imgs/reset-user-password-button.png b/docs/imgs/reset-user-password-button.png deleted file mode 100644 index 5b1f3ee153d07..0000000000000 Binary files a/docs/imgs/reset-user-password-button.png and /dev/null differ diff --git a/docs/imgs/reset-user-password-popup.png b/docs/imgs/reset-user-password-popup.png deleted file mode 100644 index ac2456dde4d4d..0000000000000 Binary files a/docs/imgs/reset-user-password-popup.png and /dev/null differ diff --git a/docs/imgs/running-ingestion.png b/docs/imgs/running-ingestion.png deleted file mode 100644 index a03fb444a029e..0000000000000 Binary files a/docs/imgs/running-ingestion.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/10_outputs.png b/docs/imgs/s3-ingestion/10_outputs.png deleted file mode 100644 index e0d1ed3376ade..0000000000000 Binary files a/docs/imgs/s3-ingestion/10_outputs.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/1_crawler-info.png b/docs/imgs/s3-ingestion/1_crawler-info.png deleted file mode 100644 index 1288247392047..0000000000000 Binary files a/docs/imgs/s3-ingestion/1_crawler-info.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/2_crawler-type.png b/docs/imgs/s3-ingestion/2_crawler-type.png deleted file mode 100644 index 4898438417913..0000000000000 Binary files a/docs/imgs/s3-ingestion/2_crawler-type.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/3_data-store.png b/docs/imgs/s3-ingestion/3_data-store.png deleted file mode 100644 index d29e4b1be05d6..0000000000000 Binary files a/docs/imgs/s3-ingestion/3_data-store.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/4_data-store-2.png b/docs/imgs/s3-ingestion/4_data-store-2.png deleted file mode 100644 index c0a6f140bedb2..0000000000000 Binary files a/docs/imgs/s3-ingestion/4_data-store-2.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/5_iam.png b/docs/imgs/s3-ingestion/5_iam.png deleted file mode 100644 index 73a631cb74f56..0000000000000 Binary files a/docs/imgs/s3-ingestion/5_iam.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/6_schedule.png b/docs/imgs/s3-ingestion/6_schedule.png deleted file mode 100644 index c5df59348fbc6..0000000000000 Binary files a/docs/imgs/s3-ingestion/6_schedule.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/7_output.png b/docs/imgs/s3-ingestion/7_output.png deleted file mode 100644 index 6201fa40bcfb3..0000000000000 Binary files a/docs/imgs/s3-ingestion/7_output.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/8_review.png b/docs/imgs/s3-ingestion/8_review.png deleted file mode 100644 index 2d27e79c2128b..0000000000000 Binary files a/docs/imgs/s3-ingestion/8_review.png and /dev/null differ diff --git a/docs/imgs/s3-ingestion/9_run.png b/docs/imgs/s3-ingestion/9_run.png deleted file mode 100644 index 2b0644f6ad038..0000000000000 Binary files a/docs/imgs/s3-ingestion/9_run.png and /dev/null differ diff --git a/docs/imgs/schedule-ingestion.png b/docs/imgs/schedule-ingestion.png deleted file mode 100644 index 0e6ec8e268c32..0000000000000 Binary files a/docs/imgs/schedule-ingestion.png and /dev/null differ diff --git a/docs/imgs/schema-blame-blame-activated.png b/docs/imgs/schema-blame-blame-activated.png deleted file mode 100644 index 363466c39aedf..0000000000000 Binary files a/docs/imgs/schema-blame-blame-activated.png and /dev/null differ diff --git a/docs/imgs/schema-history-audit-activated.png b/docs/imgs/schema-history-audit-activated.png deleted file mode 100644 index f59676b9b8a8f..0000000000000 Binary files a/docs/imgs/schema-history-audit-activated.png and /dev/null differ diff --git a/docs/imgs/schema-history-latest-version.png b/docs/imgs/schema-history-latest-version.png deleted file mode 100644 index 0a54df4d520d5..0000000000000 Binary files a/docs/imgs/schema-history-latest-version.png and /dev/null differ diff --git a/docs/imgs/schema-history-older-version.png b/docs/imgs/schema-history-older-version.png deleted file mode 100644 index 8d295f176104f..0000000000000 Binary files a/docs/imgs/schema-history-older-version.png and /dev/null differ diff --git a/docs/imgs/search-by-domain.png b/docs/imgs/search-by-domain.png deleted file mode 100644 index 4b92e58959187..0000000000000 Binary files a/docs/imgs/search-by-domain.png and /dev/null differ diff --git a/docs/imgs/search-domain.png b/docs/imgs/search-domain.png deleted file mode 100644 index b1359e07d5fc2..0000000000000 Binary files a/docs/imgs/search-domain.png and /dev/null differ diff --git a/docs/imgs/search-tag.png b/docs/imgs/search-tag.png deleted file mode 100644 index cf4b6b629d1e2..0000000000000 Binary files a/docs/imgs/search-tag.png and /dev/null differ diff --git a/docs/imgs/select-platform-template.png b/docs/imgs/select-platform-template.png deleted file mode 100644 index 4f78e2b7309ed..0000000000000 Binary files a/docs/imgs/select-platform-template.png and /dev/null differ diff --git a/docs/imgs/set-domain-id.png b/docs/imgs/set-domain-id.png deleted file mode 100644 index 3e1dde4ae51ee..0000000000000 Binary files a/docs/imgs/set-domain-id.png and /dev/null differ diff --git a/docs/imgs/set-domain.png b/docs/imgs/set-domain.png deleted file mode 100644 index 1c4460e747835..0000000000000 Binary files a/docs/imgs/set-domain.png and /dev/null differ diff --git a/docs/imgs/successful-ingestion.png b/docs/imgs/successful-ingestion.png deleted file mode 100644 index fa8dbdff7501e..0000000000000 Binary files a/docs/imgs/successful-ingestion.png and /dev/null differ diff --git a/docs/imgs/timeline/dropdown-apis.png b/docs/imgs/timeline/dropdown-apis.png deleted file mode 100644 index f7aba08bbc061..0000000000000 Binary files a/docs/imgs/timeline/dropdown-apis.png and /dev/null differ diff --git a/docs/imgs/timeline/swagger-ui.png b/docs/imgs/timeline/swagger-ui.png deleted file mode 100644 index e52a57e8ca670..0000000000000 Binary files a/docs/imgs/timeline/swagger-ui.png and /dev/null differ diff --git a/docs/imgs/timeline/timeline-conceptually.png b/docs/imgs/timeline/timeline-conceptually.png deleted file mode 100644 index 70bd843bf8aed..0000000000000 Binary files a/docs/imgs/timeline/timeline-conceptually.png and /dev/null differ diff --git a/docs/imgs/user-sign-up-screen.png b/docs/imgs/user-sign-up-screen.png deleted file mode 100644 index 88c2589203bd1..0000000000000 Binary files a/docs/imgs/user-sign-up-screen.png and /dev/null differ diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index ef4071f89c585..49de5352f6d58 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -62,9 +62,10 @@ lazy_load_plugins = False | datahub.cluster | prod | name of the airflow cluster | | datahub.capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. | | datahub.capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. | + | datahub.capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | | datahub.graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | -5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). +5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). 6. [optional] Learn more about [Airflow lineage](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html), including shorthand notation and some automation. ### How to validate installation @@ -80,9 +81,7 @@ Emitting DataHub ... If you have created a custom Airflow operator [docs](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class, when overriding the `execute` function, set inlets and outlets via `context['ti'].task.inlets` and `context['ti'].task.outlets`. -The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. - - +The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. ```python class DbtOperator(BaseOperator): @@ -97,8 +96,8 @@ class DbtOperator(BaseOperator): def _get_lineage(self): # Do some processing to get inlets/outlets - - return inlets, outlets + + return inlets, outlets ``` If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. [source](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) @@ -161,18 +160,17 @@ pip install acryl-datahub[airflow,datahub-kafka] - `capture_executions` (defaults to false): If true, it captures task runs as DataHub DataProcessInstances. - `graceful_exceptions` (defaults to true): If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. -4. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). +4. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). 5. [optional] Learn more about [Airflow lineage](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html), including shorthand notation and some automation. ## Emitting lineage via a separate operator Take a look at this sample DAG: -- [`lineage_emission_dag.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_emission_dag.py) - emits lineage using the DatahubEmitterOperator. +- [`lineage_emission_dag.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py) - emits lineage using the DatahubEmitterOperator. In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details. - ## Debugging ### Incorrect URLs diff --git a/docs/links.md b/docs/links.md index f175262b9b5d9..45ba391e557cd 100644 --- a/docs/links.md +++ b/docs/links.md @@ -39,7 +39,7 @@ * [Creating Notebook-based Dynamic Dashboards](https://towardsdatascience.com/creating-notebook-based-dynamic-dashboards-91f936adc6f3) ## Talks & Presentations -* [DataHub: Powering LinkedIn's Metadata](demo/DataHub_-_Powering_LinkedIn_Metadata.pdf) @ [Budapest Data Forum 2020](https://budapestdata.hu/2020/en/) +* [DataHub: Powering LinkedIn's Metadata](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf) @ [Budapest Data Forum 2020](https://budapestdata.hu/2020/en/) * [Taming the Data Beast Using DataHub](https://www.youtube.com/watch?v=bo4OhiPro7Y) @ [Data Engineering Melbourne Meetup November 2020](https://www.meetup.com/Data-Engineering-Melbourne/events/kgnvlrybcpbjc/) * [Metadata Management And Integration At LinkedIn With DataHub](https://www.dataengineeringpodcast.com/datahub-metadata-management-episode-147/) @ [Data Engineering Podcast](https://www.dataengineeringpodcast.com) * [The evolution of metadata: LinkedIn’s story](https://speakerdeck.com/shirshanka/the-evolution-of-metadata-linkedins-journey-strata-nyc-2019) @ [Strata Data Conference 2019](https://conferences.oreilly.com/strata/strata-ny-2019.html) diff --git a/docs/managed-datahub/chrome-extension.md b/docs/managed-datahub/chrome-extension.md index a614327c7fd29..0aa0860d03b67 100644 --- a/docs/managed-datahub/chrome-extension.md +++ b/docs/managed-datahub/chrome-extension.md @@ -10,7 +10,11 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; In order to use the Acryl DataHub Chrome extension, you need to download it onto your browser from the Chrome web store [here](https://chrome.google.com/webstore/detail/datahub-chrome-extension/aoenebhmfokhglijmoacfjcnebdpchfj). -![](imgs/saas/chrome-store-extension-screenshot.png) + +

+ +

+ Simply click "Add to Chrome" then "Add extension" on the ensuing popup. @@ -20,11 +24,19 @@ Once you have your extension installed, you'll need to configure it to work with 1. Click the extension button on the right of your browser's address bar to view all of your installed extensions. Click on the newly installed DataHub extension. -![](imgs/saas/extension_open_popup.png) + +

+ +

+ 2. Fill in your DataHub domain and click "Continue" in the extension popup that appears. -![](imgs/saas/extension_enter_domain.png) + +

+ +

+ If your organization uses standard SaaS domains for Looker, you should be ready to go! @@ -34,11 +46,19 @@ Some organizations have custom SaaS domains for Looker and some Acryl DataHub de 1. Click on the extension button and select your DataHub extension to open the popup again. Now click the settings icon in order to open the configurations page. -![](imgs/saas/extension_open_options_page.png) + +

+ +

+ 2. Fill out any and save custom configurations you have in the **TOOL CONFIGURATIONS** section. Here you can configure a custom domain, a Platform Instance associated with that domain, and the Environment set on your DataHub assets. If you don't have a custom domain but do have a custom Platform Instance or Environment, feel free to leave the field domain empty. -![](imgs/saas/extension_custom_configs.png) + +

+ +

+ ## Using the Extension @@ -52,7 +72,11 @@ Once you have everything configured on your extension, it's time to use it! 4. Click the Acryl DataHub extension button on the bottom right of your page to open a drawer where you can now see additional information about this asset right from your DataHub instance. -![](imgs/saas/extension_view_in_looker.png) + +

+ +

+ ## Advanced: Self-Hosted DataHub diff --git a/docs/managed-datahub/datahub-api/graphql-api/getting-started.md b/docs/managed-datahub/datahub-api/graphql-api/getting-started.md index 3c57b0a21d96e..736bf6fea6811 100644 --- a/docs/managed-datahub/datahub-api/graphql-api/getting-started.md +++ b/docs/managed-datahub/datahub-api/graphql-api/getting-started.md @@ -10,7 +10,11 @@ For a full reference to the Queries & Mutations available for consumption, check ### Connecting to the API -![](../../imgs/saas/image-(3).png) + +

+ +

+ When you generate the token you will see an example of `curl` command which you can use to connect to the GraphQL API. diff --git a/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md b/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md index 89bacb2009e49..16d83d2f57575 100644 --- a/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md +++ b/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md @@ -404,7 +404,11 @@ You can configure Acryl to send slack notifications to a specific channel when i These notifications are also able to tag the immediate asset's owners, along with the owners of downstream assets consuming it. -![](../../imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png) + +

+ +

+ To do so, simply follow the [Slack Integration Guide](docs/managed-datahub/saas-slack-setup.md) and contact your Acryl customer success team to enable the feature! diff --git a/docs/managed-datahub/imgs/saas/DataHub-Architecture.png b/docs/managed-datahub/imgs/saas/DataHub-Architecture.png deleted file mode 100644 index 95b3ab0b06ad6..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/DataHub-Architecture.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png deleted file mode 100644 index 721989a6c37e1..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png deleted file mode 100644 index dffac92f257c7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png deleted file mode 100644 index ff0c29de1fbad..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png deleted file mode 100644 index 070bfd9f6b897..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png deleted file mode 100644 index b4bb4e2ba60ed..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png deleted file mode 100644 index b0397afd1b3a4..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png deleted file mode 100644 index 9258badb6f088..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png deleted file mode 100644 index 386b4cdcd9911..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png deleted file mode 100644 index a129f5eba4271..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png deleted file mode 100644 index 96ae48318a35a..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png deleted file mode 100644 index b6fd273389c90..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png deleted file mode 100644 index 0acd4e75bc6d2..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png deleted file mode 100644 index 364b9292cfaab..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png deleted file mode 100644 index 6a12dc545ec62..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png deleted file mode 100644 index 6a12dc545ec62..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png deleted file mode 100644 index 83645e00d724a..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png deleted file mode 100644 index a2f239ce847e0..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png deleted file mode 100644 index e31d4b089d929..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png deleted file mode 100644 index c003581c9d1b6..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png deleted file mode 100644 index 660dd121dd0a4..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png deleted file mode 100644 index 07e3c71dba262..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png deleted file mode 100644 index 579e7f62af708..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png deleted file mode 100644 index f85f4d5c79bfb..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png deleted file mode 100644 index f85f4d5c79bfb..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png deleted file mode 100644 index cb8b7470cd957..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png deleted file mode 100644 index 1de51e33d87c2..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png deleted file mode 100644 index df687dabe345c..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png deleted file mode 100644 index a8d9ee37c7a55..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png deleted file mode 100644 index a8d9ee37c7a55..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled(1).png b/docs/managed-datahub/imgs/saas/Untitled(1).png deleted file mode 100644 index 87846e7897f6e..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png b/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png deleted file mode 100644 index 7715bf4a51fbe..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled-(2).png b/docs/managed-datahub/imgs/saas/Untitled-(2).png deleted file mode 100644 index a01a1af370442..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled-(2).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled-(3).png b/docs/managed-datahub/imgs/saas/Untitled-(3).png deleted file mode 100644 index 02d84b326896c..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled-(3).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled-(4).png b/docs/managed-datahub/imgs/saas/Untitled-(4).png deleted file mode 100644 index a01a1af370442..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled-(4).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/Untitled.png b/docs/managed-datahub/imgs/saas/Untitled.png deleted file mode 100644 index a01a1af370442..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/Untitled.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png b/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png deleted file mode 100644 index e00a4d57f32dd..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_custom_configs.png b/docs/managed-datahub/imgs/saas/extension_custom_configs.png deleted file mode 100644 index b3d70dfac00ff..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_custom_configs.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_developer_mode.png b/docs/managed-datahub/imgs/saas/extension_developer_mode.png deleted file mode 100644 index e740d15912e17..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_developer_mode.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_enter_domain.png b/docs/managed-datahub/imgs/saas/extension_enter_domain.png deleted file mode 100644 index 3304fa168beaf..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_enter_domain.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_load_unpacked.png b/docs/managed-datahub/imgs/saas/extension_load_unpacked.png deleted file mode 100644 index 8f56705cd9176..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_load_unpacked.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_open_options_page.png b/docs/managed-datahub/imgs/saas/extension_open_options_page.png deleted file mode 100644 index c1366d5673b59..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_open_options_page.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_open_popup.png b/docs/managed-datahub/imgs/saas/extension_open_popup.png deleted file mode 100644 index 216056b847fb5..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_open_popup.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/extension_view_in_looker.png b/docs/managed-datahub/imgs/saas/extension_view_in_looker.png deleted file mode 100644 index bf854b3e840f7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/extension_view_in_looker.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/home-(1).png b/docs/managed-datahub/imgs/saas/home-(1).png deleted file mode 100644 index 88cf2017dd7e7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/home-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/home.png b/docs/managed-datahub/imgs/saas/home.png deleted file mode 100644 index 8ad63deec75c9..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/home.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(1).png b/docs/managed-datahub/imgs/saas/image-(1).png deleted file mode 100644 index c1a249125fcf7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(1).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(10).png b/docs/managed-datahub/imgs/saas/image-(10).png deleted file mode 100644 index a580fdc3d6730..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(10).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(11).png b/docs/managed-datahub/imgs/saas/image-(11).png deleted file mode 100644 index ee95eb4384272..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(11).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(12).png b/docs/managed-datahub/imgs/saas/image-(12).png deleted file mode 100644 index bbd8e6a66cf85..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(12).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(13).png b/docs/managed-datahub/imgs/saas/image-(13).png deleted file mode 100644 index bbd8e6a66cf85..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(13).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(14).png b/docs/managed-datahub/imgs/saas/image-(14).png deleted file mode 100644 index a580fdc3d6730..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(14).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(15).png b/docs/managed-datahub/imgs/saas/image-(15).png deleted file mode 100644 index f282e2d92c1a1..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(15).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(16).png b/docs/managed-datahub/imgs/saas/image-(16).png deleted file mode 100644 index 1340c77bd648c..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(16).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(17).png b/docs/managed-datahub/imgs/saas/image-(17).png deleted file mode 100644 index 6eee2fb2d821f..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(17).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(2).png b/docs/managed-datahub/imgs/saas/image-(2).png deleted file mode 100644 index cf475edd7b95d..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(2).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(3).png b/docs/managed-datahub/imgs/saas/image-(3).png deleted file mode 100644 index b08818ff3e97c..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(3).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(4).png b/docs/managed-datahub/imgs/saas/image-(4).png deleted file mode 100644 index a580fdc3d6730..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(4).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(5).png b/docs/managed-datahub/imgs/saas/image-(5).png deleted file mode 100644 index 48438c6001e4f..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(5).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(6).png b/docs/managed-datahub/imgs/saas/image-(6).png deleted file mode 100644 index 54e569e853f24..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(6).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(7).png b/docs/managed-datahub/imgs/saas/image-(7).png deleted file mode 100644 index 6e89e5881cfa7..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(7).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(8).png b/docs/managed-datahub/imgs/saas/image-(8).png deleted file mode 100644 index ee0a3c89d58fa..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(8).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image-(9).png b/docs/managed-datahub/imgs/saas/image-(9).png deleted file mode 100644 index 301ca98593ef9..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image-(9).png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/image.png b/docs/managed-datahub/imgs/saas/image.png deleted file mode 100644 index a1cfc3e74c5dd..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/image.png and /dev/null differ diff --git a/docs/managed-datahub/imgs/saas/settings.png b/docs/managed-datahub/imgs/saas/settings.png deleted file mode 100644 index ca99984abbbc9..0000000000000 Binary files a/docs/managed-datahub/imgs/saas/settings.png and /dev/null differ diff --git a/docs/managed-datahub/integrations/oidc-sso-integration.md b/docs/managed-datahub/integrations/oidc-sso-integration.md index 6a9e085186b44..ec4ca311a0de5 100644 --- a/docs/managed-datahub/integrations/oidc-sso-integration.md +++ b/docs/managed-datahub/integrations/oidc-sso-integration.md @@ -42,4 +42,8 @@ To enable the OIDC integration, start by navigating to **Settings > Platform > S 4. If there are any advanced settings you would like to configure, click on the **Advanced** button. These come with defaults, so only input settings here if there is something you need changed from the default configuration. 5. Click **Update** to save your settings. -![](../imgs/saas/image-(10).png) + +

+ +

+ diff --git a/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md b/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md index 95ca6e5e33e16..0444d15b3627c 100644 --- a/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md +++ b/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md @@ -56,9 +56,17 @@ In Acryl DataHub deployments, you _must_ use a sink of type `datahub-rest`, whic 2. **token**: a unique API key used to authenticate requests to your instance's REST API The token can be retrieved by logging in as admin. You can go to Settings page and generate a Personal Access Token with your desired expiration date. -![](../imgs/saas/home-(1).png) -![](../imgs/saas/settings.png) +

+ +

+ + + +

+ +

+ To configure your instance of DataHub as the destination for ingestion, set the "server" field of your recipe to point to your Acryl instance's domain suffixed by the path `/gms`, as shown below. A complete example of a DataHub recipe file, which reads from MySQL and writes into a DataHub instance: diff --git a/docs/managed-datahub/observe/freshness-assertions.md b/docs/managed-datahub/observe/freshness-assertions.md index 54b3134151d3a..c5d4ca9081b43 100644 --- a/docs/managed-datahub/observe/freshness-assertions.md +++ b/docs/managed-datahub/observe/freshness-assertions.md @@ -59,7 +59,7 @@ Tables. For example, imagine that we work for a company with a Snowflake Table that stores user clicks collected from our e-commerce website. This table is updated with new data on a specific cadence: once per hour (In practice, daily or even weekly are also common). In turn, there is a downstream Business Analytics Dashboard in Looker that shows important metrics like -the number of people clicking our "Daily Sale" banners, and this dashboard pulls is generated from data stored in our "clicks" table. +the number of people clicking our "Daily Sale" banners, and this dashboard is generated from data stored in our "clicks" table. It is important that our clicks Table continues to be updated each hour because if it stops being updated, it could mean that our downstream metrics dashboard becomes incorrect. And the risk of this situation is obvious: our organization may make bad decisions based on incomplete information. @@ -122,8 +122,12 @@ Change Source types vary by the platform, but generally fall into these categori is higher than the previously observed value, in order to determine whether the Table has been changed within a given period of time. Note that this approach is only supported if the Change Window does not use a fixed interval. - Using the final 2 approaches - column value queries - to determine whether a Table has changed useful because it can be customized to determine whether - specific types of important changes have been made to a given Table. + - **DataHub Operation**: A DataHub "Operation" aspect contains timeseries information used to describe changes made to an entity. Using this + option avoids contacting your data platform, and instead uses the DataHub Operation metadata to evaluate Freshness Assertions. + This relies on Operations being reported to DataHub, either via ingestion or via use of the DataHub APIs (see [Report Operation via API](#reporting-operations-via-api)). + Note if you have not configured an ingestion source through DataHub, then this may be the only option available. + + Using either of the column value approaches (**Last Modified Column** or **High Watermark Column**) to determine whether a Table has changed can be useful because it can be customized to determine whether specific types of important changes have been made to a given Table. Because it does not involve system warehouse tables, it is also easily portable across Data Warehouse and Data Lake providers. Freshness Assertions also have an off switch: they can be started or stopped at any time with the click of button. @@ -178,7 +182,7 @@ _Check whether the table has changed in a specific window of time_ 7. (Optional) Click **Advanced** to customize the evaluation **source**. This is the mechanism that will be used to evaluate -the check. Each Data Platform supports different options including Audit Log, Information Schema, Last Modified Column, and High Watermark Column. +the check. Each Data Platform supports different options including Audit Log, Information Schema, Last Modified Column, High Watermark Column, and DataHub Operation.

@@ -189,11 +193,12 @@ the check. Each Data Platform supports different options including Audit Log, In - **Last Modified Column**: Check for the presence of rows using a "Last Modified Time" column, which should reflect the time at which a given row was last changed in the table, to determine whether the table changed within the evaluation period. - **High Watermark Column**: Monitor changes to a continuously-increasing "high watermark" column value to determine whether a table - has been changed. This option is particularly useful for tables that grow consistently with time, for example fact or event (e.g. click-strea) tables. It is not available + has been changed. This option is particularly useful for tables that grow consistently with time, for example fact or event (e.g. click-stream) tables. It is not available when using a fixed lookback period. +- **DataHub Operation**: Use DataHub Operations to determine whether the table changed within the evaluation period. -8. Click **Next** -9. Configure actions that should be taken when the Freshness Assertion passes or fails +1. Click **Next** +2. Configure actions that should be taken when the Freshness Assertion passes or fails

@@ -280,7 +285,7 @@ Note that to create or delete Assertions and Monitors for a specific entity on D In order to create a Freshness Assertion that is being monitored on a specific **Evaluation Schedule**, you'll need to use 2 GraphQL mutation queries to create a Freshness Assertion entity and create an Assertion Monitor entity responsible for evaluating it. -Start by creating the Freshness Assertion entity using the `createFreshnessAssertion` query and hang on to the 'urn' field of the Assertion entit y +Start by creating the Freshness Assertion entity using the `createFreshnessAssertion` query and hang on to the 'urn' field of the Assertion entity you get back. Then continue by creating a Monitor entity using the `createAssertionMonitor`. ##### Examples @@ -291,10 +296,10 @@ To create a Freshness Assertion Entity that checks whether a table has been upda mutation createFreshnessAssertion { createFreshnessAssertion( input: { - entityUrn: "" - type: DATASET_CHANGE + entityUrn: "", + type: DATASET_CHANGE, schedule: { - type: FIXED_INTERVAL + type: FIXED_INTERVAL, fixedInterval: { unit: HOUR, multiple: 8 } } } @@ -337,6 +342,28 @@ After creating the monitor, the new assertion will start to be evaluated every 8 You can delete assertions along with their monitors using GraphQL mutations: `deleteAssertion` and `deleteMonitor`. +### Reporting Operations via API + +DataHub Operations can be used to capture changes made to entities. This is useful for cases where the underlying data platform does not provide a mechanism +to capture changes, or where the data platform's mechanism is not reliable. In order to report an operation, you can use the `reportOperation` GraphQL mutation. + + +##### Examples +```json +mutation reportOperation { + reportOperation( + input: { + urn: "", + operationType: INSERT, + sourceType: DATA_PLATFORM, + timestampMillis: 1693252366489 + } + ) +} +``` + +Use the `timestampMillis` field to specify the time at which the operation occurred. If no value is provided, the current time will be used. + ### Tips :::info diff --git a/docs/managed-datahub/observe/volume-assertions.md b/docs/managed-datahub/observe/volume-assertions.md new file mode 100644 index 0000000000000..5f5aff33a5ce2 --- /dev/null +++ b/docs/managed-datahub/observe/volume-assertions.md @@ -0,0 +1,355 @@ +--- +description: This page provides an overview of working with DataHub Volume Assertions +--- +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + + +# Volume Assertions + + + + +> ⚠️ The **Volume Assertions** feature is currently in private beta, part of the **Acryl Observe** module, and may only be available to a +> limited set of design partners. +> +> If you are interested in trying it and providing feedback, please reach out to your Acryl Customer Success +> representative. + +## Introduction + +Can you remember a time when the meaning of Data Warehouse Table that you depended on fundamentally changed, with little or no notice? +If the answer is yes, how did you find out? We'll take a guess - someone looking at an internal reporting dashboard or worse, a user using your your product, sounded an alarm when +a number looked a bit out of the ordinary. Perhaps your table initially tracked purchases made on your company's e-commerce web store, but suddenly began to include purchases made +through your company's new mobile app. + +There are many reasons why an important Table on Snowflake, Redshift, or BigQuery may change in its meaning - application code bugs, new feature rollouts, +changes to key metric definitions, etc. Often times, these changes break important assumptions made about the data used in building key downstream data products +like reporting dashboards or data-driven product features. + +What if you could reduce the time to detect these incidents, so that the people responsible for the data were made aware of data +issues _before_ anyone else? With Acryl DataHub **Volume Assertions**, you can. + +Acryl DataHub allows users to define expectations about the normal volume, or size, of a particular warehouse Table, +and then monitor those expectations over time as the table grows and changes. + +In this article, we'll cover the basics of monitoring Volume Assertions - what they are, how to configure them, and more - so that you and your team can +start building trust in your most important data assets. + +Let's get started! + +## Support + +Volume Assertions are currently supported for: + +1. Snowflake +2. Redshift +3. BigQuery + +Note that an Ingestion Source _must_ be configured with the data platform of your choice in Acryl DataHub's **Ingestion** +tab. + +> Note that Volume Assertions are not yet supported if you are connecting to your warehouse +> using the DataHub CLI or a Remote Ingestion Executor. + +## What is a Volume Assertion? + +A **Volume Assertion** is a configurable Data Quality rule used to monitor a Data Warehouse Table +for unexpected or sudden changes in "volume", or row count. Volume Assertions can be particularly useful when you have frequently-changing +Tables which have a relatively stable pattern of growth or decline. + +For example, imagine that we work for a company with a Snowflake Table that stores user clicks collected from our e-commerce website. +This table is updated with new data on a specific cadence: once per hour (In practice, daily or even weekly are also common). +In turn, there is a downstream Business Analytics Dashboard in Looker that shows important metrics like +the number of people clicking our "Daily Sale" banners, and this dashboard is generated from data stored in our "clicks" table. +It is important that our clicks Table is updated with the correct number of rows each hour, else it could mean +that our downstream metrics dashboard becomes incorrect. The risk of this situation is obvious: our organization +may make bad decisions based on incomplete information. + +In such cases, we can use a **Volume Assertion** that checks whether the Snowflake "clicks" Table is growing in an expected +way, and that there are no sudden increases or sudden decreases in the rows being added or removed from the table. +If too many rows are added or removed within an hour, we can notify key stakeholders and begin to root cause before the problem impacts stakeholders of the data. + +### Anatomy of a Volume Assertion + +At the most basic level, **Volume Assertions** consist of a few important parts: + +1. An **Evaluation Schedule** +2. A **Volume Condition** +2. A **Volume Source** + +In this section, we'll give an overview of each. + +#### 1. Evaluation Schedule + +The **Evaluation Schedule**: This defines how often to check a given warehouse Table for its volume. This should usually +be configured to match the expected change frequency of the Table, although it can also be less frequently depending +on the requirements. You can also specify specific days of the week, hours in the day, or even +minutes in an hour. + + +#### 2. Volume Condition + +The **Volume Condition**: This defines the type of condition that we'd like to monitor, or when the Assertion +should result in failure. + +There are a 2 different categories of conditions: **Total** Volume and **Change** Volume. + +_Total_ volume conditions are those which are defined against the point-in-time total row count for a table. They allow you to specify conditions like: + +1. **Table has too many rows**: The table should always have less than 1000 rows +2. **Table has too few rows**: The table should always have more than 1000 rows +3. **Table row count is outside a range**: The table should always have between 1000 and 2000 rows. + +_Change_ volume conditions are those which are defined against the growth or decline rate of a table, measured between subsequent checks +of the table volume. They allow you to specify conditions like: + +1. **Table growth is too fast**: When the table volume is checked, it should have < 1000 more rows than it had during the previous check. +2. **Table growth is too slow**: When the table volume is checked, it should have > 1000 more rows than it had during the previous check. +3. **Table growth is outside a range**: When the table volume is checked, it should have between 1000 and 2000 more rows than it had during the previous check. + +For change volume conditions, both _absolute_ row count deltas and relative percentage deltas are supported for identifying +table that are following an abnormal pattern of growth. + + +#### 3. Volume Source + +The **Volume Source**: This is the mechanism that Acryl DataHub should use to determine the table volume (row count). The supported +source types vary by the platform, but generally fall into these categories: + +- **Information Schema**: A system Table that is exposed by the Data Warehouse which contains live information about the Databases + and Tables stored inside the Data Warehouse, including their row count. It is usually efficient to check, but can in some cases be slightly delayed to update + once a change has been made to a table. + +- **Query**: A `COUNT(*)` query is used to retrieve the latest row count for a table, with optional SQL filters applied (depending on platform). + This can be less efficient to check depending on the size of the table. This approach is more portable, as it does not involve + system warehouse tables, it is also easily portable across Data Warehouse and Data Lake providers. + +- **DataHub Dataset Profile**: The DataHub Dataset Profile aspect is used to retrieve the latest row count information for a table. + Using this option avoids contacting your data platform, and instead uses the DataHub Dataset Profile metadata to evaluate Volume Assertions. + Note if you have not configured an ingestion source through DataHub, then this may be the only option available. + +Volume Assertions also have an off switch: they can be started or stopped at any time with the click of button. + + +## Creating a Volume Assertion + +### Prerequisites + +1. **Permissions**: To create or delete Volume Assertions for a specific entity on DataHub, you'll need to be granted the + `Edit Assertions` and `Edit Monitors` privileges for the entity. This is granted to Entity owners by default. + +2. **Data Platform Connection**: In order to create a Volume Assertion, you'll need to have an **Ingestion Source** configured to your + Data Platform: Snowflake, BigQuery, or Redshift under the **Integrations** tab. + +Once these are in place, you're ready to create your Volume Assertions! + +### Steps + +1. Navigate to the Table that to monitor for volume +2. Click the **Validations** tab + +

+ +

+ +3. Click **+ Create Assertion** + +

+ +

+ +4. Choose **Volume** + +5. Configure the evaluation **schedule**. This is the frequency at which the assertion will be evaluated to produce a pass or fail result, and the times + when the table volume will be checked. + +6. Configure the evaluation **condition type**. This determines the cases in which the new assertion will fail when it is evaluated. + +

+ +

+ +7. (Optional) Click **Advanced** to customize the volume **source**. This is the mechanism that will be used to obtain the table + row count metric. Each Data Platform supports different options including Information Schema, Query, and DataHub Dataset Profile. + +

+ +

+ +- **Information Schema**: Check the Data Platform system metadata tables to determine the table row count. +- **Query**: Issue a `COUNT(*)` query to the table to determine the row count. +- **DataHub Dataset Profile**: Use the DataHub Dataset Profile metadata to determine the row count. + +8. Click **Next** +9. Configure actions that should be taken when the Volume Assertion passes or fails + +

+ +

+ +- **Raise incident**: Automatically raise a new DataHub `Volume` Incident for the Table whenever the Volume Assertion is failing. This + may indicate that the Table is unfit for consumption. Configure Slack Notifications under **Settings** to be notified when + an incident is created due to an Assertion failure. +- **Resolve incident**: Automatically resolved any incidents that were raised due to failures in this Volume Assertion. Note that + any other incidents will not be impacted. + +10. Click **Save**. + +And that's it! DataHub will now begin to monitor your Volume Assertion for the table. + +To view the time of the next Volume Assertion evaluation, simply click **Volume** and then click on your +new Assertion: + +

+ +

+ +Once your assertion has run, you will begin to see Success or Failure status for the Table + +

+ +

+ + +## Stopping a Volume Assertion + +In order to temporarily stop the evaluation of a Volume Assertion: + +1. Navigate to the **Validations** tab of the Table with the assertion +2. Click **Volume** to open the Volume Assertions list +3. Click the three-dot menu on the right side of the assertion you want to disable +4. Click **Stop** + +

+ +

+ +To resume the Volume Assertion, simply click **Turn On**. + +

+ +

+ + +## Smart Assertions ⚡ + +As part of the **Acryl Observe** module, Acryl DataHub also provides **Smart Assertions** out of the box. These are +dynamic, AI-powered Volume Assertions that you can use to monitor the volume of important warehouse Tables, without +requiring any manual setup. + +If Acryl DataHub is able to detect a pattern in the volume of a Snowflake, Redshift, or BigQuery Table, you'll find +a recommended Smart Assertion under the `Validations` tab on the Table profile page: + +

+ +

+ +In order to enable it, simply click **Turn On**. From this point forward, the Smart Assertion will check for changes on a cadence +based on the Table history. + +Don't need it anymore? Smart Assertions can just as easily be turned off by clicking the three-dot "more" button and then **Stop**. + + +## Creating Volume Assertions via API + +Under the hood, Acryl DataHub implements Volume Assertion Monitoring using two "entity" concepts: + +- **Assertion**: The specific expectation for volume, e.g. "The table was changed int the past 7 hours" + or "The table is changed on a schedule of every day by 8am". This is the "what". + +- **Monitor**: The process responsible for evaluating the Assertion on a given evaluation schedule and using specific + mechanisms. This is the "how". + +Note that to create or delete Assertions and Monitors for a specific entity on DataHub, you'll need the +`Edit Assertions` and `Edit Monitors` privileges for it. + +#### GraphQL + +In order to create a Volume Assertion that is being monitored on a specific **Evaluation Schedule**, you'll need to use 2 +GraphQL mutation queries to create a Volume Assertion entity and create an Assertion Monitor entity responsible for evaluating it. + +Start by creating the Volume Assertion entity using the `createVolumeAssertion` query and hang on to the 'urn' field of the Assertion entity +you get back. Then continue by creating a Monitor entity using the `createAssertionMonitor`. + +##### Examples + +To create a Volume Assertion Entity that checks whether a table has been updated in the past 8 hours: + +```json +mutation createVolumeAssertion { + createVolumeAssertion( + input: { + entityUrn: "", + type: ROW_COUNT_TOTAL, + rowCountTotal: { + operator: BETWEEN, + parameters: { + minValue: { + "value": 10, + "type": NUMBER + }, + maxValue: { + "value": 20, + "type": NUMBER + } + } + } + } + ) { + urn +} +} +``` + +To create an assertion that specifies that the row count total should always fall between 10 and 20. + +The supported volume assertion types are `ROW_COUNT_TOTAL` and `ROW_COUNT_CHANGE`. Other (e.g. incrementing segment) types are not yet supported. +The supported operator types are `GREATER_THAN`, `GREATER_THAN_OR_EQUAL_TO`, `LESS_THAN`, `LESS_THAN_OR_EQUAL_TO`, and `BETWEEN` (requires minValue, maxValue). +The supported parameter types are `NUMBER`. + +To create an Assertion Monitor Entity that evaluates the volume assertion every 8 hours using the Information Schema: + +```json +mutation createAssertionMonitor { + createAssertionMonitor( + input: { + entityUrn: "", + assertionUrn: "", + schedule: { + cron: "0 */8 * * *", + timezone: "America/Los_Angeles" + }, + parameters: { + type: DATASET_VOLUME, + datasetVolumeParameters: { + sourceType: INFORMATION_SCHEMA, + } + } + } + ) { + urn + } +} +``` + +This entity defines _when_ to run the check (Using CRON format - every 8th hour) and _how_ to run the check (using the Information Schema). + +After creating the monitor, the new assertion will start to be evaluated every 8 hours in your selected timezone. + +You can delete assertions along with their monitors using GraphQL mutations: `deleteAssertion` and `deleteMonitor`. + +### Tips + +:::info +**Authorization** + +Remember to always provide a DataHub Personal Access Token when calling the GraphQL API. To do so, just add the 'Authorization' header as follows: + +``` +Authorization: Bearer +``` + +**Exploring GraphQL API** + +Also, remember that you can play with an interactive version of the Acryl GraphQL API at `https://your-account-id.acryl.io/api/graphiql` +::: diff --git a/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md b/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md index d389ec97d0550..b8fb0ea9e80f1 100644 --- a/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md +++ b/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md @@ -17,11 +17,19 @@ Acryl DataHub comes packaged with an Acryl-managed ingestion executor, which is For example, if an ingestion source is not publicly accessible via the internet, e.g. hosted privately within a specific AWS account, then the Acryl executor will be unable to extract metadata from it. -![Option 1: Acryl-hosted ingestion runner](../imgs/saas/image-(12).png) + +

+ +

+ To accommodate these cases, Acryl supports configuring a remote ingestion executor which can be deployed inside of your AWS account. This setup allows you to continue leveraging the Acryl DataHub console to create, schedule, and run metadata ingestion, all while retaining network and credential isolation. -![Option 2: Customer-hosted ingestion runner](../imgs/saas/image-(6).png) + +

+ +

+ ## Deploying a Remote Ingestion Executor 1. **Provide AWS Account Id**: Provide Acryl Team with the id of the AWS in which the remote executor will be hosted. This will be used to grant access to private Acryl containers and create a unique SQS queue which your remote agent will subscribe to. The account id can be provided to your Acryl representative via Email or [One Time Secret](https://onetimesecret.com/). @@ -40,23 +48,39 @@ To accommodate these cases, Acryl supports configuring a remote ingestion execut Note that the only external secret provider that is currently supported is AWS Secrets Manager. -![](../imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png) -![](../imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png) +

+ +

+ + + +

+ +

+ 3. **Test the Executor:** To test your remote executor: 1. Create a new Ingestion Source by clicking '**Create new Source**' the '**Ingestion**' tab of the DataHub console. Configure your Ingestion Recipe as though you were running it from inside of your environment. 2. When working with "secret" fields (passwords, keys, etc), you can refer to any "self-managed" secrets by name: `${SECRET_NAME}:` - ![Using a secret called BQ_DEPLOY_KEY which is managed in AWS secrets manager](../imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png) + +

+ +

+ 3. In the 'Finish Up' step, click '**Advanced'**. 4. Update the '**Executor Id**' form field to be '**remote**'. This indicates that you'd like to use the remote executor. 5. Click '**Done**'. Now, simple click '**Execute**' to test out the remote executor. If your remote executor is configured properly, you should promptly see the ingestion task state change to 'Running'. -![](../imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png) + +

+ +

+ ## Updating a Remote Ingestion Executor In order to update the executor, ie. to deploy a new container version, you'll need to update the CloudFormation Stack to re-deploy the CloudFormation template with a new set of parameters. ### Steps - AWS Console @@ -66,7 +90,11 @@ In order to update the executor, ie. to deploy a new container version, you'll n 4. Select **Replace Current Template** 5. Select **Upload a template file** 6. Upload a copy of the Acryl Remote Executor [CloudFormation Template](https://raw.githubusercontent.com/acryldata/datahub-cloudformation/master/Ingestion/templates/python.ecs.template.yaml) -![](../imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png) + +

+ +

+ 7. Click **Next** 8. Change parameters based on your modifications (e.g. ImageTag, etc) 9. Click **Next** diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md index 32951ab2e41eb..98f70f6d933e4 100644 --- a/docs/modeling/extending-the-metadata-model.md +++ b/docs/modeling/extending-the-metadata-model.md @@ -11,7 +11,11 @@ these two concepts prior to making changes. ## To fork or not to fork? An important question that will arise once you've decided to extend the metadata model is whether you need to fork the main repo or not. Use the diagram below to understand how to make this decision. -![Metadata Model To Fork or Not](../imgs/metadata-model-to-fork-or-not-to.png) + +

+ +

+ The green lines represent pathways that will lead to lesser friction for you to maintain your code long term. The red lines represent higher risk of conflicts in the future. We are working hard to move the majority of model extension use-cases to no-code / low-code pathways to ensure that you can extend the core metadata model without having to maintain a custom fork of DataHub. @@ -323,7 +327,7 @@ It takes the following parameters: annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define the set of mappings to be applied in the MappingsBuilder. - Thus far, we have implemented 10 fieldTypes: + Thus far, we have implemented 11 fieldTypes: 1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering @@ -332,20 +336,25 @@ It takes the following parameters: 3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial matching is expensive, so this field type should not be applied to fields with long values (like description) - 4. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. + 4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND + word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries + matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is + expensive, so should not be applied to fields with long values such as description. + + 5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. - 5. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like + 6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components - 6. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. + 7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. - 7. *BOOLEAN* - Boolean fields used for filtering. + 8. *BOOLEAN* - Boolean fields used for filtering. - 8. *COUNT* - Count fields used for filtering. + 9. *COUNT* - Count fields used for filtering. - 9. *DATETIME* - Datetime fields used to represent timestamps. + 10. *DATETIME* - Datetime fields used to represent timestamps. - 10. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as + 11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a mapping explosion in Elasticsearch. diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md index 704fce1412329..037c9c7108a6e 100644 --- a/docs/modeling/metadata-model.md +++ b/docs/modeling/metadata-model.md @@ -30,7 +30,11 @@ Conceptually, metadata is modeled using the following abstractions Here is an example graph consisting of 3 types of entity (CorpUser, Chart, Dashboard), 2 types of relationship (OwnedBy, Contains), and 3 types of metadata aspect (Ownership, ChartInfo, and DashboardInfo). -![metadata-modeling](../imgs/metadata-model-chart.png) + +

+ +

+ ## The Core Entities @@ -73,7 +77,11 @@ to the YAML configuration, instead of creating new Snapshot / Aspect files. ## Exploring DataHub's Metadata Model To explore the current DataHub metadata model, you can inspect this high-level picture that shows the different entities and edges between them showing the relationships between them. -![Metadata Model Graph](../imgs/datahub-metadata-model.png) + +

+ +

+ To navigate the aspect model for specific entities and explore relationships using the `foreign-key` concept, you can view them in our demo environment or navigate the auto-generated docs in the **Metadata Modeling/Entities** section on the left. diff --git a/docs/platform-instances.md b/docs/platform-instances.md index c6bfe3315de98..0f4515aedae54 100644 --- a/docs/platform-instances.md +++ b/docs/platform-instances.md @@ -1,44 +1,48 @@ -# Working With Platform Instances - -DataHub's metadata model for Datasets supports a three-part key currently: -- Data Platform (e.g. urn:li:dataPlatform:mysql) -- Name (e.g. db.schema.name) -- Env or Fabric (e.g. DEV, PROD, etc.) - -This naming scheme unfortunately does not allow for easy representation of the multiplicity of platforms (or technologies) that might be deployed at an organization within the same environment or fabric. For example, an organization might have multiple Redshift instances in Production and would want to see all the data assets located in those instances inside the DataHub metadata repository. - -As part of the `v0.8.24+` releases, we are unlocking the first phase of supporting Platform Instances in the metadata model. This is done via two main additions: -- The `dataPlatformInstance` aspect that has been added to Datasets which allows datasets to be associated to an instance of a platform -- Enhancements to all ingestion sources that allow them to attach a platform instance to the recipe that changes the generated urns to go from `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format to `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format. Sources that produce lineage to datasets in other platforms (e.g. Looker, Superset etc) also have specific configuration additions that allow the recipe author to specify the mapping between a platform and the instance name that it should be mapped to. - -![./imgs/platform-instances-for-ingestion.png](./imgs/platform-instances-for-ingestion.png) - -## Naming Platform Instances - -When configuring a platform instance, choose an instance name that is understandable and will be stable for the foreseeable future. e.g. `core_warehouse` or `finance_redshift` are allowed names, as are pure guids like `a37dc708-c512-4fe4-9829-401cd60ed789`. Remember that whatever instance name you choose, you will need to specify it in more than one recipe to ensure that the identifiers produced by different sources will line up. - -## Enabling Platform Instances - -Read the Ingestion source specific guides for how to enable platform instances in each of them. -The general pattern is to add an additional optional configuration parameter called `platform_instance`. - -e.g. here is how you would configure a recipe to ingest a mysql instance that you want to call `core_finance` -```yaml -source: - type: mysql - config: - # Coordinates - host_port: localhost:3306 - platform_instance: core_finance - database: dbname - - # Credentials - username: root - password: example - -sink: - # sink configs -``` - - -## +# Working With Platform Instances + +DataHub's metadata model for Datasets supports a three-part key currently: +- Data Platform (e.g. urn:li:dataPlatform:mysql) +- Name (e.g. db.schema.name) +- Env or Fabric (e.g. DEV, PROD, etc.) + +This naming scheme unfortunately does not allow for easy representation of the multiplicity of platforms (or technologies) that might be deployed at an organization within the same environment or fabric. For example, an organization might have multiple Redshift instances in Production and would want to see all the data assets located in those instances inside the DataHub metadata repository. + +As part of the `v0.8.24+` releases, we are unlocking the first phase of supporting Platform Instances in the metadata model. This is done via two main additions: +- The `dataPlatformInstance` aspect that has been added to Datasets which allows datasets to be associated to an instance of a platform +- Enhancements to all ingestion sources that allow them to attach a platform instance to the recipe that changes the generated urns to go from `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format to `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format. Sources that produce lineage to datasets in other platforms (e.g. Looker, Superset etc) also have specific configuration additions that allow the recipe author to specify the mapping between a platform and the instance name that it should be mapped to. + + +

+ +

+ + +## Naming Platform Instances + +When configuring a platform instance, choose an instance name that is understandable and will be stable for the foreseeable future. e.g. `core_warehouse` or `finance_redshift` are allowed names, as are pure guids like `a37dc708-c512-4fe4-9829-401cd60ed789`. Remember that whatever instance name you choose, you will need to specify it in more than one recipe to ensure that the identifiers produced by different sources will line up. + +## Enabling Platform Instances + +Read the Ingestion source specific guides for how to enable platform instances in each of them. +The general pattern is to add an additional optional configuration parameter called `platform_instance`. + +e.g. here is how you would configure a recipe to ingest a mysql instance that you want to call `core_finance` +```yaml +source: + type: mysql + config: + # Coordinates + host_port: localhost:3306 + platform_instance: core_finance + database: dbname + + # Credentials + username: root + password: example + +sink: + # sink configs +``` + + +## diff --git a/docs/quickstart.md b/docs/quickstart.md index b93713c4efa5c..cd91dc8d1ac84 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -145,6 +145,27 @@ Please refer to [Change the default user datahub in quickstart](authentication/c We recommend deploying DataHub to production using Kubernetes. We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running. Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough. +The `quickstart` method of running DataHub is intended for local development and a quick way to experience the features that DataHub has to offer. It is not +intended for a production environment. This recommendation is based on the following points. + +#### Default Credentials + +`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying +prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a +design choice to make development easier and is not best practice for a production environment. + +#### Exposed Ports + +DataHub's services, and it's backend data stores use the docker default behavior of binding to all interface addresses. +This makes it useful for development but is not recommended in a production environment. + +#### Performance & Management + +* `quickstart` is limited by the resources available on a single host, there is no ability to scale horizontally. +* Rollout of new versions requires downtime. +* The configuration is largely pre-determined and not easily managed. +* `quickstart`, by default, follows the most recent builds forcing updates to the latest released and unreleased builds. + ## Other Common Operations ### Stopping DataHub diff --git a/docs/schema-history.md b/docs/schema-history.md index 9fc9ec1af52bb..120d041960186 100644 --- a/docs/schema-history.md +++ b/docs/schema-history.md @@ -23,20 +23,32 @@ must have the **View Entity Page** privilege, or be assigned to **any** DataHub You can view the Schema History for a Dataset by navigating to that Dataset's Schema Tab. As long as that Dataset has more than one version, you can view what a Dataset looked like at any given version by using the version selector. Here's an example from DataHub's official Demo environment with the -[Snowflake pets dataset](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.pets,PROD)/Schema?is_lineage_mode=false). +Snowflake pets dataset. + + +

+ +

-![](./imgs/schema-history-latest-version.png) If you click on an older version in the selector, you'll be able to see what the schema looked like back then. Notice the changes here to the glossary terms for the `status` field, and to the descriptions for the `created_at` and `updated_at` fields. -![](./imgs/schema-history-older-version.png) + +

+ +

+ In addition to this, you can also toggle the Audit view that shows you when the most recent changes were made to each field. You can active this by clicking on the Audit icon you see above the top right of the table. -![](./imgs/schema-history-audit-activated.png) + +

+ +

+ You can see here that some of these fields were added at the oldest dataset version, while some were added only at this latest version. Some fields were even modified and had a type change at the latest version! diff --git a/docs/townhall-history.md b/docs/townhall-history.md index 1da490ca6fa69..e235a70c5d7b9 100644 --- a/docs/townhall-history.md +++ b/docs/townhall-history.md @@ -343,8 +343,7 @@ Agenda - Announcements - 2 mins - Community Updates ([video](https://youtu.be/r862MZTLAJ0?t=99)) - 10 mins -- Use-Case: DataHub at Viasat ([slides](demo/ViasatMetadataJourney.pdf),[video](https://youtu.be/2SrDAJnzkjE)) by [Anna Kepler](https://www.linkedin.com/in/akepler) - 15 mins -- Tech Deep Dive: GraphQL + React RFCs readout and discussion ([slides](https://docs.google.com/presentation/d/e/2PACX-1vRtnINnpi6PvFw7-5iW8PSQoT9Kdf1O_0YW7QAr1_mSdJMNftYFTVCjKL-e3fpe8t6IGkha8UpdmoOI/pub?start=false&loop=false&delayms=3000) ,[video](https://www.youtube.com/watch?v=PrBaFrb7pqA)) by [John Joyce](https://www.linkedin.com/in/john-joyce-759883aa) and [Arun Vasudevan](https://www.linkedin.com/in/arun-vasudevan-55117368/) - 15 mins +- Use-Case: DataHub at Viasat ([slides](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/ViasatMetadataJourney.pdf),[video](https://youtu.be/2SrDAJnzkjE)) by [Anna Kepler](https://www.linkedin.com/in/akepler) - 15 mins- Tech Deep Dive: GraphQL + React RFCs readout and discussion ([slides](https://docs.google.com/presentation/d/e/2PACX-1vRtnINnpi6PvFw7-5iW8PSQoT9Kdf1O_0YW7QAr1_mSdJMNftYFTVCjKL-e3fpe8t6IGkha8UpdmoOI/pub?start=false&loop=false&delayms=3000) ,[video](https://www.youtube.com/watch?v=PrBaFrb7pqA)) by [John Joyce](https://www.linkedin.com/in/john-joyce-759883aa) and [Arun Vasudevan](https://www.linkedin.com/in/arun-vasudevan-55117368/) - 15 mins - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 3 mins - General Q&A from sign up sheet, slack, and participants - 15 mins @@ -356,8 +355,8 @@ Agenda Agenda - Quick intro - 5 mins -- [Why did Grofers choose DataHub for their data catalog?](demo/Datahub_at_Grofers.pdf) by [Shubham Gupta](https://www.linkedin.com/in/shubhamg931/) - 15 minutes -- [DataHub UI development - Part 2](demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf) by [Charlie Tran](https://www.linkedin.com/in/charlie-tran/) (LinkedIn) - 20 minutes +- [Why did Grofers choose DataHub for their data catalog?](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Datahub_at_Grofers.pdf) by [Shubham Gupta](https://www.linkedin.com/in/shubhamg931/) - 15 minutes +- [DataHub UI development - Part 2](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf) by [Charlie Tran](https://www.linkedin.com/in/charlie-tran/) (LinkedIn) - 20 minutes - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 5 minutes @@ -368,9 +367,9 @@ Agenda Agenda - Quick intro - 5 mins -- [Lightning talk on Metadata use-cases at LinkedIn](demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf) by [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) (LinkedIn) - 5 mins -- [Strongly Consistent Secondary Index (SCSI) in GMA](demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf), an upcoming feature by [Jyoti Wadhwani](https://www.linkedin.com/in/jyotiwadhwani/) (LinkedIn) - 15 minutes -- [DataHub UI overview](demo/DataHub-UIOverview.pdf) by [Ignacio Bona](https://www.linkedin.com/in/ignaciobona) (LinkedIn) - 20 minutes +- [Lightning talk on Metadata use-cases at LinkedIn](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf) by [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) (LinkedIn) - 5 mins +- [Strongly Consistent Secondary Index (SCSI) in GMA](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf), an upcoming feature by [Jyoti Wadhwani](https://www.linkedin.com/in/jyotiwadhwani/) (LinkedIn) - 15 minutes +- [DataHub UI overview](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/DataHub-UIOverview.pdf) by [Ignacio Bona](https://www.linkedin.com/in/ignaciobona) (LinkedIn) - 20 minutes - General Q&A from sign up sheet, slack, and participants - 10 mins - Closing remarks - 5 minutes @@ -382,8 +381,8 @@ Agenda Agenda - Quick intro - 5 mins -- [Data Discoverability at SpotHero](demo/Data_Discoverability_at_SpotHero.pdf) by [Maggie Hays](https://www.linkedin.com/in/maggie-hays/) (SpotHero) - 20 mins -- [Designing the next generation of metadata events for scale](demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf) by [Chris Lee](https://www.linkedin.com/in/chrisleecmu/) (LinkedIn) - 15 mins +- [Data Discoverability at SpotHero](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Data_Discoverability_at_SpotHero.pdf) by [Maggie Hays](https://www.linkedin.com/in/maggie-hays/) (SpotHero) - 20 mins +- [Designing the next generation of metadata events for scale](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf) by [Chris Lee](https://www.linkedin.com/in/chrisleecmu/) (LinkedIn) - 15 mins - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 5 mins diff --git a/docs/ui-ingestion.md b/docs/ui-ingestion.md index 4435f66e514f3..2ecb1e634c79f 100644 --- a/docs/ui-ingestion.md +++ b/docs/ui-ingestion.md @@ -14,11 +14,19 @@ This document will describe the steps required to configure, schedule, and execu To view & manage UI-based metadata ingestion, you must have the `Manage Metadata Ingestion` & `Manage Secrets` privileges assigned to your account. These can be granted by a [Platform Policy](authorization/policies.md). -![](./imgs/ingestion-privileges.png) + +

+ +

+ Once you have these privileges, you can begin to manage ingestion by navigating to the 'Ingestion' tab in DataHub. -![](./imgs/ingestion-tab.png) + +

+ +

+ On this page, you'll see a list of active **Ingestion Sources**. An Ingestion Sources is a unique source of metadata ingested into DataHub from an external source like Snowflake, Redshift, or BigQuery. @@ -33,7 +41,11 @@ your first **Ingestion Source**. Before ingesting any metadata, you need to create a new Ingestion Source. Start by clicking **+ Create new source**. -![](./imgs/create-new-ingestion-source-button.png) + +

+ +

+ #### Step 1: Select a Platform Template @@ -41,7 +53,11 @@ In the first step, select a **Recipe Template** corresponding to the source type a variety of natively supported integrations, from Snowflake to Postgres to Kafka. Select `Custom` to construct an ingestion recipe from scratch. -![](./imgs/select-platform-template.png) + +

+ +

+ Next, you'll configure an ingestion **Recipe**, which defines _how_ and _what_ to extract from the source system. @@ -68,7 +84,11 @@ used by DataHub to extract metadata from a 3rd party system. It most often consi A sample of a full recipe configured to ingest metadata from MySQL can be found in the image below. -![](./imgs/example-mysql-recipe.png) + +

+ +

+ Detailed configuration examples & documentation for each source type can be found on the [DataHub Docs](https://datahubproject.io/docs/metadata-ingestion/) website. @@ -80,7 +100,11 @@ that are encrypted and stored within DataHub's storage layer. To create a secret, first navigate to the 'Secrets' tab. Then click `+ Create new secret`. -![](./imgs/create-secret.png) + +

+ +

+ _Creating a Secret to store the username for a MySQL database_ @@ -123,7 +147,11 @@ Secret values are not persisted to disk beyond execution time, and are never tra Next, you can optionally configure a schedule on which to execute your new Ingestion Source. This enables to schedule metadata extraction on a monthly, weekly, daily, or hourly cadence depending on the needs of your organization. Schedules are defined using CRON format. -![](./imgs/schedule-ingestion.png) + +

+ +

+ _An Ingestion Source that is executed at 9:15am every day, Los Angeles time_ @@ -136,7 +164,11 @@ you can always come back and change this. Finally, give your Ingestion Source a name. -![](./imgs/name-ingestion-source.png) + +

+ +

+ Once you're happy with your configurations, click 'Done' to save your changes. @@ -149,7 +181,11 @@ with the server. However, you can override the default package version using the To do so, simply click 'Advanced', then change the 'CLI Version' text box to contain the exact version of the DataHub CLI you'd like to use. -![](./imgs/custom-ingestion-cli-version.png) + +

+ +

+ _Pinning the CLI version to version `0.8.23.2`_ Once you're happy with your changes, simply click 'Done' to save. @@ -200,11 +236,19 @@ Once you've created your Ingestion Source, you can run it by clicking 'Execute'. you should see the 'Last Status' column of the ingestion source change from `N/A` to `Running`. This means that the request to execute ingestion has been successfully picked up by the DataHub ingestion executor. -![](./imgs/running-ingestion.png) + +

+ +

+ If ingestion has executed successfully, you should see it's state shown in green as `Succeeded`. -![](./imgs/successful-ingestion.png) + +

+ +

+ ### Cancelling an Ingestion Run @@ -212,14 +256,22 @@ If ingestion has executed successfully, you should see it's state shown in green If your ingestion run is hanging, there may a bug in the ingestion source, or another persistent issue like exponential timeouts. If these situations, you can cancel ingestion by clicking **Cancel** on the problematic run. -![](./imgs/cancelled-ingestion.png) + +

+ +

+ Once cancelled, you can view the output of the ingestion run by clicking **Details**. ### Debugging a Failed Ingestion Run -![](./imgs/failed-ingestion.png) + +

+ +

+ A variety of things can cause an ingestion run to fail. Common reasons for failure include: @@ -235,12 +287,20 @@ A variety of things can cause an ingestion run to fail. Common reasons for failu 4. **Authentication**: If you've enabled [Metadata Service Authentication](authentication/introducing-metadata-service-authentication.md), you'll need to provide a Personal Access Token in your Recipe Configuration. To so this, set the 'token' field of the sink configuration to contain a Personal Access Token: - ![](./imgs/ingestion-with-token.png) + +

+ +

+ The output of each run is captured and available to view in the UI for easier debugging. To view output logs, click **DETAILS** on the corresponding ingestion run. -![](./imgs/ingestion-logs.png) + +

+ +

+ ## FAQ @@ -250,7 +310,11 @@ If not due to one of the reasons outlined above, this may be because the executo to reach DataHub's backend using the default configurations. Try changing your ingestion recipe to make the `sink.config.server` variable point to the Docker DNS name for the `datahub-gms` pod: -![](./imgs/quickstart-ingestion-config.png) + +

+ +

+ ### I see 'N/A' when I try to run ingestion. What do I do? diff --git a/docs/what/relationship.md b/docs/what/relationship.md index 1908bbd6ce75f..dcfe093a1b124 100644 --- a/docs/what/relationship.md +++ b/docs/what/relationship.md @@ -2,7 +2,11 @@ A relationship is a named associate between exactly two [entities](entity.md), a source and a destination. -![metadata-modeling](../imgs/metadata-modeling.png) + +

+ +

+ From the above graph, a `Group` entity can be linked to a `User` entity via a `HasMember` relationship. Note that the name of the relationship reflects the direction, i.e. pointing from `Group` to `User`. diff --git a/entity-registry/build.gradle b/entity-registry/build.gradle index af742d240d1e6..3da0bf5bb4fb8 100644 --- a/entity-registry/build.gradle +++ b/entity-registry/build.gradle @@ -1,16 +1,17 @@ apply plugin: 'pegasus' +apply plugin: 'java-library' dependencies { - compile spec.product.pegasus.data - compile spec.product.pegasus.generator - compile project(path: ':metadata-models') + implementation spec.product.pegasus.data + implementation spec.product.pegasus.generator + api project(path: ':metadata-models') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok - compile externalDependency.guava - compile externalDependency.jacksonDataBind - compile externalDependency.jacksonDataFormatYaml - compile externalDependency.reflections - compile externalDependency.jsonPatch + implementation externalDependency.guava + implementation externalDependency.jacksonDataBind + implementation externalDependency.jacksonDataFormatYaml + implementation externalDependency.reflections + implementation externalDependency.jsonPatch constraints { implementation(externalDependency.snakeYaml) { because("previous versions are vulnerable to CVE-2022-25857") @@ -19,12 +20,13 @@ dependencies { dataModel project(':li-utils') annotationProcessor externalDependency.lombok - compile externalDependency.mavenArtifact + api externalDependency.mavenArtifact - testCompile project(':test-models') - testCompile externalDependency.testng - testCompile externalDependency.mockito - testCompile externalDependency.mockitoInline + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') + testImplementation externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.mockitoInline } compileTestJava.dependsOn tasks.getByPath(':entity-registry:custom-test-model:modelDeploy') diff --git a/entity-registry/custom-test-model/build.gradle b/entity-registry/custom-test-model/build.gradle index 90f50fe1f2992..778e2e42b95c4 100644 --- a/entity-registry/custom-test-model/build.gradle +++ b/entity-registry/custom-test-model/build.gradle @@ -23,11 +23,11 @@ if (project.hasProperty('projVersion')) { dependencies { - compile spec.product.pegasus.data + implementation spec.product.pegasus.data // Uncomment these if you want to depend on models defined in core datahub - //compile project(':li-utils') + //implementation project(':li-utils') //dataModel project(':li-utils') - //compile project(':metadata-models') + //implementation project(':metadata-models') //dataModel project(':metadata-models') } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java index 2ffd9283ed456..8f2f42cd69cae 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java @@ -155,7 +155,8 @@ private void extractSearchableAnnotation(final Object annotationObj, final DataS annotation.getBoostScore(), annotation.getHasValuesFieldName(), annotation.getNumValuesFieldName(), - annotation.getWeightsPerFieldValue()); + annotation.getWeightsPerFieldValue(), + annotation.getFieldNameAliases()); } } log.debug("Searchable annotation for field: {} : {}", schemaPathSpec, annotation); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java index f2e65c771c6eb..d5e5044f95c23 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java @@ -4,7 +4,10 @@ import com.google.common.collect.ImmutableSet; import com.linkedin.data.schema.DataSchema; import com.linkedin.metadata.models.ModelValidationException; + +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -19,9 +22,10 @@ @Value public class SearchableAnnotation { + public static final String FIELD_NAME_ALIASES = "fieldNameAliases"; public static final String ANNOTATION_NAME = "Searchable"; private static final Set DEFAULT_QUERY_FIELD_TYPES = - ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL); + ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL); // Name of the field in the search index. Defaults to the field name in the schema String fieldName; @@ -47,6 +51,8 @@ public class SearchableAnnotation { Optional numValuesFieldName; // (Optional) Weights to apply to score for a given value Map weightsPerFieldValue; + // (Optional) Aliases for this given field that can be used for sorting etc. + List fieldNameAliases; public enum FieldType { KEYWORD, @@ -59,7 +65,8 @@ public enum FieldType { COUNT, DATETIME, OBJECT, - BROWSE_PATH_V2 + BROWSE_PATH_V2, + WORD_GRAM } @Nonnull @@ -93,6 +100,7 @@ public static SearchableAnnotation fromPegasusAnnotationObject(@Nonnull final Ob final Optional numValuesFieldName = AnnotationUtils.getField(map, "numValuesFieldName", String.class); final Optional weightsPerFieldValueMap = AnnotationUtils.getField(map, "weightsPerFieldValue", Map.class).map(m -> (Map) m); + final List fieldNameAliases = getFieldNameAliases(map); final FieldType resolvedFieldType = getFieldType(fieldType, schemaDataType); return new SearchableAnnotation( @@ -107,7 +115,8 @@ public static SearchableAnnotation fromPegasusAnnotationObject(@Nonnull final Ob boostScore.orElse(1.0), hasValuesFieldName, numValuesFieldName, - weightsPerFieldValueMap.orElse(ImmutableMap.of())); + weightsPerFieldValueMap.orElse(ImmutableMap.of()), + fieldNameAliases); } private static FieldType getFieldType(Optional maybeFieldType, DataSchema.Type schemaDataType) { @@ -155,4 +164,15 @@ private static String capitalizeFirstLetter(String str) { return str.substring(0, 1).toUpperCase() + str.substring(1); } } + + private static List getFieldNameAliases(Map map) { + final List aliases = new ArrayList<>(); + final Optional fieldNameAliases = AnnotationUtils.getField(map, FIELD_NAME_ALIASES, List.class); + if (fieldNameAliases.isPresent()) { + for (Object alias : fieldNameAliases.get()) { + aliases.add((String) alias); + } + } + return aliases; + } } diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java index 1ab5ff640ce32..3618108970afa 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java @@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) { assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName()); // Assert on Searchable Fields - assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size()); + assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10); assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName()); assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get( @@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) { assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("textArrayField", "*").toString()) .getSearchableAnnotation().getFieldType()); + assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get( + new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName()); + assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get( + new PathSpec("wordGramField").toString()) + .getSearchableAnnotation().getFieldType()); assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName()); assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get( diff --git a/gradle/docker/docker.gradle b/gradle/docker/docker.gradle index f0bb4a5500b33..db2979a8ff6dc 100644 --- a/gradle/docker/docker.gradle +++ b/gradle/docker/docker.gradle @@ -21,6 +21,7 @@ ext.getDockerContainers = { ext.cleanLocalDockerImages = { String docker_registry, String docker_repo, String docker_tag -> + println("Docker image string: ${docker_registry}/${docker_repo}:${docker_tag}") def containers = getDockerContainers(docker_registry, docker_repo, docker_tag) if(!containers.isEmpty()) { println "Stopping containers: $containers" @@ -35,6 +36,7 @@ ext.cleanLocalDockerImages = { if(!images.isEmpty()) { println "Removing images: $images" exec { + ignoreExitValue true // may not work if used by downstream image commandLine = ["docker", "rmi", "-f"] + images } } diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index ec991f9aa12cb..98debb84d51de 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-6.9.2-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.6.2-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/ingestion-scheduler/build.gradle b/ingestion-scheduler/build.gradle index b15b5b8c52673..dc9887406b8b4 100644 --- a/ingestion-scheduler/build.gradle +++ b/ingestion-scheduler/build.gradle @@ -1,16 +1,17 @@ apply plugin: 'java' dependencies { - compile project(path: ':metadata-models') - compile project(path: ':metadata-io') - compile project(path: ':metadata-service:restli-client') - compile project(':metadata-service:configuration') + implementation project(path: ':metadata-models') + implementation project(path: ':metadata-io') + implementation project(path: ':metadata-service:restli-client') + implementation project(':metadata-service:configuration') + implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng constraints { implementation(externalDependency.log4jCore) { diff --git a/li-utils/build.gradle b/li-utils/build.gradle index d11cd86659605..e8b672a3a21fa 100644 --- a/li-utils/build.gradle +++ b/li-utils/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'pegasus' tasks.withType(JavaCompile).configureEach { @@ -13,19 +13,21 @@ tasks.withType(Test).configureEach { } dependencies { - compile spec.product.pegasus.data - compile externalDependency.commonsLang - compile(externalDependency.reflections) { + api spec.product.pegasus.data + implementation externalDependency.commonsLang + implementation(externalDependency.reflections) { exclude group: 'com.google.guava', module: 'guava' } - compile externalDependency.guava + implementation externalDependency.guava implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.assertJ - testCompile project(':test-models') + testImplementation externalDependency.assertJ + testImplementation externalDependency.commonsIo + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') } idea { diff --git a/metadata-auth/auth-api/build.gradle b/metadata-auth/auth-api/build.gradle index f82f488b6f182..2bf9e5243e152 100644 --- a/metadata-auth/auth-api/build.gradle +++ b/metadata-auth/auth-api/build.gradle @@ -3,7 +3,7 @@ plugins { } apply plugin: 'com.github.johnrengelman.shadow' -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'signing' apply plugin: 'maven-publish' apply plugin: 'io.codearte.nexus-staging' @@ -28,14 +28,14 @@ shadowJar { dependencies() { implementation spec.product.pegasus.data implementation project(path: ':li-utils') - implementation project(path: ':metadata-utils') + api project(path: ':metadata-utils') - compile externalDependency.guava - compile externalDependency.lombok + implementation externalDependency.guava + implementation externalDependency.lombok annotationProcessor externalDependency.lombok - - testCompile externalDependency.testng + + testImplementation externalDependency.testng } task sourcesJar(type: Jar) { diff --git a/metadata-dao-impl/kafka-producer/build.gradle b/metadata-dao-impl/kafka-producer/build.gradle index 5b40eb5f32232..393b10b0e9d24 100644 --- a/metadata-dao-impl/kafka-producer/build.gradle +++ b/metadata-dao-impl/kafka-producer/build.gradle @@ -1,20 +1,23 @@ apply plugin: 'java' dependencies { - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile project(':entity-registry') - compile project(':metadata-io') + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':entity-registry') + implementation project(':metadata-io') - compile externalDependency.kafkaClients + implementation externalDependency.kafkaClients + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.opentelemetryAnnotations implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito + testImplementation externalDependency.mockito constraints { implementation(externalDependency.log4jCore) { @@ -23,5 +26,8 @@ dependencies { implementation(externalDependency.log4jApi) { because("previous versions are vulnerable to CVE-2021-45105") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } } \ No newline at end of file diff --git a/metadata-events/mxe-avro-1.7/build.gradle b/metadata-events/mxe-avro-1.7/build.gradle index 6bde1511bf280..e30406644913c 100644 --- a/metadata-events/mxe-avro-1.7/build.gradle +++ b/metadata-events/mxe-avro-1.7/build.gradle @@ -3,11 +3,11 @@ configurations { } apply plugin: 'io.acryl.gradle.plugin.avro' -apply plugin: 'java' +apply plugin: 'java-library' dependencies { - compile externalDependency.avro_1_7 - compile(externalDependency.avroCompiler_1_7) { + api externalDependency.avro_1_7 + implementation(externalDependency.avroCompiler_1_7) { exclude group: 'org.apache.velocity', module: 'velocity' } constraints { diff --git a/metadata-events/mxe-registration/build.gradle b/metadata-events/mxe-registration/build.gradle index aa5fad09f3fec..60e0da59616d9 100644 --- a/metadata-events/mxe-registration/build.gradle +++ b/metadata-events/mxe-registration/build.gradle @@ -5,11 +5,12 @@ configurations { } dependencies { - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-models') - compile spec.product.pegasus.dataAvro1_6 + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-models') + implementation spec.product.pegasus.dataAvro1_6 - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') avroOriginal project(path: ':metadata-models', configuration: 'avroSchema') diff --git a/metadata-events/mxe-schemas/build.gradle b/metadata-events/mxe-schemas/build.gradle index 0b3e621b8db15..fe46601fb68b7 100644 --- a/metadata-events/mxe-schemas/build.gradle +++ b/metadata-events/mxe-schemas/build.gradle @@ -11,6 +11,10 @@ task copyMetadataModels(type: Copy) { } generateAvroSchema.dependsOn copyMetadataModels +validateSchemaAnnotation.dependsOn copyMetadataModels +mainTranslateSchemas.dependsOn copyMetadataModels +generateDataTemplate.dependsOn copyMetadataModels +mainCopySchemas.dependsOn copyMetadataModels pegasus.main.generationModes = [PegasusGenerationMode.PEGASUS, PegasusGenerationMode.AVRO] task copyOriginalAvsc(type: Copy, dependsOn: generateAvroSchema) { diff --git a/metadata-events/mxe-utils-avro-1.7/build.gradle b/metadata-events/mxe-utils-avro-1.7/build.gradle index f8474e21daa0b..82249d393578c 100644 --- a/metadata-events/mxe-utils-avro-1.7/build.gradle +++ b/metadata-events/mxe-utils-avro-1.7/build.gradle @@ -1,11 +1,12 @@ -apply plugin: 'java' +apply plugin: 'java-library' dependencies { - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-models') - compile spec.product.pegasus.dataAvro1_6 + api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-models') + api spec.product.pegasus.dataAvro1_6 - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') constraints { implementation(externalDependency.log4jCore) { diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index 336be8fc94d44..d1e6f2f646491 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -7,6 +7,10 @@ ext { venv_name = 'venv' } +if (!project.hasProperty("extra_pip_requirements")) { + ext.extra_pip_requirements = "" +} + def pip_install_command = "${venv_name}/bin/pip install -e ../../metadata-ingestion" task checkPythonVersion(type: Exec) { @@ -14,30 +18,37 @@ task checkPythonVersion(type: Exec) { } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { + def sentinel_file = "${venv_name}/.venv_environment_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") - commandLine 'bash', '-c', "${python_executable} -m venv ${venv_name} && ${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0'" + outputs.file(sentinel_file) + commandLine 'bash', '-c', + "${python_executable} -m venv ${venv_name} &&" + + "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " + + "touch ${sentinel_file}" } -task installPackage(type: Exec, dependsOn: environmentSetup) { +task installPackage(type: Exec, dependsOn: [environmentSetup, ':metadata-ingestion:codegen']) { + def sentinel_file = "${venv_name}/.build_install_package_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") + outputs.file(sentinel_file) // Workaround for https://github.com/yaml/pyyaml/issues/601. // See https://github.com/yaml/pyyaml/issues/601#issuecomment-1638509577. // and https://github.com/datahub-project/datahub/pull/8435. commandLine 'bash', '-x', '-c', "${pip_install_command} install 'Cython<3.0' 'PyYAML<6' --no-build-isolation && " + - "${pip_install_command} -e ." + "${pip_install_command} -e . ${extra_pip_requirements} &&" + + "touch ${sentinel_file}" } task install(dependsOn: [installPackage]) task installDev(type: Exec, dependsOn: [install]) { + def sentinel_file = "${venv_name}/.build_install_dev_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") - outputs.file("${venv_name}/.build_install_dev_sentinel") + outputs.file("${sentinel_file}") commandLine 'bash', '-x', '-c', - "${pip_install_command} -e .[dev] && touch ${venv_name}/.build_install_dev_sentinel" + "${pip_install_command} -e .[dev] ${extra_pip_requirements} && " + + "touch ${sentinel_file}" } task lint(type: Exec, dependsOn: installDev) { @@ -45,9 +56,13 @@ task lint(type: Exec, dependsOn: installDev) { The find/sed combo below is a temporary work-around for the following mypy issue with airflow 2.2.0: "venv/lib/python3.8/site-packages/airflow/_vendor/connexion/spec.py:169: error: invalid syntax". */ - commandLine 'bash', '-x', '-c', + commandLine 'bash', '-c', "find ${venv_name}/lib -path *airflow/_vendor/connexion/spec.py -exec sed -i.bak -e '169,169s/ # type: List\\[str\\]//g' {} \\; && " + - "source ${venv_name}/bin/activate && black --check --diff src/ tests/ && isort --check --diff src/ tests/ && flake8 --count --statistics src/ tests/ && mypy src/ tests/" + "source ${venv_name}/bin/activate && set -x && " + + "black --check --diff src/ tests/ && " + + "isort --check --diff src/ tests/ && " + + "flake8 --count --statistics src/ tests/ && " + + "mypy --show-traceback --show-error-codes src/ tests/" } task lintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-x', '-c', @@ -58,21 +73,13 @@ task lintFix(type: Exec, dependsOn: installDev) { "mypy src/ tests/ " } -task testQuick(type: Exec, dependsOn: installDev) { - // We can't enforce the coverage requirements if we run a subset of the tests. - inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) - inputs.files(project.fileTree(dir: "tests/")) - outputs.dir("${venv_name}") - commandLine 'bash', '-x', '-c', - "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" -} - task installDevTest(type: Exec, dependsOn: [installDev]) { + def sentinel_file = "${venv_name}/.build_install_dev_test_sentinel" inputs.file file('setup.py') outputs.dir("${venv_name}") - outputs.file("${venv_name}/.build_install_dev_test_sentinel") + outputs.file("${sentinel_file}") commandLine 'bash', '-x', '-c', - "${pip_install_command} -e .[dev,integration-tests] && touch ${venv_name}/.build_install_dev_test_sentinel" + "${pip_install_command} -e .[dev,integration-tests] && touch ${sentinel_file}" } def testFile = hasProperty('testFile') ? testFile : 'unknown' @@ -89,6 +96,16 @@ task testSingle(dependsOn: [installDevTest]) { } } +task testQuick(type: Exec, dependsOn: installDevTest) { + // We can't enforce the coverage requirements if we run a subset of the tests. + inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) + inputs.files(project.fileTree(dir: "tests/")) + outputs.dir("${venv_name}") + commandLine 'bash', '-x', '-c', + "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" +} + + task testFull(type: Exec, dependsOn: [testQuick, installDevTest]) { commandLine 'bash', '-x', '-c', "source ${venv_name}/bin/activate && pytest -m 'not slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.full.xml" diff --git a/metadata-ingestion-modules/airflow-plugin/pyproject.toml b/metadata-ingestion-modules/airflow-plugin/pyproject.toml index 83b79e3146176..fba81486b9f67 100644 --- a/metadata-ingestion-modules/airflow-plugin/pyproject.toml +++ b/metadata-ingestion-modules/airflow-plugin/pyproject.toml @@ -9,7 +9,6 @@ extend-exclude = ''' ^/tmp ''' include = '\.pyi?$' -target-version = ['py36', 'py37', 'py38'] [tool.isort] indent = ' ' diff --git a/metadata-ingestion-modules/airflow-plugin/setup.cfg b/metadata-ingestion-modules/airflow-plugin/setup.cfg index c9a2ba93e9933..157bcce1c298d 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.cfg +++ b/metadata-ingestion-modules/airflow-plugin/setup.cfg @@ -69,4 +69,6 @@ exclude_lines = pragma: no cover @abstract if TYPE_CHECKING: -#omit = +omit = + # omit example dags + src/datahub_airflow_plugin/example_dags/* diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index c2571916ca5d0..c5bdc7ea329cd 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -13,16 +13,21 @@ def get_long_description(): return pathlib.Path(os.path.join(root, "README.md")).read_text() +rest_common = {"requests", "requests_file"} + base_requirements = { # Compatibility. "dataclasses>=0.6; python_version < '3.7'", - "typing_extensions>=3.10.0.2", + # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to Airflow 2.0.2 dependency conflict + "typing_extensions>=3.7.4.3 ; python_version < '3.8'", + "typing_extensions>=3.10.0.2,<4.6.0 ; python_version >= '3.8'", "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", "pydantic>=1.5.1", "apache-airflow >= 2.0.2", - f"acryl-datahub[airflow] == {package_metadata['__version__']}", + *rest_common, + f"acryl-datahub == {package_metadata['__version__']}", } @@ -47,19 +52,18 @@ def get_long_description(): base_dev_requirements = { *base_requirements, *mypy_stubs, - "black>=21.12b0", + "black==22.12.0", "coverage>=5.1", "flake8>=3.8.3", "flake8-tidy-imports>=4.3.0", "isort>=5.7.0", - "mypy>=0.920", + "mypy>=1.4.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.9.0", + "pydantic>=1.10", "pytest>=6.2.2", "pytest-asyncio>=0.16.0", "pytest-cov>=2.8.1", - "pytest-docker>=0.10.3,<0.12", "tox", "deepdiff", "requests-mock", @@ -127,5 +131,13 @@ def get_long_description(): "datahub-kafka": [ f"acryl-datahub[datahub-kafka] == {package_metadata['__version__']}" ], + "integration-tests": [ + f"acryl-datahub[datahub-kafka] == {package_metadata['__version__']}", + # Extra requirements for Airflow. + "apache-airflow[snowflake]>=2.0.2", # snowflake is used in example dags + # Because of https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 we need to restrict SQLAlchemy's max version. + "SQLAlchemy<1.4.42", + "virtualenv", # needed by PythonVirtualenvOperator + ], }, ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_compat.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_compat.py new file mode 100644 index 0000000000000..67c3348ec987c --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_compat.py @@ -0,0 +1,12 @@ +# This module must be imported before any Airflow imports in any of our files. +# The AIRFLOW_PATCHED just helps avoid flake8 errors. + +from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED + +assert MARKUPSAFE_PATCHED + +AIRFLOW_PATCHED = True + +__all__ = [ + "AIRFLOW_PATCHED", +] diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py new file mode 100644 index 0000000000000..5ad20e1f72551 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py @@ -0,0 +1,29 @@ +from airflow.models.baseoperator import BaseOperator + +from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED + +try: + from airflow.models.mappedoperator import MappedOperator + from airflow.models.operator import Operator + from airflow.operators.empty import EmptyOperator +except ModuleNotFoundError: + # Operator isn't a real class, but rather a type alias defined + # as the union of BaseOperator and MappedOperator. + # Since older versions of Airflow don't have MappedOperator, we can just use BaseOperator. + Operator = BaseOperator # type: ignore + MappedOperator = None # type: ignore + from airflow.operators.dummy import DummyOperator as EmptyOperator # type: ignore + +try: + from airflow.sensors.external_task import ExternalTaskSensor +except ImportError: + from airflow.sensors.external_task_sensor import ExternalTaskSensor # type: ignore + +assert AIRFLOW_PATCHED + +__all__ = [ + "Operator", + "MappedOperator", + "EmptyOperator", + "ExternalTaskSensor", +] diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py new file mode 100644 index 0000000000000..d91c039ffa718 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py @@ -0,0 +1,115 @@ +from datetime import datetime +from typing import TYPE_CHECKING, Dict, List + +import datahub.emitter.mce_builder as builder +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult +from datahub.configuration.common import ConfigModel +from datahub.utilities.urns.dataset_urn import DatasetUrn + +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator +from datahub_airflow_plugin.entities import _Entity + +if TYPE_CHECKING: + from airflow import DAG + from airflow.models.dagrun import DagRun + from airflow.models.taskinstance import TaskInstance + + from datahub_airflow_plugin._airflow_shims import Operator + from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook + + +def _entities_to_urn_list(iolets: List[_Entity]) -> List[DatasetUrn]: + return [DatasetUrn.create_from_string(let.urn) for let in iolets] + + +class DatahubBasicLineageConfig(ConfigModel): + enabled: bool = True + + # DataHub hook connection ID. + datahub_conn_id: str + + # Cluster to associate with the pipelines and tasks. Defaults to "prod". + cluster: str = builder.DEFAULT_FLOW_CLUSTER + + # If true, the owners field of the DAG will be capture as a DataHub corpuser. + capture_ownership_info: bool = True + + # If true, the tags field of the DAG will be captured as DataHub tags. + capture_tags_info: bool = True + + capture_executions: bool = False + + def make_emitter_hook(self) -> "DatahubGenericHook": + # This is necessary to avoid issues with circular imports. + from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook + + return DatahubGenericHook(self.datahub_conn_id) + + +def send_lineage_to_datahub( + config: DatahubBasicLineageConfig, + operator: "Operator", + inlets: List[_Entity], + outlets: List[_Entity], + context: Dict, +) -> None: + if not config.enabled: + return + + dag: "DAG" = context["dag"] + task: "Operator" = context["task"] + ti: "TaskInstance" = context["task_instance"] + + hook = config.make_emitter_hook() + emitter = hook.make_emitter() + + dataflow = AirflowGenerator.generate_dataflow( + cluster=config.cluster, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + dataflow.emit(emitter) + operator.log.info(f"Emitted from Lineage: {dataflow}") + + datajob = AirflowGenerator.generate_datajob( + cluster=config.cluster, + task=task, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + datajob.inlets.extend(_entities_to_urn_list(inlets)) + datajob.outlets.extend(_entities_to_urn_list(outlets)) + + datajob.emit(emitter) + operator.log.info(f"Emitted from Lineage: {datajob}") + + if config.capture_executions: + dag_run: "DagRun" = context["dag_run"] + + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag=dag, + dag_run=dag_run, + datajob=datajob, + emit_templates=False, + ) + + operator.log.info(f"Emitted from Lineage: {dpi}") + + dpi = AirflowGenerator.complete_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag=dag, + dag_run=dag_run, + datajob=datajob, + result=InstanceRunResult.SUCCESS, + end_timestamp_millis=int(datetime.utcnow().timestamp() * 1000), + ) + operator.log.info(f"Emitted from Lineage: {dpi}") + + emitter.flush() diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/__init__.py similarity index 100% rename from metadata-ingestion/src/datahub/ingestion/source/azure/__init__.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/__init__.py diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py new file mode 100644 index 0000000000000..b5e86e14d85d0 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py @@ -0,0 +1,512 @@ +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union, cast + +from airflow.configuration import conf +from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub.metadata.schema_classes import DataProcessTypeClass +from datahub.utilities.urns.data_flow_urn import DataFlowUrn +from datahub.utilities.urns.data_job_urn import DataJobUrn + +from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED + +assert AIRFLOW_PATCHED + +if TYPE_CHECKING: + from airflow import DAG + from airflow.models import DagRun, TaskInstance + from datahub.emitter.kafka_emitter import DatahubKafkaEmitter + from datahub.emitter.rest_emitter import DatahubRestEmitter + + from datahub_airflow_plugin._airflow_shims import Operator + + +def _task_downstream_task_ids(operator: "Operator") -> Set[str]: + if hasattr(operator, "downstream_task_ids"): + return operator.downstream_task_ids + return operator._downstream_task_id # type: ignore[attr-defined,union-attr] + + +class AirflowGenerator: + @staticmethod + def _get_dependencies( + task: "Operator", dag: "DAG", flow_urn: DataFlowUrn + ) -> List[DataJobUrn]: + from datahub_airflow_plugin._airflow_shims import ExternalTaskSensor + + # resolve URNs for upstream nodes in subdags upstream of the current task. + upstream_subdag_task_urns: List[DataJobUrn] = [] + + for upstream_task_id in task.upstream_task_ids: + upstream_task = dag.task_dict[upstream_task_id] + + # if upstream task is not a subdag, then skip it + upstream_subdag = getattr(upstream_task, "subdag", None) + if upstream_subdag is None: + continue + + # else, link the leaf tasks of the upstream subdag as upstream tasks + for upstream_subdag_task_id in upstream_subdag.task_dict: + upstream_subdag_task = upstream_subdag.task_dict[ + upstream_subdag_task_id + ] + + upstream_subdag_task_urn = DataJobUrn.create_from_ids( + job_id=upstream_subdag_task_id, data_flow_urn=str(flow_urn) + ) + + # if subdag task is a leaf task, then link it as an upstream task + if len(_task_downstream_task_ids(upstream_subdag_task)) == 0: + upstream_subdag_task_urns.append(upstream_subdag_task_urn) + + # resolve URNs for upstream nodes that trigger the subdag containing the current task. + # (if it is in a subdag at all) + upstream_subdag_triggers: List[DataJobUrn] = [] + + # subdags are always named with 'parent.child' style or Airflow won't run them + # add connection from subdag trigger(s) if subdag task has no upstreams + if ( + dag.is_subdag + and dag.parent_dag is not None + and len(task.upstream_task_ids) == 0 + ): + # filter through the parent dag's tasks and find the subdag trigger(s) + subdags = [ + x for x in dag.parent_dag.task_dict.values() if x.subdag is not None + ] + matched_subdags = [ + x for x in subdags if x.subdag and x.subdag.dag_id == dag.dag_id + ] + + # id of the task containing the subdag + subdag_task_id = matched_subdags[0].task_id + + # iterate through the parent dag's tasks and find the ones that trigger the subdag + for upstream_task_id in dag.parent_dag.task_dict: + upstream_task = dag.parent_dag.task_dict[upstream_task_id] + upstream_task_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), job_id=upstream_task_id + ) + + # if the task triggers the subdag, link it to this node in the subdag + if subdag_task_id in _task_downstream_task_ids(upstream_task): + upstream_subdag_triggers.append(upstream_task_urn) + + # If the operator is an ExternalTaskSensor then we set the remote task as upstream. + # It is possible to tie an external sensor to DAG if external_task_id is omitted but currently we can't tie + # jobflow to anothet jobflow. + external_task_upstreams = [] + if task.task_type == "ExternalTaskSensor": + task = cast(ExternalTaskSensor, task) + if hasattr(task, "external_task_id") and task.external_task_id is not None: + external_task_upstreams = [ + DataJobUrn.create_from_ids( + job_id=task.external_task_id, + data_flow_urn=str( + DataFlowUrn.create_from_ids( + orchestrator=flow_urn.get_orchestrator_name(), + flow_id=task.external_dag_id, + env=flow_urn.get_env(), + ) + ), + ) + ] + # exclude subdag operator tasks since these are not emitted, resulting in empty metadata + upstream_tasks = ( + [ + DataJobUrn.create_from_ids(job_id=task_id, data_flow_urn=str(flow_urn)) + for task_id in task.upstream_task_ids + if getattr(dag.task_dict[task_id], "subdag", None) is None + ] + + upstream_subdag_task_urns + + upstream_subdag_triggers + + external_task_upstreams + ) + return upstream_tasks + + @staticmethod + def generate_dataflow( + cluster: str, + dag: "DAG", + capture_owner: bool = True, + capture_tags: bool = True, + ) -> DataFlow: + """ + Generates a Dataflow object from an Airflow DAG + :param cluster: str - name of the cluster + :param dag: DAG - + :param capture_tags: + :param capture_owner: + :return: DataFlow - Data generated dataflow + """ + id = dag.dag_id + orchestrator = "airflow" + description = f"{dag.description}\n\n{dag.doc_md or ''}" + data_flow = DataFlow( + env=cluster, id=id, orchestrator=orchestrator, description=description + ) + + flow_property_bag: Dict[str, str] = {} + + allowed_flow_keys = [ + "_access_control", + "_concurrency", + "_default_view", + "catchup", + "fileloc", + "is_paused_upon_creation", + "start_date", + "tags", + "timezone", + ] + + for key in allowed_flow_keys: + if hasattr(dag, key): + flow_property_bag[key] = repr(getattr(dag, key)) + + data_flow.properties = flow_property_bag + base_url = conf.get("webserver", "base_url") + data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}" + + if capture_owner and dag.owner: + data_flow.owners.add(dag.owner) + + if capture_tags and dag.tags: + data_flow.tags.update(dag.tags) + + return data_flow + + @staticmethod + def _get_description(task: "Operator") -> Optional[str]: + from airflow.models.baseoperator import BaseOperator + + if not isinstance(task, BaseOperator): + # TODO: Get docs for mapped operators. + return None + + if hasattr(task, "doc") and task.doc: + return task.doc + elif hasattr(task, "doc_md") and task.doc_md: + return task.doc_md + elif hasattr(task, "doc_json") and task.doc_json: + return task.doc_json + elif hasattr(task, "doc_yaml") and task.doc_yaml: + return task.doc_yaml + elif hasattr(task, "doc_rst") and task.doc_yaml: + return task.doc_yaml + return None + + @staticmethod + def generate_datajob( + cluster: str, + task: "Operator", + dag: "DAG", + set_dependencies: bool = True, + capture_owner: bool = True, + capture_tags: bool = True, + ) -> DataJob: + """ + + :param cluster: str + :param task: TaskIntance + :param dag: DAG + :param set_dependencies: bool - whether to extract dependencies from airflow task + :param capture_owner: bool - whether to extract owner from airflow task + :param capture_tags: bool - whether to set tags automatically from airflow task + :return: DataJob - returns the generated DataJob object + """ + dataflow_urn = DataFlowUrn.create_from_ids( + orchestrator="airflow", env=cluster, flow_id=dag.dag_id + ) + datajob = DataJob(id=task.task_id, flow_urn=dataflow_urn) + + # TODO add support for MappedOperator + datajob.description = AirflowGenerator._get_description(task) + + job_property_bag: Dict[str, str] = {} + + allowed_task_keys = [ + "_downstream_task_ids", + "_inlets", + "_outlets", + "_task_type", + "_task_module", + "depends_on_past", + "email", + "label", + "execution_timeout", + "sla", + "sql", + "task_id", + "trigger_rule", + "wait_for_downstream", + # In Airflow 2.3, _downstream_task_ids was renamed to downstream_task_ids + "downstream_task_ids", + # In Airflow 2.4, _inlets and _outlets were removed in favor of non-private versions. + "inlets", + "outlets", + ] + + for key in allowed_task_keys: + if hasattr(task, key): + job_property_bag[key] = repr(getattr(task, key)) + + datajob.properties = job_property_bag + base_url = conf.get("webserver", "base_url") + datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={datajob.flow_urn.get_flow_id()}&_flt_3_task_id={task.task_id}" + + if capture_owner and dag.owner: + datajob.owners.add(dag.owner) + + if capture_tags and dag.tags: + datajob.tags.update(dag.tags) + + if set_dependencies: + datajob.upstream_urns.extend( + AirflowGenerator._get_dependencies( + task=task, dag=dag, flow_urn=datajob.flow_urn + ) + ) + + return datajob + + @staticmethod + def create_datajob_instance( + cluster: str, + task: "Operator", + dag: "DAG", + data_job: Optional[DataJob] = None, + ) -> DataProcessInstance: + if data_job is None: + data_job = AirflowGenerator.generate_datajob(cluster, task=task, dag=dag) + dpi = DataProcessInstance.from_datajob( + datajob=data_job, id=task.task_id, clone_inlets=True, clone_outlets=True + ) + return dpi + + @staticmethod + def run_dataflow( + emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + cluster: str, + dag_run: "DagRun", + start_timestamp_millis: Optional[int] = None, + dataflow: Optional[DataFlow] = None, + ) -> None: + if dataflow is None: + assert dag_run.dag + dataflow = AirflowGenerator.generate_dataflow(cluster, dag_run.dag) + + if start_timestamp_millis is None: + assert dag_run.execution_date + start_timestamp_millis = int(dag_run.execution_date.timestamp() * 1000) + + assert dag_run.run_id + dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dag_run.run_id) + + # This property only exists in Airflow2 + if hasattr(dag_run, "run_type"): + from airflow.utils.types import DagRunType + + if dag_run.run_type == DagRunType.SCHEDULED: + dpi.type = DataProcessTypeClass.BATCH_SCHEDULED + elif dag_run.run_type == DagRunType.MANUAL: + dpi.type = DataProcessTypeClass.BATCH_AD_HOC + else: + if dag_run.run_id.startswith("scheduled__"): + dpi.type = DataProcessTypeClass.BATCH_SCHEDULED + else: + dpi.type = DataProcessTypeClass.BATCH_AD_HOC + + property_bag: Dict[str, str] = {} + property_bag["run_id"] = str(dag_run.run_id) + property_bag["execution_date"] = str(dag_run.execution_date) + property_bag["end_date"] = str(dag_run.end_date) + property_bag["start_date"] = str(dag_run.start_date) + property_bag["creating_job_id"] = str(dag_run.creating_job_id) + # These properties only exists in Airflow>=2.2.0 + if hasattr(dag_run, "data_interval_start") and hasattr( + dag_run, "data_interval_end" + ): + property_bag["data_interval_start"] = str(dag_run.data_interval_start) + property_bag["data_interval_end"] = str(dag_run.data_interval_end) + property_bag["external_trigger"] = str(dag_run.external_trigger) + dpi.properties.update(property_bag) + + dpi.emit_process_start( + emitter=emitter, start_timestamp_millis=start_timestamp_millis + ) + + @staticmethod + def complete_dataflow( + emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + cluster: str, + dag_run: "DagRun", + end_timestamp_millis: Optional[int] = None, + dataflow: Optional[DataFlow] = None, + ) -> None: + """ + + :param emitter: DatahubRestEmitter - the datahub rest emitter to emit the generated mcps + :param cluster: str - name of the cluster + :param dag_run: DagRun + :param end_timestamp_millis: Optional[int] - the completion time in milliseconds if not set the current time will be used. + :param dataflow: Optional[Dataflow] + """ + if dataflow is None: + assert dag_run.dag + dataflow = AirflowGenerator.generate_dataflow(cluster, dag_run.dag) + + assert dag_run.run_id + dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dag_run.run_id) + if end_timestamp_millis is None: + if dag_run.end_date is None: + raise Exception( + f"Dag {dag_run.dag_id}_{dag_run.run_id} is still running and unable to get end_date..." + ) + end_timestamp_millis = int(dag_run.end_date.timestamp() * 1000) + + # We should use DagRunState but it is not available in Airflow 1 + if dag_run.state == "success": + result = InstanceRunResult.SUCCESS + elif dag_run.state == "failed": + result = InstanceRunResult.FAILURE + else: + raise Exception( + f"Result should be either success or failure and it was {dag_run.state}" + ) + + dpi.emit_process_end( + emitter=emitter, + end_timestamp_millis=end_timestamp_millis, + result=result, + result_type="airflow", + ) + + @staticmethod + def run_datajob( + emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + cluster: str, + ti: "TaskInstance", + dag: "DAG", + dag_run: "DagRun", + start_timestamp_millis: Optional[int] = None, + datajob: Optional[DataJob] = None, + attempt: Optional[int] = None, + emit_templates: bool = True, + ) -> DataProcessInstance: + if datajob is None: + datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) + + assert dag_run.run_id + dpi = DataProcessInstance.from_datajob( + datajob=datajob, + id=f"{dag.dag_id}_{ti.task_id}_{dag_run.run_id}", + clone_inlets=True, + clone_outlets=True, + ) + job_property_bag: Dict[str, str] = {} + job_property_bag["run_id"] = str(dag_run.run_id) + job_property_bag["duration"] = str(ti.duration) + job_property_bag["start_date"] = str(ti.start_date) + job_property_bag["end_date"] = str(ti.end_date) + job_property_bag["execution_date"] = str(ti.execution_date) + job_property_bag["try_number"] = str(ti.try_number - 1) + job_property_bag["hostname"] = str(ti.hostname) + job_property_bag["max_tries"] = str(ti.max_tries) + # Not compatible with Airflow 1 + if hasattr(ti, "external_executor_id"): + job_property_bag["external_executor_id"] = str(ti.external_executor_id) + job_property_bag["pid"] = str(ti.pid) + job_property_bag["state"] = str(ti.state) + job_property_bag["operator"] = str(ti.operator) + job_property_bag["priority_weight"] = str(ti.priority_weight) + job_property_bag["unixname"] = str(ti.unixname) + job_property_bag["log_url"] = ti.log_url + dpi.properties.update(job_property_bag) + dpi.url = ti.log_url + + # This property only exists in Airflow2 + if hasattr(ti, "dag_run") and hasattr(ti.dag_run, "run_type"): + from airflow.utils.types import DagRunType + + if ti.dag_run.run_type == DagRunType.SCHEDULED: + dpi.type = DataProcessTypeClass.BATCH_SCHEDULED + elif ti.dag_run.run_type == DagRunType.MANUAL: + dpi.type = DataProcessTypeClass.BATCH_AD_HOC + else: + if dag_run.run_id.startswith("scheduled__"): + dpi.type = DataProcessTypeClass.BATCH_SCHEDULED + else: + dpi.type = DataProcessTypeClass.BATCH_AD_HOC + + if start_timestamp_millis is None: + assert ti.start_date + start_timestamp_millis = int(ti.start_date.timestamp() * 1000) + + if attempt is None: + attempt = ti.try_number + + dpi.emit_process_start( + emitter=emitter, + start_timestamp_millis=start_timestamp_millis, + attempt=attempt, + emit_template=emit_templates, + ) + return dpi + + @staticmethod + def complete_datajob( + emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + cluster: str, + ti: "TaskInstance", + dag: "DAG", + dag_run: "DagRun", + end_timestamp_millis: Optional[int] = None, + result: Optional[InstanceRunResult] = None, + datajob: Optional[DataJob] = None, + ) -> DataProcessInstance: + """ + + :param emitter: DatahubRestEmitter + :param cluster: str + :param ti: TaskInstance + :param dag: DAG + :param dag_run: DagRun + :param end_timestamp_millis: Optional[int] + :param result: Optional[str] One of the result from datahub.metadata.schema_class.RunResultTypeClass + :param datajob: Optional[DataJob] + :return: DataProcessInstance + """ + if datajob is None: + datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) + + if end_timestamp_millis is None: + assert ti.end_date + end_timestamp_millis = int(ti.end_date.timestamp() * 1000) + + if result is None: + # We should use TaskInstanceState but it is not available in Airflow 1 + if ti.state == "success": + result = InstanceRunResult.SUCCESS + elif ti.state == "failed": + result = InstanceRunResult.FAILURE + else: + raise Exception( + f"Result should be either success or failure and it was {ti.state}" + ) + + dpi = DataProcessInstance.from_datajob( + datajob=datajob, + id=f"{dag.dag_id}_{ti.task_id}_{dag_run.run_id}", + clone_inlets=True, + clone_outlets=True, + ) + dpi.emit_process_end( + emitter=emitter, + end_timestamp_millis=end_timestamp_millis, + result=result, + result_type="airflow", + ) + return dpi diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py index 226a7382f7595..d1cec9e5c1b54 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py @@ -1,4 +1,367 @@ -# This package serves as a shim, but the actual implementation lives in datahub_provider -# from the acryl-datahub package. We leave this shim here to avoid breaking existing -# Airflow installs. -from datahub_provider._plugin import DatahubPlugin # noqa: F401 +import contextlib +import logging +import traceback +from typing import Any, Callable, Iterable, List, Optional, Union + +from airflow.configuration import conf +from airflow.lineage import PIPELINE_OUTLETS +from airflow.models.baseoperator import BaseOperator +from airflow.plugins_manager import AirflowPlugin +from airflow.utils.module_loading import import_string +from cattr import structure +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult + +from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED +from datahub_airflow_plugin._airflow_shims import MappedOperator, Operator +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator +from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook +from datahub_airflow_plugin.lineage.datahub import DatahubLineageConfig + +assert AIRFLOW_PATCHED +logger = logging.getLogger(__name__) + +TASK_ON_FAILURE_CALLBACK = "on_failure_callback" +TASK_ON_SUCCESS_CALLBACK = "on_success_callback" + + +def get_lineage_config() -> DatahubLineageConfig: + """Load the lineage config from airflow.cfg.""" + + enabled = conf.get("datahub", "enabled", fallback=True) + datahub_conn_id = conf.get("datahub", "conn_id", fallback="datahub_rest_default") + cluster = conf.get("datahub", "cluster", fallback="prod") + graceful_exceptions = conf.get("datahub", "graceful_exceptions", fallback=True) + capture_tags_info = conf.get("datahub", "capture_tags_info", fallback=True) + capture_ownership_info = conf.get( + "datahub", "capture_ownership_info", fallback=True + ) + capture_executions = conf.get("datahub", "capture_executions", fallback=True) + return DatahubLineageConfig( + enabled=enabled, + datahub_conn_id=datahub_conn_id, + cluster=cluster, + graceful_exceptions=graceful_exceptions, + capture_ownership_info=capture_ownership_info, + capture_tags_info=capture_tags_info, + capture_executions=capture_executions, + ) + + +def _task_inlets(operator: "Operator") -> List: + # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets + if hasattr(operator, "_inlets"): + return operator._inlets # type: ignore[attr-defined, union-attr] + return operator.inlets + + +def _task_outlets(operator: "Operator") -> List: + # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets + # We have to use _outlets because outlets is empty in Airflow < 2.4.0 + if hasattr(operator, "_outlets"): + return operator._outlets # type: ignore[attr-defined, union-attr] + return operator.outlets + + +def get_inlets_from_task(task: BaseOperator, context: Any) -> Iterable[Any]: + # TODO: Fix for https://github.com/apache/airflow/commit/1b1f3fabc5909a447a6277cafef3a0d4ef1f01ae + # in Airflow 2.4. + # TODO: ignore/handle airflow's dataset type in our lineage + + inlets: List[Any] = [] + task_inlets = _task_inlets(task) + # From Airflow 2.3 this should be AbstractOperator but due to compatibility reason lets use BaseOperator + if isinstance(task_inlets, (str, BaseOperator)): + inlets = [ + task_inlets, + ] + + if task_inlets and isinstance(task_inlets, list): + inlets = [] + task_ids = ( + {o for o in task_inlets if isinstance(o, str)} + .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) + .intersection(task.get_flat_relative_ids(upstream=True)) + ) + + from airflow.lineage import AUTO + + # pick up unique direct upstream task_ids if AUTO is specified + if AUTO.upper() in task_inlets or AUTO.lower() in task_inlets: + print("Picking up unique direct upstream task_ids as AUTO is specified") + task_ids = task_ids.union( + task_ids.symmetric_difference(task.upstream_task_ids) + ) + + inlets = task.xcom_pull( + context, task_ids=list(task_ids), dag_id=task.dag_id, key=PIPELINE_OUTLETS + ) + + # re-instantiate the obtained inlets + inlets = [ + structure(item["data"], import_string(item["type_name"])) + # _get_instance(structure(item, Metadata)) + for sublist in inlets + if sublist + for item in sublist + ] + + for inlet in task_inlets: + if not isinstance(inlet, str): + inlets.append(inlet) + + return inlets + + +def _make_emit_callback( + logger: logging.Logger, +) -> Callable[[Optional[Exception], str], None]: + def emit_callback(err: Optional[Exception], msg: str) -> None: + if err: + logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) + + return emit_callback + + +def datahub_task_status_callback(context, status): + ti = context["ti"] + task: "BaseOperator" = ti.task + dag = context["dag"] + + # This code is from the original airflow lineage code -> + # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py + inlets = get_inlets_from_task(task, context) + + emitter = ( + DatahubGenericHook(context["_datahub_config"].datahub_conn_id) + .get_underlying_hook() + .make_emitter() + ) + + dataflow = AirflowGenerator.generate_dataflow( + cluster=context["_datahub_config"].cluster, + dag=dag, + capture_tags=context["_datahub_config"].capture_tags_info, + capture_owner=context["_datahub_config"].capture_ownership_info, + ) + task.log.info(f"Emitting Datahub Dataflow: {dataflow}") + dataflow.emit(emitter, callback=_make_emit_callback(task.log)) + + datajob = AirflowGenerator.generate_datajob( + cluster=context["_datahub_config"].cluster, + task=task, + dag=dag, + capture_tags=context["_datahub_config"].capture_tags_info, + capture_owner=context["_datahub_config"].capture_ownership_info, + ) + + for inlet in inlets: + datajob.inlets.append(inlet.urn) + + task_outlets = _task_outlets(task) + for outlet in task_outlets: + datajob.outlets.append(outlet.urn) + + task.log.info(f"Emitting Datahub Datajob: {datajob}") + datajob.emit(emitter, callback=_make_emit_callback(task.log)) + + if context["_datahub_config"].capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=context["_datahub_config"].cluster, + ti=context["ti"], + dag=dag, + dag_run=context["dag_run"], + datajob=datajob, + start_timestamp_millis=int(ti.start_date.timestamp() * 1000), + ) + + task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}") + + dpi = AirflowGenerator.complete_datajob( + emitter=emitter, + cluster=context["_datahub_config"].cluster, + ti=context["ti"], + dag_run=context["dag_run"], + result=status, + dag=dag, + datajob=datajob, + end_timestamp_millis=int(ti.end_date.timestamp() * 1000), + ) + task.log.info(f"Emitted Completed Data Process Instance: {dpi}") + + emitter.flush() + + +def datahub_pre_execution(context): + ti = context["ti"] + task: "BaseOperator" = ti.task + dag = context["dag"] + + task.log.info("Running Datahub pre_execute method") + + emitter = ( + DatahubGenericHook(context["_datahub_config"].datahub_conn_id) + .get_underlying_hook() + .make_emitter() + ) + + # This code is from the original airflow lineage code -> + # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py + inlets = get_inlets_from_task(task, context) + + datajob = AirflowGenerator.generate_datajob( + cluster=context["_datahub_config"].cluster, + task=context["ti"].task, + dag=dag, + capture_tags=context["_datahub_config"].capture_tags_info, + capture_owner=context["_datahub_config"].capture_ownership_info, + ) + + for inlet in inlets: + datajob.inlets.append(inlet.urn) + + task_outlets = _task_outlets(task) + + for outlet in task_outlets: + datajob.outlets.append(outlet.urn) + + task.log.info(f"Emitting Datahub dataJob {datajob}") + datajob.emit(emitter, callback=_make_emit_callback(task.log)) + + if context["_datahub_config"].capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=context["_datahub_config"].cluster, + ti=context["ti"], + dag=dag, + dag_run=context["dag_run"], + datajob=datajob, + start_timestamp_millis=int(ti.start_date.timestamp() * 1000), + ) + + task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}") + + emitter.flush() + + +def _wrap_pre_execution(pre_execution): + def custom_pre_execution(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + datahub_pre_execution(context) + + # Call original policy + if pre_execution: + pre_execution(context) + + return custom_pre_execution + + +def _wrap_on_failure_callback(on_failure_callback): + def custom_on_failure_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback(context, status=InstanceRunResult.FAILURE) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_failure_callback: + on_failure_callback(context) + + return custom_on_failure_callback + + +def _wrap_on_success_callback(on_success_callback): + def custom_on_success_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback(context, status=InstanceRunResult.SUCCESS) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_success_callback: + on_success_callback(context) + + return custom_on_success_callback + + +def task_policy(task: Union[BaseOperator, MappedOperator]) -> None: + task.log.debug(f"Setting task policy for Dag: {task.dag_id} Task: {task.task_id}") + # task.add_inlets(["auto"]) + # task.pre_execute = _wrap_pre_execution(task.pre_execute) + + # MappedOperator's callbacks don't have setters until Airflow 2.X.X + # https://github.com/apache/airflow/issues/24547 + # We can bypass this by going through partial_kwargs for now + if MappedOperator and isinstance(task, MappedOperator): # type: ignore + on_failure_callback_prop: property = getattr( + MappedOperator, TASK_ON_FAILURE_CALLBACK + ) + on_success_callback_prop: property = getattr( + MappedOperator, TASK_ON_SUCCESS_CALLBACK + ) + if not on_failure_callback_prop.fset or not on_success_callback_prop.fset: + task.log.debug( + "Using MappedOperator's partial_kwargs instead of callback properties" + ) + task.partial_kwargs[TASK_ON_FAILURE_CALLBACK] = _wrap_on_failure_callback( + task.on_failure_callback + ) + task.partial_kwargs[TASK_ON_SUCCESS_CALLBACK] = _wrap_on_success_callback( + task.on_success_callback + ) + return + + task.on_failure_callback = _wrap_on_failure_callback(task.on_failure_callback) # type: ignore + task.on_success_callback = _wrap_on_success_callback(task.on_success_callback) # type: ignore + # task.pre_execute = _wrap_pre_execution(task.pre_execute) + + +def _wrap_task_policy(policy): + if policy and hasattr(policy, "_task_policy_patched_by"): + return policy + + def custom_task_policy(task): + policy(task) + task_policy(task) + + # Add a flag to the policy to indicate that we've patched it. + custom_task_policy._task_policy_patched_by = "datahub_plugin" # type: ignore[attr-defined] + return custom_task_policy + + +def _patch_policy(settings): + if hasattr(settings, "task_policy"): + datahub_task_policy = _wrap_task_policy(settings.task_policy) + settings.task_policy = datahub_task_policy + + +def _patch_datahub_policy(): + with contextlib.suppress(ImportError): + import airflow_local_settings + + _patch_policy(airflow_local_settings) + + from airflow.models.dagbag import settings + + _patch_policy(settings) + + +_patch_datahub_policy() + + +class DatahubPlugin(AirflowPlugin): + name = "datahub_plugin" diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py new file mode 100644 index 0000000000000..69f667cad3241 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py @@ -0,0 +1,47 @@ +from abc import abstractmethod +from typing import Optional + +import attr +import datahub.emitter.mce_builder as builder +from datahub.utilities.urns.urn import guess_entity_type + + +class _Entity: + @property + @abstractmethod + def urn(self) -> str: + pass + + +@attr.s(auto_attribs=True, str=True) +class Dataset(_Entity): + platform: str + name: str + env: str = builder.DEFAULT_ENV + platform_instance: Optional[str] = None + + @property + def urn(self): + return builder.make_dataset_urn_with_platform_instance( + platform=self.platform, + name=self.name, + platform_instance=self.platform_instance, + env=self.env, + ) + + +@attr.s(str=True) +class Urn(_Entity): + _urn: str = attr.ib() + + @_urn.validator + def _validate_urn(self, attribute, value): + if not value.startswith("urn:"): + raise ValueError("invalid urn provided: urns must start with 'urn:'") + if guess_entity_type(value) != "dataset": + # This is because DataJobs only support Dataset lineage. + raise ValueError("Airflow lineage currently only supports datasets") + + @property + def urn(self): + return self._urn diff --git a/metadata-ingestion/src/datahub_provider/example_dags/.airflowignore b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/.airflowignore similarity index 100% rename from metadata-ingestion/src/datahub_provider/example_dags/.airflowignore rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/.airflowignore diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/sql/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/__init__.py similarity index 100% rename from metadata-ingestion/src/datahub/ingestion/source_report/sql/__init__.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/__init__.py diff --git a/metadata-ingestion/src/datahub_provider/example_dags/generic_recipe_sample_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py similarity index 98% rename from metadata-ingestion/src/datahub_provider/example_dags/generic_recipe_sample_dag.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py index d0e4aa944e840..ff8dba457066f 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/generic_recipe_sample_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py @@ -9,7 +9,6 @@ from airflow import DAG from airflow.operators.python import PythonOperator from airflow.utils.dates import days_ago - from datahub.configuration.config_loader import load_config_file from datahub.ingestion.run.pipeline import Pipeline @@ -41,6 +40,7 @@ def datahub_recipe(): schedule_interval=timedelta(days=1), start_date=days_ago(2), catchup=False, + default_view="tree", ) as dag: ingest_task = PythonOperator( task_id="ingest_using_recipe", diff --git a/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py similarity index 94% rename from metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py index 95b594e4052a5..3caea093b932d 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py @@ -9,7 +9,7 @@ from airflow.operators.bash import BashOperator from airflow.utils.dates import days_ago -from datahub_provider.entities import Dataset, Urn +from datahub_airflow_plugin.entities import Dataset, Urn default_args = { "owner": "airflow", @@ -28,6 +28,7 @@ start_date=days_ago(2), tags=["example_tag"], catchup=False, + default_view="tree", ) as dag: task1 = BashOperator( task_id="run_data_task", diff --git a/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py similarity index 94% rename from metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py index 1fe321eb5c80a..ceb0f452b540a 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py @@ -8,7 +8,7 @@ from airflow.decorators import dag, task from airflow.utils.dates import days_ago -from datahub_provider.entities import Dataset, Urn +from datahub_airflow_plugin.entities import Dataset, Urn default_args = { "owner": "airflow", @@ -26,6 +26,7 @@ start_date=days_ago(2), tags=["example_tag"], catchup=False, + default_view="tree", ) def datahub_lineage_backend_taskflow_demo(): @task( diff --git a/metadata-ingestion/src/datahub_provider/example_dags/lineage_emission_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py similarity index 96% rename from metadata-ingestion/src/datahub_provider/example_dags/lineage_emission_dag.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py index 153464246cef7..f40295c6bb883 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/lineage_emission_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py @@ -5,12 +5,12 @@ from datetime import timedelta +import datahub.emitter.mce_builder as builder from airflow import DAG from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator from airflow.utils.dates import days_ago -import datahub.emitter.mce_builder as builder -from datahub_provider.operators.datahub import DatahubEmitterOperator +from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator default_args = { "owner": "airflow", @@ -31,6 +31,7 @@ schedule_interval=timedelta(days=1), start_date=days_ago(2), catchup=False, + default_view="tree", ) as dag: # This example shows a SnowflakeOperator followed by a lineage emission. However, the # same DatahubEmitterOperator can be used to emit lineage in any context. diff --git a/metadata-ingestion/src/datahub_provider/example_dags/mysql_sample_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/mysql_sample_dag.py similarity index 98% rename from metadata-ingestion/src/datahub_provider/example_dags/mysql_sample_dag.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/mysql_sample_dag.py index 2c833e1425634..77b29711d7688 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/mysql_sample_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/mysql_sample_dag.py @@ -47,6 +47,7 @@ def ingest_from_mysql(): start_date=datetime(2022, 1, 1), schedule_interval=timedelta(days=1), catchup=False, + default_view="tree", ) as dag: # While it is also possible to use the PythonOperator, we recommend using # the PythonVirtualenvOperator to ensure that there are no dependency diff --git a/metadata-ingestion/src/datahub_provider/example_dags/snowflake_sample_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/snowflake_sample_dag.py similarity index 99% rename from metadata-ingestion/src/datahub_provider/example_dags/snowflake_sample_dag.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/snowflake_sample_dag.py index c107bb479262c..30e63b68e459f 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/snowflake_sample_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/snowflake_sample_dag.py @@ -57,6 +57,7 @@ def ingest_from_snowflake(snowflake_credentials, datahub_gms_server): start_date=datetime(2022, 1, 1), schedule_interval=timedelta(days=1), catchup=False, + default_view="tree", ) as dag: # This example pulls credentials from Airflow's connection store. # For this to work, you must have previously configured these connections in Airflow. diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/usage/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/__init__.py similarity index 100% rename from metadata-ingestion/src/datahub/ingestion/source_report/usage/__init__.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/__init__.py diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py new file mode 100644 index 0000000000000..aed858c6c4df0 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py @@ -0,0 +1,214 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +from airflow.exceptions import AirflowException +from airflow.hooks.base import BaseHook +from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( + MetadataChangeEvent, + MetadataChangeProposal, +) + +if TYPE_CHECKING: + from airflow.models.connection import Connection + from datahub.emitter.kafka_emitter import DatahubKafkaEmitter + from datahub.emitter.rest_emitter import DatahubRestEmitter + from datahub.ingestion.sink.datahub_kafka import KafkaSinkConfig + + +class DatahubRestHook(BaseHook): + """ + Creates a DataHub Rest API connection used to send metadata to DataHub. + Takes the endpoint for your DataHub Rest API in the Server Endpoint(host) field. + + URI example: :: + + AIRFLOW_CONN_DATAHUB_REST_DEFAULT='datahub-rest://rest-endpoint' + + :param datahub_rest_conn_id: Reference to the DataHub Rest connection. + :type datahub_rest_conn_id: str + """ + + conn_name_attr = "datahub_rest_conn_id" + default_conn_name = "datahub_rest_default" + conn_type = "datahub_rest" + hook_name = "DataHub REST Server" + + def __init__(self, datahub_rest_conn_id: str = default_conn_name) -> None: + super().__init__() + self.datahub_rest_conn_id = datahub_rest_conn_id + + @staticmethod + def get_connection_form_widgets() -> Dict[str, Any]: + return {} + + @staticmethod + def get_ui_field_behaviour() -> Dict: + """Returns custom field behavior""" + return { + "hidden_fields": ["port", "schema", "login"], + "relabeling": { + "host": "Server Endpoint", + }, + } + + def _get_config(self) -> Tuple[str, Optional[str], Optional[int]]: + conn: "Connection" = self.get_connection(self.datahub_rest_conn_id) + + host = conn.host + if not host: + raise AirflowException("host parameter is required") + if conn.port: + if ":" in host: + raise AirflowException( + "host parameter should not contain a port number if the port is specified separately" + ) + host = f"{host}:{conn.port}" + password = conn.password + timeout_sec = conn.extra_dejson.get("timeout_sec") + return (host, password, timeout_sec) + + def make_emitter(self) -> "DatahubRestEmitter": + import datahub.emitter.rest_emitter + + return datahub.emitter.rest_emitter.DatahubRestEmitter(*self._get_config()) + + def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + emitter = self.make_emitter() + + for mce in mces: + emitter.emit_mce(mce) + + def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: + emitter = self.make_emitter() + + for mce in mcps: + emitter.emit_mcp(mce) + + +class DatahubKafkaHook(BaseHook): + """ + Creates a DataHub Kafka connection used to send metadata to DataHub. + Takes your kafka broker in the Kafka Broker(host) field. + + URI example: :: + + AIRFLOW_CONN_DATAHUB_KAFKA_DEFAULT='datahub-kafka://kafka-broker' + + :param datahub_kafka_conn_id: Reference to the DataHub Kafka connection. + :type datahub_kafka_conn_id: str + """ + + conn_name_attr = "datahub_kafka_conn_id" + default_conn_name = "datahub_kafka_default" + conn_type = "datahub_kafka" + hook_name = "DataHub Kafka Sink" + + def __init__(self, datahub_kafka_conn_id: str = default_conn_name) -> None: + super().__init__() + self.datahub_kafka_conn_id = datahub_kafka_conn_id + + @staticmethod + def get_connection_form_widgets() -> Dict[str, Any]: + return {} + + @staticmethod + def get_ui_field_behaviour() -> Dict: + """Returns custom field behavior""" + return { + "hidden_fields": ["port", "schema", "login", "password"], + "relabeling": { + "host": "Kafka Broker", + }, + } + + def _get_config(self) -> "KafkaSinkConfig": + import datahub.ingestion.sink.datahub_kafka + + conn = self.get_connection(self.datahub_kafka_conn_id) + obj = conn.extra_dejson + obj.setdefault("connection", {}) + if conn.host is not None: + if "bootstrap" in obj["connection"]: + raise AirflowException( + "Kafka broker specified twice (present in host and extra)" + ) + obj["connection"]["bootstrap"] = ":".join( + map(str, filter(None, [conn.host, conn.port])) + ) + config = datahub.ingestion.sink.datahub_kafka.KafkaSinkConfig.parse_obj(obj) + return config + + def make_emitter(self) -> "DatahubKafkaEmitter": + import datahub.emitter.kafka_emitter + + sink_config = self._get_config() + return datahub.emitter.kafka_emitter.DatahubKafkaEmitter(sink_config) + + def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + emitter = self.make_emitter() + errors = [] + + def callback(exc, msg): + if exc: + errors.append(exc) + + for mce in mces: + emitter.emit_mce_async(mce, callback) + + emitter.flush() + + if errors: + raise AirflowException(f"failed to push some MCEs: {errors}") + + def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: + emitter = self.make_emitter() + errors = [] + + def callback(exc, msg): + if exc: + errors.append(exc) + + for mcp in mcps: + emitter.emit_mcp_async(mcp, callback) + + emitter.flush() + + if errors: + raise AirflowException(f"failed to push some MCPs: {errors}") + + +class DatahubGenericHook(BaseHook): + """ + Emits Metadata Change Events using either the DatahubRestHook or the + DatahubKafkaHook. Set up a DataHub Rest or Kafka connection to use. + + :param datahub_conn_id: Reference to the DataHub connection. + :type datahub_conn_id: str + """ + + def __init__(self, datahub_conn_id: str) -> None: + super().__init__() + self.datahub_conn_id = datahub_conn_id + + def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: + conn = self.get_connection(self.datahub_conn_id) + + # We need to figure out the underlying hook type. First check the + # conn_type. If that fails, attempt to guess using the conn id name. + if conn.conn_type == DatahubRestHook.conn_type: + return DatahubRestHook(self.datahub_conn_id) + elif conn.conn_type == DatahubKafkaHook.conn_type: + return DatahubKafkaHook(self.datahub_conn_id) + elif "rest" in self.datahub_conn_id: + return DatahubRestHook(self.datahub_conn_id) + elif "kafka" in self.datahub_conn_id: + return DatahubKafkaHook(self.datahub_conn_id) + else: + raise AirflowException( + f"DataHub cannot handle conn_type {conn.conn_type} in {conn}" + ) + + def make_emitter(self) -> Union["DatahubRestEmitter", "DatahubKafkaEmitter"]: + return self.get_underlying_hook().make_emitter() + + def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + return self.get_underlying_hook().emit_mces(mces) diff --git a/metadata-ingestion/src/datahub_provider/example_dags/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/__init__.py similarity index 100% rename from metadata-ingestion/src/datahub_provider/example_dags/__init__.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/__init__.py diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py new file mode 100644 index 0000000000000..c41bb2b2a1e37 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py @@ -0,0 +1,91 @@ +import json +from typing import TYPE_CHECKING, Dict, List, Optional + +from airflow.configuration import conf +from airflow.lineage.backend import LineageBackend + +from datahub_airflow_plugin._lineage_core import ( + DatahubBasicLineageConfig, + send_lineage_to_datahub, +) + +if TYPE_CHECKING: + from airflow.models.baseoperator import BaseOperator + + +class DatahubLineageConfig(DatahubBasicLineageConfig): + # If set to true, most runtime errors in the lineage backend will be + # suppressed and will not cause the overall task to fail. Note that + # configuration issues will still throw exceptions. + graceful_exceptions: bool = True + + +def get_lineage_config() -> DatahubLineageConfig: + """Load the lineage config from airflow.cfg.""" + + # The kwargs pattern is also used for secret backends. + kwargs_str = conf.get("lineage", "datahub_kwargs", fallback="{}") + kwargs = json.loads(kwargs_str) + + # Continue to support top-level datahub_conn_id config. + datahub_conn_id = conf.get("lineage", "datahub_conn_id", fallback=None) + if datahub_conn_id: + kwargs["datahub_conn_id"] = datahub_conn_id + + return DatahubLineageConfig.parse_obj(kwargs) + + +class DatahubLineageBackend(LineageBackend): + """ + Sends lineage data from tasks to DataHub. + + Configurable via ``airflow.cfg`` as follows: :: + + # For REST-based: + airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://localhost:8080' + # For Kafka-based (standard Kafka sink config can be passed via extras): + airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' + + [lineage] + backend = datahub_provider.lineage.datahub.DatahubLineageBackend + datahub_kwargs = { + "datahub_conn_id": "datahub_rest_default", + "capture_ownership_info": true, + "capture_tags_info": true, + "graceful_exceptions": true } + # The above indentation is important! + """ + + def __init__(self) -> None: + super().__init__() + + # By attempting to get and parse the config, we can detect configuration errors + # ahead of time. The init method is only called in Airflow 2.x. + _ = get_lineage_config() + + # With Airflow 2.0, this can be an instance method. However, with Airflow 1.10.x, this + # method is used statically, even though LineageBackend declares it as an instance variable. + @staticmethod + def send_lineage( + operator: "BaseOperator", + inlets: Optional[List] = None, # unused + outlets: Optional[List] = None, # unused + context: Optional[Dict] = None, + ) -> None: + config = get_lineage_config() + if not config.enabled: + return + + try: + context = context or {} # ensure not None to satisfy mypy + send_lineage_to_datahub( + config, operator, operator.inlets, operator.outlets, context + ) + except Exception as e: + if config.graceful_exceptions: + operator.log.error(e) + operator.log.info( + "Suppressing error because graceful_exceptions is set" + ) + else: + raise diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py new file mode 100644 index 0000000000000..109e7ddfe4dfa --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py @@ -0,0 +1,63 @@ +from typing import List, Union + +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent + +from datahub_airflow_plugin.hooks.datahub import ( + DatahubGenericHook, + DatahubKafkaHook, + DatahubRestHook, +) + + +class DatahubBaseOperator(BaseOperator): + """ + The DatahubBaseOperator is used as a base operator all DataHub operators. + """ + + ui_color = "#4398c8" + + hook: Union[DatahubRestHook, DatahubKafkaHook] + + # mypy is not a fan of this. Newer versions of Airflow support proper typing for the decorator + # using PEP 612. However, there is not yet a good way to inherit the types of the kwargs from + # the superclass. + @apply_defaults # type: ignore[misc] + def __init__( # type: ignore[no-untyped-def] + self, + *, + datahub_conn_id: str, + **kwargs, + ): + super().__init__(**kwargs) + + self.datahub_conn_id = datahub_conn_id + self.generic_hook = DatahubGenericHook(datahub_conn_id) + + +class DatahubEmitterOperator(DatahubBaseOperator): + """ + Emits a Metadata Change Event to DataHub using either a DataHub + Rest or Kafka connection. + + :param datahub_conn_id: Reference to the DataHub Rest or Kafka Connection. + :type datahub_conn_id: str + """ + + # See above for why these mypy type issues are ignored here. + @apply_defaults # type: ignore[misc] + def __init__( # type: ignore[no-untyped-def] + self, + mces: List[MetadataChangeEvent], + datahub_conn_id: str, + **kwargs, + ): + super().__init__( + datahub_conn_id=datahub_conn_id, + **kwargs, + ) + self.mces = mces + + def execute(self, context): + self.generic_hook.get_underlying_hook().emit_mces(self.mces) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_operator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_operator.py new file mode 100644 index 0000000000000..6f93c09a9e287 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_operator.py @@ -0,0 +1,78 @@ +import datetime +from typing import Any, List, Optional, Sequence, Union + +from airflow.models import BaseOperator +from datahub.api.circuit_breaker import ( + AssertionCircuitBreaker, + AssertionCircuitBreakerConfig, +) + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +class DataHubAssertionOperator(BaseOperator): + r""" + DataHub Assertion Circuit Breaker Operator. + + :param urn: The DataHub dataset unique identifier. (templated) + :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub + which is set as Airflow connection. + :param check_last_assertion_time: If set it checks assertions after the last operation was set on the dataset. + By default it is True. + :param time_delta: If verify_after_last_update is False it checks for assertion within the time delta. + """ + + template_fields: Sequence[str] = ("urn",) + circuit_breaker: AssertionCircuitBreaker + urn: Union[List[str], str] + + def __init__( # type: ignore[no-untyped-def] + self, + *, + urn: Union[List[str], str], + datahub_rest_conn_id: Optional[str] = None, + check_last_assertion_time: bool = True, + time_delta: Optional[datetime.timedelta] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + hook: DatahubRestHook + if datahub_rest_conn_id is not None: + hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) + else: + hook = DatahubRestHook() + + host, password, timeout_sec = hook._get_config() + self.urn = urn + config: AssertionCircuitBreakerConfig = AssertionCircuitBreakerConfig( + datahub_host=host, + datahub_token=password, + timeout=timeout_sec, + verify_after_last_update=check_last_assertion_time, + time_delta=time_delta if time_delta else datetime.timedelta(days=1), + ) + + self.circuit_breaker = AssertionCircuitBreaker(config=config) + + def execute(self, context: Any) -> bool: + if "datahub_silence_circuit_breakers" in context["dag_run"].conf: + self.log.info( + "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" + ) + return True + + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + if isinstance(self.urn, str): + urns = [self.urn] + elif isinstance(self.urn, list): + urns = self.urn + else: + raise Exception(f"urn parameter has invalid type {type(self.urn)}") + + for urn in urns: + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + ret = self.circuit_breaker.is_circuit_breaker_active(urn=urn) + if ret: + raise Exception(f"Dataset {self.urn} is not in consumable state") + + return True diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_sensor.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_sensor.py new file mode 100644 index 0000000000000..16e5d1cbe8b1f --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_sensor.py @@ -0,0 +1,78 @@ +import datetime +from typing import Any, List, Optional, Sequence, Union + +from airflow.sensors.base import BaseSensorOperator +from datahub.api.circuit_breaker import ( + AssertionCircuitBreaker, + AssertionCircuitBreakerConfig, +) + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +class DataHubAssertionSensor(BaseSensorOperator): + r""" + DataHub Assertion Circuit Breaker Sensor. + + :param urn: The DataHub dataset unique identifier. (templated) + :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub + which is set as Airflow connection. + :param check_last_assertion_time: If set it checks assertions after the last operation was set on the dataset. + By default it is True. + :param time_delta: If verify_after_last_update is False it checks for assertion within the time delta. + """ + + template_fields: Sequence[str] = ("urn",) + circuit_breaker: AssertionCircuitBreaker + urn: Union[List[str], str] + + def __init__( # type: ignore[no-untyped-def] + self, + *, + urn: Union[List[str], str], + datahub_rest_conn_id: Optional[str] = None, + check_last_assertion_time: bool = True, + time_delta: datetime.timedelta = datetime.timedelta(days=1), + **kwargs, + ) -> None: + super().__init__(**kwargs) + hook: DatahubRestHook + if datahub_rest_conn_id is not None: + hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) + else: + hook = DatahubRestHook() + + host, password, timeout_sec = hook._get_config() + self.urn = urn + config: AssertionCircuitBreakerConfig = AssertionCircuitBreakerConfig( + datahub_host=host, + datahub_token=password, + timeout=timeout_sec, + verify_after_last_update=check_last_assertion_time, + time_delta=time_delta, + ) + self.circuit_breaker = AssertionCircuitBreaker(config=config) + + def poke(self, context: Any) -> bool: + if "datahub_silence_circuit_breakers" in context["dag_run"].conf: + self.log.info( + "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" + ) + return True + + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + if isinstance(self.urn, str): + urns = [self.urn] + elif isinstance(self.urn, list): + urns = self.urn + else: + raise Exception(f"urn parameter has invalid type {type(self.urn)}") + + for urn in urns: + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + ret = self.circuit_breaker.is_circuit_breaker_active(urn=urn) + if ret: + self.log.info(f"Dataset {self.urn} is not in consumable state") + return False + + return True diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_operator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_operator.py new file mode 100644 index 0000000000000..94e105309537b --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_operator.py @@ -0,0 +1,97 @@ +import datetime +from typing import Any, List, Optional, Sequence, Union + +from airflow.sensors.base import BaseSensorOperator +from datahub.api.circuit_breaker import ( + OperationCircuitBreaker, + OperationCircuitBreakerConfig, +) + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +class DataHubOperationCircuitBreakerOperator(BaseSensorOperator): + r""" + DataHub Operation Circuit Breaker Operator. + + :param urn: The DataHub dataset unique identifier. (templated) + :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub + which is set as Airflow connection. + :param partition: The partition to check the operation. + :param source_type: The partition to check the operation. :ref:`https://datahubproject.io/docs/graphql/enums#operationsourcetype` + + """ + + template_fields: Sequence[str] = ( + "urn", + "partition", + "source_type", + "operation_type", + ) + circuit_breaker: OperationCircuitBreaker + urn: Union[List[str], str] + partition: Optional[str] + source_type: Optional[str] + operation_type: Optional[str] + + def __init__( # type: ignore[no-untyped-def] + self, + *, + urn: Union[List[str], str], + datahub_rest_conn_id: Optional[str] = None, + time_delta: Optional[datetime.timedelta] = datetime.timedelta(days=1), + partition: Optional[str] = None, + source_type: Optional[str] = None, + operation_type: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + hook: DatahubRestHook + if datahub_rest_conn_id is not None: + hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) + else: + hook = DatahubRestHook() + + host, password, timeout_sec = hook._get_config() + + self.urn = urn + self.partition = partition + self.operation_type = operation_type + self.source_type = source_type + + config: OperationCircuitBreakerConfig = OperationCircuitBreakerConfig( + datahub_host=host, + datahub_token=password, + timeout=timeout_sec, + time_delta=time_delta, + ) + + self.circuit_breaker = OperationCircuitBreaker(config=config) + + def execute(self, context: Any) -> bool: + if "datahub_silence_circuit_breakers" in context["dag_run"].conf: + self.log.info( + "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" + ) + return True + + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + if isinstance(self.urn, str): + urns = [self.urn] + elif isinstance(self.urn, list): + urns = self.urn + else: + raise Exception(f"urn parameter has invalid type {type(self.urn)}") + + for urn in urns: + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + ret = self.circuit_breaker.is_circuit_breaker_active( + urn=urn, + partition=self.partition, + operation_type=self.operation_type, + source_type=self.source_type, + ) + if ret: + raise Exception(f"Dataset {self.urn} is not in consumable state") + + return True diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_sensor.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_sensor.py new file mode 100644 index 0000000000000..434c60754064d --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_sensor.py @@ -0,0 +1,100 @@ +import datetime +from typing import Any, List, Optional, Sequence, Union + +from airflow.sensors.base import BaseSensorOperator +from datahub.api.circuit_breaker import ( + OperationCircuitBreaker, + OperationCircuitBreakerConfig, +) + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +class DataHubOperationCircuitBreakerSensor(BaseSensorOperator): + r""" + DataHub Operation Circuit Breaker Sensor. + + :param urn: The DataHub dataset unique identifier. (templated) + :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub + which is set as Airflow connection. + :param partition: The partition to check the operation. + :param source_type: The source type to filter on. If not set it will accept any source type. + See valid values at: https://datahubproject.io/docs/graphql/enums#operationsourcetype + :param operation_type: The operation type to filter on. If not set it will accept any source type. + See valid values at: https://datahubproject.io/docs/graphql/enums/#operationtype + """ + + template_fields: Sequence[str] = ( + "urn", + "partition", + "source_type", + "operation_type", + ) + circuit_breaker: OperationCircuitBreaker + urn: Union[List[str], str] + partition: Optional[str] + source_type: Optional[str] + operation_type: Optional[str] + + def __init__( # type: ignore[no-untyped-def] + self, + *, + urn: Union[List[str], str], + datahub_rest_conn_id: Optional[str] = None, + time_delta: Optional[datetime.timedelta] = datetime.timedelta(days=1), + partition: Optional[str] = None, + source_type: Optional[str] = None, + operation_type: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + hook: DatahubRestHook + if datahub_rest_conn_id is not None: + hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) + else: + hook = DatahubRestHook() + + host, password, timeout_sec = hook._get_config() + + self.urn = urn + self.partition = partition + self.operation_type = operation_type + self.source_type = source_type + + config: OperationCircuitBreakerConfig = OperationCircuitBreakerConfig( + datahub_host=host, + datahub_token=password, + timeout=timeout_sec, + time_delta=time_delta, + ) + + self.circuit_breaker = OperationCircuitBreaker(config=config) + + def poke(self, context: Any) -> bool: + if "datahub_silence_circuit_breakers" in context["dag_run"].conf: + self.log.info( + "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" + ) + return True + + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + if isinstance(self.urn, str): + urns = [self.urn] + elif isinstance(self.urn, list): + urns = self.urn + else: + raise Exception(f"urn parameter has invalid type {type(self.urn)}") + + for urn in urns: + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + ret = self.circuit_breaker.is_circuit_breaker_active( + urn=urn, + partition=self.partition, + operation_type=self.operation_type, + source_type=self.source_type, + ) + if ret: + self.log.info(f"Dataset {self.urn} is not in consumable state") + return False + + return True diff --git a/metadata-ingestion/tests/unit/test_airflow.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py similarity index 97% rename from metadata-ingestion/tests/unit/test_airflow.py rename to metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py index 980dc5550fafa..9aa901171cfa6 100644 --- a/metadata-ingestion/tests/unit/test_airflow.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py @@ -9,12 +9,11 @@ import airflow.configuration import airflow.version +import datahub.emitter.mce_builder as builder import packaging.version import pytest from airflow.lineage import apply_lineage, prepare_lineage from airflow.models import DAG, Connection, DagBag, DagRun, TaskInstance - -import datahub.emitter.mce_builder as builder from datahub_provider import get_provider_info from datahub_provider._airflow_shims import AIRFLOW_PATCHED, EmptyOperator from datahub_provider.entities import Dataset, Urn @@ -23,7 +22,7 @@ assert AIRFLOW_PATCHED -pytestmark = pytest.mark.airflow +# TODO: Remove default_view="tree" arg. Figure out why is default_view being picked as "grid" and how to fix it ? # Approach suggested by https://stackoverflow.com/a/11887885/5004662. AIRFLOW_VERSION = packaging.version.parse(airflow.version.version) @@ -75,7 +74,7 @@ def test_airflow_provider_info(): @pytest.mark.filterwarnings("ignore:.*is deprecated.*") def test_dags_load_with_no_errors(pytestconfig: pytest.Config) -> None: airflow_examples_folder = ( - pytestconfig.rootpath / "src/datahub_provider/example_dags" + pytestconfig.rootpath / "src/datahub_airflow_plugin/example_dags" ) # Note: the .airflowignore file skips the snowflake DAG. @@ -233,7 +232,11 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): func = mock.Mock() func.__name__ = "foo" - dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) + dag = DAG( + dag_id="test_lineage_is_sent_to_backend", + start_date=DEFAULT_DATE, + default_view="tree", + ) with dag: op1 = EmptyOperator( @@ -252,6 +255,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): # versions do not require it, but will attempt to find the associated # run_id in the database if execution_date is provided. As such, we # must fake the run_id parameter for newer Airflow versions. + # We need to add type:ignore in else to suppress mypy error in Airflow < 2.2 if AIRFLOW_VERSION < packaging.version.parse("2.2.0"): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) # Ignoring type here because DagRun state is just a sring at Airflow 1 @@ -259,7 +263,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): else: from airflow.utils.state import DagRunState - ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") + ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") # type: ignore[call-arg] dag_run = DagRun( state=DagRunState.SUCCESS, run_id=f"scheduled_{DEFAULT_DATE.isoformat()}", diff --git a/metadata-ingestion/adding-source.md b/metadata-ingestion/adding-source.md index 50e6a1cd5fcc6..e4fc950a7cdbd 100644 --- a/metadata-ingestion/adding-source.md +++ b/metadata-ingestion/adding-source.md @@ -44,7 +44,11 @@ class LookerAPIConfig(ConfigModel): ``` generates the following documentation: -![Generated Config Documentation](./docs/images/generated_config_docs.png) + +

+ +

+ :::note Inline markdown or code snippets are not yet supported for field level documentation. diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index f636cf25c67f7..199ccc59c21e0 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -21,11 +21,13 @@ task checkPythonVersion(type: Exec) { } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { + def sentinel_file = "${venv_name}/.venv_environment_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") + outputs.file(sentinel_file) commandLine 'bash', '-c', "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0'" + "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " + + "touch ${sentinel_file}" } task runPreFlightScript(type: Exec, dependsOn: environmentSetup) { @@ -39,7 +41,6 @@ task runPreFlightScript(type: Exec, dependsOn: environmentSetup) { task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) { def sentinel_file = "${venv_name}/.build_install_package_only_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . &&" + @@ -47,9 +48,12 @@ task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) { } task installPackage(type: Exec, dependsOn: installPackageOnly) { + def sentinel_file = "${venv_name}/.build_install_package_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") - commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . ${extra_pip_requirements}" + outputs.file(sentinel_file) + commandLine 'bash', '-x', '-c', + "${venv_name}/bin/pip install -e . ${extra_pip_requirements} && " + + "touch ${sentinel_file}" } task codegen(type: Exec, dependsOn: [environmentSetup, installPackage, ':metadata-events:mxe-schemas:build']) { @@ -63,7 +67,6 @@ task install(dependsOn: [installPackage, codegen]) task installDev(type: Exec, dependsOn: [install]) { def sentinel_file = "${venv_name}/.build_install_dev_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + @@ -75,7 +78,6 @@ task installDev(type: Exec, dependsOn: [install]) { task installAll(type: Exec, dependsOn: [install]) { def sentinel_file = "${venv_name}/.build_install_all_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md index 67041d23a21b1..f529590e2ab39 100644 --- a/metadata-ingestion/developing.md +++ b/metadata-ingestion/developing.md @@ -26,6 +26,16 @@ source venv/bin/activate datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)" ``` +### (Optional) Set up your Python environment for developing on Airflow Plugin + +From the repository root: + +```shell +cd metadata-ingestion-modules/airflow-plugin +../../gradlew :metadata-ingestion-modules:airflow-plugin:installDev +source venv/bin/activate +datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)" +``` ### Common setup issues Common issues (click to expand): @@ -74,7 +84,9 @@ The syntax for installing plugins is slightly different in development. For exam ## Architecture -![metadata ingestion framework layout](../docs/imgs/datahub-metadata-ingestion-framework.png) +

+ +

The architecture of this metadata ingestion framework is heavily inspired by [Apache Gobblin](https://gobblin.apache.org/) (also originally a LinkedIn project!). We have a standardized format - the MetadataChangeEvent - and sources and sinks which respectively produce and consume these objects. The sources pull metadata from a variety of data systems, while the sinks are primarily for moving this metadata into DataHub. @@ -181,7 +193,7 @@ pytest -m 'slow_integration' ../gradlew :metadata-ingestion:testFull ../gradlew :metadata-ingestion:check # Run all tests in a single file -../gradlew :metadata-ingestion:testSingle -PtestFile=tests/unit/test_airflow.py +../gradlew :metadata-ingestion:testSingle -PtestFile=tests/unit/test_bigquery_source.py # Run all tests under tests/unit ../gradlew :metadata-ingestion:testSingle -PtestFile=tests/unit ``` diff --git a/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md b/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md index 0e3ead9a7adf8..6a1204fb0f2b3 100644 --- a/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md +++ b/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md @@ -252,7 +252,7 @@ Example code: def get_workunits(self) -> Iterable[MetadataWorkUnit]: # Skip a redundant run if self.redundant_run_skip_handler.should_skip_this_run( - cur_start_time_millis=datetime_to_ts_millis(self.config.start_time) + cur_start_time_millis=self.config.start_time ): return @@ -260,7 +260,7 @@ Example code: # # Update checkpoint state for this run. self.redundant_run_skip_handler.update_state( - start_time_millis=datetime_to_ts_millis(self.config.start_time), - end_time_millis=datetime_to_ts_millis(self.config.end_time), + start_time_millis=self.config.start_time, + end_time_millis=self.config.end_time, ) ``` \ No newline at end of file diff --git a/metadata-ingestion/docs/dev_guides/stateful.md b/metadata-ingestion/docs/dev_guides/stateful.md index eccacbb416714..b3a409e965c62 100644 --- a/metadata-ingestion/docs/dev_guides/stateful.md +++ b/metadata-ingestion/docs/dev_guides/stateful.md @@ -38,7 +38,9 @@ Following is the list of current use-cases powered by stateful ingestion in data Stateful ingestion can be used to automatically soft-delete the tables and views that are seen in a previous run but absent in the current run (they are either deleted or no longer desired). -![Stale Metadata Deletion](./stale_metadata_deletion.png) +

+ +

#### Supported sources * All sql based sources. diff --git a/metadata-ingestion/docs/sources/azure-ad/azure-ad.md b/metadata-ingestion/docs/sources/azure-ad/azure-ad.md index 8b375fbee4f33..d2677d7e4fc7a 100644 --- a/metadata-ingestion/docs/sources/azure-ad/azure-ad.md +++ b/metadata-ingestion/docs/sources/azure-ad/azure-ad.md @@ -5,6 +5,15 @@ to read your organization's Users and Groups. The following permissions are requ - `GroupMember.Read.All` - `User.Read.All` -You can add a permission by navigating to the permissions tab in your DataHub application on the Azure AD portal. ![Azure AD API Permissions](./azure_ad_api_permissions.png) +You can add a permission by navigating to the permissions tab in your DataHub application on the Azure AD portal. +

+ +

-You can view the necessary endpoints to configure by clicking on the Endpoints button in the Overview tab. ![Azure AD Endpoints](./azure_ad_endpoints.png) + +You can view the necessary endpoints to configure by clicking on the Endpoints button in the Overview tab. + + +

+ +

diff --git a/metadata-ingestion/docs/sources/databricks/README.md b/metadata-ingestion/docs/sources/databricks/README.md index 01aee3236e01c..b380a892c22b9 100644 --- a/metadata-ingestion/docs/sources/databricks/README.md +++ b/metadata-ingestion/docs/sources/databricks/README.md @@ -15,8 +15,11 @@ To complete the picture, we recommend adding push-based ingestion from your Spar ## Watch the DataHub Talk at the Data and AI Summit 2022 For a deeper look at how to think about DataHub within and across your Databricks ecosystem, watch the recording of our talk at the Data and AI Summit 2022. - -[![IMAGE_ALT](../../images/databricks/data_and_ai_summit_2022.png)](https://www.youtube.com/watch?v=SCP0PR3t7dc) +

+ + + +

diff --git a/metadata-ingestion/docs/sources/datahub/README.md b/metadata-ingestion/docs/sources/datahub/README.md new file mode 100644 index 0000000000000..45afc6e166889 --- /dev/null +++ b/metadata-ingestion/docs/sources/datahub/README.md @@ -0,0 +1,4 @@ +Migrate data from one DataHub instance to another. + +Requires direct access to the database, kafka broker, and kafka schema registry +of the source DataHub instance. diff --git a/metadata-ingestion/docs/sources/datahub/datahub_pre.md b/metadata-ingestion/docs/sources/datahub/datahub_pre.md new file mode 100644 index 0000000000000..c98cce7047836 --- /dev/null +++ b/metadata-ingestion/docs/sources/datahub/datahub_pre.md @@ -0,0 +1,66 @@ +### Overview + +This source pulls data from two locations: +- The DataHub database, containing a single table holding all versioned aspects +- The DataHub Kafka cluster, reading from the [MCL Log](../../../../docs/what/mxe.md#metadata-change-log-mcl) +topic for timeseries aspects. + +All data is first read from the database, before timeseries data is ingested from kafka. +To prevent this source from potentially running forever, it will not ingest data produced after the +datahub_source ingestion job is started. This `stop_time` is reflected in the report. + +Data from the database and kafka are read in chronological order, specifically by the +createdon timestamp in the database and by kafka offset per partition. In order to +properly read from the database, please ensure that the `createdon` column is indexed. +Newly created databases should have this index, named `timeIndex`, by default, but older +ones you may have to create yourself, with the statement: + +``` +CREATE INDEX timeIndex ON metadata_aspect_v2 (createdon); +``` + +*If you do not have this index, the source may run incredibly slowly and produce +significant database load.* + +#### Stateful Ingestion +On first run, the source will read from the earliest data in the database and the earliest +kafka offsets. Every `commit_state_interval` (default 1000) records, the source will store +a checkpoint to remember its place, i.e. the last createdon timestamp and kafka offsets. +This allows you to stop and restart the source without losing much progress, but note that +you will re-ingest some data at the start of the new run. + +If any errors are encountered in the ingestion process, e.g. we are unable to emit an aspect +due to network errors, the source will keep running, but will stop committing checkpoints, +unless `commit_with_parse_errors` (default `false`) is set. Thus, if you re-run the ingestion, +you can re-ingest the data that was missed, but note it will all re-ingest all subsequent data. + +If you want to re-ingest all data, you can set a different `pipeline_name` in your recipe, +or set `stateful_ingestion.ignore_old_state`: + +```yaml +source: + config: + # ... connection config, etc. + stateful_ingestion: + enabled: true + ignore_old_state: true +``` + +#### Limitations +- Can only pull timeseries aspects retained by Kafka, which by default lasts 90 days. +- Does not detect hard timeseries deletions, e.g. if via a `datahub delete` command using the CLI. +Therefore, if you deleted data in this way, it will still exist in the destination instance. +- If you have a significant amount of aspects with the exact same `createdon` timestamp, +stateful ingestion will not be able to save checkpoints partially through that timestamp. +On a subsequent run, all aspects for that timestamp will be ingested. + +#### Performance +On your destination DataHub instance, we suggest the following settings: +- Enable [async ingestion](../../../../docs/deploy/environment-vars.md#ingestion) +- Use standalone consumers +([mae-consumer](../../../../metadata-jobs/mae-consumer-job/README.md) +and [mce-consumer](../../../../metadata-jobs/mce-consumer-job/README.md)) + * If you are migrating large amounts of data, consider scaling consumer replicas. +- Increase the number of gms pods to add redundancy and increase resilience to node evictions + * If you are migrating large amounts of data, consider increasing elasticsearch's + thread count via the `ELASTICSEARCH_THREAD_COUNT` environment variable. diff --git a/metadata-ingestion/docs/sources/datahub/datahub_recipe.yml b/metadata-ingestion/docs/sources/datahub/datahub_recipe.yml new file mode 100644 index 0000000000000..cb7fc97a39b9f --- /dev/null +++ b/metadata-ingestion/docs/sources/datahub/datahub_recipe.yml @@ -0,0 +1,30 @@ +pipeline_name: datahub_source_1 +datahub_api: + server: "http://localhost:8080" # Migrate data from DataHub instance on localhost:8080 + token: "" +source: + type: datahub + config: + include_all_versions: false + database_connection: + scheme: "mysql+pymysql" # or "postgresql+psycopg2" for Postgres + host_port: ":" + username: "" + password: "" + database: "" + kafka_connection: + bootstrap: ":9092" + schema_registry_url: ":8081" + stateful_ingestion: + enabled: true + ignore_old_state: false + extractor_config: + set_system_metadata: false # Replicate system metadata + +# Here, we write to a DataHub instance +# You can also use a different sink, e.g. to write the data to a file instead +sink: + type: datahub + config: + server: "" + token: "" diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml b/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml index 28bce8a478211..8caedafbea50e 100644 --- a/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml +++ b/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml @@ -2,14 +2,17 @@ source: type: "iceberg" config: env: PROD - adls: - # Will be translated to https://{account_name}.dfs.core.windows.net - account_name: my_adls_account - # Can use sas_token or account_key - sas_token: "${SAS_TOKEN}" - # account_key: "${ACCOUNT_KEY}" - container_name: warehouse - base_path: iceberg + catalog: + name: my_iceberg_catalog + type: rest + # Catalog configuration follows pyiceberg's documentation (https://py.iceberg.apache.org/configuration) + config: + uri: http://localhost:8181 + s3.access-key-id: admin + s3.secret-access-key: password + s3.region: us-east-1 + warehouse: s3a://warehouse/wh/ + s3.endpoint: http://localhost:9000 platform_instance: my_iceberg_catalog table_pattern: allow: diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md index 9d400460407c8..03bcef70e1860 100644 --- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md +++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md @@ -1,5 +1,60 @@ ## Advanced Configurations +### Working with Platform Instances +If you've multiple instances of kafka OR source/sink systems that are referred in your `kafka-connect` setup, you'd need to configure platform instance for these systems in `kafka-connect` recipe to generate correct lineage edges. You must have already set `platform_instance` in recipes of original source/sink systems. Refer the document [Working with Platform Instances](https://datahubproject.io/docs/platform-instances) to understand more about this. + +There are two options available to declare source/sink system's `platform_instance` in `kafka-connect` recipe. If single instance of platform is used across all `kafka-connect` connectors, you can use `platform_instance_map` to specify platform_instance to use for a platform when constructing URNs for lineage. + +Example: +```yml + # Map of platform name to platform instance + platform_instance_map: + snowflake: snowflake_platform_instance + mysql: mysql_platform_instance + +``` +If multiple instances of platform are used across `kafka-connect` connectors, you'd need to specify platform_instance to use for platform for every connector. + +#### Example - Multiple MySQL Source Connectors each reading from different mysql instance +```yml + # Map of platform name to platform instance per connector + connect_to_platform_map: + mysql_connector1: + mysql: mysql_instance1 + + mysql_connector2: + mysql: mysql_instance2 +``` +Here mysql_connector1 and mysql_connector2 are names of MySQL source connectors as defined in `kafka-connect` connector config. + +#### Example - Multiple MySQL Source Connectors each reading from difference mysql instance and writing to different kafka cluster +```yml + connect_to_platform_map: + mysql_connector1: + mysql: mysql_instance1 + kafka: kafka_instance1 + + mysql_connector2: + mysql: mysql_instance2 + kafka: kafka_instance2 +``` +You can also use combination of `platform_instance_map` and `connect_to_platform_map` in your recipe. Note that, the platform_instance specified for the connector in `connect_to_platform_map` will always take higher precedance even if platform_instance for same platform is set in `platform_instance_map`. + +If you do not use `platform_instance` in original source/sink recipes, you do not need to specify them in above configurations. + +Note that, you do not need to specify platform_instance for BigQuery. + +#### Example - Multiple BigQuery Sink Connectors each writing to different kafka cluster +```yml + connect_to_platform_map: + bigquery_connector1: + kafka: kafka_instance1 + + bigquery_connector2: + kafka: kafka_instance2 +``` + +### Provided Configurations from External Sources Kafka Connect supports pluggable configuration providers which can load configuration data from external sources at runtime. These values are not available to DataHub ingestion source through Kafka Connect APIs. If you are using such provided configurations to specify connection url (database, etc) in Kafka Connect connector configuration then you will need also add these in `provided_configs` section in recipe for DataHub to generate correct lineage. ```yml diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml index f5e33e661622d..cacbda5ca078a 100644 --- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml +++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml @@ -3,14 +3,16 @@ source: config: # Coordinates connect_uri: "http://localhost:8083" - + # Credentials username: admin password: password # Optional - platform_instance_map: - bigquery: bigquery_platform_instance_id - + # Platform instance mapping to use when constructing URNs. + # Use if single instance of platform is referred across connectors. + platform_instance_map: + mysql: mysql_platform_instance + sink: - # sink configs \ No newline at end of file + # sink configs diff --git a/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png b/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png deleted file mode 100644 index 7227dc04fb8a0..0000000000000 Binary files a/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png and /dev/null differ diff --git a/metadata-ingestion/docs/sources/looker/looker_pre.md b/metadata-ingestion/docs/sources/looker/looker_pre.md index ad7fff9c0daaf..6798103d66e99 100644 --- a/metadata-ingestion/docs/sources/looker/looker_pre.md +++ b/metadata-ingestion/docs/sources/looker/looker_pre.md @@ -19,7 +19,10 @@ see_user_dashboards see_users ``` Here is an example permission set after configuration. -![Looker DataHub Permission Set](./looker_datahub_permission_set.png) + +

+ +

#### Get an API key diff --git a/metadata-ingestion/docs/sources/mssql/mssql_pre.md b/metadata-ingestion/docs/sources/mssql/mssql_pre.md new file mode 100644 index 0000000000000..396581966e691 --- /dev/null +++ b/metadata-ingestion/docs/sources/mssql/mssql_pre.md @@ -0,0 +1,14 @@ +### Prerequisites + +If you want to ingest MSSQL Jobs and stored procedures (with code) the user credentials needs the proper privileges. + +Script for granting the privileges: +``` +USE MSDB +GRANT SELECT ON OBJECT::msdb.dbo.sysjobsteps TO 'USERNAME' +GRANT SELECT ON OBJECT::msdb.dbo.sysjobs TO 'USERNAME' + +USE 'DATA_DB_NAME' +GRANT VIEW DEFINITION TO 'USERNAME' +GRANT SELECT ON OBJECT::sys.sql_expression_dependencies TO 'USERNAME' +``` \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/s3/s3.md b/metadata-ingestion/docs/sources/s3/s3.md index 93715629d0b8e..9484cd8de6666 100644 --- a/metadata-ingestion/docs/sources/s3/s3.md +++ b/metadata-ingestion/docs/sources/s3/s3.md @@ -196,3 +196,9 @@ If you are ingesting datasets from AWS S3, we recommend running the ingestion on Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz). For an example guide on setting up PyDeequ on AWS, see [this guide](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/). + +:::caution + +From Spark 3.2.0+, Avro reader fails on column names that don't start with a letter and contains other character than letters, number, and underscore. [https://github.com/apache/spark/blob/72c62b6596d21e975c5597f8fff84b1a9d070a02/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala#L158] +Avro files that contain such columns won't be profiled. +::: \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md index 9a381fb351aec..75bd579417a48 100644 --- a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md @@ -99,6 +99,24 @@ The steps slightly differ based on which you decide to use. including `client_id` and `client_secret`, plus your Okta user's `Username` and `Password` * Note: the `username` and `password` config options are not nested under `oauth_config` +### Snowflake Shares +If you are using [Snowflake Shares](https://docs.snowflake.com/en/user-guide/data-sharing-provider) to share data across different snowflake accounts, and you have set up DataHub recipes for ingesting metadata from all these accounts, you may end up having multiple similar dataset entities corresponding to virtual versions of same table in different snowflake accounts. DataHub Snowflake connector can automatically link such tables together through Siblings and Lineage relationship if user provides information necessary to establish the relationship using configuration `shares` in recipe. + +#### Example +- Snowflake account `account1` (ingested as platform_instance `instance1`) owns a database `db1`. A share `X` is created in `account1` that includes database `db1` along with schemas and tables inside it. +- Now, `X` is shared with snowflake account `account2` (ingested as platform_instance `instance2`). A database `db1_from_X` is created from inbound share `X` in `account2`. In this case, all tables and views included in share `X` will also be present in `instance2`.`db1_from_X`. +- This can be represented in `shares` configuration section as + ```yaml + shares: + X: # name of the share + database_name: db1 + platform_instance: instance1 + consumers: # list of all databases created from share X + - database_name: db1_from_X + platform_instance: instance2 + + ``` +- If share `X` is shared with more snowflake accounts and database is created from share `X` in those account then additional entries need to be added in `consumers` list for share `X`, one per snowflake account. The same `shares` config can then be copied across recipes of all accounts. ### Caveats - Some of the features are only available in the Snowflake Enterprise Edition. This doc has notes mentioning where this applies. diff --git a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml index f2c70110326fc..de6ba8731c878 100644 --- a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml +++ b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml @@ -70,7 +70,7 @@ nodes: - Shipping.CountryCode - Shipping.StreetAddress custom_properties: - - is_used_for_compliance_tracking: true + is_used_for_compliance_tracking: true knowledge_links: - url: "https://en.wikipedia.org/wiki/Address" label: Wiki link diff --git a/metadata-ingestion/schedule_docs/airflow.md b/metadata-ingestion/schedule_docs/airflow.md index e48710964b01c..95393c3cc9919 100644 --- a/metadata-ingestion/schedule_docs/airflow.md +++ b/metadata-ingestion/schedule_docs/airflow.md @@ -4,9 +4,9 @@ If you are using Apache Airflow for your scheduling then you might want to also We've provided a few examples of how to configure your DAG: -- [`mysql_sample_dag`](../src/datahub_provider/example_dags/mysql_sample_dag.py) embeds the full MySQL ingestion configuration inside the DAG. +- [`mysql_sample_dag`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/mysql_sample_dag.py) embeds the full MySQL ingestion configuration inside the DAG. -- [`snowflake_sample_dag`](../src/datahub_provider/example_dags/snowflake_sample_dag.py) avoids embedding credentials inside the recipe, and instead fetches them from Airflow's [Connections](https://airflow.apache.org/docs/apache-airflow/stable/howto/connection/index.html) feature. You must configure your connections in Airflow to use this approach. +- [`snowflake_sample_dag`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/snowflake_sample_dag.py) avoids embedding credentials inside the recipe, and instead fetches them from Airflow's [Connections](https://airflow.apache.org/docs/apache-airflow/stable/howto/connection/index.html) feature. You must configure your connections in Airflow to use this approach. :::tip @@ -37,6 +37,6 @@ In more advanced cases, you might want to store your ingestion recipe in a file - Create a DAG task to read your DataHub ingestion recipe file and run it. See the example below for reference. - Deploy the DAG file into airflow for scheduling. Typically this involves checking in the DAG file into your dags folder which is accessible to your Airflow instance. -Example: [`generic_recipe_sample_dag`](../src/datahub_provider/example_dags/generic_recipe_sample_dag.py) +Example: [`generic_recipe_sample_dag`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py) diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index 59d847395ec47..fad55b99ec938 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -75,7 +75,6 @@ disallow_untyped_defs = yes asyncio_mode = auto addopts = --cov=src --cov-report= --cov-config setup.cfg --strict-markers markers = - airflow: marks tests related to airflow (deselect with '-m not airflow') slow_unit: marks tests to only run slow unit tests (deselect with '-m not slow_unit') integration: marks tests to only run in integration (deselect with '-m "not integration"') integration_batch_1: mark tests to only run in batch 1 of integration tests. This is done mainly for parallelisation (deselect with '-m not integration_batch_1') @@ -112,5 +111,3 @@ exclude_lines = omit = # omit codegen src/datahub/metadata/* - # omit example dags - src/datahub_provider/example_dags/* diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 4339e3eb099c8..32e1cf926cc68 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -229,8 +229,8 @@ def get_long_description(): iceberg_common = { # Iceberg Python SDK - "acryl-iceberg-legacy==0.0.4", - "azure-identity==1.10.0", + "pyiceberg", + "pyarrow>=9.0.0, <13.0.0", } s3_base = { @@ -247,8 +247,8 @@ def get_long_description(): } data_lake_profiling = { - "pydeequ>=1.0.1, <1.1", - "pyspark==3.0.3", + "pydeequ==1.1.0", + "pyspark~=3.3.0", } delta_lake = { @@ -269,6 +269,8 @@ def get_long_description(): "requests", } +mysql = sql_common | {"pymysql>=1.0.2"} + # Note: for all of these, framework_common will be added. plugins: Dict[str, Set[str]] = { # Sink plugins. @@ -281,13 +283,13 @@ def get_long_description(): }, # Integrations. "airflow": { - "apache-airflow >= 2.0.2", - *rest_common, + f"acryl-datahub-airflow-plugin == {package_metadata['__version__']}", }, "circuit-breaker": { "gql>=3.3.0", "gql[requests]>=3.3.0", }, + "datahub": mysql | kafka_common, "great-expectations": sql_common | sqllineage_lib, # Misc plugins. "sql-parser": sqlglot_lib, @@ -342,7 +344,7 @@ def get_long_description(): }, "iceberg": iceberg_common, "json-schema": set(), - "kafka": {*kafka_common, *kafka_protobuf}, + "kafka": kafka_common | kafka_protobuf, "kafka-connect": sql_common | {"requests", "JPype1"}, "ldap": {"python-ldap>=2.4"}, "looker": looker_common, @@ -352,10 +354,10 @@ def get_long_description(): "mongodb": {"pymongo[srv]>=3.11", "packaging"}, "mssql": sql_common | {"sqlalchemy-pytds>=0.3"}, "mssql-odbc": sql_common | {"pyodbc"}, - "mysql": sql_common | {"pymysql>=1.0.2"}, + "mysql": mysql, # mariadb should have same dependency as mysql "mariadb": sql_common | {"pymysql>=1.0.2"}, - "okta": {"okta~=1.7.0"}, + "okta": {"okta~=1.7.0", "nest-asyncio"}, "oracle": sql_common | {"cx_Oracle"}, "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, "presto": sql_common | pyhive_common | trino, @@ -373,6 +375,7 @@ def get_long_description(): "salesforce": {"simple-salesforce"}, "snowflake": snowflake_common | usage_common | sqlglot_lib, "sqlalchemy": sql_common, + "sql-queries": usage_common | sqlglot_lib, "superset": { "requests", "sqlalchemy", @@ -385,7 +388,7 @@ def get_long_description(): "trino": sql_common | trino, "starburst-trino-usage": sql_common | usage_common | trino, "nifi": {"requests", "packaging", "requests-gssapi"}, - "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"}, + "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"} | sqlglot_lib, "powerbi-report-server": powerbi_report_server, "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8"}, "unity-catalog": databricks | sqllineage_lib, @@ -417,6 +420,7 @@ def get_long_description(): # The boto3-stubs package seems to have regularly breaking minor releases, # we pin to a specific version to avoid this. "boto3-stubs[s3,glue,sagemaker,sts]==1.28.15", + "mypy-boto3-sagemaker==1.28.15", # For some reason, above pin only restricts `mypy-boto3-sagemaker<1.29.0,>=1.28.0` "types-tabulate", # avrogen package requires this "types-pytz", @@ -451,7 +455,7 @@ def get_long_description(): "mypy==1.0.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.9.0", + "pydantic>=1.10.0", *test_api_requirements, pytest_dep, "pytest-asyncio>=0.16.0", @@ -473,7 +477,7 @@ def get_long_description(): "druid", "elasticsearch", "feast" if sys.version_info >= (3, 8) else None, - "iceberg", + "iceberg" if sys.version_info >= (3, 8) else None, "json-schema", "ldap", "looker", @@ -503,8 +507,8 @@ def get_long_description(): "salesforce", "unity-catalog", "nifi", - "vertica" - # airflow is added below + "vertica", + "mode", ] if plugin for dependency in plugins[plugin] @@ -513,9 +517,6 @@ def get_long_description(): dev_requirements = { *base_dev_requirements, - # Extra requirements for Airflow. - "apache-airflow[snowflake]>=2.0.2", # snowflake is used in example dags - "virtualenv", # needed by PythonVirtualenvOperator } full_test_dev_requirements = { @@ -529,7 +530,7 @@ def get_long_description(): "druid", "hana", "hive", - "iceberg", + "iceberg" if sys.version_info >= (3, 8) else None, "kafka-connect", "ldap", "mongodb", @@ -539,6 +540,7 @@ def get_long_description(): "redash", "vertica", ] + if plugin for dependency in plugins[plugin] ), } @@ -548,6 +550,7 @@ def get_long_description(): "datahub.ingestion.source.plugins": [ "csv-enricher = datahub.ingestion.source.csv_enricher:CSVEnricherSource", "file = datahub.ingestion.source.file:GenericFileSource", + "datahub = datahub.ingestion.source.datahub.datahub_source:DataHubSource", "sqlalchemy = datahub.ingestion.source.sql.sql_generic:SQLAlchemyGenericSource", "athena = datahub.ingestion.source.sql.athena:AthenaSource", "azure-ad = datahub.ingestion.source.identity.azure_ad:AzureADSource", @@ -604,6 +607,7 @@ def get_long_description(): "demo-data = datahub.ingestion.source.demo_data.DemoDataSource", "unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource", "gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource", + "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource", ], "datahub.ingestion.transformer.plugins": [ "simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership", diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index 0d3c35e933e25..7ab7605ef6363 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -37,6 +37,11 @@ "glossaryNode", } +_RECURSIVE_DELETE_TYPES = { + "container", + "dataPlatformInstance", +} + @click.group(cls=DefaultGroup, default="by-filter") def delete() -> None: @@ -252,6 +257,12 @@ def references(urn: str, dry_run: bool, force: bool) -> None: help="Entity type filter (e.g. dataset)", ) @click.option("--query", required=False, type=str, help="Elasticsearch query string") +@click.option( + "--recursive", + required=False, + is_flag=True, + help="Recursively delete all contained entities (only for containers and dataPlatformInstances)", +) @click.option( "--start-time", required=False, @@ -298,6 +309,7 @@ def by_filter( platform: Optional[str], entity_type: Optional[str], query: Optional[str], + recursive: bool, start_time: Optional[datetime], end_time: Optional[datetime], batch_size: int, @@ -308,7 +320,12 @@ def by_filter( # Validate the cli arguments. _validate_user_urn_and_filters( - urn=urn, entity_type=entity_type, platform=platform, env=env, query=query + urn=urn, + entity_type=entity_type, + platform=platform, + env=env, + query=query, + recursive=recursive, ) soft_delete_filter = _validate_user_soft_delete_flags( soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted @@ -327,11 +344,29 @@ def by_filter( logger.info(f"Using {graph}") # Determine which urns to delete. + delete_by_urn = bool(urn) and not recursive if urn: - delete_by_urn = True urns = [urn] + + if recursive: + # Add children urns to the list. + if guess_entity_type(urn) == "dataPlatformInstance": + urns.extend( + graph.get_urns_by_filter( + platform_instance=urn, + status=soft_delete_filter, + batch_size=batch_size, + ) + ) + else: + urns.extend( + graph.get_urns_by_filter( + container=urn, + status=soft_delete_filter, + batch_size=batch_size, + ) + ) else: - delete_by_urn = False urns = list( graph.get_urns_by_filter( entity_types=[entity_type] if entity_type else None, @@ -348,20 +383,22 @@ def by_filter( ) return + # Print out a summary of the urns to be deleted and confirm with the user. + if not delete_by_urn: urns_by_type: Dict[str, List[str]] = {} for urn in urns: entity_type = guess_entity_type(urn) urns_by_type.setdefault(entity_type, []).append(urn) if len(urns_by_type) > 1: # Display a breakdown of urns by entity type if there's multiple. - click.echo("Filter matched urns of multiple entity types") + click.echo("Found urns of multiple entity types") for entity_type, entity_urns in urns_by_type.items(): click.echo( f"- {len(entity_urns)} {entity_type} urn(s). Sample: {choices(entity_urns, k=min(5, len(entity_urns)))}" ) else: click.echo( - f"Filter matched {len(urns)} {entity_type} urn(s). Sample: {choices(urns, k=min(5, len(urns)))}" + f"Found {len(urns)} {entity_type} urn(s). Sample: {choices(urns, k=min(5, len(urns)))}" ) if not force and not dry_run: @@ -403,6 +440,7 @@ def _validate_user_urn_and_filters( platform: Optional[str], env: Optional[str], query: Optional[str], + recursive: bool, ) -> None: # Check urn / filters options. if urn: @@ -423,6 +461,21 @@ def _validate_user_urn_and_filters( f"Using --env without other filters will delete all metadata in the {env} environment. Please use with caution." ) + # Check recursive flag. + if recursive: + if not urn: + raise click.UsageError( + "The --recursive flag can only be used with a single urn." + ) + elif guess_entity_type(urn) not in _RECURSIVE_DELETE_TYPES: + raise click.UsageError( + f"The --recursive flag can only be used with these entity types: {_RECURSIVE_DELETE_TYPES}." + ) + elif urn and guess_entity_type(urn) in _RECURSIVE_DELETE_TYPES: + logger.warning( + f"This will only delete {urn}. Use --recursive to delete all contained entities." + ) + def _validate_user_soft_delete_flags( soft: bool, aspect: Optional[str], only_soft_deleted: bool diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 918f610ce4635..9fde47c82873c 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -893,6 +893,7 @@ def download_compose_files( tmp_file.write(quickstart_download_response.content) logger.debug(f"Copied to {path}") if kafka_setup: + base_url = get_docker_compose_base_url(compose_git_ref) kafka_setup_github_file = f"{base_url}/{KAFKA_SETUP_QUICKSTART_COMPOSE_FILE}" default_kafka_compose_file = ( diff --git a/metadata-ingestion/src/datahub/configuration/config_loader.py b/metadata-ingestion/src/datahub/configuration/config_loader.py index 78bee21d1bda4..30ca4ff6aed2d 100644 --- a/metadata-ingestion/src/datahub/configuration/config_loader.py +++ b/metadata-ingestion/src/datahub/configuration/config_loader.py @@ -2,6 +2,7 @@ import pathlib import re import sys +import tempfile import unittest.mock from typing import Any, Dict, Set, Union from urllib import parse @@ -14,7 +15,7 @@ from datahub.configuration.yaml import YamlConfigurationMechanism -def resolve_element(element: str) -> str: +def _resolve_element(element: str) -> str: if re.search(r"(\$\{).+(\})", element): return expandvars(element, nounset=True) elif element.startswith("$"): @@ -30,7 +31,7 @@ def _resolve_list(ele_list: list) -> list: new_v: list = [] for ele in ele_list: if isinstance(ele, str): - new_v.append(resolve_element(ele)) + new_v.append(_resolve_element(ele)) elif isinstance(ele, list): new_v.append(_resolve_list(ele)) elif isinstance(ele, dict): @@ -48,7 +49,7 @@ def resolve_env_variables(config: dict) -> dict: elif isinstance(v, list): new_dict[k] = _resolve_list(v) elif isinstance(v, str): - new_dict[k] = resolve_element(v) + new_dict[k] = _resolve_element(v) else: new_dict[k] = v return new_dict @@ -67,12 +68,40 @@ def list_referenced_env_variables(config: dict) -> Set[str]: return set([call[1][0] for call in calls]) +WRITE_TO_FILE_DIRECTIVE_PREFIX = "__DATAHUB_TO_FILE_" + + +def _process_directives(config: dict) -> dict: + def _process(obj: Any) -> Any: + if isinstance(obj, dict): + new_obj = {} + for k, v in obj.items(): + if isinstance(k, str) and k.startswith(WRITE_TO_FILE_DIRECTIVE_PREFIX): + # This writes the value to a temporary file and replaces the value with the path to the file. + config_option = k[len(WRITE_TO_FILE_DIRECTIVE_PREFIX) :] + + with tempfile.NamedTemporaryFile("w", delete=False) as f: + filepath = f.name + f.write(v) + + new_obj[config_option] = filepath + else: + new_obj[k] = _process(v) + + return new_obj + else: + return obj + + return _process(config) + + def load_config_file( config_file: Union[str, pathlib.Path], squirrel_original_config: bool = False, squirrel_field: str = "__orig_config", allow_stdin: bool = False, resolve_env_vars: bool = True, + process_directives: bool = True, ) -> dict: config_mech: ConfigurationMechanism if allow_stdin and config_file == "-": @@ -105,10 +134,13 @@ def load_config_file( config_fp = io.StringIO(raw_config_file) raw_config = config_mech.load_config(config_fp) + + config = raw_config.copy() if resolve_env_vars: - config = resolve_env_variables(raw_config) - else: - config = raw_config + config = resolve_env_variables(config) + if process_directives: + config = _process_directives(config) + if squirrel_original_config: config[squirrel_field] = raw_config return config diff --git a/metadata-ingestion/src/datahub/configuration/datetimes.py b/metadata-ingestion/src/datahub/configuration/datetimes.py index 55f5c6fbd6155..41af7565593d9 100644 --- a/metadata-ingestion/src/datahub/configuration/datetimes.py +++ b/metadata-ingestion/src/datahub/configuration/datetimes.py @@ -43,24 +43,28 @@ def parse_user_datetime(input: str) -> datetime: # Then try parsing as a relative time. with contextlib.suppress(humanfriendly.InvalidTimespan): - delta = _parse_relative_timespan(input) + delta = parse_relative_timespan(input) return datetime.now(tz=timezone.utc) + delta # Finally, try parsing as an absolute time. with contextlib.suppress(dateutil.parser.ParserError): - dt = dateutil.parser.parse(input) - if dt.tzinfo is None: - # Assume that the user meant to specify a time in UTC. - dt = dt.replace(tzinfo=timezone.utc) - else: - # Convert to UTC. - dt = dt.astimezone(timezone.utc) - return dt + return parse_absolute_time(input) raise ValueError(f"Could not parse {input} as a datetime or relative time.") -def _parse_relative_timespan(input: str) -> timedelta: +def parse_absolute_time(input: str) -> datetime: + dt = dateutil.parser.parse(input) + if dt.tzinfo is None: + # Assume that the user meant to specify a time in UTC. + dt = dt.replace(tzinfo=timezone.utc) + else: + # Convert to UTC. + dt = dt.astimezone(timezone.utc) + return dt + + +def parse_relative_timespan(input: str) -> timedelta: neg = False input = input.strip() diff --git a/metadata-ingestion/src/datahub/configuration/time_window_config.py b/metadata-ingestion/src/datahub/configuration/time_window_config.py index a4b451f0cdfbd..15de7470e4d82 100644 --- a/metadata-ingestion/src/datahub/configuration/time_window_config.py +++ b/metadata-ingestion/src/datahub/configuration/time_window_config.py @@ -2,10 +2,12 @@ from datetime import datetime, timedelta, timezone from typing import Any, Dict, List +import humanfriendly import pydantic from pydantic.fields import Field from datahub.configuration.common import ConfigModel +from datahub.configuration.datetimes import parse_absolute_time, parse_relative_timespan from datahub.metadata.schema_classes import CalendarIntervalClass @@ -42,25 +44,46 @@ class BaseTimeWindowConfig(ConfigModel): # if those fields are not set by the user. end_time: datetime = Field( default_factory=lambda: datetime.now(tz=timezone.utc), - description="Latest date of usage to consider. Default: Current time in UTC", + description="Latest date of lineage/usage to consider. Default: Current time in UTC", ) - start_time: datetime = Field(default=None, description="Earliest date of usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`)") # type: ignore + start_time: datetime = Field(default=None, description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.") # type: ignore @pydantic.validator("start_time", pre=True, always=True) def default_start_time( - cls, v: Any, *, values: Dict[str, Any], **kwargs: Any + cls, v: Any, values: Dict[str, Any], **kwargs: Any ) -> datetime: - return v or get_time_bucket( - values["end_time"] - get_bucket_duration_delta(values["bucket_duration"]), - values["bucket_duration"], - ) + if v is None: + return get_time_bucket( + values["end_time"] + - get_bucket_duration_delta(values["bucket_duration"]), + values["bucket_duration"], + ) + elif isinstance(v, str): + # This is where start_time str is resolved to datetime + try: + delta = parse_relative_timespan(v) + assert delta < timedelta( + 0 + ), "Relative start time should start with minus sign (-) e.g. '-2 days'." + assert abs(delta) >= get_bucket_duration_delta( + values["bucket_duration"] + ), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'." + return get_time_bucket( + values["end_time"] + delta, values["bucket_duration"] + ) + except humanfriendly.InvalidTimespan: + # We do not floor start_time to the bucket start time if absolute start time is specified. + # If user has specified absolute start time in recipe, it's most likely that he means it. + return parse_absolute_time(v) + + return v @pydantic.validator("start_time", "end_time") def ensure_timestamps_in_utc(cls, v: datetime) -> datetime: - if v.tzinfo != timezone.utc: - raise ValueError( - 'timezone is not UTC; try adding a "Z" to the value e.g. "2021-07-20T00:00:00Z"' - ) + assert ( + v.tzinfo == timezone.utc + ), 'timezone is not UTC; try adding a "Z" to the value e.g. "2021-07-20T00:00:00Z"' + return v def buckets(self) -> List[datetime]: diff --git a/metadata-ingestion/src/datahub/emitter/aspect.py b/metadata-ingestion/src/datahub/emitter/aspect.py index 9118967a07273..0be2b3336980c 100644 --- a/metadata-ingestion/src/datahub/emitter/aspect.py +++ b/metadata-ingestion/src/datahub/emitter/aspect.py @@ -1,10 +1,12 @@ -from datahub.metadata.schema_classes import ASPECT_CLASSES +from typing import Dict, Type -ASPECT_MAP = { +from datahub.metadata.schema_classes import ASPECT_CLASSES, _Aspect + +ASPECT_MAP: Dict[str, Type[_Aspect]] = { AspectClass.get_aspect_name(): AspectClass for AspectClass in ASPECT_CLASSES } -TIMESERIES_ASPECT_MAP = { +TIMESERIES_ASPECT_MAP: Dict[str, Type[_Aspect]] = { name: klass for name, klass in ASPECT_MAP.items() if klass.get_aspect_type() == "timeseries" diff --git a/metadata-ingestion/src/datahub/emitter/mcp.py b/metadata-ingestion/src/datahub/emitter/mcp.py index 6f9a22bffd085..9085ac152ea0b 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp.py +++ b/metadata-ingestion/src/datahub/emitter/mcp.py @@ -9,6 +9,7 @@ DictWrapper, GenericAspectClass, KafkaAuditHeaderClass, + MetadataChangeLogClass, MetadataChangeProposalClass, SystemMetadataClass, _Aspect, @@ -214,6 +215,22 @@ def try_from_mcpc( else: return None + @classmethod + def try_from_mcl( + cls, mcl: MetadataChangeLogClass + ) -> Union["MetadataChangeProposalWrapper", MetadataChangeProposalClass]: + mcpc = MetadataChangeProposalClass( + entityUrn=mcl.entityUrn, + entityType=mcl.entityType, + entityKeyAspect=mcl.entityKeyAspect, + aspect=mcl.aspect, + aspectName=mcl.aspectName, + changeType=mcl.changeType, + auditHeader=mcl.auditHeader, + systemMetadata=mcl.systemMetadata, + ) + return cls.try_from_mcpc(mcpc) or mcpc + @classmethod def from_obj_require_wrapper( cls, obj: dict, tuples: bool = False diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index cf4d46cf18ba8..acb5763280905 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -31,13 +31,14 @@ ) _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on 429, + 500, 502, 503, 504, ] _DEFAULT_RETRY_METHODS = ["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"] _DEFAULT_RETRY_MAX_TIMES = int( - os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "3") + os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4") ) diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py new file mode 100644 index 0000000000000..071d590f270f8 --- /dev/null +++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py @@ -0,0 +1,289 @@ +import logging +import time +from collections import defaultdict +from dataclasses import dataclass, field +from datetime import datetime +from typing import Collection, Dict, Iterable, List, Optional, Set + +from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator +from datahub.metadata.schema_classes import ( + AuditStampClass, + DatasetLineageTypeClass, + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, + OperationClass, + OperationTypeClass, + UpstreamClass, + UpstreamLineageClass, +) +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult + +logger = logging.getLogger(__name__) + +# TODO: Use this over other sources' equivalent code, if possible + +DatasetUrn = str +FieldUrn = str +UserUrn = str + + +@dataclass +class LineageEdge: + """Stores information about a single lineage edge, from an upstream table to a downstream table.""" + + downstream_urn: DatasetUrn + upstream_urn: DatasetUrn + audit_stamp: Optional[datetime] + actor: Optional[UserUrn] + type: str = DatasetLineageTypeClass.TRANSFORMED + + # Maps downstream_col -> {upstream_col} + column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set)) + + def gen_upstream_aspect(self) -> UpstreamClass: + return UpstreamClass( + auditStamp=AuditStampClass( + time=int(self.audit_stamp.timestamp() * 1000), actor=self.actor or "" + ) + if self.audit_stamp + else None, + dataset=self.upstream_urn, + type=self.type, + ) + + def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]: + for downstream_col, upstream_cols in self.column_map.items(): + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + # Sort to avoid creating multiple aspects in backend with same lineage but different order + upstreams=sorted( + make_schema_field_urn(self.upstream_urn, col) + for col in upstream_cols + ), + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn(self.downstream_urn, downstream_col) + ], + ) + + +@dataclass +class SqlParsingBuilder: + # Open question: does it make sense to iterate over out_tables? When will we have multiple? + + generate_lineage: bool = True + generate_usage_statistics: bool = True + generate_operations: bool = True + usage_config: Optional[BaseUsageConfig] = None + + # TODO: Make inner dict a FileBackedDict and make LineageEdge frozen + # Builds up a single LineageEdge for each upstream -> downstream pair + _lineage_map: Dict[DatasetUrn, Dict[DatasetUrn, LineageEdge]] = field( + default_factory=lambda: defaultdict(dict), init=False + ) + + # TODO: Replace with FileBackedDict approach like in BigQuery usage + _usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False) + + def __post_init__(self) -> None: + if self.usage_config: + self._usage_aggregator = UsageAggregator(self.usage_config) + else: + logger.info("No usage config provided, not generating usage statistics") + self.generate_usage_statistics = False + + def process_sql_parsing_result( + self, + result: SqlParsingResult, + *, + query: str, + query_timestamp: Optional[datetime] = None, + is_view_ddl: bool = False, + user: Optional[UserUrn] = None, + custom_operation_type: Optional[str] = None, + include_urns: Optional[Set[DatasetUrn]] = None, + ) -> Iterable[MetadataWorkUnit]: + """Process a single query and yield any generated workunits. + + Args: + result: The result of parsing the query, or a mock result if parsing failed. + query: The SQL query to parse and process. + query_timestamp: When the query was run. + is_view_ddl: Whether the query is a DDL statement that creates a view. + user: The urn of the user who ran the query. + custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed. + include_urns: If provided, only generate workunits for these urns. + """ + downstreams_to_ingest = result.out_tables + upstreams_to_ingest = result.in_tables + if include_urns: + logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}") + downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns) + upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns) + + if self.generate_lineage: + for downstream_urn in downstreams_to_ingest: + _merge_lineage_data( + downstream_urn=downstream_urn, + upstream_urns=result.in_tables, + column_lineage=result.column_lineage, + upstream_edges=self._lineage_map[downstream_urn], + query_timestamp=query_timestamp, + is_view_ddl=is_view_ddl, + user=user, + ) + + if self.generate_usage_statistics and query_timestamp is not None: + upstream_fields = _compute_upstream_fields(result) + for upstream_urn in upstreams_to_ingest: + self._usage_aggregator.aggregate_event( + resource=upstream_urn, + start_time=query_timestamp, + query=query, + user=user, + fields=sorted(upstream_fields.get(upstream_urn, [])), + ) + + if self.generate_operations and query_timestamp is not None: + for downstream_urn in downstreams_to_ingest: + yield from _gen_operation_workunit( + result, + downstream_urn=downstream_urn, + query_timestamp=query_timestamp, + user=user, + custom_operation_type=custom_operation_type, + ) + + def add_lineage( + self, + downstream_urn: DatasetUrn, + upstream_urns: Collection[DatasetUrn], + timestamp: Optional[datetime] = None, + is_view_ddl: bool = False, + user: Optional[UserUrn] = None, + ) -> None: + """Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails.""" + _merge_lineage_data( + downstream_urn=downstream_urn, + upstream_urns=upstream_urns, + column_lineage=None, + upstream_edges=self._lineage_map[downstream_urn], + query_timestamp=timestamp, + is_view_ddl=is_view_ddl, + user=user, + ) + + def gen_workunits(self) -> Iterable[MetadataWorkUnit]: + if self.generate_lineage: + yield from self._gen_lineage_workunits() + if self.generate_usage_statistics: + yield from self._gen_usage_statistics_workunits() + + def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]: + for downstream_urn in self._lineage_map: + upstreams: List[UpstreamClass] = [] + fine_upstreams: List[FineGrainedLineageClass] = [] + for upstream_urn, edge in self._lineage_map[downstream_urn].items(): + upstreams.append(edge.gen_upstream_aspect()) + fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects()) + + upstream_lineage = UpstreamLineageClass( + upstreams=sorted(upstreams, key=lambda x: x.dataset), + fineGrainedLineages=sorted( + fine_upstreams, + key=lambda x: (x.downstreams, x.upstreams), + ) + or None, + ) + yield MetadataChangeProposalWrapper( + entityUrn=downstream_urn, aspect=upstream_lineage + ).as_workunit() + + def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]: + yield from self._usage_aggregator.generate_workunits( + resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn + ) + + +def _merge_lineage_data( + downstream_urn: DatasetUrn, + *, + upstream_urns: Collection[DatasetUrn], + column_lineage: Optional[List[ColumnLineageInfo]], + upstream_edges: Dict[DatasetUrn, LineageEdge], + query_timestamp: Optional[datetime], + is_view_ddl: bool, + user: Optional[UserUrn], +) -> None: + for upstream_urn in upstream_urns: + edge = upstream_edges.setdefault( + upstream_urn, + LineageEdge( + downstream_urn=downstream_urn, + upstream_urn=upstream_urn, + audit_stamp=query_timestamp, + actor=user, + type=DatasetLineageTypeClass.VIEW + if is_view_ddl + else DatasetLineageTypeClass.TRANSFORMED, + ), + ) + if query_timestamp and ( # Use the most recent query + edge.audit_stamp is None or query_timestamp > edge.audit_stamp + ): + edge.audit_stamp = query_timestamp + if user: + edge.actor = user + + # Note: Inefficient as we loop through all column_lineage entries for each downstream table + for cl in column_lineage or []: + if cl.downstream.table == downstream_urn: + for upstream_column_info in cl.upstreams: + if upstream_column_info.table not in upstream_urns: + continue + column_map = upstream_edges[upstream_column_info.table].column_map + column_map[cl.downstream.column].add(upstream_column_info.column) + + +def _compute_upstream_fields( + result: SqlParsingResult, +) -> Dict[DatasetUrn, Set[DatasetUrn]]: + upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set) + for cl in result.column_lineage or []: + for upstream in cl.upstreams: + upstream_fields[upstream.table].add(upstream.column) + return upstream_fields + + +def _gen_operation_workunit( + result: SqlParsingResult, + *, + downstream_urn: DatasetUrn, + query_timestamp: datetime, + user: Optional[UserUrn], + custom_operation_type: Optional[str], +) -> Iterable[MetadataWorkUnit]: + operation_type = result.query_type.to_operation_type() + # Filter out SELECT and other undesired statements + if operation_type is None: + return + elif operation_type == OperationTypeClass.UNKNOWN: + if custom_operation_type is None: + return + else: + operation_type = OperationTypeClass.CUSTOM + + aspect = OperationClass( + timestampMillis=int(time.time() * 1000), + operationType=operation_type, + lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000), + actor=user, + customOperationType=custom_operation_type, + ) + yield MetadataChangeProposalWrapper( + entityUrn=downstream_urn, aspect=aspect + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/api/registry.py b/metadata-ingestion/src/datahub/ingestion/api/registry.py index ec4884e7e805f..56ea716948199 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/registry.py +++ b/metadata-ingestion/src/datahub/ingestion/api/registry.py @@ -127,7 +127,7 @@ def _ensure_not_lazy(self, key: str) -> Union[Type[T], Exception]: plugin_class = import_path(path) self.register(key, plugin_class, override=True) return plugin_class - except (AssertionError, ImportError) as e: + except Exception as e: self.register_disabled(key, e, override=True) return e diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 0eabd22e77334..7fc15cf829678 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -314,7 +314,7 @@ def auto_empty_dataset_usage_statistics( logger.warning( f"Usage statistics with unexpected timestamps, bucket_duration={config.bucket_duration}:\n" ", ".join( - str(datetime.fromtimestamp(ts, tz=timezone.utc)) + str(datetime.fromtimestamp(ts / 1000, tz=timezone.utc)) for ts in invalid_timestamps ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index 8e313e92cbf84..c943b83a887ed 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -435,6 +435,7 @@ def _field_from_complex_type( field_path._set_parent_type_if_not_exists( DataHubType(type=MapTypeClass, nested_type=value_type) ) + # FIXME: description not set. This is present in schema["description"]. yield from JsonSchemaTranslator.get_fields( JsonSchemaTranslator._get_type_from_schema( schema["additionalProperties"] diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py index 62e880a2e5334..36450dda153d7 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py @@ -27,6 +27,9 @@ def _try_reformat_with_black(code: str) -> str: class WorkUnitRecordExtractorConfig(ConfigModel): set_system_metadata = True + set_system_metadata_pipeline_name = ( + False # false for now until the models are available in OSS + ) unpack_mces_into_mcps = False @@ -66,6 +69,10 @@ def get_records( workunit.metadata.systemMetadata = SystemMetadata( lastObserved=get_sys_time(), runId=self.ctx.run_id ) + if self.config.set_system_metadata_pipeline_name: + workunit.metadata.systemMetadata.pipelineName = ( + self.ctx.pipeline_name + ) if ( isinstance(workunit.metadata, MetadataChangeEvent) and len(workunit.metadata.proposedSnapshot.aspects) == 0 diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 243c1848279c7..b371ab181e133 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type from avro.schema import RecordSchema from deprecated import deprecated @@ -16,7 +16,12 @@ from datahub.cli.cli_utils import get_url_and_token from datahub.configuration.common import ConfigModel, GraphError, OperationalError from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP -from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect, make_data_platform_urn +from datahub.emitter.mce_builder import ( + DEFAULT_ENV, + Aspect, + make_data_platform_urn, + make_dataplatform_instance_urn, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.emitter.serialization_helper import post_json_transform @@ -38,6 +43,8 @@ SystemMetadataClass, TelemetryClientIdClass, ) +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn, guess_entity_type if TYPE_CHECKING: @@ -541,8 +548,10 @@ def get_urns_by_filter( *, entity_types: Optional[List[str]] = None, platform: Optional[str] = None, + platform_instance: Optional[str] = None, env: Optional[str] = None, query: Optional[str] = None, + container: Optional[str] = None, status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED, batch_size: int = 10000, extraFilters: Optional[List[SearchFilterRule]] = None, @@ -555,15 +564,25 @@ def get_urns_by_filter( :param entity_types: List of entity types to include. If None, all entity types will be returned. :param platform: Platform to filter on. If None, all platforms will be returned. + :param platform_instance: Platform instance to filter on. If None, all platform instances will be returned. :param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned. + :param query: Query string to filter on. If None, all entities will be returned. + :param container: A container urn that entities must be within. + This works recursively, so it will include entities within sub-containers as well. + If None, all entities will be returned. + Note that this requires browsePathV2 aspects (added in 0.10.4+). :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities. :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters. + + :return: An iterable of urns that match the filters. """ types: Optional[List[str]] = None if entity_types is not None: if not entity_types: - raise ValueError("entity_types cannot be an empty list") + raise ValueError( + "entity_types cannot be an empty list; use None for all entities" + ) types = [_graphql_entity_type(entity_type) for entity_type in entity_types] @@ -582,6 +601,44 @@ def get_urns_by_filter( } ] + # Platform instance filter. + if platform_instance: + if platform: + # Massage the platform instance into a fully qualified urn, if necessary. + platform_instance = make_dataplatform_instance_urn( + platform, platform_instance + ) + + # Warn if platform_instance is not a fully qualified urn. + # TODO: Change this once we have a first-class data platform instance urn type. + if guess_entity_type(platform_instance) != "dataPlatformInstance": + raise ValueError( + f"Invalid data platform instance urn: {platform_instance}" + ) + + andFilters += [ + { + "field": "platformInstance", + "values": [platform_instance], + "condition": "EQUAL", + } + ] + + # Browse path v2 filter. + if container: + # Warn if container is not a fully qualified urn. + # TODO: Change this once we have a first-class container urn type. + if guess_entity_type(container) != "container": + raise ValueError(f"Invalid container urn: {container}") + + andFilters += [ + { + "field": "browsePathV2", + "values": [container], + "condition": "CONTAIN", + } + ] + # Status filter. if status == RemovedStatusFilter.NOT_SOFT_DELETED: # Subtle: in some cases (e.g. when the dataset doesn't have a status aspect), the @@ -957,7 +1014,11 @@ def delete_references_to_urn( @functools.lru_cache() def _make_schema_resolver( - self, platform: str, platform_instance: Optional[str], env: str + self, + platform: str, + platform_instance: Optional[str], + env: str, + include_graph: bool = True, ) -> "SchemaResolver": from datahub.utilities.sqlglot_lineage import SchemaResolver @@ -965,8 +1026,50 @@ def _make_schema_resolver( platform=platform, platform_instance=platform_instance, env=env, - graph=self, + graph=self if include_graph else None, + ) + + def initialize_schema_resolver_from_datahub( + self, platform: str, platform_instance: Optional[str], env: str + ) -> Tuple["SchemaResolver", Set[str]]: + logger.info("Initializing schema resolver") + + # TODO: Filter on platform instance? + logger.info(f"Fetching urns for platform {platform}, env {env}") + with PerfTimer() as timer: + urns = set( + self.get_urns_by_filter( + entity_types=[DatasetUrn.ENTITY_TYPE], + platform=platform, + env=env, + batch_size=3000, + ) + ) + logger.info( + f"Fetched {len(urns)} urns in {timer.elapsed_seconds()} seconds" + ) + + schema_resolver = self._make_schema_resolver( + platform, platform_instance, env, include_graph=False ) + with PerfTimer() as timer: + count = 0 + for i, urn in enumerate(urns): + if i % 1000 == 0: + logger.debug(f"Loaded {i} schema metadata") + try: + schema_metadata = self.get_aspect(urn, SchemaMetadataClass) + if schema_metadata: + schema_resolver.add_schema_metadata(urn, schema_metadata) + count += 1 + except Exception: + logger.warning("Failed to load schema metadata", exc_info=True) + logger.info( + f"Loaded {count} schema metadata in {timer.elapsed_seconds()} seconds" + ) + + logger.info("Finished initializing schema resolver") + return schema_resolver, urns def parse_sql_lineage( self, @@ -982,9 +1085,7 @@ def parse_sql_lineage( # Cache the schema resolver to make bulk parsing faster. schema_resolver = self._make_schema_resolver( - platform=platform, - platform_instance=platform_instance, - env=env, + platform=platform, platform_instance=platform_instance, env=env ) return sqlglot_lineage( diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py deleted file mode 100644 index 1a48725330df9..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py +++ /dev/null @@ -1,88 +0,0 @@ -from typing import Dict, Optional, Union - -from azure.identity import ClientSecretCredential -from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient -from pydantic import Field, root_validator - -from datahub.configuration import ConfigModel -from datahub.configuration.common import ConfigurationError - - -class AdlsSourceConfig(ConfigModel): - """ - Common Azure credentials config. - - https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python - """ - - base_path: str = Field( - default="/", - description="Base folder in hierarchical namespaces to start from.", - ) - container_name: str = Field( - description="Azure storage account container name.", - ) - account_name: str = Field( - description="Name of the Azure storage account. See [Microsoft official documentation on how to create a storage account.](https://docs.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account)", - ) - account_key: Optional[str] = Field( - description="Azure storage account access key that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**", - default=None, - ) - sas_token: Optional[str] = Field( - description="Azure storage account Shared Access Signature (SAS) token that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**", - default=None, - ) - client_secret: Optional[str] = Field( - description="Azure client secret that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**", - default=None, - ) - client_id: Optional[str] = Field( - description="Azure client (Application) ID required when a `client_secret` is used as a credential.", - default=None, - ) - tenant_id: Optional[str] = Field( - description="Azure tenant (Directory) ID required when a `client_secret` is used as a credential.", - default=None, - ) - - def get_abfss_url(self, folder_path: str = "") -> str: - if not folder_path.startswith("/"): - folder_path = f"/{folder_path}" - return f"abfss://{self.container_name}@{self.account_name}.dfs.core.windows.net{folder_path}" - - def get_filesystem_client(self) -> FileSystemClient: - return self.get_service_client().get_file_system_client(self.container_name) - - def get_service_client(self) -> DataLakeServiceClient: - return DataLakeServiceClient( - account_url=f"https://{self.account_name}.dfs.core.windows.net", - credential=self.get_credentials(), - ) - - def get_credentials( - self, - ) -> Union[Optional[str], ClientSecretCredential]: - if self.client_id and self.client_secret and self.tenant_id: - return ClientSecretCredential( - tenant_id=self.tenant_id, - client_id=self.client_id, - client_secret=self.client_secret, - ) - return self.sas_token if self.sas_token is not None else self.account_key - - @root_validator() - def _check_credential_values(cls, values: Dict) -> Dict: - if ( - values.get("account_key") - or values.get("sas_token") - or ( - values.get("client_id") - and values.get("client_secret") - and values.get("tenant_id") - ) - ): - return values - raise ConfigurationError( - "credentials missing, requires one combination of account_key or sas_token or (client_id and client_secret and tenant_id)" - ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index d1f39a3ba1ba6..1107a54a1896b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -1,5 +1,4 @@ import atexit -import hashlib import logging import os import re @@ -74,7 +73,8 @@ ) from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.ingestion.source.state.redundant_run_skip_handler import ( - RedundantRunSkipHandler, + RedundantLineageRunSkipHandler, + RedundantUsageRunSkipHandler, ) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, @@ -82,6 +82,11 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) +from datahub.ingestion.source_report.ingestion_stage import ( + LINEAGE_EXTRACTION, + METADATA_EXTRACTION, + PROFILING, +) from datahub.metadata.com.linkedin.pegasus2avro.common import ( Status, SubTypes, @@ -122,13 +127,13 @@ from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage -from datahub.utilities.time import datetime_to_ts_millis logger: logging.Logger = logging.getLogger(__name__) # Handle table snapshots # See https://cloud.google.com/bigquery/docs/table-snapshots-intro. SNAPSHOT_TABLE_REGEX = re.compile(r"^(.+)@(\d{13})$") +CLUSTERING_COLUMN_TAG = "CLUSTERING_COLUMN" # We can't use close as it is not called if the ingestion is not successful @@ -140,10 +145,6 @@ def cleanup(config: BigQueryV2Config) -> None: os.unlink(config._credentials_path) -def _generate_sql_id(sql: str) -> str: - return hashlib.md5(sql.encode("utf-8")).hexdigest() - - @platform_name("BigQuery", doc_order=1) @config_class(BigQueryV2Config) @support_status(SupportStatus.CERTIFIED) @@ -227,10 +228,36 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) + self.redundant_lineage_run_skip_handler: Optional[ + RedundantLineageRunSkipHandler + ] = None + if self.config.enable_stateful_lineage_ingestion: + self.redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler( + source=self, + config=self.config, + pipeline_name=self.ctx.pipeline_name, + run_id=self.ctx.run_id, + ) + # For database, schema, tables, views, etc - self.lineage_extractor = BigqueryLineageExtractor(config, self.report) + self.lineage_extractor = BigqueryLineageExtractor( + config, self.report, self.redundant_lineage_run_skip_handler + ) + + redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None + if self.config.enable_stateful_usage_ingestion: + redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler( + source=self, + config=self.config, + pipeline_name=self.ctx.pipeline_name, + run_id=self.ctx.run_id, + ) + self.usage_extractor = BigQueryUsageExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn_from_ref + config, + self.report, + dataset_urn_builder=self.gen_dataset_urn_from_ref, + redundant_run_skip_handler=redundant_usage_run_skip_handler, ) self.domain_registry: Optional[DomainRegistry] = None @@ -239,15 +266,8 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): cached_domains=[k for k in self.config.domain], graph=self.ctx.graph ) - self.redundant_run_skip_handler = RedundantRunSkipHandler( - source=self, - config=self.config, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - self.profiling_state_handler: Optional[ProfilingHandler] = None - if self.config.store_last_profiling_timestamps: + if self.config.enable_stateful_profiling: self.profiling_state_handler = ProfilingHandler( source=self, config=self.config, @@ -261,16 +281,15 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): # Global store of table identifiers for lineage filtering self.table_refs: Set[str] = set() - # We do this so that the SQL is stored in a file-backed dict, but the sql IDs are stored in memory. - # Maps project -> view_ref -> sql ID (will be used when generating lineage) - self.view_definition_ids: Dict[str, Dict[str, str]] = defaultdict(dict) - # Maps sql ID -> actual sql + # Maps project -> view_ref, so we can find all views in a project + self.view_refs_by_project: Dict[str, Set[str]] = defaultdict(set) + # Maps view ref -> actual sql self.view_definitions: FileBackedDict[str] = FileBackedDict() self.sql_parser_schema_resolver = SchemaResolver( platform=self.platform, env=self.config.env ) - + self.add_config_to_report() atexit.register(cleanup, config) @classmethod @@ -428,7 +447,9 @@ def get_dataplatform_instance_aspect( ) -> MetadataWorkUnit: aspect = DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), - instance=make_dataplatform_instance_urn(self.platform, project_id), + instance=make_dataplatform_instance_urn(self.platform, project_id) + if self.config.include_data_platform_instance + else None, ) return MetadataChangeProposalWrapper( entityUrn=dataset_urn, aspect=aspect @@ -499,68 +520,50 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: conn: bigquery.Client = get_bigquery_client(self.config) - self.add_config_to_report() projects = self._get_projects(conn) if not projects: return for project_id in projects: - self.report.set_ingestion_stage(project_id.id, "Metadata Extraction") + self.report.set_ingestion_stage(project_id.id, METADATA_EXTRACTION) logger.info(f"Processing project: {project_id.id}") yield from self._process_project(conn, project_id) - if self._should_ingest_usage(): + if self.config.include_usage_statistics: yield from self.usage_extractor.get_usage_workunits( [p.id for p in projects], self.table_refs ) if self._should_ingest_lineage(): for project in projects: - self.report.set_ingestion_stage(project.id, "Lineage Extraction") + self.report.set_ingestion_stage(project.id, LINEAGE_EXTRACTION) yield from self.generate_lineage(project.id) - def _should_ingest_usage(self) -> bool: - if not self.config.include_usage_statistics: - return False - - if self.config.store_last_usage_extraction_timestamp: - if self.redundant_run_skip_handler.should_skip_this_run( - cur_start_time_millis=datetime_to_ts_millis(self.config.start_time) - ): - self.report.report_warning( - "usage-extraction", - f"Skip this run as there was a run later than the current start time: {self.config.start_time}", - ) - return False - else: + if self.redundant_lineage_run_skip_handler: # Update the checkpoint state for this run. - self.redundant_run_skip_handler.update_state( - start_time_millis=datetime_to_ts_millis(self.config.start_time), - end_time_millis=datetime_to_ts_millis(self.config.end_time), + self.redundant_lineage_run_skip_handler.update_state( + self.config.start_time, self.config.end_time ) - return True def _should_ingest_lineage(self) -> bool: if not self.config.include_table_lineage: return False - if self.config.store_last_lineage_extraction_timestamp: - if self.redundant_run_skip_handler.should_skip_this_run( - cur_start_time_millis=datetime_to_ts_millis(self.config.start_time) - ): - # Skip this run - self.report.report_warning( - "lineage-extraction", - f"Skip this run as there was a run later than the current start time: {self.config.start_time}", - ) - return False - else: - # Update the checkpoint state for this run. - self.redundant_run_skip_handler.update_state( - start_time_millis=datetime_to_ts_millis(self.config.start_time), - end_time_millis=datetime_to_ts_millis(self.config.end_time), - ) + if ( + self.redundant_lineage_run_skip_handler + and self.redundant_lineage_run_skip_handler.should_skip_this_run( + cur_start_time=self.config.start_time, + cur_end_time=self.config.end_time, + ) + ): + # Skip this run + self.report.report_warning( + "lineage-extraction", + "Skip this run as there was already a run for current ingestion window.", + ) + return False + return True def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]: @@ -661,7 +664,7 @@ def _process_project( if self.config.is_profiling_enabled(): logger.info(f"Starting profiling project {project_id}") - self.report.set_ingestion_stage(project_id, "Profiling") + self.report.set_ingestion_stage(project_id, PROFILING) yield from self.profiler.get_workunits( project_id=project_id, tables=db_tables, @@ -675,10 +678,8 @@ def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]: ) if self.config.lineage_parse_view_ddl: - for view, view_definition_id in self.view_definition_ids[ - project_id - ].items(): - view_definition = self.view_definitions[view_definition_id] + for view in self.view_refs_by_project[project_id]: + view_definition = self.view_definitions[view] raw_view_lineage = sqlglot_lineage( view_definition, schema_resolver=self.sql_parser_schema_resolver, @@ -887,10 +888,9 @@ def _process_view( BigQueryTableRef(table_identifier).get_sanitized_table_ref() ) self.table_refs.add(table_ref) - if self.config.lineage_parse_view_ddl: - view_definition_id = _generate_sql_id(view.view_definition) - self.view_definition_ids[project_id][table_ref] = view_definition_id - self.view_definitions[view_definition_id] = view.view_definition + if self.config.lineage_parse_view_ddl and view.view_definition: + self.view_refs_by_project[project_id].add(table_ref) + self.view_definitions[table_ref] = view.view_definition view.column_count = len(columns) if not view.column_count: @@ -980,7 +980,7 @@ def gen_view_dataset_workunits( view_properties_aspect = ViewProperties( materialized=view.materialized, viewLanguage="SQL", - viewLogic=view_definition_string, + viewLogic=view_definition_string or "", ) yield MetadataChangeProposalWrapper( entityUrn=self.gen_dataset_urn( @@ -1151,6 +1151,21 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: field.description = col.comment schema_fields[idx] = field else: + tags = [] + if col.is_partition_column: + tags.append( + TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY)) + ) + + if col.cluster_column_position is not None: + tags.append( + TagAssociationClass( + make_tag_urn( + f"{CLUSTERING_COLUMN_TAG}_{col.cluster_column_position}" + ) + ) + ) + field = SchemaField( fieldPath=col.name, type=SchemaFieldDataType( @@ -1160,15 +1175,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: nativeDataType=col.data_type, description=col.comment, nullable=col.is_nullable, - globalTags=GlobalTagsClass( - tags=[ - TagAssociationClass( - make_tag_urn(Constants.TAG_PARTITION_KEY) - ) - ] - ) - if col.is_partition_column - else GlobalTagsClass(tags=[]), + globalTags=GlobalTagsClass(tags=tags), ) schema_fields.append(field) last_id = col.ordinal_position @@ -1318,3 +1325,13 @@ def add_config_to_report(self): self.report.use_exported_bigquery_audit_metadata = ( self.config.use_exported_bigquery_audit_metadata ) + self.report.stateful_lineage_ingestion_enabled = ( + self.config.enable_stateful_lineage_ingestion + ) + self.report.stateful_usage_ingestion_enabled = ( + self.config.enable_stateful_usage_ingestion + ) + self.report.window_start_time, self.report.window_end_time = ( + self.config.start_time, + self.config.end_time, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 7287dc1b67d73..0f2082c5e53bf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -8,7 +8,7 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.validate_field_removal import pydantic_removed_field -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulLineageConfigMixin, StatefulProfilingConfigMixin, @@ -37,7 +37,7 @@ class BigQueryUsageConfig(BaseUsageConfig): class BigQueryV2Config( BigQueryBaseConfig, - SQLAlchemyConfig, + SQLCommonConfig, StatefulUsageConfigMixin, StatefulLineageConfigMixin, StatefulProfilingConfigMixin, @@ -81,6 +81,13 @@ class BigQueryV2Config( description="Whether to populate BigQuery Console url to Datasets/Tables", ) + include_data_platform_instance: bool = Field( + default=False, + description="Whether to create a DataPlatformInstance aspect, equal to the BigQuery project id." + " If enabled, will cause redundancy in the browse path for BigQuery entities in the UI," + " because the project id is represented as the top-level container.", + ) + debug_include_full_payloads: bool = Field( default=False, description="Include full payload into events. It is only for debugging and internal use.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index b57e691411f75..8c46d8f675259 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -2,21 +2,22 @@ import dataclasses import logging from dataclasses import dataclass, field -from datetime import datetime, timezone +from datetime import datetime from typing import Counter, Dict, List, Optional import pydantic from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport +from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.utilities.lossy_collections import LossyDict, LossyList -from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.stats_collections import TopKDict, int_top_k_dict logger: logging.Logger = logging.getLogger(__name__) @dataclass -class BigQueryV2Report(ProfilingSqlReport): +class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict) num_skipped_lineage_entries_missing_data: TopKDict[str, int] = field( default_factory=int_top_k_dict @@ -52,7 +53,6 @@ class BigQueryV2Report(ProfilingSqlReport): use_date_sharded_audit_log_tables: Optional[bool] = None log_page_size: Optional[pydantic.PositiveInt] = None use_exported_bigquery_audit_metadata: Optional[bool] = None - end_time: Optional[datetime] = None log_entry_start_time: Optional[str] = None log_entry_end_time: Optional[str] = None audit_start_time: Optional[str] = None @@ -88,23 +88,14 @@ class BigQueryV2Report(ProfilingSqlReport): default_factory=collections.Counter ) usage_state_size: Optional[str] = None - ingestion_stage: Optional[str] = None - ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict) - _timer: Optional[PerfTimer] = field( - default=None, init=False, repr=False, compare=False - ) + lineage_start_time: Optional[datetime] = None + lineage_end_time: Optional[datetime] = None + stateful_lineage_ingestion_enabled: bool = False - def set_ingestion_stage(self, project: str, stage: str) -> None: - if self._timer: - elapsed = round(self._timer.elapsed_seconds(), 2) - logger.info( - f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds" - ) - if self.ingestion_stage: - self.ingestion_stage_durations[self.ingestion_stage] = elapsed - else: - self._timer = PerfTimer() + usage_start_time: Optional[datetime] = None + usage_end_time: Optional[datetime] = None + stateful_usage_ingestion_enabled: bool = False - self.ingestion_stage = f"{project}: {stage} at {datetime.now(timezone.utc)}" - self._timer.start() + def set_ingestion_stage(self, project_id: str, stage: str) -> None: + self.report_ingestion_stage_start(f"{project_id}: {stage}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 2450dbd0e2391..f8256f8e6fed6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -33,6 +33,7 @@ class BigqueryTableType: class BigqueryColumn(BaseColumn): field_path: str is_partition_column: bool + cluster_column_position: Optional[int] RANGE_PARTITION_NAME: str = "RANGE" @@ -285,7 +286,8 @@ class BigqueryQuery: CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, description as comment, c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column + c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, from `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name @@ -307,6 +309,7 @@ class BigqueryQuery: description as comment, c.is_hidden as is_hidden, c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, -- We count the columns to be able limit it later row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num, -- Getting the maximum shard for each table @@ -333,6 +336,7 @@ class BigqueryQuery: CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, c.is_hidden as is_hidden, c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, description as comment from `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c @@ -583,6 +587,7 @@ def get_columns_for_dataset( data_type=column.data_type, comment=column.comment, is_partition_column=column.is_partitioning_column == "YES", + cluster_column_position=column.clustering_ordinal_position, ) ) @@ -621,6 +626,7 @@ def get_columns_for_table( data_type=column.data_type, comment=column.comment, is_partition_column=column.is_partitioning_column == "YES", + cluster_column_position=column.clustering_ordinal_position, ) ) last_seen_table = column.table_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 255a673026252..341952d95e7d7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -4,7 +4,18 @@ import textwrap from dataclasses import dataclass from datetime import datetime, timezone -from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set, Union +from typing import ( + Any, + Callable, + Dict, + FrozenSet, + Iterable, + List, + Optional, + Set, + Tuple, + Union, +) import humanfriendly from google.cloud.bigquery import Client as BigQueryClient @@ -29,6 +40,9 @@ _make_gcp_logging_client, get_bigquery_client, ) +from datahub.ingestion.source.state.redundant_run_skip_handler import ( + RedundantLineageRunSkipHandler, +) from datahub.metadata.schema_classes import ( AuditStampClass, DatasetLineageTypeClass, @@ -133,7 +147,6 @@ def _follow_column_lineage( def make_lineage_edges_from_parsing_result( sql_lineage: SqlParsingResult, audit_stamp: datetime, lineage_type: str ) -> List[LineageEdge]: - # Note: This ignores the out_tables section of the sql parsing result. audit_stamp = datetime.now(timezone.utc) @@ -170,6 +183,7 @@ def make_lineage_edges_from_parsing_result( column_mapping=frozenset( LineageEdgeColumnMapping(out_column=out_column, in_columns=in_columns) for out_column, in_columns in column_mapping.items() + if in_columns ), auditStamp=audit_stamp, type=lineage_type, @@ -215,10 +229,29 @@ class BigqueryLineageExtractor: timestamp < "{end_time}" """.strip() - def __init__(self, config: BigQueryV2Config, report: BigQueryV2Report): + def __init__( + self, + config: BigQueryV2Config, + report: BigQueryV2Report, + redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None, + ): self.config = config self.report = report + self.redundant_run_skip_handler = redundant_run_skip_handler + self.start_time, self.end_time = ( + self.report.lineage_start_time, + self.report.lineage_end_time, + ) = self.get_time_window() + + def get_time_window(self) -> Tuple[datetime, datetime]: + if self.redundant_run_skip_handler: + return self.redundant_run_skip_handler.suggest_run_time_window( + self.config.start_time, self.config.end_time + ) + else: + return self.config.start_time, self.config.end_time + def error(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason) log.error(f"{key} => {reason}") @@ -406,7 +439,7 @@ def _get_bigquery_log_entries( ) -> Iterable[AuditLogEntry]: self.report.num_total_log_entries[client.project] = 0 # Add a buffer to start and end time to account for delays in logging events. - start_time = (self.config.start_time - self.config.max_query_duration).strftime( + start_time = (self.start_time - self.config.max_query_duration).strftime( BQ_DATETIME_FORMAT ) self.report.log_entry_start_time = start_time @@ -462,12 +495,12 @@ def _get_exported_bigquery_audit_metadata( self.report.bigquery_audit_metadata_datasets_missing = True return - corrected_start_time = self.config.start_time - self.config.max_query_duration + corrected_start_time = self.start_time - self.config.max_query_duration start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT) self.report.audit_start_time = start_time - corrected_end_time = self.config.end_time + self.config.max_query_duration + corrected_end_time = self.end_time + self.config.max_query_duration end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT) self.report.audit_end_time = end_time @@ -663,6 +696,7 @@ def _compute_bigquery_lineage( "lineage", f"{project_id}: {e}", ) + self.report_status(f"{project_id}-lineage", False) lineage_metadata = {} self.report.lineage_mem_size[project_id] = humanfriendly.format_size( @@ -832,3 +866,7 @@ def test_capability(self, project_id: str) -> None: ) for entry in self._get_bigquery_log_entries(gcp_logging_client, limit=1): logger.debug(f"Connection test got one audit metadata entry {entry}") + + def report_status(self, step: str, status: bool) -> None: + if self.redundant_run_skip_handler: + self.redundant_run_skip_handler.report_current_run_status(step, status) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 1081dd8eec1ec..e112db31c5c63 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -25,7 +25,10 @@ from google.cloud.logging_v2.client import Client as GCPLoggingClient from ratelimiter import RateLimiter -from datahub.configuration.time_window_config import get_time_bucket +from datahub.configuration.time_window_config import ( + BaseTimeWindowConfig, + get_time_bucket, +) from datahub.emitter.mce_builder import make_user_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.closeable import Closeable @@ -50,9 +53,14 @@ _make_gcp_logging_client, get_bigquery_client, ) -from datahub.ingestion.source.usage.usage_common import ( - TOTAL_BUDGET_FOR_QUERY_LIST, - make_usage_workunit, +from datahub.ingestion.source.state.redundant_run_skip_handler import ( + RedundantUsageRunSkipHandler, +) +from datahub.ingestion.source.usage.usage_common import make_usage_workunit +from datahub.ingestion.source_report.ingestion_stage import ( + USAGE_EXTRACTION_INGESTION, + USAGE_EXTRACTION_OPERATIONAL_STATS, + USAGE_EXTRACTION_USAGE_AGGREGATION, ) from datahub.metadata.schema_classes import OperationClass, OperationTypeClass from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser @@ -90,7 +98,6 @@ READ_STATEMENT_TYPES: List[str] = ["SELECT"] STRING_ENCODING = "utf-8" -MAX_QUERY_LENGTH = TOTAL_BUDGET_FOR_QUERY_LIST @dataclass(frozen=True, order=True) @@ -377,6 +384,7 @@ def __init__( config: BigQueryV2Config, report: BigQueryV2Report, dataset_urn_builder: Callable[[BigQueryTableRef], str], + redundant_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None, ): self.config: BigQueryV2Config = config self.report: BigQueryV2Report = report @@ -384,6 +392,20 @@ def __init__( # Replace hash of query with uuid if there are hash conflicts self.uuid_to_query: Dict[str, str] = {} + self.redundant_run_skip_handler = redundant_run_skip_handler + self.start_time, self.end_time = ( + self.report.usage_start_time, + self.report.usage_end_time, + ) = self.get_time_window() + + def get_time_window(self) -> Tuple[datetime, datetime]: + if self.redundant_run_skip_handler: + return self.redundant_run_skip_handler.suggest_run_time_window( + self.config.start_time, self.config.end_time + ) + else: + return self.config.start_time, self.config.end_time + def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool: return ( table_ref is not None @@ -391,12 +413,39 @@ def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool: and self.config.table_pattern.allowed(table_ref.table_identifier.table) ) + def _should_ingest_usage(self) -> bool: + if ( + self.redundant_run_skip_handler + and self.redundant_run_skip_handler.should_skip_this_run( + cur_start_time=self.config.start_time, + cur_end_time=self.config.end_time, + ) + ): + # Skip this run + self.report.report_warning( + "usage-extraction", + "Skip this run as there was already a run for current ingestion window.", + ) + return False + + return True + def get_usage_workunits( self, projects: Iterable[str], table_refs: Collection[str] ) -> Iterable[MetadataWorkUnit]: + if not self._should_ingest_usage(): + return events = self._get_usage_events(projects) yield from self._get_workunits_internal(events, table_refs) + if self.redundant_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_run_skip_handler.update_state( + self.config.start_time, + self.config.end_time, + self.config.bucket_duration, + ) + def _get_workunits_internal( self, events: Iterable[AuditEvent], table_refs: Collection[str] ) -> Iterable[MetadataWorkUnit]: @@ -413,7 +462,11 @@ def _get_workunits_internal( yield from auto_empty_dataset_usage_statistics( self._generate_usage_workunits(usage_state), - config=self.config, + config=BaseTimeWindowConfig( + start_time=self.start_time, + end_time=self.end_time, + bucket_duration=self.config.bucket_duration, + ), dataset_urns={ self.dataset_urn_builder(BigQueryTableRef.from_string_name(ref)) for ref in table_refs @@ -423,6 +476,7 @@ def _get_workunits_internal( except Exception as e: logger.error("Error processing usage", exc_info=True) self.report.report_warning("usage-ingestion", str(e)) + self.report_status("usage-ingestion", False) def generate_read_events_from_query( self, query_event_on_view: QueryEvent @@ -496,7 +550,7 @@ def _ingest_events( def _generate_operational_workunits( self, usage_state: BigQueryUsageState, table_refs: Collection[str] ) -> Iterable[MetadataWorkUnit]: - self.report.set_ingestion_stage("*", "Usage Extraction Operational Stats") + self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS) for audit_event in usage_state.standalone_events(): try: operational_wu = self._create_operation_workunit( @@ -515,7 +569,7 @@ def _generate_operational_workunits( def _generate_usage_workunits( self, usage_state: BigQueryUsageState ) -> Iterable[MetadataWorkUnit]: - self.report.set_ingestion_stage("*", "Usage Extraction Usage Aggregation") + self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION) top_n = ( self.config.usage.top_n_queries if self.config.usage.include_top_n_queries @@ -543,6 +597,7 @@ def _generate_usage_workunits( resource_urn_builder=self.dataset_urn_builder, top_n_queries=self.config.usage.top_n_queries, format_sql_queries=self.config.usage.format_sql_queries, + queries_character_limit=self.config.usage.queries_character_limit, ) self.report.num_usage_workunits_emitted += 1 except Exception as e: @@ -560,7 +615,7 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: with PerfTimer() as timer: try: self.report.set_ingestion_stage( - project_id, "Usage Extraction Ingestion" + project_id, USAGE_EXTRACTION_INGESTION ) yield from self._get_parsed_bigquery_log_events(project_id) except Exception as e: @@ -570,6 +625,7 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: ) self.report.usage_failed_extraction.append(project_id) self.report.report_warning(f"usage-extraction-{project_id}", str(e)) + self.report_status(f"usage-extraction-{project_id}", False) self.report.usage_extraction_sec[project_id] = round( timer.elapsed_seconds(), 2 @@ -583,7 +639,7 @@ def _store_usage_event( ) -> bool: """Stores a usage event in `usage_state` and returns if an event was successfully processed.""" if event.read_event and ( - self.config.start_time <= event.read_event.timestamp < self.config.end_time + self.start_time <= event.read_event.timestamp < self.end_time ): resource = event.read_event.resource if str(resource) not in table_refs: @@ -603,7 +659,8 @@ def _store_usage_event( usage_state.column_accesses[str(uuid.uuid4())] = key, field_read return True elif event.query_event and event.query_event.job_name: - query = event.query_event.query[:MAX_QUERY_LENGTH] + max_query_length = self.config.usage.queries_character_limit + query = event.query_event.query[:max_query_length] query_hash = hashlib.md5(query.encode(STRING_ENCODING)).hexdigest() if usage_state.queries.get(query_hash, query) != query: key = str(uuid.uuid4()) @@ -623,14 +680,15 @@ def _get_exported_bigquery_audit_metadata( limit: Optional[int] = None, ) -> Iterable[BigQueryAuditMetadata]: if self.config.bigquery_audit_metadata_datasets is None: + self.report.bigquery_audit_metadata_datasets_missing = True return - corrected_start_time = self.config.start_time - self.config.max_query_duration + corrected_start_time = self.start_time - self.config.max_query_duration start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT) self.report.audit_start_time = start_time - corrected_end_time = self.config.end_time + self.config.max_query_duration + corrected_end_time = self.end_time + self.config.max_query_duration end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT) self.report.audit_end_time = end_time @@ -664,7 +722,6 @@ def _get_exported_bigquery_audit_metadata( def _get_bigquery_log_entries_via_gcp_logging( self, client: GCPLoggingClient, limit: Optional[int] = None ) -> Iterable[AuditLogEntry]: - filter = self._generate_filter(BQ_AUDIT_V2) logger.debug(filter) @@ -707,11 +764,11 @@ def _generate_filter(self, audit_templates: Dict[str, str]) -> str: # handle the case where the read happens within our time range but the query # completion event is delayed and happens after the configured end time. - start_time = (self.config.start_time - self.config.max_query_duration).strftime( + start_time = (self.start_time - self.config.max_query_duration).strftime( BQ_DATETIME_FORMAT ) self.report.log_entry_start_time = start_time - end_time = (self.config.end_time + self.config.max_query_duration).strftime( + end_time = (self.end_time + self.config.max_query_duration).strftime( BQ_DATETIME_FORMAT ) self.report.log_entry_end_time = end_time @@ -1046,3 +1103,7 @@ def test_capability(self, project_id: str) -> None: for entry in self._get_parsed_bigquery_log_events(project_id, limit=1): logger.debug(f"Connection test got one {entry}") return + + def report_status(self, step: str, status: bool) -> None: + if self.redundant_run_skip_handler: + self.redundant_run_skip_handler.report_current_run_status(step, status) diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py index dcaec4e45737f..0bdcb115b377c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py @@ -113,7 +113,7 @@ def get_schema_str_replace_confluent_ref_avro( schema_seen = set() schema_str = self._compact_schema(schema.schema_str) for schema_ref in schema.references: - ref_subject = schema_ref["subject"] + ref_subject = schema_ref.subject if ref_subject in schema_seen: continue @@ -132,7 +132,7 @@ def get_schema_str_replace_confluent_ref_avro( # Replace only external type references with the reference schema recursively. # NOTE: The type pattern is dependent on _compact_schema. avro_type_kwd = '"type"' - ref_name = schema_ref["name"] + ref_name = schema_ref.name # Try by name first pattern_to_replace = f'{avro_type_kwd}:"{ref_name}"' if pattern_to_replace not in schema_str: @@ -164,7 +164,7 @@ def get_schemas_from_confluent_ref_protobuf( schema_ref: SchemaReference for schema_ref in schema.references: - ref_subject: str = schema_ref["subject"] + ref_subject: str = schema_ref.subject if ref_subject in schema_seen: continue reference_schema: RegisteredSchema = ( @@ -173,7 +173,7 @@ def get_schemas_from_confluent_ref_protobuf( schema_seen.add(ref_subject) all_schemas.append( ProtobufSchema( - name=schema_ref["name"], content=reference_schema.schema.schema_str + name=schema_ref.name, content=reference_schema.schema.schema_str ) ) return all_schemas @@ -192,19 +192,19 @@ def get_schemas_from_confluent_ref_json( schema_ref: SchemaReference for schema_ref in schema.references: - ref_subject: str = schema_ref["subject"] + ref_subject: str = schema_ref.subject if ref_subject in schema_seen: continue reference_schema: RegisteredSchema = ( self.schema_registry_client.get_version( - subject_name=ref_subject, version=schema_ref["version"] + subject_name=ref_subject, version=schema_ref.version ) ) schema_seen.add(ref_subject) all_schemas.extend( self.get_schemas_from_confluent_ref_json( reference_schema.schema, - name=schema_ref["name"], + name=schema_ref.name, subject=ref_subject, schema_seen=schema_seen, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py new file mode 100644 index 0000000000000..053d136305527 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -0,0 +1,77 @@ +from typing import Optional + +from pydantic import Field, root_validator + +from datahub.configuration.kafka import KafkaConsumerConnectionConfig +from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfig, + StatefulIngestionConfigBase, +) + +DEFAULT_DATABASE_TABLE_NAME = "metadata_aspect_v2" +DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1" +DEFAULT_DATABASE_BATCH_SIZE = 10_000 + + +class DataHubSourceConfig(StatefulIngestionConfigBase): + database_connection: Optional[SQLAlchemyConnectionConfig] = Field( + default=None, + description="Database connection config", + ) + + kafka_connection: Optional[KafkaConsumerConnectionConfig] = Field( + default=None, + description="Kafka connection config", + ) + + include_all_versions: bool = Field( + default=False, + description=( + "If enabled, include all versions of each aspect. " + "Otherwise, only include the latest version of each aspect. " + ), + ) + + database_query_batch_size: int = Field( + default=DEFAULT_DATABASE_BATCH_SIZE, + description="Number of records to fetch from the database at a time", + ) + + database_table_name: str = Field( + default=DEFAULT_DATABASE_TABLE_NAME, + description="Name of database table containing all versioned aspects", + ) + + kafka_topic_name: str = Field( + default=DEFAULT_KAFKA_TOPIC_NAME, + description="Name of kafka topic containing timeseries MCLs", + ) + + # Override from base class to make this enabled by default + stateful_ingestion: StatefulIngestionConfig = Field( + default=StatefulIngestionConfig(enabled=True), + description="Stateful Ingestion Config", + ) + + commit_state_interval: Optional[int] = Field( + default=1000, + description="Number of records to process before committing state", + ) + + commit_with_parse_errors: bool = Field( + default=False, + description=( + "Whether to update createdon timestamp and kafka offset despite parse errors. " + "Enable if you want to ignore the errors." + ), + ) + + @root_validator + def check_ingesting_data(cls, values): + if not values.get("database_connection") and not values.get("kafka_connection"): + raise ValueError( + "Your current config will not ingest any data." + " Please specify at least one of `database_connection` or `kafka_connection`, ideally both." + ) + return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py new file mode 100644 index 0000000000000..39702ba3ce347 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py @@ -0,0 +1,105 @@ +import json +import logging +from datetime import datetime +from typing import Dict, Iterable, Optional, Tuple + +from sqlalchemy import create_engine + +from datahub.emitter.aspect import ASPECT_MAP +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.serialization_helper import post_json_transform +from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.report import DataHubSourceReport +from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig +from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass +from datahub.utilities.lossy_collections import LossyDict, LossyList + +logger = logging.getLogger(__name__) + +# Should work for at least mysql, mariadb, postgres +DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f" + + +class DataHubDatabaseReader: + def __init__( + self, + config: DataHubSourceConfig, + connection_config: SQLAlchemyConnectionConfig, + report: DataHubSourceReport, + ): + self.config = config + self.report = report + self.engine = create_engine( + url=connection_config.get_sql_alchemy_url(), + **connection_config.options, + ) + + @property + def query(self) -> str: + # May repeat rows for the same date + # Offset is generally 0, unless we repeat the same createdon twice + return f""" + SELECT urn, aspect, metadata, systemmetadata, createdon + FROM `{self.config.database_table_name}` + WHERE createdon >= %(since_createdon)s + {"" if self.config.include_all_versions else "AND version = 0"} + ORDER BY createdon, urn, aspect, # Ensure stable order, chronological per (urn, aspect) + CASE WHEN version = 0 THEN 1 ELSE 0 END, version + # Version 0 last, only when createdon is the same. Otherwise relies on createdon order + LIMIT %(limit)s + OFFSET %(offset)s + """ + + def get_aspects( + self, from_createdon: datetime, stop_time: datetime + ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]: + with self.engine.connect() as conn: + ts = from_createdon + offset = 0 + while ts.timestamp() <= stop_time.timestamp(): + logger.debug(f"Polling database aspects from {ts}") + rows = conn.execute( + self.query, + since_createdon=ts.strftime(DATETIME_FORMAT), + limit=self.config.database_query_batch_size, + offset=offset, + ) + if not rows.rowcount: + return + + for i, row in enumerate(rows): + # TODO: Replace with namedtuple usage once we drop sqlalchemy 1.3 + if hasattr(row, "_asdict"): + row_dict = row._asdict() + else: + row_dict = dict(row) + mcp = self._parse_row(row_dict) + if mcp: + yield mcp, row_dict["createdon"] + + if ts == row_dict["createdon"]: + offset += i + else: + ts = row_dict["createdon"] + offset = 0 + + def _parse_row(self, d: Dict) -> Optional[MetadataChangeProposalWrapper]: + try: + json_aspect = post_json_transform(json.loads(d["metadata"])) + json_metadata = post_json_transform(json.loads(d["systemmetadata"] or "{}")) + system_metadata = SystemMetadataClass.from_obj(json_metadata) + return MetadataChangeProposalWrapper( + entityUrn=d["urn"], + aspect=ASPECT_MAP[d["aspect"]].from_obj(json_aspect), + systemMetadata=system_metadata, + changeType=ChangeTypeClass.UPSERT, + ) + except Exception as e: + logger.warning( + f"Failed to parse metadata for {d['urn']}: {e}", exc_info=True + ) + self.report.num_database_parse_errors += 1 + self.report.database_parse_errors.setdefault( + str(e), LossyDict() + ).setdefault(d["aspect"], LossyList()).append(d["urn"]) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py new file mode 100644 index 0000000000000..d9e53e87c2cea --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py @@ -0,0 +1,102 @@ +import logging +from datetime import datetime +from typing import Dict, Iterable, List, Tuple + +from confluent_kafka import ( + OFFSET_BEGINNING, + Consumer, + DeserializingConsumer, + TopicPartition, +) +from confluent_kafka.schema_registry import SchemaRegistryClient +from confluent_kafka.schema_registry.avro import AvroDeserializer + +from datahub.configuration.kafka import KafkaConsumerConnectionConfig +from datahub.ingestion.api.closeable import Closeable +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.report import DataHubSourceReport +from datahub.ingestion.source.datahub.state import PartitionOffset +from datahub.metadata.schema_classes import MetadataChangeLogClass + +logger = logging.getLogger(__name__) + +KAFKA_GROUP_PREFIX = "datahub_source" + + +class DataHubKafkaReader(Closeable): + def __init__( + self, + config: DataHubSourceConfig, + connection_config: KafkaConsumerConnectionConfig, + report: DataHubSourceReport, + ctx: PipelineContext, + ): + self.config = config + self.connection_config = connection_config + self.report = report + self.group_id = f"{KAFKA_GROUP_PREFIX}-{ctx.pipeline_name}" + + def __enter__(self) -> "DataHubKafkaReader": + self.consumer = DeserializingConsumer( + { + "group.id": self.group_id, + "bootstrap.servers": self.connection_config.bootstrap, + **self.connection_config.consumer_config, + "auto.offset.reset": "earliest", + "enable.auto.commit": False, + "value.deserializer": AvroDeserializer( + schema_registry_client=SchemaRegistryClient( + {"url": self.connection_config.schema_registry_url} + ), + return_record_name=True, + ), + } + ) + return self + + def get_mcls( + self, from_offsets: Dict[int, int], stop_time: datetime + ) -> Iterable[Tuple[MetadataChangeLogClass, PartitionOffset]]: + # Based on https://github.com/confluentinc/confluent-kafka-python/issues/145#issuecomment-284843254 + def on_assign(consumer: Consumer, partitions: List[TopicPartition]) -> None: + for p in partitions: + p.offset = from_offsets.get(p.partition, OFFSET_BEGINNING) + logger.debug(f"Set partition {p.partition} offset to {p.offset}") + consumer.assign(partitions) + + self.consumer.subscribe([self.config.kafka_topic_name], on_assign=on_assign) + try: + yield from self._poll_partition(stop_time) + finally: + self.consumer.unsubscribe() + + def _poll_partition( + self, stop_time: datetime + ) -> Iterable[Tuple[MetadataChangeLogClass, PartitionOffset]]: + while True: + msg = self.consumer.poll(10) + if msg is None: + break + + try: + mcl = MetadataChangeLogClass.from_obj(msg.value(), True) + except Exception as e: + logger.warning(f"Error deserializing MCL: {e}") + self.report.num_kafka_parse_errors += 1 + self.report.kafka_parse_errors.setdefault(str(e), 0) + self.report.kafka_parse_errors[str(e)] += 1 + continue + + if mcl.created and mcl.created.time > stop_time.timestamp() * 1000: + logger.info( + f"Stopped reading from kafka, reached MCL " + f"with audit stamp {datetime.fromtimestamp(mcl.created.time / 1000)}" + ) + break + + # TODO: Consider storing state in kafka instead, via consumer.commit() + yield mcl, PartitionOffset(partition=msg.partition(), offset=msg.offset()) + + def close(self) -> None: + self.consumer.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py new file mode 100644 index 0000000000000..2368febe1ff57 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -0,0 +1,155 @@ +import logging +from datetime import datetime, timezone +from functools import partial +from typing import Dict, Iterable, List, Optional + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit_reporter +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.datahub_database_reader import ( + DataHubDatabaseReader, +) +from datahub.ingestion.source.datahub.datahub_kafka_reader import DataHubKafkaReader +from datahub.ingestion.source.datahub.report import DataHubSourceReport +from datahub.ingestion.source.datahub.state import StatefulDataHubIngestionHandler +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) +from datahub.metadata.schema_classes import ChangeTypeClass + +logger = logging.getLogger(__name__) + + +@platform_name("DataHub") +@config_class(DataHubSourceConfig) +@support_status(SupportStatus.TESTING) +class DataHubSource(StatefulIngestionSourceBase): + platform: str = "datahub" + + def __init__(self, config: DataHubSourceConfig, ctx: PipelineContext): + super().__init__(config, ctx) + self.config = config + self.report: DataHubSourceReport = DataHubSourceReport() + self.stateful_ingestion_handler = StatefulDataHubIngestionHandler(self) + + @classmethod + def create(cls, config_dict: Dict, ctx: PipelineContext) -> "DataHubSource": + config: DataHubSourceConfig = DataHubSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_report(self) -> SourceReport: + return self.report + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + # Exactly replicate data from DataHub source + return [partial(auto_workunit_reporter, self.get_report())] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + self.report.stop_time = datetime.now(tz=timezone.utc) + logger.info(f"Ingesting DataHub metadata up until {self.report.stop_time}") + state = self.stateful_ingestion_handler.get_last_run_state() + + if self.config.database_connection is not None: + yield from self._get_database_workunits( + from_createdon=state.database_createdon_datetime + ) + self._commit_progress() + else: + logger.info( + "Skipping ingestion of versioned aspects as no database_connection provided" + ) + + if self.config.kafka_connection is not None: + yield from self._get_kafka_workunits(from_offsets=state.kafka_offsets) + self._commit_progress() + else: + logger.info( + "Skipping ingestion of timeseries aspects as no kafka_connection provided" + ) + + def _get_database_workunits( + self, from_createdon: datetime + ) -> Iterable[MetadataWorkUnit]: + if self.config.database_connection is None: + return + + logger.info(f"Fetching database aspects starting from {from_createdon}") + reader = DataHubDatabaseReader( + self.config, self.config.database_connection, self.report + ) + mcps = reader.get_aspects(from_createdon, self.report.stop_time) + for i, (mcp, createdon) in enumerate(mcps): + yield mcp.as_workunit() + self.report.num_database_aspects_ingested += 1 + + if ( + self.config.commit_with_parse_errors + or not self.report.num_database_parse_errors + ): + self.stateful_ingestion_handler.update_checkpoint( + last_createdon=createdon + ) + self._commit_progress(i) + + def _get_kafka_workunits( + self, from_offsets: Dict[int, int] + ) -> Iterable[MetadataWorkUnit]: + if self.config.kafka_connection is None: + return + + logger.info("Fetching timeseries aspects from kafka") + with DataHubKafkaReader( + self.config, self.config.kafka_connection, self.report, self.ctx + ) as reader: + mcls = reader.get_mcls( + from_offsets=from_offsets, stop_time=self.report.stop_time + ) + for i, (mcl, offset) in enumerate(mcls): + mcp = MetadataChangeProposalWrapper.try_from_mcl(mcl) + if mcp.changeType == ChangeTypeClass.DELETE: + self.report.num_timeseries_deletions_dropped += 1 + logger.debug( + f"Dropping timeseries deletion of {mcp.aspectName} on {mcp.entityUrn}" + ) + continue + + if isinstance(mcp, MetadataChangeProposalWrapper): + yield mcp.as_workunit() + else: + yield MetadataWorkUnit( + id=f"{mcp.entityUrn}-{mcp.aspectName}-{i}", mcp_raw=mcp + ) + self.report.num_kafka_aspects_ingested += 1 + + if ( + self.config.commit_with_parse_errors + or not self.report.num_kafka_parse_errors + ): + self.stateful_ingestion_handler.update_checkpoint( + last_offset=offset + ) + self._commit_progress(i) + + def _commit_progress(self, i: Optional[int] = None) -> None: + """Commit progress to stateful storage, if there have been no errors. + + If an index `i` is provided, only commit if we are at the appropriate interval + as per `config.commit_state_interval`. + """ + on_interval = ( + i + and self.config.commit_state_interval + and i % self.config.commit_state_interval == 0 + ) + + if i is None or on_interval: + self.stateful_ingestion_handler.commit_checkpoint() diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py new file mode 100644 index 0000000000000..73e5a798a1553 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass, field +from datetime import datetime, timezone + +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionReport, +) +from datahub.utilities.lossy_collections import LossyDict, LossyList + + +@dataclass +class DataHubSourceReport(StatefulIngestionReport): + stop_time: datetime = field(default_factory=lambda: datetime.now(tz=timezone.utc)) + + num_database_aspects_ingested: int = 0 + num_database_parse_errors: int = 0 + # error -> aspect -> [urn] + database_parse_errors: LossyDict[str, LossyDict[str, LossyList[str]]] = field( + default_factory=LossyDict + ) + + num_kafka_aspects_ingested: int = 0 + num_kafka_parse_errors: int = 0 + kafka_parse_errors: LossyDict[str, int] = field(default_factory=LossyDict) + + num_timeseries_deletions_dropped: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py new file mode 100644 index 0000000000000..4bedd331a9aea --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py @@ -0,0 +1,97 @@ +from datetime import datetime, timezone +from functools import lru_cache +from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, cast + +from pydantic import Field +from pydantic.types import NonNegativeInt + +from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import JobId +from datahub.ingestion.source.state.checkpoint import Checkpoint, CheckpointStateBase +from datahub.ingestion.source.state.use_case_handler import ( + StatefulIngestionUsecaseHandlerBase, +) + +if TYPE_CHECKING: + from datahub.ingestion.source.datahub.datahub_source import DataHubSource + + +class DataHubIngestionState(CheckpointStateBase): + database_createdon_ts: NonNegativeInt = 0 + + # Maps partition -> offset + kafka_offsets: Dict[int, NonNegativeInt] = Field(default_factory=dict) + + @property + def database_createdon_datetime(self) -> datetime: + return datetime.fromtimestamp( + self.database_createdon_ts / 1000, tz=timezone.utc + ) + + +class PartitionOffset(NamedTuple): + partition: int + offset: int + + +class StatefulDataHubIngestionHandler( + StatefulIngestionUsecaseHandlerBase[DataHubIngestionState] +): + def __init__(self, source: "DataHubSource"): + self.state_provider = source.state_provider + self.config = source.config.stateful_ingestion + self.run_id = source.ctx.run_id + self.pipeline_name = source.ctx.pipeline_name + self.state_provider.register_stateful_ingestion_usecase_handler(self) + + @lru_cache(maxsize=1) + def is_checkpointing_enabled(self) -> bool: + return self.state_provider.is_stateful_ingestion_configured() + + def get_last_run_state(self) -> DataHubIngestionState: + if self.is_checkpointing_enabled() and not self.config.ignore_old_state: + last_checkpoint = self.state_provider.get_last_checkpoint( + self.job_id, DataHubIngestionState + ) + if last_checkpoint and last_checkpoint.state: + return last_checkpoint.state + + return DataHubIngestionState() + + def create_checkpoint(self) -> Optional[Checkpoint[DataHubIngestionState]]: + if not self.is_checkpointing_enabled() or self.config.ignore_new_state: + return None + + if self.pipeline_name is None: + raise ValueError( + "Pipeline name must be set to use stateful datahub ingestion" + ) + + return Checkpoint( + job_name=self.job_id, + pipeline_name=self.pipeline_name, + run_id=self.run_id, + state=self.get_last_run_state(), + ) + + def update_checkpoint( + self, + *, + last_createdon: Optional[datetime] = None, + last_offset: Optional[PartitionOffset] = None, + ) -> None: + cur_checkpoint = self.state_provider.get_current_checkpoint(self.job_id) + if cur_checkpoint: + cur_state = cast(DataHubIngestionState, cur_checkpoint.state) + if last_createdon: + cur_state.database_createdon_ts = int(last_createdon.timestamp() * 1000) + if last_offset: + cur_state.kafka_offsets[last_offset.partition] = last_offset.offset + 1 + + def commit_checkpoint(self) -> None: + if self.state_provider.ingestion_checkpointing_state_provider: + self.state_provider.prepare_for_commit() + self.state_provider.ingestion_checkpointing_state_provider.commit() + + @property + def job_id(self) -> JobId: + return JobId("datahub_ingestion") diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 1cd5ed8164854..af9769bc9d94c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -162,9 +162,11 @@ class DBTCloudConfig(DBTCommonConfig): } _DBT_GRAPHQL_QUERY = """ -query DatahubMetadataQuery_{type}($jobId: Int!, $runId: Int) {{ - {type}(jobId: $jobId, runId: $runId) {{ +query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{ + job(id: $jobId, runId: $runId) {{ + {type} {{ {fields} + }} }} }} """ @@ -218,7 +220,7 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: }, ) - raw_nodes.extend(data[node_type]) + raw_nodes.extend(data["job"][node_type]) nodes = [self._parse_into_dbt_node(node) for node in raw_nodes] diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index b7ae50eb766af..cc7f646dcb884 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -1,14 +1,37 @@ +import sys + +if sys.version_info < (3, 8): + raise ImportError("Iceberg is only supported on Python 3.8+") + import json import logging import uuid -from typing import Any, Dict, Iterable, List, Optional, Tuple - -from iceberg.api import types as IcebergTypes -from iceberg.api.table import Table -from iceberg.api.types.types import NestedField -from iceberg.core.base_table import BaseTable -from iceberg.core.filesystem.filesystem_tables import FilesystemTables -from iceberg.exceptions import NoSuchTableException +from typing import Any, Dict, Iterable, List, Optional + +from pyiceberg.catalog import Catalog +from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit +from pyiceberg.table import Table +from pyiceberg.typedef import Identifier +from pyiceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IntegerType, + ListType, + LongType, + MapType, + NestedField, + StringType, + StructType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, +) from datahub.emitter.mce_builder import ( make_data_platform_urn, @@ -59,23 +82,13 @@ LOGGER = logging.getLogger(__name__) -_all_atomic_types = { - IcebergTypes.BooleanType: "boolean", - IcebergTypes.IntegerType: "int", - IcebergTypes.LongType: "long", - IcebergTypes.FloatType: "float", - IcebergTypes.DoubleType: "double", - IcebergTypes.BinaryType: "bytes", - IcebergTypes.StringType: "string", -} - @platform_name("Iceberg") @support_status(SupportStatus.TESTING) @config_class(IcebergSourceConfig) @capability( SourceCapability.PLATFORM_INSTANCE, - "Optionally enabled via configuration, an Iceberg instance represents the datalake name where the table is stored.", + "Optionally enabled via configuration, an Iceberg instance represents the catalog name where the table is stored.", ) @capability(SourceCapability.DOMAINS, "Currently not supported.", supported=False) @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration.") @@ -95,16 +108,7 @@ class IcebergSource(StatefulIngestionSourceBase): The DataHub Iceberg source plugin extracts metadata from [Iceberg tables](https://iceberg.apache.org/spec/) stored in a distributed or local file system. Typically, Iceberg tables are stored in a distributed file system like S3 or Azure Data Lake Storage (ADLS) and registered in a catalog. There are various catalog implementations like Filesystem-based, RDBMS-based or even REST-based catalogs. This Iceberg source plugin relies on the - [Iceberg python_legacy library](https://github.com/apache/iceberg/tree/master/python_legacy) and its support for catalogs is limited at the moment. - A new version of the [Iceberg Python library](https://github.com/apache/iceberg/tree/master/python) is currently in development and should fix this. - Because of this limitation, this source plugin **will only ingest HadoopCatalog-based tables that have a `version-hint.text` metadata file**. - - Ingestion of tables happens in 2 steps: - 1. Discover Iceberg tables stored in file system. - 2. Load discovered tables using Iceberg python_legacy library - - The current implementation of the Iceberg source plugin will only discover tables stored in a local file system or in ADLS. Support for S3 could - be added fairly easily. + [pyiceberg library](https://py.iceberg.apache.org/). """ def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None: @@ -112,7 +116,6 @@ def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None: self.platform: str = "iceberg" self.report: IcebergSourceReport = IcebergSourceReport() self.config: IcebergSourceConfig = config - self.iceberg_client: FilesystemTables = config.filesystem_tables @classmethod def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource": @@ -127,23 +130,31 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ).workunit_processor, ] + def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]: + for namespace in catalog.list_namespaces(): + yield from catalog.list_tables(namespace) + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - for dataset_path, dataset_name in self.config.get_paths(): # Tuple[str, str] - try: - if not self.config.table_pattern.allowed(dataset_name): - # Path contained a valid Iceberg table, but is rejected by pattern. - self.report.report_dropped(dataset_name) - continue + try: + catalog = self.config.get_catalog() + except Exception as e: + LOGGER.error("Failed to get catalog", exc_info=True) + self.report.report_failure( + "get-catalog", f"Failed to get catalog {self.config.catalog.name}: {e}" + ) + return + + for dataset_path in self._get_datasets(catalog): + dataset_name = ".".join(dataset_path) + if not self.config.table_pattern.allowed(dataset_name): + # Dataset name is rejected by pattern, report as dropped. + self.report.report_dropped(dataset_name) + continue - # Try to load an Iceberg table. Might not contain one, this will be caught by NoSuchTableException. - table: Table = self.iceberg_client.load(dataset_path) + try: + # Try to load an Iceberg table. Might not contain one, this will be caught by NoSuchIcebergTableError. + table = catalog.load_table(dataset_path) yield from self._create_iceberg_workunit(dataset_name, table) - except NoSuchTableException: - # Path did not contain a valid Iceberg table. Silently ignore this. - LOGGER.debug( - f"Path {dataset_path} does not contain table {dataset_name}" - ) - pass except Exception as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( @@ -165,26 +176,21 @@ def _create_iceberg_workunit( aspects=[Status(removed=False)], ) - custom_properties: Dict = dict(table.properties()) - custom_properties["location"] = table.location() - try: - if isinstance(table, BaseTable) and table.current_snapshot(): - custom_properties["snapshot-id"] = str( - table.current_snapshot().snapshot_id - ) - custom_properties[ - "manifest-list" - ] = table.current_snapshot().manifest_location - except KeyError: - # The above API is not well implemented, and can throw KeyError when there is no data. - pass + # Dataset properties aspect. + custom_properties = table.metadata.properties.copy() + custom_properties["location"] = table.metadata.location + custom_properties["format-version"] = str(table.metadata.format_version) + if table.current_snapshot(): + custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id) + custom_properties["manifest-list"] = table.current_snapshot().manifest_list dataset_properties = DatasetPropertiesClass( tags=[], - description=table.properties().get("comment", None), + description=table.metadata.properties.get("comment", None), customProperties=custom_properties, ) dataset_snapshot.aspects.append(dataset_properties) + # Dataset ownership aspect. dataset_ownership = self._get_ownership_aspect(table) if dataset_ownership: dataset_snapshot.aspects.append(dataset_ownership) @@ -206,8 +212,10 @@ def _create_iceberg_workunit( def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]: owners = [] if self.config.user_ownership_property: - if self.config.user_ownership_property in table.properties(): - user_owner = table.properties()[self.config.user_ownership_property] + if self.config.user_ownership_property in table.metadata.properties: + user_owner = table.metadata.properties[ + self.config.user_ownership_property + ] owners.append( OwnerClass( owner=make_user_urn(user_owner), @@ -216,8 +224,10 @@ def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]: ) ) if self.config.group_ownership_property: - if self.config.group_ownership_property in table.properties(): - group_owner = table.properties()[self.config.group_ownership_property] + if self.config.group_ownership_property in table.metadata.properties: + group_owner = table.metadata.properties[ + self.config.group_ownership_property + ] owners.append( OwnerClass( owner=make_group_urn(group_owner), @@ -225,9 +235,7 @@ def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]: source=None, ) ) - if owners: - return OwnershipClass(owners=owners) - return None + return OwnershipClass(owners=owners) if owners else None def _get_dataplatform_instance_aspect( self, dataset_urn: str @@ -249,191 +257,171 @@ def _get_dataplatform_instance_aspect( def _create_schema_metadata( self, dataset_name: str, table: Table ) -> SchemaMetadata: - schema_fields: List[SchemaField] = self._get_schema_fields( - table.schema().columns() - ) + schema_fields = self._get_schema_fields_for_schema(table.schema()) schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=make_data_platform_urn(self.platform), version=0, hash="", - platformSchema=OtherSchema(rawSchema=repr(table.schema())), + platformSchema=OtherSchema(rawSchema=str(table.schema())), fields=schema_fields, ) return schema_metadata - def _get_schema_fields(self, columns: Tuple) -> List[SchemaField]: - canonical_schema: List[SchemaField] = [] - for column in columns: - fields = self._get_schema_fields_for_column(column) - canonical_schema.extend(fields) - return canonical_schema - - def _get_schema_fields_for_column( + def _get_schema_fields_for_schema( self, - column: NestedField, + schema: Schema, ) -> List[SchemaField]: - field_type: IcebergTypes.Type = column.type - if field_type.is_primitive_type() or field_type.is_nested_type(): - avro_schema: Dict = self._get_avro_schema_from_data_type(column) - schema_fields: List[SchemaField] = schema_util.avro_schema_to_mce_fields( - json.dumps(avro_schema), default_nullable=column.is_optional - ) - return schema_fields + avro_schema = visit(schema, ToAvroSchemaIcebergVisitor()) + schema_fields = schema_util.avro_schema_to_mce_fields( + json.dumps(avro_schema), default_nullable=False + ) + return schema_fields + + def get_report(self) -> SourceReport: + return self.report + + +class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]): + """Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema.""" - raise ValueError(f"Invalid Iceberg field type: {field_type}") + @staticmethod + def _gen_name(prefix: str) -> str: + return f"{prefix}{str(uuid.uuid4()).replace('-', '')}" - def _get_avro_schema_from_data_type(self, column: NestedField) -> Dict[str, Any]: - """ - See Iceberg documentation for Avro mapping: - https://iceberg.apache.org/#spec/#appendix-a-format-specific-requirements - """ - # The record structure represents the dataset level. - # The inner fields represent the complex field (struct/array/map/union). + def schema(self, schema: Schema, struct_result: Dict[str, Any]) -> Dict[str, Any]: + return struct_result + + def struct( + self, struct: StructType, field_results: List[Dict[str, Any]] + ) -> Dict[str, Any]: + nullable = True return { "type": "record", - "name": "__struct_", - "fields": [ - { - "name": column.name, - "type": _parse_datatype(column.type, column.is_optional), - "doc": column.doc, - } - ], + "name": self._gen_name("__struct_"), + "fields": field_results, + "native_data_type": str(struct), + "_nullable": nullable, } - def get_report(self) -> SourceReport: - return self.report - + def field(self, field: NestedField, field_result: Dict[str, Any]) -> Dict[str, Any]: + field_result["_nullable"] = not field.required + return { + "name": field.name, + "type": field_result, + "doc": field.doc, + } -def _parse_datatype(type: IcebergTypes.Type, nullable: bool = False) -> Dict[str, Any]: - # Check for complex types: struct, list, map - if type.is_list_type(): - list_type: IcebergTypes.ListType = type + def list( + self, list_type: ListType, element_result: Dict[str, Any] + ) -> Dict[str, Any]: return { "type": "array", - "items": _parse_datatype(list_type.element_type), - "native_data_type": str(type), - "_nullable": nullable, + "items": element_result, + "native_data_type": str(list_type), + "_nullable": not list_type.element_required, } - elif type.is_map_type(): + + def map( + self, + map_type: MapType, + key_result: Dict[str, Any], + value_result: Dict[str, Any], + ) -> Dict[str, Any]: # The Iceberg Map type will be handled differently. The idea is to translate the map # similar to the Map.Entry struct of Java i.e. as an array of map_entry struct, where # the map_entry struct has a key field and a value field. The key and value type can # be complex or primitive types. - map_type: IcebergTypes.MapType = type - map_entry: Dict[str, Any] = { + key_result["_nullable"] = False + value_result["_nullable"] = not map_type.value_required + map_entry = { "type": "record", - "name": _gen_name("__map_entry_"), + "name": self._gen_name("__map_entry_"), "fields": [ { "name": "key", - "type": _parse_datatype(map_type.key_type(), False), + "type": key_result, }, { "name": "value", - "type": _parse_datatype(map_type.value_type(), True), + "type": value_result, }, ], } return { "type": "array", "items": map_entry, - "native_data_type": str(type), - "_nullable": nullable, + "native_data_type": str(map_type), } - elif type.is_struct_type(): - structType: IcebergTypes.StructType = type - return _parse_struct_fields(structType.fields, nullable) - else: - # Primitive types - return _parse_basic_datatype(type, nullable) - - -def _parse_struct_fields(parts: Tuple[NestedField], nullable: bool) -> Dict[str, Any]: - fields = [] - for nested_field in parts: # type: NestedField - field_name = nested_field.name - field_type = _parse_datatype(nested_field.type, nested_field.is_optional) - fields.append({"name": field_name, "type": field_type, "doc": nested_field.doc}) - return { - "type": "record", - "name": _gen_name("__struct_"), - "fields": fields, - "native_data_type": "struct<{}>".format(parts), - "_nullable": nullable, - } - - -def _parse_basic_datatype( - type: IcebergTypes.PrimitiveType, nullable: bool -) -> Dict[str, Any]: - """ - See https://iceberg.apache.org/#spec/#avro - """ - # Check for an atomic types. - for iceberg_type in _all_atomic_types.keys(): - if isinstance(type, iceberg_type): - return { - "type": _all_atomic_types[iceberg_type], - "native_data_type": repr(type), - "_nullable": nullable, - } - - # Fixed is a special case where it is not an atomic type and not a logical type. - if isinstance(type, IcebergTypes.FixedType): - fixed_type: IcebergTypes.FixedType = type + + def visit_fixed(self, fixed_type: FixedType) -> Dict[str, Any]: return { "type": "fixed", - "name": _gen_name("__fixed_"), - "size": fixed_type.length, - "native_data_type": repr(fixed_type), - "_nullable": nullable, + "name": self._gen_name("__fixed_"), + "size": len(fixed_type), + "native_data_type": str(fixed_type), } - # Not an atomic type, so check for a logical type. - if isinstance(type, IcebergTypes.DecimalType): + def visit_decimal(self, decimal_type: DecimalType) -> Dict[str, Any]: # Also of interest: https://avro.apache.org/docs/current/spec.html#Decimal - decimal_type: IcebergTypes.DecimalType = type return { # "type": "bytes", # when using bytes, avro drops _nullable attribute and others. See unit test. "type": "fixed", # to fix avro bug ^ resolved by using a fixed type - "name": _gen_name( + "name": self._gen_name( "__fixed_" ), # to fix avro bug ^ resolved by using a fixed type "size": 1, # to fix avro bug ^ resolved by using a fixed type "logicalType": "decimal", "precision": decimal_type.precision, "scale": decimal_type.scale, - "native_data_type": repr(decimal_type), - "_nullable": nullable, + "native_data_type": str(decimal_type), + } + + def visit_boolean(self, boolean_type: BooleanType) -> Dict[str, Any]: + return { + "type": "boolean", + "native_data_type": str(boolean_type), } - elif isinstance(type, IcebergTypes.UUIDType): - uuid_type: IcebergTypes.UUIDType = type + + def visit_integer(self, integer_type: IntegerType) -> Dict[str, Any]: return { - "type": "string", - "logicalType": "uuid", - "native_data_type": repr(uuid_type), - "_nullable": nullable, + "type": "int", + "native_data_type": str(integer_type), + } + + def visit_long(self, long_type: LongType) -> Dict[str, Any]: + return { + "type": "long", + "native_data_type": str(long_type), } - elif isinstance(type, IcebergTypes.DateType): - date_type: IcebergTypes.DateType = type + + def visit_float(self, float_type: FloatType) -> Dict[str, Any]: + return { + "type": "float", + "native_data_type": str(float_type), + } + + def visit_double(self, double_type: DoubleType) -> Dict[str, Any]: + return { + "type": "double", + "native_data_type": str(double_type), + } + + def visit_date(self, date_type: DateType) -> Dict[str, Any]: return { "type": "int", "logicalType": "date", - "native_data_type": repr(date_type), - "_nullable": nullable, + "native_data_type": str(date_type), } - elif isinstance(type, IcebergTypes.TimeType): - time_type: IcebergTypes.TimeType = type + + def visit_time(self, time_type: TimeType) -> Dict[str, Any]: return { "type": "long", "logicalType": "time-micros", - "native_data_type": repr(time_type), - "_nullable": nullable, + "native_data_type": str(time_type), } - elif isinstance(type, IcebergTypes.TimestampType): - timestamp_type: IcebergTypes.TimestampType = type + + def visit_timestamp(self, timestamp_type: TimestampType) -> Dict[str, Any]: # Avro supports 2 types of timestamp: # - Timestamp: independent of a particular timezone or calendar (TZ information is lost) # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local @@ -446,12 +434,40 @@ def _parse_basic_datatype( # "logicalType": "timestamp-micros" # if timestamp_type.adjust_to_utc # else "local-timestamp-micros", - "native_data_type": repr(timestamp_type), - "_nullable": nullable, + "native_data_type": str(timestamp_type), } - return {"type": "null", "native_data_type": repr(type)} + def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]: + # Avro supports 2 types of timestamp: + # - Timestamp: independent of a particular timezone or calendar (TZ information is lost) + # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local + # utcAdjustment: bool = True + return { + "type": "long", + "logicalType": "timestamp-micros", + # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec. + # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634 + # "logicalType": "timestamp-micros" + # if timestamp_type.adjust_to_utc + # else "local-timestamp-micros", + "native_data_type": str(timestamptz_type), + } + def visit_string(self, string_type: StringType) -> Dict[str, Any]: + return { + "type": "string", + "native_data_type": str(string_type), + } -def _gen_name(prefix: str) -> str: - return f"{prefix}{str(uuid.uuid4()).replace('-', '')}" + def visit_uuid(self, uuid_type: UUIDType) -> Dict[str, Any]: + return { + "type": "string", + "logicalType": "uuid", + "native_data_type": str(uuid_type), + } + + def visit_binary(self, binary_type: BinaryType) -> Dict[str, Any]: + return { + "type": "bytes", + "native_data_type": str(binary_type), + } diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py index d5b9092912d4e..f4d93f67b27af 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py @@ -1,20 +1,11 @@ -import os from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Tuple - -import pydantic -from azure.storage.filedatalake import FileSystemClient, PathProperties -from iceberg.core.filesystem.abfss_filesystem import AbfssFileSystem -from iceberg.core.filesystem.filesystem_tables import FilesystemTables -from pydantic import Field, root_validator - -from datahub.configuration.common import ( - AllowDenyPattern, - ConfigModel, - ConfigurationError, -) +from typing import Dict, List, Optional + +from pydantic import Field +from pyiceberg.catalog import Catalog, load_catalog + +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import DatasetSourceConfigMixin -from datahub.ingestion.source.azure.azure_common import AdlsSourceConfig from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, StatefulStaleMetadataRemovalConfig, @@ -59,22 +50,32 @@ class IcebergProfilingConfig(ConfigModel): # include_field_sample_values: bool = True +class IcebergCatalogConfig(ConfigModel): + """ + Iceberg catalog config. + + https://py.iceberg.apache.org/configuration/ + """ + + name: str = Field( + default="default", + description="Name of catalog", + ) + type: str = Field( + description="Type of catalog. See [PyIceberg](https://py.iceberg.apache.org/configuration/) for list of possible values.", + ) + config: Dict[str, str] = Field( + description="Catalog specific configuration. See [PyIceberg documentation](https://py.iceberg.apache.org/configuration/) for details.", + ) + + class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): # Override the stateful_ingestion config param with the Iceberg custom stateful ingestion config in the IcebergSourceConfig - stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field( + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( default=None, description="Iceberg Stateful Ingestion Config." ) - adls: Optional[AdlsSourceConfig] = Field( - default=None, - description="[Azure Data Lake Storage](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) to crawl for Iceberg tables. This is one filesystem type supported by this source and **only one can be configured**.", - ) - localfs: Optional[str] = Field( - default=None, - description="Local path to crawl for Iceberg tables. This is one filesystem type supported by this source and **only one can be configured**.", - ) - max_path_depth: int = Field( - default=2, - description="Maximum folder depth to crawl for Iceberg tables. Folders deeper than this value will be silently ignored.", + catalog: IcebergCatalogConfig = Field( + description="Catalog configuration where to find Iceberg tables. See [pyiceberg's catalog configuration details](https://py.iceberg.apache.org/configuration/).", ) table_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), @@ -95,92 +96,15 @@ def is_profiling_enabled(self) -> bool: self.profiling.operation_config ) - @root_validator() - def _ensure_one_filesystem_is_configured( - cls: "IcebergSourceConfig", values: Dict - ) -> Dict: - if values.get("adls") and values.get("localfs"): - raise ConfigurationError( - "Only one filesystem can be configured: adls or localfs" - ) - elif not values.get("adls") and not values.get("localfs"): - raise ConfigurationError( - "One filesystem (adls or localfs) needs to be configured." - ) - return values - - @property - def adls_filesystem_client(self) -> FileSystemClient: - """Azure Filesystem client if configured. - - Raises: - ConfigurationError: If ADLS is not configured. - - Returns: - FileSystemClient: Azure Filesystem client instance to access storage account files and folders. - """ - if self.adls: # TODO Use local imports for abfss - AbfssFileSystem.get_instance().set_conf(self.adls.dict()) - return self.adls.get_filesystem_client() - raise ConfigurationError("No ADLS filesystem client configured") - - @property - def filesystem_tables(self) -> FilesystemTables: - """Iceberg FilesystemTables abstraction to access tables on a filesystem. - Currently supporting ADLS (Azure Storage Account) and local filesystem. - - Raises: - ConfigurationError: If no filesystem was configured. + def get_catalog(self) -> Catalog: + """Returns the Iceberg catalog instance as configured by the `catalog` dictionary. Returns: - FilesystemTables: An Iceberg FilesystemTables abstraction instance to access tables on a filesystem + Catalog: Iceberg catalog instance. """ - if self.adls: - return FilesystemTables(self.adls.dict()) - elif self.localfs: - return FilesystemTables() - raise ConfigurationError("No filesystem client configured") - - def _get_adls_paths(self, root_path: str, depth: int) -> Iterable[Tuple[str, str]]: - if self.adls and depth < self.max_path_depth: - sub_paths = self.adls_filesystem_client.get_paths( - path=root_path, recursive=False - ) - sub_path: PathProperties - for sub_path in sub_paths: - if sub_path.is_directory: - dataset_name = ".".join( - sub_path.name[len(self.adls.base_path) + 1 :].split("/") - ) - yield self.adls.get_abfss_url(sub_path.name), dataset_name - yield from self._get_adls_paths(sub_path.name, depth + 1) - - def _get_localfs_paths( - self, root_path: str, depth: int - ) -> Iterable[Tuple[str, str]]: - if self.localfs and depth < self.max_path_depth: - for f in os.scandir(root_path): - if f.is_dir(): - dataset_name = ".".join(f.path[len(self.localfs) + 1 :].split("/")) - yield f.path, dataset_name - yield from self._get_localfs_paths(f.path, depth + 1) - - def get_paths(self) -> Iterable[Tuple[str, str]]: - """Generates a sequence of data paths and dataset names. - - Raises: - ConfigurationError: If no filesystem configured. - - Yields: - Iterator[Iterable[Tuple[str, str]]]: A sequence of tuples where the first item is the location of the dataset - and the second item is the associated dataset name. - """ - if self.adls: - yield from self._get_adls_paths(self.adls.base_path, 0) - elif self.localfs: - yield from self._get_localfs_paths(self.localfs, 0) - else: - raise ConfigurationError("No filesystem client configured") + return load_catalog( + name=self.catalog.name, **{"type": self.catalog.type, **self.catalog.config} + ) @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py index 1437847ee4343..e1d52752d779a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py @@ -1,17 +1,26 @@ -from datetime import datetime, timedelta from typing import Any, Callable, Dict, Iterable, Union, cast -from iceberg.api import types as IcebergTypes -from iceberg.api.data_file import DataFile -from iceberg.api.manifest_file import ManifestFile -from iceberg.api.schema import Schema -from iceberg.api.snapshot import Snapshot -from iceberg.api.table import Table -from iceberg.api.types import Conversions, NestedField, Type, TypeID -from iceberg.core.base_table import BaseTable -from iceberg.core.filesystem import FileSystemInputFile -from iceberg.core.manifest_reader import ManifestReader -from iceberg.exceptions.exceptions import FileSystemNotFound +from pyiceberg.conversions import from_bytes +from pyiceberg.schema import Schema +from pyiceberg.table import Table +from pyiceberg.types import ( + DateType, + DecimalType, + DoubleType, + FloatType, + IcebergType, + IntegerType, + LongType, + TimestampType, + TimestamptzType, + TimeType, +) +from pyiceberg.utils.datetime import ( + days_to_date, + to_human_time, + to_human_timestamp, + to_human_timestamptz, +) from datahub.emitter.mce_builder import get_sys_time from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -51,15 +60,18 @@ def _aggregate_bounds( schema: Schema, aggregator: Callable, aggregated_values: Dict[int, Any], - manifest_values: Dict[int, Any], + manifest_values: Dict[int, bytes], ) -> None: for field_id, value_encoded in manifest_values.items(): # type: int, Any - field: NestedField = schema.find_field(field_id) - # Bounds in manifests can reference historical field IDs that are not part of the current schema. - # We simply not profile those since we only care about the current snapshot. - if field and IcebergProfiler._is_numeric_type(field.type): - value_decoded = Conversions.from_byte_buffer(field.type, value_encoded) - if value_decoded: + try: + field = schema.find_field(field_id) + except ValueError: + # Bounds in manifests can reference historical field IDs that are not part of the current schema. + # We simply not profile those since we only care about the current snapshot. + continue + if IcebergProfiler._is_numeric_type(field.field_type): + value_decoded = from_bytes(field.field_type, value_encoded) + if value_decoded is not None: agg_value = aggregated_values.get(field_id) aggregated_values[field_id] = ( aggregator(agg_value, value_decoded) @@ -97,12 +109,23 @@ def profile_table( Yields: Iterator[Iterable[MetadataWorkUnit]]: Workunits related to datasetProfile. """ - if not table.snapshots() or not isinstance(table, BaseTable): + current_snapshot = table.current_snapshot() + if not current_snapshot: # Table has no data, cannot profile, or we can't get current_snapshot. return - row_count: int = int(table.current_snapshot().summary["total-records"]) - column_count: int = len(table.schema()._id_to_name) + row_count = ( + int(current_snapshot.summary.additional_properties["total-records"]) + if current_snapshot.summary + else 0 + ) + column_count = len( + [ + field.field_id + for field in table.schema().fields + if field.field_type.is_primitive + ] + ) dataset_profile = DatasetProfileClass( timestampMillis=get_sys_time(), rowCount=row_count, @@ -110,47 +133,44 @@ def profile_table( ) dataset_profile.fieldProfiles = [] - field_paths: Dict[int, str] = table.schema()._id_to_name - current_snapshot: Snapshot = table.current_snapshot() - total_count: int = 0 + total_count = 0 null_counts: Dict[int, int] = {} min_bounds: Dict[int, Any] = {} max_bounds: Dict[int, Any] = {} - manifest: ManifestFile try: - for manifest in current_snapshot.manifests: - manifest_input_file = FileSystemInputFile.from_location( - manifest.manifest_path, table.ops.conf - ) - manifest_reader = ManifestReader.read(manifest_input_file) - data_file: DataFile - for data_file in manifest_reader.iterator(): + for manifest in current_snapshot.manifests(table.io): + for manifest_entry in manifest.fetch_manifest_entry(table.io): + data_file = manifest_entry.data_file if self.config.include_field_null_count: null_counts = self._aggregate_counts( - null_counts, data_file.null_value_counts() + null_counts, data_file.null_value_counts ) if self.config.include_field_min_value: self._aggregate_bounds( table.schema(), min, min_bounds, - data_file.lower_bounds(), + data_file.lower_bounds, ) if self.config.include_field_max_value: self._aggregate_bounds( table.schema(), max, max_bounds, - data_file.upper_bounds(), + data_file.upper_bounds, ) - total_count += data_file.record_count() - # TODO Work on error handling to provide better feedback. Iceberg exceptions are weak... - except FileSystemNotFound as e: - raise Exception("Error loading table manifests") from e + total_count += data_file.record_count + except Exception as e: + # Catch any errors that arise from attempting to read the Iceberg table's manifests + # This will prevent stateful ingestion from being blocked by an error (profiling is not critical) + self.report.report_warning( + "profiling", + f"Error while profiling dataset {dataset_name}: {e}", + ) if row_count: # Iterating through fieldPaths introduces unwanted stats for list element fields... - for field_id, field_path in field_paths.items(): - field: NestedField = table.schema().find_field(field_id) + for field_path, field_id in table.schema()._name_to_id.items(): + field = table.schema().find_field(field_id) column_profile = DatasetFieldProfileClass(fieldPath=field_path) if self.config.include_field_null_count: column_profile.nullCount = cast(int, null_counts.get(field_id, 0)) @@ -160,16 +180,16 @@ def profile_table( if self.config.include_field_min_value: column_profile.min = ( - self._renderValue( - dataset_name, field.type, min_bounds.get(field_id) + self._render_value( + dataset_name, field.field_type, min_bounds.get(field_id) ) if field_id in min_bounds else None ) if self.config.include_field_max_value: column_profile.max = ( - self._renderValue( - dataset_name, field.type, max_bounds.get(field_id) + self._render_value( + dataset_name, field.field_type, max_bounds.get(field_id) ) if field_id in max_bounds else None @@ -181,24 +201,18 @@ def profile_table( aspect=dataset_profile, ).as_workunit() - # The following will eventually be done by the Iceberg API (in the new Python refactored API). - def _renderValue( - self, dataset_name: str, value_type: Type, value: Any + def _render_value( + self, dataset_name: str, value_type: IcebergType, value: Any ) -> Union[str, None]: try: - if value_type.type_id == TypeID.TIMESTAMP: - if value_type.adjust_to_utc: - # TODO Deal with utc when required - microsecond_unix_ts = value - else: - microsecond_unix_ts = value - return datetime.fromtimestamp(microsecond_unix_ts / 1000000.0).strftime( - "%Y-%m-%d %H:%M:%S" - ) - elif value_type.type_id == TypeID.DATE: - return (datetime(1970, 1, 1, 0, 0) + timedelta(value - 1)).strftime( - "%Y-%m-%d" - ) + if isinstance(value_type, TimestampType): + return to_human_timestamp(value) + if isinstance(value_type, TimestamptzType): + return to_human_timestamptz(value) + elif isinstance(value_type, DateType): + return days_to_date(value).strftime("%Y-%m-%d") + elif isinstance(value_type, TimeType): + return to_human_time(value) return str(value) except Exception as e: self.report.report_warning( @@ -208,17 +222,18 @@ def _renderValue( return None @staticmethod - def _is_numeric_type(type: Type) -> bool: + def _is_numeric_type(type: IcebergType) -> bool: return isinstance( type, ( - IcebergTypes.DateType, - IcebergTypes.DecimalType, - IcebergTypes.DoubleType, - IcebergTypes.FloatType, - IcebergTypes.IntegerType, - IcebergTypes.LongType, - IcebergTypes.TimestampType, - IcebergTypes.TimeType, + DateType, + DecimalType, + DoubleType, + FloatType, + IntegerType, + LongType, + TimestampType, + TimestamptzType, + TimeType, ), ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index a299023b88e64..5e8413bbb6f30 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -7,6 +7,7 @@ from time import sleep from typing import Dict, Iterable, List, Optional, Union +import nest_asyncio from okta.client import Client as OktaClient from okta.exceptions import OktaAPIException from okta.models import Group, GroupProfile, User, UserProfile, UserStatus @@ -51,6 +52,7 @@ ) logger = logging.getLogger(__name__) +nest_asyncio.apply() class OktaConfig(StatefulIngestionConfigBase, ConfigModel): @@ -301,11 +303,13 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # This method can be called on the main thread or an async thread, so we must create a new loop if one doesn't exist # See https://docs.python.org/3/library/asyncio-eventloop.html for more info. + created_event_loop = False try: event_loop: asyncio.AbstractEventLoop = asyncio.get_event_loop() except RuntimeError: event_loop = asyncio.new_event_loop() asyncio.set_event_loop(event_loop) + created_event_loop = True # Step 1: Produce MetadataWorkUnits for CorpGroups. okta_groups: Optional[Iterable[Group]] = None @@ -406,7 +410,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ).as_workunit() # Step 4: Close the event loop - event_loop.close() + if created_event_loop: + event_loop.close() def get_report(self): return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index c8a4c7a6ab8fa..b3fa5e3401c07 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -626,12 +626,17 @@ def _extract_lineages(self): @dataclass class DebeziumSourceConnector: connector_manifest: ConnectorManifest + report: KafkaConnectSourceReport def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig + self, + connector_manifest: ConnectorManifest, + config: KafkaConnectSourceConfig, + report: KafkaConnectSourceReport, ) -> None: self.connector_manifest = connector_manifest self.config = config + self.report = report self._extract_lineages() @dataclass @@ -683,10 +688,19 @@ def get_parser( database_name=connector_manifest.config.get("database.dbname"), ) elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + parser = self.DebeziumParser( source_platform="mssql", server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), + database_name=database_name, ) elif connector_class == "io.debezium.connector.db2.Db2Connector": parser = self.DebeziumParser( @@ -707,29 +721,37 @@ def get_parser( def _extract_lineages(self): lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name) - if not self.connector_manifest.topic_names: - return lineages + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name) - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) + if not self.connector_manifest.topic_names: + return lineages - if found: - table_name = get_dataset_name(database_name, found.group(2)) + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.connector_manifest.lineages = lineages + except Exception as e: + self.report.report_warning( + self.connector_manifest.name, f"Error resolving lineage: {e}" + ) + + return @dataclass @@ -1061,7 +1083,9 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: "io.debezium.connector" ): connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, config=self.config + connector_manifest=connector_manifest, + config=self.config, + report=self.report, ).connector_manifest elif ( connector_manifest.config.get(CONNECTOR_CLASS, "") diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 085878245c60d..e1d035a96d42f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -146,6 +146,11 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): description="Use pagination while do search query (enabled by default).", ) + use_email_as_username: bool = Field( + default=False, + description="Use email for users' usernames instead of username (disabled by default). \ + If enabled, the user and group urn would be having email as the id part of the urn.", + ) # default mapping for attrs user_attrs_map: Dict[str, Any] = {} group_attrs_map: Dict[str, Any] = {} @@ -266,10 +271,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if dn is None: continue - if not attrs: + if not attrs or "objectClass" not in attrs: self.report.report_warning( "", - f"skipping {dn} because attrs is empty; check your permissions if this is unexpected", + f"skipping {dn} because attrs ({attrs}) does not contain expected data; " + f"check your permissions if this is unexpected", ) continue @@ -306,6 +312,7 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn work unit based on the information. """ manager_ldap = None + make_manager_urn = None if self.config.user_attrs_map["managerUrn"] in attrs: try: m_cn = attrs[self.config.user_attrs_map["managerUrn"]][0].decode() @@ -322,10 +329,19 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn result = self.ldap_client.result3(manager_msgid) if result[1]: _m_dn, m_attrs = result[1][0] + manager_ldap = guess_person_ldap(m_attrs, self.config, self.report) + + m_email = get_attr_or_none( + m_attrs, self.config.user_attrs_map["email"], manager_ldap + ) + make_manager_urn = ( + m_email if self.config.use_email_as_username else manager_ldap + ) + except ldap.LDAPError as e: self.report.report_warning(dn, f"manager LDAP search failed: {e}") - mce = self.build_corp_user_mce(dn, attrs, manager_ldap) + mce = self.build_corp_user_mce(dn, attrs, make_manager_urn) if mce: yield MetadataWorkUnit(dn, mce) else: @@ -387,8 +403,10 @@ def build_corp_user_mce( manager_urn = f"urn:li:corpuser:{manager_ldap}" if manager_ldap else None + make_user_urn = email if self.config.use_email_as_username else ldap_user + user_snapshot = CorpUserSnapshotClass( - urn=f"urn:li:corpuser:{ldap_user}", + urn=f"urn:li:corpuser:{make_user_urn}", aspects=[ CorpUserInfoClass( active=True, @@ -429,8 +447,10 @@ def build_corp_group_mce(self, attrs: dict) -> Optional[MetadataChangeEvent]: attrs, self.config.group_attrs_map["displayName"] ) + make_group_urn = email if self.config.use_email_as_username else full_name + group_snapshot = CorpGroupSnapshotClass( - urn=f"urn:li:corpGroup:{full_name}", + urn=f"urn:li:corpGroup:{make_group_urn}", aspects=[ CorpGroupInfoClass( email=email, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index d568ddcb02afa..40b90d216348c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -34,6 +34,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, ) +from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageTypeClass, FineGrainedLineageDownstreamType, @@ -76,6 +77,8 @@ from datahub.utilities.lossy_collections import LossyList, LossySet from datahub.utilities.url_util import remove_port_from_url +CORPUSER_DATAHUB = "urn:li:corpuser:datahub" + if TYPE_CHECKING: from datahub.ingestion.source.looker.lookml_source import ( LookerViewFileLoader, @@ -786,6 +789,7 @@ def _to_metadata_events( # noqa: C901 if self.upstream_views is not None: assert self.project_name is not None upstreams = [] + observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc) for view_ref in sorted(self.upstream_views): view_urn = LookerViewId( project_name=view_ref.project @@ -799,6 +803,10 @@ def _to_metadata_events( # noqa: C901 UpstreamClass( dataset=view_urn, type=DatasetLineageTypeClass.VIEW, + auditStamp=AuditStamp( + time=int(observed_lineage_ts.timestamp() * 1000), + actor=CORPUSER_DATAHUB, + ), ) ) view_name_to_urn_map[view_ref.include] = view_urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 362b4e5530638..1a32afa2b7fdd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -6,7 +6,7 @@ import re import tempfile from dataclasses import dataclass, field as dataclass_field, replace -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import ( Any, ClassVar, @@ -50,6 +50,7 @@ from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.git.git_import import GitClone from datahub.ingestion.source.looker.looker_common import ( + CORPUSER_DATAHUB, LookerCommonConfig, LookerExplore, LookerUtil, @@ -83,6 +84,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ( + AuditStampClass, DatasetPropertiesClass, FineGrainedLineageClass, FineGrainedLineageUpstreamTypeClass, @@ -1615,11 +1617,16 @@ def _get_upstream_lineage( # Generate the upstream + fine grained lineage objects. upstreams = [] + observed_lineage_ts = datetime.now(tz=timezone.utc) fine_grained_lineages: List[FineGrainedLineageClass] = [] for upstream_dataset_urn in upstream_dataset_urns: upstream = UpstreamClass( dataset=upstream_dataset_urn, type=DatasetLineageTypeClass.VIEW, + auditStamp=AuditStampClass( + time=int(observed_lineage_ts.timestamp() * 1000), + actor=CORPUSER_DATAHUB, + ), ) upstreams.append(upstream) diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index 54c5888ee3312..fb4512893feb1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -1,6 +1,6 @@ from datetime import datetime, timezone from functools import lru_cache -from typing import Dict, Iterable, List, Optional, Union +from typing import Dict, Iterable, List, Optional, Tuple, Union import dateutil.parser as dp import pydantic @@ -43,6 +43,8 @@ ) from datahub.utilities import config_clean +DATASOURCE_URN_RECURSION_LIMIT = 5 + class MetabaseConfig(DatasetLineageProviderConfigBase): # See the Metabase /api/session endpoint for details @@ -327,18 +329,43 @@ def emit_card_mces(self) -> Iterable[MetadataWorkUnit]: ) return None - def construct_card_from_api_data(self, card_data: dict) -> Optional[ChartSnapshot]: - card_id = card_data.get("id", "") + def get_card_details_by_id(self, card_id: Union[int, str]) -> dict: + """ + Method will attempt to get detailed information on card + from Metabase API by card ID and return this info as dict. + If information can't be retrieved, an empty dict is returned + to unify return value of failed call with successful call of the method. + :param Union[int, str] card_id: ID of card (question) in Metabase + :param int datasource_id: Numeric datasource ID received from Metabase API + :return: dict with info or empty dict + """ card_url = f"{self.config.connect_uri}/api/card/{card_id}" try: card_response = self.session.get(card_url) card_response.raise_for_status() - card_details = card_response.json() + return card_response.json() except HTTPError as http_error: self.report.report_failure( key=f"metabase-card-{card_id}", reason=f"Unable to retrieve Card info. " f"Reason: {str(http_error)}", ) + return {} + + def construct_card_from_api_data(self, card_data: dict) -> Optional[ChartSnapshot]: + card_id = card_data.get("id") + if card_id is None: + self.report.report_failure( + key="metabase-card", + reason=f"Unable to get Card id from card data {str(card_data)}", + ) + return None + + card_details = self.get_card_details_by_id(card_id) + if not card_details: + self.report.report_failure( + key=f"metabase-card-{card_id}", + reason="Unable to construct Card due to empty card details", + ) return None chart_urn = builder.make_chart_urn(self.platform, card_id) @@ -357,7 +384,7 @@ def construct_card_from_api_data(self, card_data: dict) -> Optional[ChartSnapsho lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) - chart_type = self._get_chart_type(card_id, card_details.get("display")) + chart_type = self._get_chart_type(card_id, card_details.get("display") or "") description = card_details.get("description") or "" title = card_details.get("name") or "" datasource_urn = self.get_datasource_urn(card_details) @@ -448,13 +475,30 @@ def construct_card_custom_properties(self, card_details: dict) -> Dict: return custom_properties - def get_datasource_urn(self, card_details: dict) -> Optional[List]: + def get_datasource_urn( + self, card_details: dict, recursion_depth: int = 0 + ) -> Optional[List]: + if recursion_depth > DATASOURCE_URN_RECURSION_LIMIT: + self.report.report_failure( + key=f"metabase-card-{card_details.get('id')}", + reason="Unable to retrieve Card info. Reason: source table recursion depth exceeded", + ) + return None + + datasource_id = card_details.get("database_id") or "" ( platform, database_name, database_schema, platform_instance, - ) = self.get_datasource_from_id(card_details.get("database_id", "")) + ) = self.get_datasource_from_id(datasource_id) + if not platform: + self.report.report_failure( + key=f"metabase-datasource-{datasource_id}", + reason=f"Unable to detect platform for database id {datasource_id}", + ) + return None + query_type = card_details.get("dataset_query", {}).get("type", {}) source_tables = set() @@ -463,8 +507,19 @@ def get_datasource_urn(self, card_details: dict) -> Optional[List]: card_details.get("dataset_query", {}) .get("query", {}) .get("source-table") + or "" ) - if source_table_id is not None: + if str(source_table_id).startswith("card__"): + # question is built not directly from table in DB but from results of other question in Metabase + # trying to get source table from source question. Recursion depth is limited + return self.get_datasource_urn( + card_details=self.get_card_details_by_id( + source_table_id.replace("card__", "") + ), + recursion_depth=recursion_depth + 1, + ) + elif source_table_id != "": + # the question is built directly from table in DB schema_name, table_name = self.get_source_table_from_id(source_table_id) if table_name: source_tables.add( @@ -520,7 +575,9 @@ def get_datasource_urn(self, card_details: dict) -> Optional[List]: return dataset_urn @lru_cache(maxsize=None) - def get_source_table_from_id(self, table_id): + def get_source_table_from_id( + self, table_id: Union[int, str] + ) -> Tuple[Optional[str], Optional[str]]: try: dataset_response = self.session.get( f"{self.config.connect_uri}/api/table/{table_id}" @@ -542,8 +599,8 @@ def get_source_table_from_id(self, table_id): @lru_cache(maxsize=None) def get_platform_instance( - self, platform: Union[str, None] = None, datasource_id: Union[int, None] = None - ) -> Union[str, None]: + self, platform: Optional[str] = None, datasource_id: Optional[int] = None + ) -> Optional[str]: """ Method will attempt to detect `platform_instance` by checking `database_id_to_instance_map` and `platform_instance_map` mappings. @@ -571,7 +628,9 @@ def get_platform_instance( return platform_instance @lru_cache(maxsize=None) - def get_datasource_from_id(self, datasource_id): + def get_datasource_from_id( + self, datasource_id: Union[int, str] + ) -> Tuple[str, Optional[str], Optional[str], Optional[str]]: try: dataset_response = self.session.get( f"{self.config.connect_uri}/api/database/{datasource_id}" @@ -583,7 +642,9 @@ def get_datasource_from_id(self, datasource_id): key=f"metabase-datasource-{datasource_id}", reason=f"Unable to retrieve Datasource. " f"Reason: {str(http_error)}", ) - return None, None + # returning empty string as `platform` because + # `make_dataset_urn_with_platform_instance()` only accepts `str` + return "", None, None, None # Map engine names to what datahub expects in # https://github.com/datahub-project/datahub/blob/master/metadata-service/war/src/main/resources/boot/data_platforms.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py index 42924a09a39e9..3925ba51c16dd 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -108,7 +108,7 @@ class ApiWorkUnit(MetadataWorkUnit): @platform_name("OpenAPI", id="openapi") @config_class(OpenApiConfig) -@support_status(SupportStatus.CERTIFIED) +@support_status(SupportStatus.INCUBATING) @capability(SourceCapability.PLATFORM_INSTANCE, supported=False, description="") class APISource(Source, ABC): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 31d067f984d2d..ffa685fb25826 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -121,6 +121,12 @@ class DataPlatformPair: powerbi_data_platform_name: str +@dataclass +class PowerBIPlatformDetail: + data_platform_pair: DataPlatformPair + data_platform_server: str + + class SupportedDataPlatform(Enum): POSTGRES_SQL = DataPlatformPair( powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" @@ -382,6 +388,15 @@ class PowerBiDashboardSourceConfig( description="The instance of the platform that all assets produced by this recipe belong to", ) + # Enable advance sql construct + enable_advance_lineage_sql_construct: bool = pydantic.Field( + default=False, + description="Whether to enable advance native sql construct for parsing like join, sub-queries. " + "along this flag , the native_query_parsing should be enabled. " + "By default convert_lineage_urns_to_lowercase is enabled, in-case if you have disabled it in previous ingestion execution then it may break lineage " + "as this option generates the upstream datasets URN in lowercase.", + ) + @validator("dataset_type_mapping") @classmethod def map_data_platform(cls, value): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index 396da2d79e3b7..baaa8d5b85ae1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -5,8 +5,8 @@ from datahub.ingestion.source.powerbi.config import ( PlatformDetail, PowerBiDashboardSourceConfig, + PowerBIPlatformDetail, ) -from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable logger = logging.getLogger(__name__) @@ -14,7 +14,7 @@ class AbstractDataPlatformInstanceResolver(ABC): @abstractmethod def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: pass @@ -32,10 +32,10 @@ class ResolvePlatformInstanceFromDatasetTypeMapping( BaseAbstractDataPlatformInstanceResolver ): def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: platform: Union[str, PlatformDetail] = self.config.dataset_type_mapping[ - dataplatform_table.data_platform_pair.powerbi_data_platform_name + data_platform_detail.data_platform_pair.powerbi_data_platform_name ] if isinstance(platform, PlatformDetail): @@ -48,13 +48,13 @@ class ResolvePlatformInstanceFromServerToPlatformInstance( BaseAbstractDataPlatformInstanceResolver ): def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: return ( self.config.server_to_platform_instance[ - dataplatform_table.datasource_server + data_platform_detail.data_platform_server ] - if dataplatform_table.datasource_server + if data_platform_detail.data_platform_server in self.config.server_to_platform_instance else PlatformDetail.parse_obj({}) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 640bc4bd60d80..021c429c3c633 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -1,8 +1,12 @@ import logging -from typing import List +from typing import List, Optional import sqlparse +import datahub.utilities.sqlglot_lineage as sqlglot_l +from datahub.ingestion.api.common import PipelineContext +from datahub.utilities.sqlglot_lineage import SqlParsingResult + SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] logger = logging.getLogger() @@ -45,3 +49,30 @@ def get_tables(native_query: str) -> List[str]: from_index = from_index + 1 return tables + + +def parse_custom_sql( + ctx: PipelineContext, + query: str, + schema: Optional[str], + database: Optional[str], + platform: str, + env: str, + platform_instance: Optional[str], +) -> Optional["SqlParsingResult"]: + + logger.debug("Using sqlglot_lineage to parse custom sql") + + sql_query = remove_special_characters(query) + + logger.debug(f"Parsing sql={sql_query}") + + return sqlglot_l.create_lineage_sql_parsed_result( + query=sql_query, + schema=schema, + database=database, + platform=platform, + platform_instance=platform_instance, + env=env, + graph=ctx.graph, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 83106c04529d1..8cc38c366c42a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -6,7 +6,14 @@ import lark from lark import Lark, Tree -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) from datahub.ingestion.source.powerbi.m_query import resolver, validator from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -45,7 +52,9 @@ def _parse_expression(expression: str) -> Tree: def get_upstream_tables( table: Table, reporter: PowerBiDashboardSourceReport, - native_query_enabled: bool = True, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, ) -> List[resolver.DataPlatformTable]: if table.expression is None: @@ -58,7 +67,7 @@ def get_upstream_tables( parse_tree: Tree = _parse_expression(table.expression) valid, message = validator.validate_parse_tree( - parse_tree, native_query_enabled=native_query_enabled + parse_tree, native_query_enabled=config.native_query_parsing ) if valid is False: assert message is not None @@ -84,7 +93,11 @@ def get_upstream_tables( parse_tree=parse_tree, reporter=reporter, parameters=parameters, - ).resolve_to_data_platform_table_list() + ).resolve_to_data_platform_table_list( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) except BaseException as e: reporter.report_warning(table.full_name, "Failed to process m-query expression") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index e2b448124c89d..479f1decff903 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -6,11 +6,19 @@ from lark import Tree +import datahub.emitter.mce_builder as builder +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( DataPlatformPair, + PlatformDetail, + PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, + PowerBIPlatformDetail, SupportedDataPlatform, ) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -19,19 +27,98 @@ IdentifierAccessor, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table +from datahub.utilities.sqlglot_lineage import SqlParsingResult logger = logging.getLogger(__name__) @dataclass class DataPlatformTable: - name: str - full_name: str - datasource_server: str data_platform_pair: DataPlatformPair + urn: str + + +def urn_to_lowercase(value: str, flag: bool) -> str: + if flag is True: + return value.lower() + + return value + + +def urn_creator( + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + data_platform_pair: DataPlatformPair, + server: str, + qualified_table_name: str, +) -> str: + + platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=data_platform_pair, + data_platform_server=server, + ) + ) + + return builder.make_dataset_urn_with_platform_instance( + platform=data_platform_pair.datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + name=urn_to_lowercase( + qualified_table_name, config.convert_lineage_urns_to_lowercase + ), + ) class AbstractDataPlatformTableCreator(ABC): + """ + Base class to share common functionalities among different dataplatform for M-Query parsing. + + To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and + the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query. + + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + + It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument + of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL. + + DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern + + data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to + find out database-name , schema-name and table-name also varies as per dataplatform. + + Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query + + let + Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) + in + Source + + In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query. + + NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing. + + """ + + ctx: PipelineContext + config: PowerBiDashboardSourceConfig + platform_instance_resolver: AbstractDataPlatformInstanceResolver + + def __init__( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> None: + super().__init__() + self.ctx = ctx + self.config = config + self.platform_instance_resolver = platform_instance_resolver + @abstractmethod def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail @@ -58,6 +145,49 @@ def get_db_detail_from_argument( return arguments[0], arguments[1] + def parse_custom_sql( + self, query: str, server: str, database: Optional[str], schema: Optional[str] + ) -> List[DataPlatformTable]: + + dataplatform_tables: List[DataPlatformTable] = [] + + platform_detail: PlatformDetail = ( + self.platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=self.get_platform_pair(), + data_platform_server=server, + ) + ) + ) + + parsed_result: Optional[ + "SqlParsingResult" + ] = native_sql_parser.parse_custom_sql( + ctx=self.ctx, + query=query, + platform=self.get_platform_pair().datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + database=database, + schema=schema, + ) + + if parsed_result is None: + logger.debug("Failed to parse query") + return dataplatform_tables + + for urn in parsed_result.in_tables: + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") + + return dataplatform_tables + class AbstractDataAccessMQueryResolver(ABC): table: Table @@ -80,11 +210,29 @@ def __init__( self.data_access_functions = SupportedResolver.get_function_names() @abstractmethod - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + def resolve_to_data_platform_table_list( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> List[DataPlatformTable]: pass class MQueryResolver(AbstractDataAccessMQueryResolver, ABC): + """ + This class parses the M-Query recursively to generate DataAccessFunctionDetail (see method create_data_access_functional_detail). + + This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail. + + Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator + (see method resolve_to_data_platform_table_list). + + Classes which extended from AbstractDataPlatformTableCreator knows how to convert generated DataAccessFunctionDetail instance + to respective DataPlatformTable instance as per dataplatform. + + """ + def get_item_selector_tokens( self, expression_tree: Tree, @@ -318,9 +466,15 @@ def internal( return table_links - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + def resolve_to_data_platform_table_list( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> List[DataPlatformTable]: data_platform_tables: List[DataPlatformTable] = [] + # Find out output variable as we are doing backtracking in M-Query output_variable: Optional[str] = tree_function.get_output_variable( self.parse_tree ) @@ -332,12 +486,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) return data_platform_tables + # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail table_links: List[ DataAccessFunctionDetail ] = self.create_data_access_functional_detail(output_variable) # Each item is data-access function for f_detail in table_links: + # Get & Check if we support data-access-function available in M-Query supported_resolver = SupportedResolver.get_resolver( f_detail.data_access_function_name ) @@ -351,8 +507,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) continue + # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it + # & also pass additional information that will be need to generate urn table_full_name_creator: AbstractDataPlatformTableCreator = ( - supported_resolver.get_table_full_name_creator()() + supported_resolver.get_table_full_name_creator()( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) data_platform_tables.extend( @@ -393,18 +555,24 @@ def two_level_access_pattern( IdentifierAccessor, data_access_func_detail.identifier_accessor ).items["Item"] - full_table_name: str = f"{db_name}.{schema_name}.{table_name}" + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" logger.debug( - f"Platform({self.get_platform_pair().datahub_data_platform_name}) full_table_name= {full_table_name}" + f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}" + ) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, ) return [ DataPlatformTable( - name=table_name, - full_name=full_table_name, - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -420,9 +588,48 @@ def get_platform_pair(self) -> DataPlatformPair: class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources): + # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 + DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo + def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.MS_SQL.value + def create_urn_using_old_parser( + self, query: str, db_name: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for table in tables: + schema_and_table: List[str] = table.split(".") + if len(schema_and_table) == 1: + # schema name is not present. set default schema + schema_and_table.insert(0, MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA) + + qualified_table_name = ( + f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}" + ) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated upstream tables = {dataplatform_tables}") + + return dataplatform_tables + def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[DataPlatformTable]: @@ -442,28 +649,20 @@ def create_dataplatform_tables( logger.debug("Unsupported case is found. Second index is not the Query") return dataplatform_tables - db_name: str = arguments[1] - - tables: List[str] = native_sql_parser.get_tables(arguments[3]) - for table in tables: - schema_and_table: List[str] = table.split(".") - if len(schema_and_table) == 1: - # schema name is not present. Default schema name in MS-SQL is dbo - # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 - schema_and_table.insert(0, "dbo") - - dataplatform_tables.append( - DataPlatformTable( - name=schema_and_table[1], - full_name=f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}", - datasource_server=arguments[0], - data_platform_pair=self.get_platform_pair(), - ) + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=arguments[3], + db_name=arguments[1], + server=arguments[0], ) - logger.debug("MS-SQL full-table-names %s", dataplatform_tables) - - return dataplatform_tables + return self.parse_custom_sql( + query=arguments[3], + database=arguments[1], + server=arguments[0], + schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, + ) class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -510,12 +709,20 @@ def create_dataplatform_tables( cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -547,14 +754,28 @@ def create_dataplatform_tables( db_name: str = value_dict["Database"] schema_name: str = value_dict["Schema"] table_name: str = value_dict["Table"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + server, _ = self.get_db_detail_from_argument(data_access_func_detail.arg_list) + if server is None: + logger.info( + f"server information is not available for {qualified_table_name}. Skipping upstream table" + ) + return [] + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server if server else "", data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -589,20 +810,26 @@ def create_dataplatform_tables( IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore ).items["Name"] - full_table_name: str = f"{db_name}.{schema_name}.{table_name}" + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" logger.debug( - f"{self.get_platform_pair().datahub_data_platform_name} full-table-name {full_table_name}" + f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" + ) + + server: str = self.get_datasource_server(arguments, data_access_func_detail) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, ) return [ DataPlatformTable( - name=table_name, - full_name=full_table_name, - datasource_server=self.get_datasource_server( - arguments, data_access_func_detail - ), data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -654,12 +881,20 @@ def create_dataplatform_tables( cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -681,6 +916,39 @@ def is_native_parsing_supported(data_access_function_name: str) -> bool: in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM ) + def create_urn_using_old_parser( + self, query: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for qualified_table_name in tables: + if len(qualified_table_name.split(".")) != 3: + logger.debug( + f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format" + ) + continue + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") + + return dataplatform_tables + def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[DataPlatformTable]: @@ -727,25 +995,21 @@ def create_dataplatform_tables( 0 ] # Remove any whitespaces and double quotes character - for table in native_sql_parser.get_tables(sql_query): - if len(table.split(".")) != 3: - logger.debug( - f"Skipping table {table} as it is not as per full_table_name format" - ) - continue + server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] - dataplatform_tables.append( - DataPlatformTable( - name=table.split(".")[2], - full_name=table, - datasource_server=tree_function.strip_char_from_list( - [data_access_tokens[2]] - )[0], - data_platform_pair=self.get_platform_pair(), - ) + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=sql_query, + server=server, ) - return dataplatform_tables + return self.parse_custom_sql( + query=sql_query, + server=server, + database=None, # database and schema is available inside custom sql as per PowerBI Behavior + schema=None, + ) class FunctionName(Enum): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 919cb83e4d832..5d477ee090e7e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -28,7 +28,6 @@ ) from datahub.ingestion.source.powerbi.config import ( Constant, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, ) @@ -96,10 +95,12 @@ def __hash__(self): def __init__( self, + ctx: PipelineContext, config: PowerBiDashboardSourceConfig, reporter: PowerBiDashboardSourceReport, dataplatform_instance_resolver: AbstractDataPlatformInstanceResolver, ): + self.__ctx = ctx self.__config = config self.__reporter = reporter self.__dataplatform_instance_resolver = dataplatform_instance_resolver @@ -172,43 +173,40 @@ def extract_lineage( # table.dataset should always be set, but we check it just in case. parameters = table.dataset.parameters if table.dataset else {} - upstreams: List[UpstreamClass] = [] - upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables( - table, self.__reporter, parameters=parameters + upstream: List[UpstreamClass] = [] + + upstream_dpts: List[resolver.DataPlatformTable] = parser.get_upstream_tables( + table=table, + reporter=self.__reporter, + platform_instance_resolver=self.__dataplatform_instance_resolver, + ctx=self.__ctx, + config=self.__config, + parameters=parameters, ) + logger.debug( - f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_tables}" + f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_dpts}" ) - for upstream_table in upstream_tables: + + for upstream_dpt in upstream_dpts: if ( - upstream_table.data_platform_pair.powerbi_data_platform_name + upstream_dpt.data_platform_pair.powerbi_data_platform_name not in self.__config.dataset_type_mapping.keys() ): logger.debug( - f"Skipping upstream table for {ds_urn}. The platform {upstream_table.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", + f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", ) continue - platform_detail: PlatformDetail = ( - self.__dataplatform_instance_resolver.get_platform_instance( - upstream_table - ) - ) - upstream_urn = builder.make_dataset_urn_with_platform_instance( - platform=upstream_table.data_platform_pair.datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - name=self.lineage_urn_to_lowercase(upstream_table.full_name), - ) - upstream_table_class = UpstreamClass( - upstream_urn, + upstream_dpt.urn, DatasetLineageTypeClass.TRANSFORMED, ) - upstreams.append(upstream_table_class) - if len(upstreams) > 0: - upstream_lineage = UpstreamLineageClass(upstreams=upstreams) + upstream.append(upstream_table_class) + + if len(upstream) > 0: + upstream_lineage = UpstreamLineageClass(upstreams=upstream) logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}") mcp = MetadataChangeProposalWrapper( entityType=Constant.DATASET, @@ -1107,7 +1105,9 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext): ) # Exit pipeline as we are not able to connect to PowerBI API Service. This exit will avoid raising # unwanted stacktrace on console - self.mapper = Mapper(config, self.reporter, self.dataplatform_instance_resolver) + self.mapper = Mapper( + ctx, config, self.reporter, self.dataplatform_instance_resolver + ) # Create and register the stateful ingestion use-case handler. self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py index 2d2d9f527788f..0d41ab00c66f5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py @@ -1,3 +1,4 @@ +import dataclasses from dataclasses import dataclass from enum import Enum from typing import Any, Dict, List, Optional, Union @@ -105,7 +106,7 @@ class Measure: dataType: str = "measure" datahubDataType: Union[ BooleanTypeClass, DateTypeClass, NullTypeClass, NumberTypeClass, StringTypeClass - ] = NullTypeClass() + ] = dataclasses.field(default_factory=NullTypeClass) description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index cf4e3a5b0135a..c8623798f6937 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -2,6 +2,7 @@ import traceback from collections import defaultdict from dataclasses import dataclass, field +from datetime import datetime from enum import Enum from typing import Dict, List, Optional, Set, Tuple, Union from urllib.parse import urlparse @@ -24,6 +25,9 @@ RedshiftView, ) from datahub.ingestion.source.redshift.report import RedshiftReport +from datahub.ingestion.source.state.redundant_run_skip_handler import ( + RedundantLineageRunSkipHandler, +) from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage from datahub.metadata.schema_classes import ( DatasetLineageTypeClass, @@ -79,11 +83,27 @@ def __init__( self, config: RedshiftConfig, report: RedshiftReport, + redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None, ): self.config = config self.report = report self._lineage_map: Dict[str, LineageItem] = defaultdict() + self.redundant_run_skip_handler = redundant_run_skip_handler + self.start_time, self.end_time = ( + self.report.lineage_start_time, + self.report.lineage_end_time, + ) = self.get_time_window() + + def get_time_window(self) -> Tuple[datetime, datetime]: + if self.redundant_run_skip_handler: + self.report.stateful_lineage_ingestion_enabled = True + return self.redundant_run_skip_handler.suggest_run_time_window( + self.config.start_time, self.config.end_time + ) + else: + return self.config.start_time, self.config.end_time + def warn(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason) log.warning(f"{key} => {reason}") @@ -263,6 +283,7 @@ def _populate_lineage_map( f"extract-{lineage_type.name}", f"Error was {e}, {traceback.format_exc()}", ) + self.report_status(f"extract-{lineage_type.name}", False) def _get_target_lineage( self, @@ -343,31 +364,33 @@ def populate_lineage( if self.config.table_lineage_mode == LineageMode.STL_SCAN_BASED: # Populate table level lineage by getting upstream tables from stl_scan redshift table query = RedshiftQuery.stl_scan_based_lineage_query( - self.config.database, self.config.start_time, self.config.end_time + self.config.database, + self.config.start_time, + self.config.end_time, ) populate_calls.append((query, LineageCollectorType.QUERY_SCAN)) elif self.config.table_lineage_mode == LineageMode.SQL_BASED: # Populate table level lineage by parsing table creating sqls query = RedshiftQuery.list_insert_create_queries_sql( db_name=database, - start_time=self.config.start_time, - end_time=self.config.end_time, + start_time=self.start_time, + end_time=self.end_time, ) populate_calls.append((query, LineageCollectorType.QUERY_SQL_PARSER)) elif self.config.table_lineage_mode == LineageMode.MIXED: # Populate table level lineage by parsing table creating sqls query = RedshiftQuery.list_insert_create_queries_sql( db_name=database, - start_time=self.config.start_time, - end_time=self.config.end_time, + start_time=self.start_time, + end_time=self.end_time, ) populate_calls.append((query, LineageCollectorType.QUERY_SQL_PARSER)) # Populate table level lineage by getting upstream tables from stl_scan redshift table query = RedshiftQuery.stl_scan_based_lineage_query( db_name=database, - start_time=self.config.start_time, - end_time=self.config.end_time, + start_time=self.start_time, + end_time=self.end_time, ) populate_calls.append((query, LineageCollectorType.QUERY_SCAN)) @@ -383,16 +406,16 @@ def populate_lineage( if self.config.include_copy_lineage: query = RedshiftQuery.list_copy_commands_sql( db_name=database, - start_time=self.config.start_time, - end_time=self.config.end_time, + start_time=self.start_time, + end_time=self.end_time, ) populate_calls.append((query, LineageCollectorType.COPY)) if self.config.include_unload_lineage: query = RedshiftQuery.list_unload_commands_sql( db_name=database, - start_time=self.config.start_time, - end_time=self.config.end_time, + start_time=self.start_time, + end_time=self.end_time, ) populate_calls.append((query, LineageCollectorType.UNLOAD)) @@ -467,3 +490,7 @@ def get_lineage( return None return UpstreamLineage(upstreams=upstream_lineage), {} + + def report_status(self, step: str, status: bool) -> None: + if self.redundant_run_skip_handler: + self.redundant_run_skip_handler.report_current_run_status(step, status) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 29f0808a6ca7d..e8a8ff976afa6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -63,7 +63,8 @@ ) from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.ingestion.source.state.redundant_run_skip_handler import ( - RedundantRunSkipHandler, + RedundantLineageRunSkipHandler, + RedundantUsageRunSkipHandler, ) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, @@ -71,6 +72,11 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) +from datahub.ingestion.source_report.ingestion_stage import ( + LINEAGE_EXTRACTION, + METADATA_EXTRACTION, + PROFILING, +) from datahub.metadata.com.linkedin.pegasus2avro.common import SubTypes, TimeStamp from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetProperties, @@ -95,7 +101,6 @@ from datahub.utilities.mapping import Constants from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.registries.domain_registry import DomainRegistry -from datahub.utilities.time import datetime_to_ts_millis logger: logging.Logger = logging.getLogger(__name__) @@ -297,15 +302,19 @@ def __init__(self, config: RedshiftConfig, ctx: PipelineContext): cached_domains=list(self.config.domain.keys()), graph=self.ctx.graph ) - self.redundant_run_skip_handler = RedundantRunSkipHandler( - source=self, - config=self.config, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) + self.redundant_lineage_run_skip_handler: Optional[ + RedundantLineageRunSkipHandler + ] = None + if self.config.enable_stateful_lineage_ingestion: + self.redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler( + source=self, + config=self.config, + pipeline_name=self.ctx.pipeline_name, + run_id=self.ctx.run_id, + ) self.profiling_state_handler: Optional[ProfilingHandler] = None - if self.config.store_last_profiling_timestamps: + if self.config.enable_stateful_profiling: self.profiling_state_handler = ProfilingHandler( source=self, config=self.config, @@ -317,6 +326,8 @@ def __init__(self, config: RedshiftConfig, ctx: PipelineContext): self.db_views: Dict[str, Dict[str, List[RedshiftView]]] = {} self.db_schemas: Dict[str, Dict[str, RedshiftSchema]] = {} + self.add_config_to_report() + @classmethod def create(cls, config_dict, ctx): config = RedshiftConfig.parse_obj(config_dict) @@ -367,7 +378,7 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit connection = RedshiftSource.get_redshift_connection(self.config) database = get_db_name(self.config) logger.info(f"Processing db {self.config.database} with name {database}") - # self.add_config_to_report() + self.report.report_ingestion_stage_start(METADATA_EXTRACTION) self.db_tables[database] = defaultdict() self.db_views[database] = defaultdict() self.db_schemas.setdefault(database, {}) @@ -388,17 +399,8 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit all_tables = self.get_all_tables() - if ( - self.config.store_last_lineage_extraction_timestamp - or self.config.store_last_usage_extraction_timestamp - ): - # Update the checkpoint state for this run. - self.redundant_run_skip_handler.update_state( - start_time_millis=datetime_to_ts_millis(self.config.start_time), - end_time_millis=datetime_to_ts_millis(self.config.end_time), - ) - if self.config.include_table_lineage or self.config.include_copy_lineage: + self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) yield from self.extract_lineage( connection=connection, all_tables=all_tables, database=database ) @@ -409,6 +411,7 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit ) if self.config.is_profiling_enabled(): + self.report.report_ingestion_stage_start(PROFILING) profiler = RedshiftProfiler( config=self.config, report=self.report, @@ -841,26 +844,26 @@ def extract_usage( database: str, all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]], ) -> Iterable[MetadataWorkUnit]: - if ( - self.config.store_last_usage_extraction_timestamp - and self.redundant_run_skip_handler.should_skip_this_run( - cur_start_time_millis=datetime_to_ts_millis(self.config.start_time) - ) - ): - # Skip this run - self.report.report_warning( - "usage-extraction", - f"Skip this run as there was a run later than the current start time: {self.config.start_time}", - ) - return - with PerfTimer() as timer: - yield from RedshiftUsageExtractor( + redundant_usage_run_skip_handler: Optional[ + RedundantUsageRunSkipHandler + ] = None + if self.config.enable_stateful_usage_ingestion: + redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler( + source=self, + config=self.config, + pipeline_name=self.ctx.pipeline_name, + run_id=self.ctx.run_id, + ) + usage_extractor = RedshiftUsageExtractor( config=self.config, connection=connection, report=self.report, dataset_urn_builder=self.gen_dataset_urn, - ).get_usage_workunits(all_tables=all_tables) + redundant_run_skip_handler=redundant_usage_run_skip_handler, + ) + + yield from usage_extractor.get_usage_workunits(all_tables=all_tables) self.report.usage_extraction_sec[database] = round( timer.elapsed_seconds(), 2 @@ -872,22 +875,13 @@ def extract_lineage( database: str, all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]], ) -> Iterable[MetadataWorkUnit]: - if ( - self.config.store_last_lineage_extraction_timestamp - and self.redundant_run_skip_handler.should_skip_this_run( - cur_start_time_millis=datetime_to_ts_millis(self.config.start_time) - ) - ): - # Skip this run - self.report.report_warning( - "lineage-extraction", - f"Skip this run as there was a run later than the current start time: {self.config.start_time}", - ) + if not self._should_ingest_lineage(): return self.lineage_extractor = RedshiftLineageExtractor( config=self.config, report=self.report, + redundant_run_skip_handler=self.redundant_lineage_run_skip_handler, ) with PerfTimer() as timer: @@ -900,6 +894,29 @@ def extract_lineage( ) yield from self.generate_lineage(database) + if self.redundant_lineage_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_lineage_run_skip_handler.update_state( + self.config.start_time, self.config.end_time + ) + + def _should_ingest_lineage(self) -> bool: + if ( + self.redundant_lineage_run_skip_handler + and self.redundant_lineage_run_skip_handler.should_skip_this_run( + cur_start_time=self.config.start_time, + cur_end_time=self.config.end_time, + ) + ): + # Skip this run + self.report.report_warning( + "lineage-extraction", + "Skip this run as there was already a run for current ingestion window.", + ) + return False + + return True + def generate_lineage(self, database: str) -> Iterable[MetadataWorkUnit]: assert self.lineage_extractor @@ -940,3 +957,15 @@ def generate_lineage(self, database: str) -> Iterable[MetadataWorkUnit]: yield from gen_lineage( dataset_urn, lineage_info, self.config.incremental_lineage ) + + def add_config_to_report(self): + self.report.stateful_lineage_ingestion_enabled = ( + self.config.enable_stateful_lineage_ingestion + ) + self.report.stateful_usage_ingestion_enabled = ( + self.config.enable_stateful_usage_ingestion + ) + self.report.window_start_time, self.report.window_end_time = ( + self.config.start_time, + self.config.end_time, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py index 319a731a14cef..b845580f35939 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py @@ -1,13 +1,16 @@ from dataclasses import dataclass, field +from datetime import datetime from typing import Dict, Optional from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport +from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.utilities.lossy_collections import LossyDict from datahub.utilities.stats_collections import TopKDict @dataclass -class RedshiftReport(ProfilingSqlReport): +class RedshiftReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): num_usage_workunits_emitted: Optional[int] = None num_operational_stats_workunits_emitted: Optional[int] = None upstream_lineage: LossyDict = field(default_factory=LossyDict) @@ -32,5 +35,13 @@ class RedshiftReport(ProfilingSqlReport): num_lineage_dropped_query_parser: int = 0 num_lineage_dropped_not_support_copy_path: int = 0 + lineage_start_time: Optional[datetime] = None + lineage_end_time: Optional[datetime] = None + stateful_lineage_ingestion_enabled: bool = False + + usage_start_time: Optional[datetime] = None + usage_end_time: Optional[datetime] = None + stateful_usage_ingestion_enabled: bool = False + def report_dropped(self, key: str) -> None: self.filtered.append(key) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py index 653b41d690e48..bbb1876102578 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py @@ -2,7 +2,7 @@ import logging import time from datetime import datetime -from typing import Callable, Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union import pydantic.error_wrappers import redshift_connector @@ -10,7 +10,10 @@ from pydantic.main import BaseModel import datahub.emitter.mce_builder as builder -from datahub.configuration.time_window_config import get_time_bucket +from datahub.configuration.time_window_config import ( + BaseTimeWindowConfig, + get_time_bucket, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -20,7 +23,14 @@ RedshiftView, ) from datahub.ingestion.source.redshift.report import RedshiftReport +from datahub.ingestion.source.state.redundant_run_skip_handler import ( + RedundantUsageRunSkipHandler, +) from datahub.ingestion.source.usage.usage_common import GenericAggregatedDataset +from datahub.ingestion.source_report.ingestion_stage import ( + USAGE_EXTRACTION_OPERATIONAL_STATS, + USAGE_EXTRACTION_USAGE_AGGREGATION, +) from datahub.metadata.schema_classes import OperationClass, OperationTypeClass from datahub.utilities.perf_timer import PerfTimer @@ -170,18 +180,56 @@ def __init__( connection: redshift_connector.Connection, report: RedshiftReport, dataset_urn_builder: Callable[[str], str], + redundant_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None, ): self.config = config self.report = report self.connection = connection self.dataset_urn_builder = dataset_urn_builder + self.redundant_run_skip_handler = redundant_run_skip_handler + self.start_time, self.end_time = ( + self.report.usage_start_time, + self.report.usage_end_time, + ) = self.get_time_window() + + def get_time_window(self) -> Tuple[datetime, datetime]: + if self.redundant_run_skip_handler: + return self.redundant_run_skip_handler.suggest_run_time_window( + self.config.start_time, self.config.end_time + ) + else: + return self.config.start_time, self.config.end_time + + def _should_ingest_usage(self): + if ( + self.redundant_run_skip_handler + and self.redundant_run_skip_handler.should_skip_this_run( + cur_start_time=self.config.start_time, + cur_end_time=self.config.end_time, + ) + ): + # Skip this run + self.report.report_warning( + "usage-extraction", + "Skip this run as there was already a run for current ingestion window.", + ) + return False + + return True + def get_usage_workunits( self, all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] ) -> Iterable[MetadataWorkUnit]: + if not self._should_ingest_usage(): + return yield from auto_empty_dataset_usage_statistics( self._get_workunits_internal(all_tables), - config=self.config, + config=BaseTimeWindowConfig( + start_time=self.start_time, + end_time=self.end_time, + bucket_duration=self.config.bucket_duration, + ), dataset_urns={ self.dataset_urn_builder(f"{database}.{schema}.{table.name}") for database in all_tables @@ -190,6 +238,14 @@ def get_usage_workunits( }, ) + if self.redundant_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_run_skip_handler.update_state( + self.config.start_time, + self.config.end_time, + self.config.bucket_duration, + ) + def _get_workunits_internal( self, all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] ) -> Iterable[MetadataWorkUnit]: @@ -198,6 +254,7 @@ def _get_workunits_internal( self.report.num_operational_stats_skipped = 0 if self.config.include_operational_stats: + self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS) with PerfTimer() as timer: # Generate operation aspect workunits yield from self._gen_operation_aspect_workunits( @@ -208,9 +265,10 @@ def _get_workunits_internal( ] = round(timer.elapsed_seconds(), 2) # Generate aggregate events + self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION) query: str = REDSHIFT_USAGE_QUERY_TEMPLATE.format( - start_time=self.config.start_time.strftime(REDSHIFT_DATETIME_FORMAT), - end_time=self.config.end_time.strftime(REDSHIFT_DATETIME_FORMAT), + start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT), + end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT), database=self.config.database, ) access_events_iterable: Iterable[ @@ -236,8 +294,8 @@ def _gen_operation_aspect_workunits( ) -> Iterable[MetadataWorkUnit]: # Generate access events query: str = REDSHIFT_OPERATION_ASPECT_QUERY_TEMPLATE.format( - start_time=self.config.start_time.strftime(REDSHIFT_DATETIME_FORMAT), - end_time=self.config.end_time.strftime(REDSHIFT_DATETIME_FORMAT), + start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT), + end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT), ) access_events_iterable: Iterable[ RedshiftAccessEvent @@ -391,4 +449,9 @@ def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: self.config.top_n_queries, self.config.format_sql_queries, self.config.include_top_n_queries, + self.config.queries_character_limit, ) + + def report_status(self, step: str, status: bool) -> None: + if self.redundant_run_skip_handler: + self.redundant_run_skip_handler.report_current_run_status(step, status) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index a989dc2f2fcbe..f1dd622efb746 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -66,6 +66,11 @@ class DataLakeSourceConfig( default="4g", description="Max amount of memory to grant Spark." ) + spark_config: Dict[str, Any] = Field( + description='Spark configuration properties to set on the SparkSession. Put config property names into quotes. For example: \'"spark.executor.memory": "2g"\'', + default={}, + ) + max_rows: int = Field( default=100, description="Maximum number of rows to use when inferring schemas for TSV and CSV files.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 61f9f88c3fb05..ab5d3a4e007ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -37,6 +37,7 @@ from datahub.emitter.mce_builder import ( make_data_platform_urn, + make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -81,6 +82,7 @@ TimeTypeClass, ) from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, DatasetPropertiesClass, MapTypeClass, OperationClass, @@ -259,13 +261,14 @@ def init_spark(self): import pydeequ conf = SparkConf() - + spark_version = os.getenv("SPARK_VERSION", "3.3") conf.set( "spark.jars.packages", ",".join( [ "org.apache.hadoop:hadoop-aws:3.0.3", - "org.apache.spark:spark-avro_2.12:3.0.3", + # Spark's avro version needs to be matched with the Spark version + f"org.apache.spark:spark-avro_2.12:{spark_version}{'.0' if spark_version.count('.') == 1 else ''}", pydeequ.deequ_maven_coord, ] ), @@ -329,6 +332,9 @@ def init_spark(self): conf.set("spark.jars.excludes", pydeequ.f2j_maven_coord) conf.set("spark.driver.memory", self.source_config.spark_driver_memory) + if self.source_config.spark_config: + for key, value in self.source_config.spark_config.items(): + conf.set(key, value) self.spark = SparkSession.builder.config(conf=conf).getOrCreate() @classmethod @@ -369,10 +375,10 @@ def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]: elif ext.endswith(".avro"): try: df = self.spark.read.format("avro").load(file) - except AnalysisException: + except AnalysisException as e: self.report.report_warning( file, - "To ingest avro files, please install the spark-avro package: https://mvnrepository.com/artifact/org.apache.spark/spark-avro_2.12/3.0.3", + f"Avro file reading failed with exception. The error was: {e}", ) return None @@ -559,6 +565,15 @@ def ingest_table( self.source_config.env, ) + if self.source_config.platform_instance: + data_platform_instance = DataPlatformInstanceClass( + platform=data_platform_urn, + instance=make_dataplatform_instance_urn( + self.source_config.platform, self.source_config.platform_instance + ), + ) + aspects.append(data_platform_instance) + customProperties = {"schema_inferred_from": str(table_data.full_path)} if not path_spec.sample_files: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index e8e80e172a9ce..af99faf6e6396 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -1,10 +1,12 @@ import logging +from collections import defaultdict +from dataclasses import dataclass from enum import Enum -from typing import Dict, List, Optional, cast +from typing import Dict, List, Optional, Set, cast from pydantic import Field, SecretStr, root_validator, validator -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pattern_utils import UUID_REGEX from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -12,6 +14,7 @@ ClassificationSourceConfigMixin, ) from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulLineageConfigMixin, StatefulProfilingConfigMixin, StatefulUsageConfigMixin, ) @@ -42,9 +45,35 @@ class TagOption(str, Enum): skip = "skip" +@dataclass(frozen=True) +class DatabaseId: + database: str = Field( + description="Database created from share in consumer account." + ) + platform_instance: str = Field( + description="Platform instance of consumer snowflake account." + ) + + +class SnowflakeShareConfig(ConfigModel): + database: str = Field(description="Database from which share is created.") + platform_instance: str = Field( + description="Platform instance for snowflake account in which share is created." + ) + + consumers: Set[DatabaseId] = Field( + description="List of databases created in consumer accounts." + ) + + @property + def source_database(self) -> DatabaseId: + return DatabaseId(self.database, self.platform_instance) + + class SnowflakeV2Config( SnowflakeConfig, SnowflakeUsageConfig, + StatefulLineageConfigMixin, StatefulUsageConfigMixin, StatefulProfilingConfigMixin, ClassificationSourceConfigMixin, @@ -91,13 +120,8 @@ class SnowflakeV2Config( description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", ) - use_legacy_lineage_method: bool = Field( - default=False, - description=( - "Whether to use the legacy lineage computation method. " - "By default, uses new optimised lineage extraction method that requires less ingestion process memory. " - "Table-to-view and view-to-view column-level lineage are not supported with the legacy method." - ), + _use_legacy_lineage_method_removed = pydantic_removed_field( + "use_legacy_lineage_method" ) validate_upstreams_against_patterns: bool = Field( @@ -113,13 +137,20 @@ class SnowflakeV2Config( # This is required since access_history table does not capture whether the table was temporary table. temporary_tables_pattern: List[str] = Field( default=DEFAULT_TABLES_DENY_LIST, - description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools. Not used if `use_legacy_lineage_method=True`", + description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools.", ) rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( "upstreams_deny_pattern", "temporary_tables_pattern" ) + shares: Optional[Dict[str, SnowflakeShareConfig]] = Field( + default=None, + description="Required if current account owns or consumes snowflake share." + " If specified, connector creates lineage and siblings relationship between current account's database tables and consumer/producer account's database tables." + " Map of share name -> details of share.", + ) + email_as_user_identifier: bool = Field( default=True, description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is provided, generates email addresses for snowflake users with unset emails, based on their username.", @@ -197,3 +228,77 @@ def get_sql_alchemy_url( @property def parse_view_ddl(self) -> bool: return self.include_view_column_lineage + + @validator("shares") + def validate_shares( + cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict + ) -> Optional[Dict[str, SnowflakeShareConfig]]: + current_platform_instance = values.get("platform_instance") + + if shares: + # Check: platform_instance should be present + assert current_platform_instance is not None, ( + "Did you forget to set `platform_instance` for current ingestion ? " + "It is required to use `platform_instance` when ingesting from multiple snowflake accounts." + ) + + databases_included_in_share: List[DatabaseId] = [] + databases_created_from_share: List[DatabaseId] = [] + + for share_details in shares.values(): + shared_db = DatabaseId( + share_details.database, share_details.platform_instance + ) + assert all( + consumer.platform_instance != share_details.platform_instance + for consumer in share_details.consumers + ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake." + + databases_included_in_share.append(shared_db) + databases_created_from_share.extend(share_details.consumers) + + for db_from_share in databases_created_from_share: + assert ( + db_from_share not in databases_included_in_share + ), "Database included in a share can not be present as consumer in any share." + assert ( + databases_created_from_share.count(db_from_share) == 1 + ), "Same database can not be present as consumer in more than one share." + + return shares + + def outbounds(self) -> Dict[str, Set[DatabaseId]]: + """ + Returns mapping of + database included in current account's outbound share -> all databases created from this share in other accounts + """ + outbounds: Dict[str, Set[DatabaseId]] = defaultdict(set) + if self.shares: + for share_name, share_details in self.shares.items(): + if share_details.platform_instance == self.platform_instance: + logger.debug( + f"database {share_details.database} is included in outbound share(s) {share_name}." + ) + outbounds[share_details.database].update(share_details.consumers) + return outbounds + + def inbounds(self) -> Dict[str, DatabaseId]: + """ + Returns mapping of + database created from an current account's inbound share -> other-account database from which this share was created + """ + inbounds: Dict[str, DatabaseId] = {} + if self.shares: + for share_name, share_details in self.shares.items(): + for consumer in share_details.consumers: + if consumer.platform_instance == self.platform_instance: + logger.debug( + f"database {consumer.database} is created from inbound share {share_name}." + ) + inbounds[consumer.database] = share_details.source_database + break + else: + logger.info( + f"Skipping Share {share_name}, as it does not include current platform instance {self.platform_instance}", + ) + return inbounds diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py deleted file mode 100644 index 832a072c619f8..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py +++ /dev/null @@ -1,664 +0,0 @@ -import json -import logging -from collections import defaultdict -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set - -from pydantic import Field -from pydantic.error_wrappers import ValidationError -from snowflake.connector import SnowflakeConnection - -import datahub.emitter.mce_builder as builder -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import make_s3_urn -from datahub.ingestion.source.snowflake.constants import ( - LINEAGE_PERMISSION_ERROR, - SnowflakeEdition, - SnowflakeObjectDomain, -) -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( - SnowflakeColumnReference, -) -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakePermissionError, - SnowflakeQueryMixin, -) -from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( - FineGrainedLineage, - FineGrainedLineageDownstreamType, - FineGrainedLineageUpstreamType, - UpstreamLineage, -) -from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass -from datahub.utilities.perf_timer import PerfTimer - -logger: logging.Logger = logging.getLogger(__name__) - - -class SnowflakeColumnWithLineage(SnowflakeColumnReference): - class Config: - # This is for backward compatibility and can be removed later - allow_population_by_field_name = True - - directSourceColumns: Optional[List[SnowflakeColumnReference]] = Field( - default=None, alias="directSources" - ) - - -@dataclass(frozen=True) -class SnowflakeColumnId: - columnName: str - objectName: str - objectDomain: Optional[str] = None - - -@dataclass(frozen=True) -class SnowflakeColumnFineGrainedLineage: - """ - Fie grained upstream of column, - which represents a transformation applied on input columns""" - - inputColumns: FrozenSet[SnowflakeColumnId] - # Transform function, query etc can be added here - - -@dataclass -class SnowflakeColumnUpstreams: - """All upstreams of a column""" - - upstreams: Set[SnowflakeColumnFineGrainedLineage] = field( - default_factory=set, init=False - ) - - def update_column_lineage( - self, directSourceColumns: List[SnowflakeColumnReference] - ) -> None: - input_columns = frozenset( - [ - SnowflakeColumnId( - upstream_col.columnName, - upstream_col.objectName, - upstream_col.objectDomain, - ) - for upstream_col in directSourceColumns - if upstream_col.objectName - ] - ) - if not input_columns: - return - upstream = SnowflakeColumnFineGrainedLineage(inputColumns=input_columns) - if upstream not in self.upstreams: - self.upstreams.add(upstream) - - -@dataclass -class SnowflakeUpstreamTable: - upstreamDataset: str - upstreamColumns: List[SnowflakeColumnReference] - downstreamColumns: List[SnowflakeColumnWithLineage] - - @classmethod - def from_dict( - cls, - dataset: str, - upstreams_columns_json: Optional[str], - downstream_columns_json: Optional[str], - ) -> "SnowflakeUpstreamTable": - try: - upstreams_columns_list = [] - downstream_columns_list = [] - if upstreams_columns_json is not None: - upstreams_columns_list = json.loads(upstreams_columns_json) - if downstream_columns_json is not None: - downstream_columns_list = json.loads(downstream_columns_json) - - table_with_upstreams = cls( - dataset, - [ - SnowflakeColumnReference.parse_obj(col) - for col in upstreams_columns_list - ], - [ - SnowflakeColumnWithLineage.parse_obj(col) - for col in downstream_columns_list - ], - ) - except ValidationError: - # Earlier versions of column lineage did not include columnName, only columnId - table_with_upstreams = cls(dataset, [], []) - return table_with_upstreams - - -@dataclass -class SnowflakeTableLineage: - # key: upstream table name - upstreamTables: Dict[str, SnowflakeUpstreamTable] = field( - default_factory=dict, init=False - ) - - # key: downstream column name - columnLineages: Dict[str, SnowflakeColumnUpstreams] = field( - default_factory=lambda: defaultdict(SnowflakeColumnUpstreams), init=False - ) - - def update_lineage( - self, table: SnowflakeUpstreamTable, include_column_lineage: bool = True - ) -> None: - if table.upstreamDataset not in self.upstreamTables.keys(): - self.upstreamTables[table.upstreamDataset] = table - - if include_column_lineage and table.downstreamColumns: - for col in table.downstreamColumns: - if col.directSourceColumns: - self.columnLineages[col.columnName].update_column_lineage( - col.directSourceColumns - ) - - -class SnowflakeLineageExtractor( - SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin -): - """ - Extracts Lineage from Snowflake. - Following lineage edges are considered. - - 1. "Table to View" lineage via `snowflake.account_usage.object_dependencies` view - 2. "S3 to Table" lineage via `show external tables` query. - 3. "View to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - 4. "Table to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - 5. "S3 to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - - Edition Note - Snowflake Standard Edition does not have Access History Feature. So it does not support lineage extraction for edges 3, 4, 5 mentioned above. - """ - - def __init__( - self, - config: SnowflakeV2Config, - report: SnowflakeV2Report, - dataset_urn_builder: Callable[[str], str], - ) -> None: - self._lineage_map: Dict[str, SnowflakeTableLineage] = defaultdict( - SnowflakeTableLineage - ) - self._external_lineage_map: Dict[str, Set[str]] = defaultdict(set) - self.config = config - self.report = report - self.logger = logger - self.dataset_urn_builder = dataset_urn_builder - self.connection: Optional[SnowflakeConnection] = None - - # Kwargs used by new snowflake lineage extractor need to be ignored here - def get_workunits( - self, discovered_tables: List[str], discovered_views: List[str], **_kwargs: Any - ) -> Iterable[MetadataWorkUnit]: - self.connection = self.create_connection() - if self.connection is None: - return - - self._populate_table_lineage() - - if self.config.include_view_lineage: - if len(discovered_views) > 0: - self._populate_view_lineage() - else: - logger.info("No views found. Skipping View Lineage Extraction.") - - self._populate_external_lineage() - - if ( - len(self._lineage_map.keys()) == 0 - and len(self._external_lineage_map.keys()) == 0 - ): - logger.debug("No lineage found.") - return - - yield from self.get_table_upstream_workunits(discovered_tables) - yield from self.get_view_upstream_workunits(discovered_views) - - def _populate_table_lineage(self): - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. Table to Table Lineage Feature is not supported." - ) # See Edition Note above for why - else: - with PerfTimer() as timer: - self._populate_lineage() - self.report.table_lineage_query_secs = timer.elapsed_seconds() - - def get_table_upstream_workunits(self, discovered_tables): - if self.config.include_table_lineage: - for dataset_name in discovered_tables: - upstream_lineage = self._get_upstream_lineage_info(dataset_name) - if upstream_lineage is not None: - yield MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(dataset_name), - aspect=upstream_lineage, - ).as_workunit() - - def get_view_upstream_workunits(self, discovered_views): - if self.config.include_view_lineage: - for view_name in discovered_views: - upstream_lineage = self._get_upstream_lineage_info(view_name) - if upstream_lineage is not None: - yield MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(view_name), - aspect=upstream_lineage, - ).as_workunit() - - def _get_upstream_lineage_info( - self, dataset_name: str - ) -> Optional[UpstreamLineage]: - lineage = self._lineage_map[dataset_name] - external_lineage = self._external_lineage_map[dataset_name] - if not (lineage.upstreamTables or lineage.columnLineages or external_lineage): - logger.debug(f"No lineage found for {dataset_name}") - return None - - upstream_tables: List[UpstreamClass] = [] - finegrained_lineages: List[FineGrainedLineage] = [] - - # Populate the table-lineage in aspect - self.update_upstream_tables_lineage(upstream_tables, lineage) - - # Populate the column-lineage in aspect - self.update_upstream_columns_lineage( - self.dataset_urn_builder(dataset_name), finegrained_lineages, lineage - ) - - # Populate the external-table-lineage(s3->snowflake) in aspect - self.update_external_tables_lineage(upstream_tables, external_lineage) - - if len(upstream_tables) > 0: - logger.debug( - f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}" - ) - if self.config.upstream_lineage_in_report: - self.report.upstream_lineage[dataset_name] = [ - u.dataset for u in upstream_tables - ] - return UpstreamLineage( - upstreams=upstream_tables, - fineGrainedLineages=sorted( - finegrained_lineages, key=lambda x: (x.downstreams, x.upstreams) - ) - or None, - ) - else: - return None - - def _populate_view_lineage(self) -> None: - with PerfTimer() as timer: - self._populate_view_upstream_lineage() - self.report.view_upstream_lineage_query_secs = timer.elapsed_seconds() - - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. View to Table Lineage Feature is not supported." - ) # See Edition Note above for why - else: - with PerfTimer() as timer: - self._populate_view_downstream_lineage() - self.report.view_downstream_lineage_query_secs = timer.elapsed_seconds() - - def _populate_external_lineage(self) -> None: - with PerfTimer() as timer: - self.report.num_external_table_edges_scanned = 0 - - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. External Lineage Feature via Access History is not supported." - ) # See Edition Note above for why - else: - self._populate_external_lineage_from_access_history() - - self._populate_external_lineage_from_show_query() - - logger.info( - f"Found {self.report.num_external_table_edges_scanned} external lineage edges." - ) - - self.report.external_lineage_queries_secs = timer.elapsed_seconds() - - # Handles the case for explicitly created external tables. - # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_show_query(self): - external_tables_query: str = SnowflakeQuery.show_external_tables() - try: - for db_row in self.query(external_tables_query): - key = self.get_dataset_identifier( - db_row["name"], db_row["schema_name"], db_row["database_name"] - ) - - if not self._is_dataset_pattern_allowed( - key, SnowflakeObjectDomain.TABLE - ): - continue - self._external_lineage_map[key].add(db_row["location"]) - logger.debug( - f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via show external tables" - ) - self.report.num_external_table_edges_scanned += 1 - except Exception as e: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating external table lineage from Snowflake failed due to error {e}.", - ) - - # Handles the case where a table is populated from an external location via copy. - # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv'; - def _populate_external_lineage_from_access_history(self): - query: str = SnowflakeQuery.external_table_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - ) - - try: - for db_row in self.query(query): - self._process_external_lineage_result_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating table external lineage from Snowflake failed due to error {e}.", - ) - - def _process_external_lineage_result_row(self, db_row): - # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed(key, SnowflakeObjectDomain.TABLE): - return - - if db_row["UPSTREAM_LOCATIONS"] is not None: - external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"]) - - for loc in external_locations: - if loc not in self._external_lineage_map[key]: - self._external_lineage_map[key].add(loc) - self.report.num_external_table_edges_scanned += 1 - - logger.debug( - f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via access_history" - ) - - def _populate_lineage(self) -> None: - query: str = SnowflakeQuery.table_to_table_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - include_column_lineage=self.config.include_column_lineage, - ) - self.report.num_table_to_table_edges_scanned = 0 - try: - for db_row in self.query(query): - self._process_table_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get table to table lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "table-lineage", - f"Extracting lineage from Snowflake failed due to error {e}.", - ) - logger.info( - f"A total of {self.report.num_table_to_table_edges_scanned} Table->Table edges found" - f" for {len(self._lineage_map)} downstream tables.", - ) - - def _process_table_lineage_row(self, db_row): - # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - upstream_table_name = self.get_dataset_identifier_from_qualified_name( - db_row["UPSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed( - key, SnowflakeObjectDomain.TABLE - ) or not ( - self._is_dataset_pattern_allowed( - upstream_table_name, SnowflakeObjectDomain.TABLE, is_upstream=True - ) - ): - return - self._lineage_map[key].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict( - upstream_table_name, - db_row["UPSTREAM_TABLE_COLUMNS"], - db_row["DOWNSTREAM_TABLE_COLUMNS"], - ), - self.config.include_column_lineage, - ) - self.report.num_table_to_table_edges_scanned += 1 - logger.debug(f"Lineage[Table(Down)={key}]:Table(Up)={self._lineage_map[key]}") - - def _populate_view_upstream_lineage(self) -> None: - # NOTE: This query captures only the upstream lineage of a view (with no column lineage). - # For more details see: https://docs.snowflake.com/en/user-guide/object-dependencies.html#object-dependencies - # and also https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views. - view_upstream_lineage_query: str = SnowflakeQuery.view_dependencies() - - self.report.num_table_to_view_edges_scanned = 0 - - try: - for db_row in self.query(view_upstream_lineage_query): - self._process_view_upstream_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get table to view lineage. Please grant imported privileges on SNOWFLAKE database." - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "view-upstream-lineage", - f"Extracting the upstream view lineage from Snowflake failed due to error {e}.", - ) - logger.info( - f"A total of {self.report.num_table_to_view_edges_scanned} View upstream edges found." - ) - - def _process_view_upstream_lineage_row(self, db_row): - # Process UpstreamTable/View/ExternalTable/Materialized View->View edge. - view_upstream: str = self.get_dataset_identifier_from_qualified_name( - db_row["VIEW_UPSTREAM"] - ) - view_name: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_VIEW"] - ) - - if not self._is_dataset_pattern_allowed( - dataset_name=view_name, - dataset_type=db_row["REFERENCING_OBJECT_DOMAIN"], - ) or not self._is_dataset_pattern_allowed( - view_upstream, db_row["REFERENCED_OBJECT_DOMAIN"], is_upstream=True - ): - return - # key is the downstream view name - self._lineage_map[view_name].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict(view_upstream, None, None), - self.config.include_column_lineage, - ) - self.report.num_table_to_view_edges_scanned += 1 - logger.debug( - f"Upstream->View: Lineage[View(Down)={view_name}]:Upstream={view_upstream}" - ) - - def _populate_view_downstream_lineage(self) -> None: - # This query captures the downstream table lineage for views. - # See https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views. - # Eg: For viewA->viewB->ViewC->TableD, snowflake does not yet log intermediate view logs, resulting in only the viewA->TableD edge. - view_lineage_query: str = SnowflakeQuery.view_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - include_column_lineage=self.config.include_column_lineage, - ) - - self.report.num_view_to_table_edges_scanned = 0 - - try: - for db_row in self.query(view_lineage_query): - self._process_view_downstream_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get view to table lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "view-downstream-lineage", - f"Extracting the view lineage from Snowflake failed due to error {e}.", - ) - - logger.info( - f"Found {self.report.num_view_to_table_edges_scanned} View->Table edges." - ) - - def _process_view_downstream_lineage_row(self, db_row): - view_name: str = self.get_dataset_identifier_from_qualified_name( - db_row["VIEW_NAME"] - ) - downstream_table: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed( - view_name, db_row["VIEW_DOMAIN"], is_upstream=True - ) or not self._is_dataset_pattern_allowed( - downstream_table, db_row["DOWNSTREAM_TABLE_DOMAIN"] - ): - return - - # Capture view->downstream table lineage. - self._lineage_map[downstream_table].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict( - view_name, - db_row["VIEW_COLUMNS"], - db_row["DOWNSTREAM_TABLE_COLUMNS"], - ), - self.config.include_column_lineage, - ) - self.report.num_view_to_table_edges_scanned += 1 - - logger.debug( - f"View->Table: Lineage[Table(Down)={downstream_table}]:View(Up)={self._lineage_map[downstream_table]}" - ) - - def update_upstream_tables_lineage( - self, upstream_tables: List[UpstreamClass], lineage: SnowflakeTableLineage - ) -> None: - for lineage_entry in sorted( - lineage.upstreamTables.values(), key=lambda x: x.upstreamDataset - ): - upstream_table_name = lineage_entry.upstreamDataset - upstream_table = UpstreamClass( - dataset=self.dataset_urn_builder(upstream_table_name), - type=DatasetLineageTypeClass.TRANSFORMED, - ) - upstream_tables.append(upstream_table) - - def update_upstream_columns_lineage( - self, - dataset_urn: str, - finegrained_lineages: List[FineGrainedLineage], - lineage: SnowflakeTableLineage, - ) -> None: - # For every column for which upstream lineage is available - for col, col_upstreams in lineage.columnLineages.items(): - # For every upstream of column - self.update_upstream_columns_lineage_of_column( - dataset_urn, col, finegrained_lineages, col_upstreams - ) - - def update_upstream_columns_lineage_of_column( - self, - dataset_urn: str, - col: str, - finegrained_lineages: List[FineGrainedLineage], - col_upstreams: SnowflakeColumnUpstreams, - ) -> None: - for fine_upstream in col_upstreams.upstreams: - finegrained_lineage_entry = self.build_finegrained_lineage( - dataset_urn, col, fine_upstream - ) - if finegrained_lineage_entry.upstreams: - finegrained_lineages.append(finegrained_lineage_entry) - - def build_finegrained_lineage( - self, - dataset_urn: str, - col: str, - fine_upstream: SnowflakeColumnFineGrainedLineage, - ) -> FineGrainedLineage: - fieldPath = col - - column_upstreams = self.build_finegrained_lineage_upstreams(fine_upstream) - finegrained_lineage_entry = FineGrainedLineage( - upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, - # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend - # even if the lineage is same but the order is different. - upstreams=sorted(column_upstreams), - downstreamType=FineGrainedLineageDownstreamType.FIELD, - downstreams=[ - builder.make_schema_field_urn( - dataset_urn, self.snowflake_identifier(fieldPath) - ) - ], - ) - - return finegrained_lineage_entry - - def build_finegrained_lineage_upstreams( - self, fine_upstream: SnowflakeColumnFineGrainedLineage - ) -> List[str]: - column_upstreams = [] - for upstream_col in fine_upstream.inputColumns: - if ( - upstream_col.objectName - and upstream_col.columnName - and self._is_dataset_pattern_allowed( - upstream_col.objectName, upstream_col.objectDomain, is_upstream=True - ) - ): - upstream_dataset_name = self.get_dataset_identifier_from_qualified_name( - upstream_col.objectName - ) - column_upstreams.append( - builder.make_schema_field_urn( - self.dataset_urn_builder(upstream_dataset_name), - self.snowflake_identifier(upstream_col.columnName), - ) - ) - return column_upstreams - - def update_external_tables_lineage( - self, upstream_tables: List[UpstreamClass], external_lineage: Set[str] - ) -> None: - for external_lineage_entry in sorted(external_lineage): - # For now, populate only for S3 - if external_lineage_entry.startswith("s3://"): - external_upstream_table = UpstreamClass( - dataset=make_s3_urn(external_lineage_entry, self.config.env), - type=DatasetLineageTypeClass.COPY, - ) - upstream_tables.append(external_upstream_table) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index c338c427aefbf..cee3a2926520f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -2,6 +2,7 @@ import logging from collections import defaultdict from dataclasses import dataclass +from datetime import datetime from typing import ( Callable, Collection, @@ -35,6 +36,9 @@ SnowflakePermissionError, SnowflakeQueryMixin, ) +from datahub.ingestion.source.state.redundant_run_skip_handler import ( + RedundantLineageRunSkipHandler, +) from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( FineGrainedLineage, FineGrainedLineageDownstreamType, @@ -48,10 +52,15 @@ SqlParsingResult, sqlglot_lineage, ) +from datahub.utilities.time import ts_millis_to_datetime from datahub.utilities.urns.dataset_urn import DatasetUrn logger: logging.Logger = logging.getLogger(__name__) +EXTERNAL_LINEAGE = "external_lineage" +TABLE_LINEAGE = "table_lineage" +VIEW_LINEAGE = "view_lineage" + @dataclass(frozen=True) class SnowflakeColumnId: @@ -81,6 +90,7 @@ def __init__( config: SnowflakeV2Config, report: SnowflakeV2Report, dataset_urn_builder: Callable[[str], str], + redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler], ) -> None: self._external_lineage_map: Dict[str, Set[str]] = defaultdict(set) self.config = config @@ -89,6 +99,28 @@ def __init__( self.dataset_urn_builder = dataset_urn_builder self.connection: Optional[SnowflakeConnection] = None + self.redundant_run_skip_handler = redundant_run_skip_handler + self.start_time, self.end_time = ( + self.report.lineage_start_time, + self.report.lineage_end_time, + ) = self.get_time_window() + + def get_time_window(self) -> Tuple[datetime, datetime]: + if self.redundant_run_skip_handler: + return self.redundant_run_skip_handler.suggest_run_time_window( + self.config.start_time + if not self.config.ignore_start_time_lineage + else ts_millis_to_datetime(0), + self.config.end_time, + ) + else: + return ( + self.config.start_time + if not self.config.ignore_start_time_lineage + else ts_millis_to_datetime(0), + self.config.end_time, + ) + def get_workunits( self, discovered_tables: List[str], @@ -96,6 +128,9 @@ def get_workunits( schema_resolver: SchemaResolver, view_definitions: MutableMapping[str, str], ) -> Iterable[MetadataWorkUnit]: + if not self._should_ingest_lineage(): + return + self.connection = self.create_connection() if self.connection is None: return @@ -117,6 +152,15 @@ def get_workunits( if self._external_lineage_map: # Some external lineage is yet to be emitted yield from self.get_table_external_upstream_workunits() + if self.redundant_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_run_skip_handler.update_state( + self.config.start_time + if not self.config.ignore_start_time_lineage + else ts_millis_to_datetime(0), + self.config.end_time, + ) + def get_table_external_upstream_workunits(self) -> Iterable[MetadataWorkUnit]: for ( dataset_name, @@ -140,12 +184,14 @@ def get_table_upstream_workunits( else: with PerfTimer() as timer: results = self._fetch_upstream_lineages_for_tables() - self.report.table_lineage_query_secs = timer.elapsed_seconds() - if not results: - return + if not results: + return - yield from self._gen_workunits_from_query_result(discovered_tables, results) + yield from self._gen_workunits_from_query_result( + discovered_tables, results + ) + self.report.table_lineage_query_secs = timer.elapsed_seconds() logger.info( f"Upstream lineage detected for {self.report.num_tables_with_upstreams} tables.", ) @@ -212,12 +258,14 @@ def get_view_upstream_workunits( with PerfTimer() as timer: results = self._fetch_upstream_lineages_for_views() - self.report.view_upstream_lineage_query_secs = timer.elapsed_seconds() - if results: - yield from self._gen_workunits_from_query_result( - set(discovered_views) - views_processed, results, upstream_for_view=True - ) + if results: + yield from self._gen_workunits_from_query_result( + set(discovered_views) - views_processed, + results, + upstream_for_view=True, + ) + self.report.view_upstream_lineage_query_secs = timer.elapsed_seconds() logger.info( f"Upstream lineage detected for {self.report.num_views_with_upstreams} views.", ) @@ -377,6 +425,7 @@ def _populate_external_lineage_from_show_query(self, discovered_tables): "external_lineage", f"Populating external table lineage from Snowflake failed due to error {e}.", ) + self.report_status(EXTERNAL_LINEAGE, False) # Handles the case where a table is populated from an external stage/s3 location via copy. # Eg: copy into category_english from @external_s3_stage; @@ -386,10 +435,8 @@ def _populate_external_lineage_from_copy_history( self, discovered_tables: List[str] ) -> None: query: str = SnowflakeQuery.copy_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), + start_time_millis=int(self.start_time.timestamp() * 1000), + end_time_millis=int(self.end_time.timestamp() * 1000), downstreams_deny_pattern=self.config.temporary_tables_pattern, ) @@ -406,6 +453,7 @@ def _populate_external_lineage_from_copy_history( "external_lineage", f"Populating table external lineage from Snowflake failed due to error {e}.", ) + self.report_status(EXTERNAL_LINEAGE, False) def _process_external_lineage_result_row(self, db_row, discovered_tables): # key is the down-stream table name @@ -429,10 +477,8 @@ def _process_external_lineage_result_row(self, db_row, discovered_tables): def _fetch_upstream_lineages_for_tables(self): query: str = SnowflakeQuery.table_to_table_lineage_history_v2( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), + start_time_millis=int(self.start_time.timestamp() * 1000), + end_time_millis=int(self.end_time.timestamp() * 1000), upstreams_deny_pattern=self.config.temporary_tables_pattern, include_view_lineage=self.config.include_view_lineage, include_column_lineage=self.config.include_column_lineage, @@ -450,6 +496,7 @@ def _fetch_upstream_lineages_for_tables(self): "table-upstream-lineage", f"Extracting lineage from Snowflake failed due to error {e}.", ) + self.report_status(TABLE_LINEAGE, False) def map_query_result_upstreams(self, upstream_tables): if not upstream_tables: @@ -535,6 +582,7 @@ def _fetch_upstream_lineages_for_views(self): "view-upstream-lineage", f"Extracting the upstream view lineage from Snowflake failed due to error {e}.", ) + self.report_status(VIEW_LINEAGE, False) def build_finegrained_lineage( self, @@ -596,3 +644,25 @@ def get_external_upstreams(self, external_lineage: Set[str]) -> List[UpstreamCla ) external_upstreams.append(external_upstream_table) return external_upstreams + + def _should_ingest_lineage(self) -> bool: + if ( + self.redundant_run_skip_handler + and self.redundant_run_skip_handler.should_skip_this_run( + cur_start_time=self.config.start_time + if not self.config.ignore_start_time_lineage + else ts_millis_to_datetime(0), + cur_end_time=self.config.end_time, + ) + ): + # Skip this run + self.report.report_warning( + "lineage-extraction", + "Skip this run as there was already a run for current ingestion window.", + ) + return False + return True + + def report_status(self, step: str, status: bool) -> None: + if self.redundant_run_skip_handler: + self.redundant_run_skip_handler.report_current_run_status(step, status) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 039eac1e93819..0f89324f5efc6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -1,5 +1,6 @@ from typing import List, Optional +from datahub.configuration.time_window_config import BucketDuration from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain from datahub.ingestion.source.snowflake.snowflake_config import DEFAULT_TABLES_DENY_LIST @@ -505,35 +506,6 @@ def view_dependencies_v2() -> str: def show_external_tables() -> str: return "show external tables in account" - # Note - This method should be removed once legacy lineage is removed - @staticmethod - def external_table_lineage_history( - start_time_millis: int, end_time_millis: int - ) -> str: - return f""" - WITH external_table_lineage_history AS ( - SELECT - r.value:"locations" AS upstream_locations, - w.value:"objectName"::varchar AS downstream_table_name, - w.value:"objectDomain"::varchar AS downstream_table_domain, - w.value:"columns" AS downstream_table_columns, - t.query_start_time AS query_start_time - FROM - (SELECT * from snowflake.account_usage.access_history) t, - lateral flatten(input => t.BASE_OBJECTS_ACCESSED) r, - lateral flatten(input => t.OBJECTS_MODIFIED) w - WHERE r.value:"locations" IS NOT NULL - AND w.value:"objectId" IS NOT NULL - AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3) - AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)) - SELECT - upstream_locations AS "UPSTREAM_LOCATIONS", - downstream_table_name AS "DOWNSTREAM_TABLE_NAME", - downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS" - FROM external_table_lineage_history - WHERE downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}' - QUALIFY ROW_NUMBER() OVER (PARTITION BY downstream_table_name ORDER BY query_start_time DESC) = 1""" - @staticmethod def copy_lineage_history( start_time_millis: int, @@ -575,14 +547,17 @@ def get_access_history_date_range() -> str: def usage_per_object_per_time_bucket_for_time_window( start_time_millis: int, end_time_millis: int, - time_bucket_size: str, + time_bucket_size: BucketDuration, use_base_objects: bool, top_n_queries: int, include_top_n_queries: bool, ) -> str: if not include_top_n_queries: top_n_queries = 0 - assert time_bucket_size == "DAY" or time_bucket_size == "HOUR" + assert ( + time_bucket_size == BucketDuration.DAY + or time_bucket_size == BucketDuration.HOUR + ) objects_column = ( "BASE_OBJECTS_ACCESSED" if use_base_objects else "DIRECT_OBJECTS_ACCESSED" ) @@ -629,7 +604,7 @@ def usage_per_object_per_time_bucket_for_time_window( SELECT object_name, ANY_VALUE(object_domain) AS object_domain, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries, count( distinct(user_name) ) AS total_users FROM @@ -644,7 +619,7 @@ def usage_per_object_per_time_bucket_for_time_window( SELECT object_name, column_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries FROM field_access_history @@ -658,7 +633,7 @@ def usage_per_object_per_time_bucket_for_time_window( ( SELECT object_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries, user_name, ANY_VALUE(users.email) AS user_email @@ -677,7 +652,7 @@ def usage_per_object_per_time_bucket_for_time_window( ( SELECT object_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, query_history.query_text AS query_text, count(distinct(access_history.query_id)) AS total_queries FROM diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py index 8003de8286288..f67b359dedb11 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py @@ -1,16 +1,71 @@ from dataclasses import dataclass, field -from typing import Dict, MutableSet, Optional +from datetime import datetime +from typing import Dict, List, MutableSet, Optional from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin from datahub.ingestion.source.snowflake.constants import SnowflakeEdition from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport -from datahub.ingestion.source_report.sql.snowflake import SnowflakeReport -from datahub.ingestion.source_report.usage.snowflake_usage import SnowflakeUsageReport +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionReport, +) +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport +from datahub.ingestion.source_report.time_window import BaseTimeWindowReport + + +@dataclass +class SnowflakeUsageReport: + min_access_history_time: Optional[datetime] = None + max_access_history_time: Optional[datetime] = None + access_history_range_query_secs: float = -1 + access_history_query_secs: float = -1 + + rows_processed: int = 0 + rows_missing_query_text: int = 0 + rows_zero_base_objects_accessed: int = 0 + rows_zero_direct_objects_accessed: int = 0 + rows_missing_email: int = 0 + rows_parsing_error: int = 0 + + usage_start_time: Optional[datetime] = None + usage_end_time: Optional[datetime] = None + stateful_usage_ingestion_enabled: bool = False + + +@dataclass +class SnowflakeReport(ProfilingSqlReport, BaseTimeWindowReport): + num_table_to_table_edges_scanned: int = 0 + num_table_to_view_edges_scanned: int = 0 + num_view_to_table_edges_scanned: int = 0 + num_external_table_edges_scanned: int = 0 + ignore_start_time_lineage: Optional[bool] = None + upstream_lineage_in_report: Optional[bool] = None + upstream_lineage: Dict[str, List[str]] = field(default_factory=dict) + + lineage_start_time: Optional[datetime] = None + lineage_end_time: Optional[datetime] = None + stateful_lineage_ingestion_enabled: bool = False + + cleaned_account_id: str = "" + run_ingestion: bool = False + + # https://community.snowflake.com/s/topic/0TO0Z000000Unu5WAC/releases + saas_version: Optional[str] = None + default_warehouse: Optional[str] = None + default_db: Optional[str] = None + default_schema: Optional[str] = None + role: str = "" + + profile_if_updated_since: Optional[datetime] = None + profile_candidates: Dict[str, List[str]] = field(default_factory=dict) @dataclass class SnowflakeV2Report( - SnowflakeReport, SnowflakeUsageReport, ProfilingSqlReport, ClassificationReportMixin + SnowflakeReport, + SnowflakeUsageReport, + StatefulIngestionReport, + ClassificationReportMixin, + IngestionStageReport, ): account_locator: Optional[str] = None region: Optional[str] = None @@ -94,3 +149,6 @@ def _is_tag_scanned(self, tag_name: str) -> bool: def report_tag_processed(self, tag_name: str) -> None: self._processed_tags.add(tag_name) + + def set_ingestion_stage(self, database: str, stage: str) -> None: + self.report_ingestion_stage_start(f"{database}: {stage}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index dab46645bffcc..e5b214ba35e4b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -261,6 +261,7 @@ def get_tables_for_database( for table in cur: if table["TABLE_SCHEMA"] not in tables: tables[table["TABLE_SCHEMA"]] = [] + tables[table["TABLE_SCHEMA"]].append( SnowflakeTable( name=table["TABLE_NAME"], diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py new file mode 100644 index 0000000000000..6f7520bbf1988 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -0,0 +1,158 @@ +import logging +from typing import Callable, Iterable, List + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_config import ( + DatabaseId, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + DatasetLineageType, + Upstream, + UpstreamLineage, +) + +logger: logging.Logger = logging.getLogger(__name__) + + +class SnowflakeSharesHandler(SnowflakeCommonMixin): + def __init__( + self, + config: SnowflakeV2Config, + report: SnowflakeV2Report, + dataset_urn_builder: Callable[[str], str], + ) -> None: + self.config = config + self.report = report + self.logger = logger + self.dataset_urn_builder = dataset_urn_builder + + def get_shares_workunits( + self, databases: List[SnowflakeDatabase] + ) -> Iterable[MetadataWorkUnit]: + inbounds = self.config.inbounds() + outbounds = self.config.outbounds() + # None of the databases are shared + if not (inbounds or outbounds): + return + + logger.debug("Checking databases for inbound or outbound shares.") + for db in databases: + is_inbound = db.name in inbounds + is_outbound = db.name in outbounds + + if not (is_inbound or is_outbound): + logger.debug(f"database {db.name} is not shared.") + continue + + sibling_dbs = ( + list(outbounds[db.name]) if is_outbound else [inbounds[db.name]] + ) + + for schema in db.schemas: + for table_name in schema.tables + schema.views: + # TODO: If this is outbound database, + # 1. attempt listing shares using `show shares` to identify name of share associated with this database (cache query result). + # 2. if corresponding share is listed, then run `show grants to share ` to identify exact tables, views included in share. + # 3. emit siblings only for the objects listed above. + # This will work only if the configured role has accountadmin role access OR is owner of share. + # Otherwise ghost nodes may be shown in "Composed Of" section for tables/views in original database which are not granted to share. + yield from self.gen_siblings( + db.name, + schema.name, + table_name, + is_outbound, + sibling_dbs, + ) + + if is_inbound: + assert len(sibling_dbs) == 1 + # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy + # hence this lineage code is not written in SnowflakeLineageExtractor + # also this is not governed by configs include_table_lineage and include_view_lineage + yield self.get_upstream_lineage_with_primary_sibling( + db.name, schema.name, table_name, sibling_dbs[0] + ) + + self.report_missing_databases( + databases, list(inbounds.keys()), list(outbounds.keys()) + ) + + def report_missing_databases( + self, + databases: List[SnowflakeDatabase], + inbounds: List[str], + outbounds: List[str], + ) -> None: + db_names = [db.name for db in databases] + missing_dbs = [db for db in inbounds + outbounds if db not in db_names] + + if missing_dbs: + self.report_warning( + "snowflake-shares", + f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.", + ) + + def gen_siblings( + self, + database_name: str, + schema_name: str, + table_name: str, + primary: bool, + sibling_databases: List[DatabaseId], + ) -> Iterable[MetadataWorkUnit]: + if not sibling_databases: + return + dataset_identifier = self.get_dataset_identifier( + table_name, schema_name, database_name + ) + urn = self.dataset_urn_builder(dataset_identifier) + + sibling_urns = [ + make_dataset_urn_with_platform_instance( + self.platform, + self.get_dataset_identifier( + table_name, schema_name, sibling_db.database + ), + sibling_db.platform_instance, + ) + for sibling_db in sibling_databases + ] + + yield MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=Siblings(primary=primary, siblings=sorted(sibling_urns)), + ).as_workunit() + + def get_upstream_lineage_with_primary_sibling( + self, + database_name: str, + schema_name: str, + table_name: str, + primary_sibling_db: DatabaseId, + ) -> MetadataWorkUnit: + dataset_identifier = self.get_dataset_identifier( + table_name, schema_name, database_name + ) + urn = self.dataset_urn_builder(dataset_identifier) + + upstream_urn = make_dataset_urn_with_platform_instance( + self.platform, + self.get_dataset_identifier( + table_name, schema_name, primary_sibling_db.database + ), + primary_sibling_db.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=UpstreamLineage( + upstreams=[Upstream(dataset=upstream_urn, type=DatasetLineageType.COPY)] + ), + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 3605205b6055c..a64921ea01759 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -2,11 +2,12 @@ import logging import time from datetime import datetime, timezone -from typing import Any, Callable, Dict, Iterable, List, Optional +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple import pydantic from snowflake.connector import SnowflakeConnection +from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.emitter.mce_builder import make_user_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics @@ -21,7 +22,13 @@ SnowflakePermissionError, SnowflakeQueryMixin, ) -from datahub.ingestion.source.usage.usage_common import TOTAL_BUDGET_FOR_QUERY_LIST +from datahub.ingestion.source.state.redundant_run_skip_handler import ( + RedundantUsageRunSkipHandler, +) +from datahub.ingestion.source_report.ingestion_stage import ( + USAGE_EXTRACTION_OPERATIONAL_STATS, + USAGE_EXTRACTION_USAGE_AGGREGATION, +) from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetFieldUsageCounts, DatasetUsageStatistics, @@ -107,6 +114,7 @@ def __init__( config: SnowflakeV2Config, report: SnowflakeV2Report, dataset_urn_builder: Callable[[str], str], + redundant_run_skip_handler: Optional[RedundantUsageRunSkipHandler], ) -> None: self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = report @@ -114,9 +122,28 @@ def __init__( self.logger = logger self.connection: Optional[SnowflakeConnection] = None + self.redundant_run_skip_handler = redundant_run_skip_handler + self.start_time, self.end_time = ( + self.report.usage_start_time, + self.report.usage_end_time, + ) = self.get_time_window() + + def get_time_window(self) -> Tuple[datetime, datetime]: + if self.redundant_run_skip_handler: + return self.redundant_run_skip_handler.suggest_run_time_window( + self.config.start_time, self.config.end_time + ) + else: + return self.config.start_time, self.config.end_time + def get_usage_workunits( self, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: + if not self._should_ingest_usage(): + return + + self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION) + self.connection = self.create_connection() if self.connection is None: return @@ -144,13 +171,19 @@ def get_usage_workunits( if self.config.include_usage_stats: yield from auto_empty_dataset_usage_statistics( self._get_workunits_internal(discovered_datasets), - config=self.config, + config=BaseTimeWindowConfig( + start_time=self.start_time, + end_time=self.end_time, + bucket_duration=self.config.bucket_duration, + ), dataset_urns={ self.dataset_urn_builder(dataset_identifier) for dataset_identifier in discovered_datasets }, ) + self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS) + if self.config.include_operational_stats: # Generate the operation workunits. access_events = self._get_snowflake_history() @@ -159,6 +192,14 @@ def get_usage_workunits( event, discovered_datasets ) + if self.redundant_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_run_skip_handler.update_state( + self.config.start_time, + self.config.end_time, + self.config.bucket_duration, + ) + def _get_workunits_internal( self, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: @@ -167,10 +208,8 @@ def _get_workunits_internal( try: results = self.query( SnowflakeQuery.usage_per_object_per_time_bucket_for_time_window( - start_time_millis=int( - self.config.start_time.timestamp() * 1000 - ), - end_time_millis=int(self.config.end_time.timestamp() * 1000), + start_time_millis=int(self.start_time.timestamp() * 1000), + end_time_millis=int(self.end_time.timestamp() * 1000), time_bucket_size=self.config.bucket_duration, use_base_objects=self.config.apply_view_usage_to_tables, top_n_queries=self.config.top_n_queries, @@ -179,11 +218,13 @@ def _get_workunits_internal( ) except Exception as e: logger.debug(e, exc_info=e) - self.report_warning( + self.warn_if_stateful_else_error( "usage-statistics", f"Populating table usage statistics from Snowflake failed due to error {e}.", ) + self.report_status(USAGE_EXTRACTION_USAGE_AGGREGATION, False) return + self.report.usage_aggregation_query_secs = timer.elapsed_seconds() for row in results: @@ -238,7 +279,7 @@ def build_usage_statistics_for_dataset(self, dataset_identifier, row): def _map_top_sql_queries(self, top_sql_queries: Dict) -> List[str]: budget_per_query: int = int( - TOTAL_BUDGET_FOR_QUERY_LIST / self.config.top_n_queries + self.config.queries_character_limit / self.config.top_n_queries ) return sorted( [ @@ -300,10 +341,11 @@ def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]: results = self.query(query) except Exception as e: logger.debug(e, exc_info=e) - self.report_warning( + self.warn_if_stateful_else_error( "operation", f"Populating table operation history from Snowflake failed due to error {e}.", ) + self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False) return self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2) @@ -311,8 +353,8 @@ def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]: yield from self._process_snowflake_history_row(row) def _make_operations_query(self) -> str: - start_time = int(self.config.start_time.timestamp() * 1000) - end_time = int(self.config.end_time.timestamp() * 1000) + start_time = int(self.start_time.timestamp() * 1000) + end_time = int(self.end_time.timestamp() * 1000) return SnowflakeQuery.operational_data_for_time_window(start_time, end_time) def _check_usage_date_ranges(self) -> Any: @@ -331,6 +373,7 @@ def _check_usage_date_ranges(self) -> Any: "usage", f"Extracting the date range for usage data from Snowflake failed due to error {e}.", ) + self.report_status("date-range-check", False) else: for db_row in results: if ( @@ -356,7 +399,6 @@ def _check_usage_date_ranges(self) -> Any: def _get_operation_aspect_work_unit( self, event: SnowflakeJoinedAccessEvent, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: - if event.query_start_time and event.query_type: start_time = event.query_start_time query_type = event.query_type @@ -494,3 +536,24 @@ def _is_object_valid(self, obj: Dict[str, Any]) -> bool: ): return False return True + + def _should_ingest_usage(self) -> bool: + if ( + self.redundant_run_skip_handler + and self.redundant_run_skip_handler.should_skip_this_run( + cur_start_time=self.config.start_time, + cur_end_time=self.config.end_time, + ) + ): + # Skip this run + self.report.report_warning( + "usage-extraction", + "Skip this run as there was already a run for current ingestion window.", + ) + return False + + return True + + def report_status(self, step: str, status: bool) -> None: + if self.redundant_run_skip_handler: + self.redundant_run_skip_handler.report_current_run_status(step, status) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 7dd51d5b20e8e..e561ed0e2d146 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -51,9 +51,6 @@ SnowflakeV2Config, TagOption, ) -from datahub.ingestion.source.snowflake.snowflake_lineage_legacy import ( - SnowflakeLineageExtractor as SnowflakeLineageLegacyExtractor, -) from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( SnowflakeLineageExtractor, ) @@ -71,6 +68,7 @@ SnowflakeTag, SnowflakeView, ) +from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler from datahub.ingestion.source.snowflake.snowflake_tag import SnowflakeTagExtractor from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeUsageExtractor, @@ -92,7 +90,8 @@ ) from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.ingestion.source.state.redundant_run_skip_handler import ( - RedundantRunSkipHandler, + RedundantLineageRunSkipHandler, + RedundantUsageRunSkipHandler, ) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, @@ -100,6 +99,11 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) +from datahub.ingestion.source_report.ingestion_stage import ( + LINEAGE_EXTRACTION, + METADATA_EXTRACTION, + PROFILING, +) from datahub.metadata.com.linkedin.pegasus2avro.common import ( GlobalTags, Status, @@ -132,7 +136,6 @@ from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.sqlglot_lineage import SchemaResolver -from datahub.utilities.time import datetime_to_ts_millis logger: logging.Logger = logging.getLogger(__name__) @@ -224,13 +227,6 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.snowsight_base_url: Optional[str] = None self.connection: Optional[SnowflakeConnection] = None - self.redundant_run_skip_handler = RedundantRunSkipHandler( - source=self, - config=self.config, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - self.domain_registry: Optional[DomainRegistry] = None if self.config.domain: self.domain_registry = DomainRegistry( @@ -240,23 +236,42 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary() - self.lineage_extractor: Union[ - SnowflakeLineageExtractor, SnowflakeLineageLegacyExtractor - ] - if config.include_table_lineage: - # For lineage - if self.config.use_legacy_lineage_method: - self.lineage_extractor = SnowflakeLineageLegacyExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn - ) - else: - self.lineage_extractor = SnowflakeLineageExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn + self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None + if self.config.include_table_lineage: + redundant_lineage_run_skip_handler: Optional[ + RedundantLineageRunSkipHandler + ] = None + if self.config.enable_stateful_lineage_ingestion: + redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler( + source=self, + config=self.config, + pipeline_name=self.ctx.pipeline_name, + run_id=self.ctx.run_id, ) + self.lineage_extractor = SnowflakeLineageExtractor( + config, + self.report, + dataset_urn_builder=self.gen_dataset_urn, + redundant_run_skip_handler=redundant_lineage_run_skip_handler, + ) - if config.include_usage_stats or config.include_operational_stats: + self.usage_extractor: Optional[SnowflakeUsageExtractor] = None + if self.config.include_usage_stats or self.config.include_operational_stats: + redundant_usage_run_skip_handler: Optional[ + RedundantUsageRunSkipHandler + ] = None + if self.config.enable_stateful_usage_ingestion: + redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler( + source=self, + config=self.config, + pipeline_name=self.ctx.pipeline_name, + run_id=self.ctx.run_id, + ) self.usage_extractor = SnowflakeUsageExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn + config, + self.report, + dataset_urn_builder=self.gen_dataset_urn, + redundant_run_skip_handler=redundant_usage_run_skip_handler, ) self.tag_extractor = SnowflakeTagExtractor( @@ -264,7 +279,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): ) self.profiling_state_handler: Optional[ProfilingHandler] = None - if self.config.store_last_profiling_timestamps: + if self.config.enable_stateful_profiling: self.profiling_state_handler = ProfilingHandler( source=self, config=self.config, @@ -292,6 +307,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): env=self.config.env, ) self.view_definitions: FileBackedDict[str] = FileBackedDict() + self.add_config_to_report() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": @@ -492,7 +508,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.connection is None: return - self.add_config_to_report() self.inspect_session_metadata() if self.config.include_external_url: @@ -503,13 +518,21 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: return self.data_dictionary.set_connection(self.connection) - databases = self.get_databases() + databases: List[SnowflakeDatabase] = [] + + for database in self.get_databases() or []: + self.report.report_entity_scanned(database.name, "database") + if not self.config.database_pattern.allowed(database.name): + self.report.report_dropped(f"{database.name}.*") + else: + databases.append(database) - if databases is None or len(databases) == 0: + if len(databases) == 0: return for snowflake_db in databases: try: + self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION) yield from self._process_database(snowflake_db) except SnowflakePermissionError as e: @@ -532,25 +555,22 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # TODO: The checkpoint state for stale entity detection can be committed here. + if self.config.shares: + yield from SnowflakeSharesHandler( + self.config, self.report, self.gen_dataset_urn + ).get_shares_workunits(databases) + discovered_tables: List[str] = [ self.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.tables - if self._is_dataset_pattern_allowed( - self.get_dataset_identifier(table_name, schema.name, db.name), - SnowflakeObjectDomain.TABLE, - ) ] discovered_views: List[str] = [ self.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.views - if self._is_dataset_pattern_allowed( - self.get_dataset_identifier(table_name, schema.name, db.name), - SnowflakeObjectDomain.VIEW, - ) ] if len(discovered_tables) == 0 and len(discovered_views) == 0: @@ -562,7 +582,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: discovered_datasets = discovered_tables + discovered_views - if self.config.include_table_lineage: + if self.config.include_table_lineage and self.lineage_extractor: + self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION) yield from self.lineage_extractor.get_workunits( discovered_tables=discovered_tables, discovered_views=discovered_views, @@ -570,27 +591,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: view_definitions=self.view_definitions, ) - if self.config.include_usage_stats or self.config.include_operational_stats: - if ( - self.config.store_last_usage_extraction_timestamp - and self.redundant_run_skip_handler.should_skip_this_run( - cur_start_time_millis=datetime_to_ts_millis(self.config.start_time) - ) - ): - # Skip this run - self.report.report_warning( - "usage-extraction", - f"Skip this run as there was a run later than the current start time: {self.config.start_time}", - ) - return - - if self.config.store_last_usage_extraction_timestamp: - # Update the checkpoint state for this run. - self.redundant_run_skip_handler.update_state( - start_time_millis=datetime_to_ts_millis(self.config.start_time), - end_time_millis=datetime_to_ts_millis(self.config.end_time), - ) - + if ( + self.config.include_usage_stats or self.config.include_operational_stats + ) and self.usage_extractor: yield from self.usage_extractor.get_usage_workunits(discovered_datasets) def report_warehouse_failure(self): @@ -654,11 +657,6 @@ def get_databases_from_ischema(self, databases): def _process_database( self, snowflake_db: SnowflakeDatabase ) -> Iterable[MetadataWorkUnit]: - self.report.report_entity_scanned(snowflake_db.name, "database") - if not self.config.database_pattern.allowed(snowflake_db.name): - self.report.report_dropped(f"{snowflake_db.name}.*") - return - db_name = snowflake_db.name try: @@ -702,13 +700,25 @@ def _process_database( yield from self._process_schema(snowflake_schema, db_name) if self.config.is_profiling_enabled() and self.db_tables: + self.report.set_ingestion_stage(snowflake_db.name, PROFILING) yield from self.profiler.get_workunits(snowflake_db, self.db_tables) - def fetch_schemas_for_database(self, snowflake_db, db_name): + def fetch_schemas_for_database( + self, snowflake_db: SnowflakeDatabase, db_name: str + ) -> None: + schemas: List[SnowflakeSchema] = [] try: - snowflake_db.schemas = self.data_dictionary.get_schemas_for_database( - db_name - ) + for schema in self.data_dictionary.get_schemas_for_database(db_name): + self.report.report_entity_scanned(schema.name, "schema") + if not is_schema_allowed( + self.config.schema_pattern, + schema.name, + db_name, + self.config.match_fully_qualified_names, + ): + self.report.report_dropped(f"{db_name}.{schema.name}.*") + else: + schemas.append(schema) except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = f"Failed to get schemas for database {db_name}. Please check permissions." @@ -724,25 +734,17 @@ def fetch_schemas_for_database(self, snowflake_db, db_name): db_name, ) - if not snowflake_db.schemas: + if not schemas: self.report_warning( "No schemas found in database. If schemas exist, please grant USAGE permissions on them.", db_name, ) + else: + snowflake_db.schemas = schemas def _process_schema( self, snowflake_schema: SnowflakeSchema, db_name: str ) -> Iterable[MetadataWorkUnit]: - self.report.report_entity_scanned(snowflake_schema.name, "schema") - if not is_schema_allowed( - self.config.schema_pattern, - snowflake_schema.name, - db_name, - self.config.match_fully_qualified_names, - ): - self.report.report_dropped(f"{db_name}.{snowflake_schema.name}.*") - return - schema_name = snowflake_schema.name if self.config.extract_tags != TagOption.skip: @@ -768,7 +770,8 @@ def _process_schema( if self.config.parse_view_ddl: for view in views: key = self.get_dataset_identifier(view.name, schema_name, db_name) - self.view_definitions[key] = view.view_definition + if view.view_definition: + self.view_definitions[key] = view.view_definition if self.config.include_technical_schema or self.config.parse_view_ddl: for view in views: @@ -784,9 +787,20 @@ def _process_schema( f"{db_name}.{schema_name}", ) - def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name): + def fetch_views_for_schema( + self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str + ) -> List[SnowflakeView]: try: - views = self.get_views_for_schema(schema_name, db_name) + views: List[SnowflakeView] = [] + for view in self.get_views_for_schema(schema_name, db_name): + view_name = self.get_dataset_identifier(view.name, schema_name, db_name) + + self.report.report_entity_scanned(view_name, "view") + + if not self.config.view_pattern.allowed(view_name): + self.report.report_dropped(view_name) + else: + views.append(view) snowflake_schema.views = [view.name for view in views] return views except Exception as e: @@ -804,10 +818,22 @@ def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name): "Failed to get views for schema", f"{db_name}.{schema_name}", ) + return [] - def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name): + def fetch_tables_for_schema( + self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str + ) -> List[SnowflakeTable]: try: - tables = self.get_tables_for_schema(schema_name, db_name) + tables: List[SnowflakeTable] = [] + for table in self.get_tables_for_schema(schema_name, db_name): + table_identifier = self.get_dataset_identifier( + table.name, schema_name, db_name + ) + self.report.report_entity_scanned(table_identifier) + if not self.config.table_pattern.allowed(table_identifier): + self.report.report_dropped(table_identifier) + else: + tables.append(table) snowflake_schema.tables = [table.name for table in tables] return tables except Exception as e: @@ -824,6 +850,7 @@ def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name): "Failed to get tables for schema", f"{db_name}.{schema_name}", ) + return [] def _process_table( self, @@ -833,12 +860,6 @@ def _process_table( ) -> Iterable[MetadataWorkUnit]: table_identifier = self.get_dataset_identifier(table.name, schema_name, db_name) - self.report.report_entity_scanned(table_identifier) - - if not self.config.table_pattern.allowed(table_identifier): - self.report.report_dropped(table_identifier) - return - self.fetch_columns_for_table(table, schema_name, db_name, table_identifier) self.fetch_pk_for_table(table, schema_name, db_name, table_identifier) @@ -950,12 +971,6 @@ def _process_view( ) -> Iterable[MetadataWorkUnit]: view_name = self.get_dataset_identifier(view.name, schema_name, db_name) - self.report.report_entity_scanned(view_name, "view") - - if not self.config.view_pattern.allowed(view_name): - self.report.report_dropped(view_name) - return - try: view.columns = self.get_columns_for_table(view.name, schema_name, db_name) if self.config.extract_tags != TagOption.skip: @@ -1417,16 +1432,20 @@ def add_config_to_report(self): self.report.cleaned_account_id = self.config.get_account() self.report.ignore_start_time_lineage = self.config.ignore_start_time_lineage self.report.upstream_lineage_in_report = self.config.upstream_lineage_in_report - if not self.report.ignore_start_time_lineage: - self.report.lineage_start_time = self.config.start_time - self.report.lineage_end_time = self.config.end_time self.report.include_technical_schema = self.config.include_technical_schema self.report.include_usage_stats = self.config.include_usage_stats self.report.include_operational_stats = self.config.include_operational_stats self.report.include_column_lineage = self.config.include_column_lineage - if self.report.include_usage_stats or self.config.include_operational_stats: - self.report.window_start_time = self.config.start_time - self.report.window_end_time = self.config.end_time + self.report.stateful_lineage_ingestion_enabled = ( + self.config.enable_stateful_lineage_ingestion + ) + self.report.stateful_usage_ingestion_enabled = ( + self.config.enable_stateful_usage_ingestion + ) + self.report.window_start_time, self.report.window_end_time = ( + self.config.start_time, + self.config.end_time, + ) def inspect_session_metadata(self) -> None: try: @@ -1608,7 +1627,7 @@ def close(self) -> None: StatefulIngestionSourceBase.close(self) self.view_definitions.close() self.sql_parser_schema_resolver.close() - if hasattr(self, "lineage_extractor"): + if self.lineage_extractor: self.lineage_extractor.close() - if hasattr(self, "usage_extractor"): + if self.usage_extractor: self.usage_extractor.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index 8b2eed36ac6b3..9cb613bde1e9f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -9,7 +9,7 @@ from sqlalchemy.engine.reflection import Inspector from datahub.configuration.validate_field_rename import pydantic_renamed_field -from datahub.emitter.mcp_builder import ContainerKey +from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey from datahub.ingestion.api.decorators import ( SourceCapability, SupportStatus, @@ -22,10 +22,7 @@ from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes from datahub.ingestion.source.sql.sql_common import SQLAlchemySource -from datahub.ingestion.source.sql.sql_config import ( - SQLAlchemyConfig, - make_sqlalchemy_uri, -) +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, gen_database_container, @@ -33,7 +30,7 @@ ) -class AthenaConfig(SQLAlchemyConfig): +class AthenaConfig(SQLCommonConfig): scheme: str = "awsathena+rest" username: Optional[str] = pydantic.Field( default=None, @@ -195,15 +192,12 @@ def gen_schema_containers( database: str, extra_properties: Optional[Dict[str, Any]] = None, ) -> Iterable[MetadataWorkUnit]: - database_container_key = gen_database_key( - database, - platform=self.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, + database_container_key = self.get_database_container_key( + db_name=database, schema=schema ) yield from gen_database_container( - database=database, + database=database_container_key.database, database_container_key=database_container_key, sub_types=[DatasetContainerSubTypes.DATABASE], domain_registry=self.domain_registry, @@ -211,7 +205,7 @@ def gen_schema_containers( extra_properties=extra_properties, ) - def get_database_container_key(self, db_name: str, schema: str) -> ContainerKey: + def get_database_container_key(self, db_name: str, schema: str) -> DatabaseKey: # Because our overridden get_allowed_schemas method returns db_name as the schema name, # the db_name and schema here will be the same. Hence, we just ignore the schema parameter. # Based on community feedback, db_name only available if it is explicitly specified in the connection string. diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py deleted file mode 100644 index a9afd40fd45b6..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py +++ /dev/null @@ -1,278 +0,0 @@ -import logging -import urllib.parse -from typing import Any, Dict, Iterable, List, Optional, Tuple - -import pydantic -import sqlalchemy.dialects.mssql - -# This import verifies that the dependencies are available. -import sqlalchemy_pytds # noqa: F401 -from pydantic.fields import Field -from sqlalchemy import create_engine, inspect -from sqlalchemy.engine.base import Connection -from sqlalchemy.engine.reflection import Inspector - -from datahub.configuration.common import AllowDenyPattern -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.source.sql.sql_common import ( - SQLAlchemySource, - register_custom_type, -) -from datahub.ingestion.source.sql.sql_config import ( - BasicSQLAlchemyConfig, - make_sqlalchemy_uri, -) -from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass - -logger: logging.Logger = logging.getLogger(__name__) - -register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) -register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) - - -class SQLServerConfig(BasicSQLAlchemyConfig): - # defaults - host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") - scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) - use_odbc: bool = Field( - default=False, - description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", - ) - uri_args: Dict[str, str] = Field( - default={}, - description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", - ) - database_pattern: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for databases to filter in ingestion.", - ) - database: Optional[str] = Field( - default=None, - description="database (catalog). If set to Null, all databases will be considered for ingestion.", - ) - convert_urns_to_lowercase: bool = Field( - default=False, - description="Enable to convert the SQL Server assets urns to lowercase", - ) - - @pydantic.validator("uri_args") - def passwords_match(cls, v, values, **kwargs): - if values["use_odbc"] and "driver" not in v: - raise ValueError("uri_args must contain a 'driver' option") - elif not values["use_odbc"] and v: - raise ValueError("uri_args is not supported when ODBC is disabled") - return v - - def get_sql_alchemy_url( - self, - uri_opts: Optional[Dict[str, Any]] = None, - current_db: Optional[str] = None, - ) -> str: - if self.use_odbc: - # Ensure that the import is available. - import pyodbc # noqa: F401 - - self.scheme = "mssql+pyodbc" - - uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri( - self.scheme, # type: ignore - self.username, - self.password.get_secret_value() if self.password else None, - self.host_port, # type: ignore - current_db if current_db else self.database, - uri_opts=uri_opts, - ) - if self.use_odbc: - uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}" - return uri - - -@platform_name("Microsoft SQL Server", id="mssql") -@config_class(SQLServerConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") -@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") -@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") -@capability( - SourceCapability.USAGE_STATS, - "Not provided by this module, use `bigquery-usage` for that.", - supported=False, -) -@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") -class SQLServerSource(SQLAlchemySource): - """ - This plugin extracts the following: - - - Metadata for databases, schemas, views and tables - - Column types associated with each table/view - - Table, row, and column statistics via optional SQL profiling - - We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. - """ - - def __init__(self, config: SQLServerConfig, ctx: PipelineContext): - super().__init__(config, ctx, "mssql") - # Cache the table and column descriptions - self.config: SQLServerConfig = config - self.current_database = None - self.table_descriptions: Dict[str, str] = {} - self.column_descriptions: Dict[str, str] = {} - for inspector in self.get_inspectors(): - db_name: str = self.get_db_name(inspector) - with inspector.engine.connect() as conn: - if self.config.use_odbc: - self._add_output_converters(conn) - self._populate_table_descriptions(conn, db_name) - self._populate_column_descriptions(conn, db_name) - - @staticmethod - def _add_output_converters(conn: Connection) -> None: - def handle_sql_variant_as_string(value): - return value.decode("utf-16le") - - # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported - # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy - try: - conn.connection.add_output_converter(-150, handle_sql_variant_as_string) - except AttributeError as e: - logger.debug( - f"Failed to mount output converter for MSSQL data type -150 due to {e}" - ) - - def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: - # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name - # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/ - table_metadata = conn.execute( - """ - SELECT - SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, - T.NAME AS table_name, - EP.VALUE AS table_description - FROM sys.tables AS T - INNER JOIN sys.extended_properties AS EP - ON EP.MAJOR_ID = T.[OBJECT_ID] - AND EP.MINOR_ID = 0 - AND EP.NAME = 'MS_Description' - AND EP.CLASS = 1 - """ - ) - for row in table_metadata: - self.table_descriptions[ - f"{db_name}.{row['schema_name']}.{row['table_name']}" - ] = row["table_description"] - - def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None: - column_metadata = conn.execute( - """ - SELECT - SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, - T.NAME AS table_name, - C.NAME AS column_name , - EP.VALUE AS column_description - FROM sys.tables AS T - INNER JOIN sys.all_columns AS C - ON C.OBJECT_ID = T.[OBJECT_ID] - INNER JOIN sys.extended_properties AS EP - ON EP.MAJOR_ID = T.[OBJECT_ID] - AND EP.MINOR_ID = C.COLUMN_ID - AND EP.NAME = 'MS_Description' - AND EP.CLASS = 1 - """ - ) - for row in column_metadata: - self.column_descriptions[ - f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}" - ] = row["column_description"] - - @classmethod - def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource": - config = SQLServerConfig.parse_obj(config_dict) - return cls(config, ctx) - - # override to get table descriptions - def get_table_properties( - self, inspector: Inspector, schema: str, table: str - ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: - description, properties, location_urn = super().get_table_properties( - inspector, schema, table - ) - # Update description if available. - db_name: str = self.get_db_name(inspector) - description = self.table_descriptions.get( - f"{db_name}.{schema}.{table}", description - ) - return description, properties, location_urn - - # override to get column descriptions - def _get_columns( - self, dataset_name: str, inspector: Inspector, schema: str, table: str - ) -> List[Dict]: - columns: List[Dict] = super()._get_columns( - dataset_name, inspector, schema, table - ) - # Update column description if available. - db_name: str = self.get_db_name(inspector) - for column in columns: - description: Optional[str] = self.column_descriptions.get( - f"{db_name}.{schema}.{table}.{column['name']}", - ) - if description: - column["comment"] = description - return columns - - def get_inspectors(self) -> Iterable[Inspector]: - # This method can be overridden in the case that you want to dynamically - # run on multiple databases. - url = self.config.get_sql_alchemy_url() - logger.debug(f"sql_alchemy_url={url}") - engine = create_engine(url, **self.config.options) - with engine.connect() as conn: - if self.config.database and self.config.database != "": - inspector = inspect(conn) - yield inspector - else: - databases = conn.execute( - "SELECT name FROM master.sys.databases WHERE name NOT IN \ - ('master', 'model', 'msdb', 'tempdb', 'Resource', \ - 'distribution' , 'reportserver', 'reportservertempdb'); " - ) - for db in databases: - if self.config.database_pattern.allowed(db["name"]): - url = self.config.get_sql_alchemy_url(current_db=db["name"]) - with create_engine( - url, **self.config.options - ).connect() as conn: - inspector = inspect(conn) - self.current_database = db["name"] - yield inspector - - def get_identifier( - self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any - ) -> str: - regular = f"{schema}.{entity}" - - qualified_table_name = regular - - if self.config.database: - if self.config.database_alias: - qualified_table_name = f"{self.config.database_alias}.{regular}" - else: - qualified_table_name = f"{self.config.database}.{regular}" - - if self.current_database: - qualified_table_name = f"{self.current_database}.{regular}" - - return ( - qualified_table_name.lower() - if self.config.convert_urns_to_lowercase - else qualified_table_name - ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py new file mode 100644 index 0000000000000..8db89505a9cf6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py @@ -0,0 +1 @@ +from datahub.ingestion.source.sql.mssql.source import SQLServerConfig, SQLServerSource diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py new file mode 100644 index 0000000000000..8aeb5421891aa --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -0,0 +1,239 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union + +from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn +from datahub.metadata.schema_classes import ( + DataFlowInfoClass, + DataJobInfoClass, + DataJobInputOutputClass, +) + + +@dataclass +class ProcedureDependency: + db: str + schema: str + name: str + type: str + env: str + server: str + source: str = "mssql" + + +@dataclass +class ProcedureLineageStream: + dependencies: List[ProcedureDependency] + + @property + def as_property(self) -> Dict[str, str]: + return { + f"{dep.db}.{dep.schema}.{dep.name}": dep.type for dep in self.dependencies + } + + +@dataclass +class MSSQLJob: + db: str + platform_instance: str + name: str + env: str + source: str = "mssql" + type: str = "JOB" + + @property + def formatted_name(self) -> str: + return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}" + + @property + def full_type(self) -> str: + return f"({self.source},{self.formatted_name},{self.env})" + + @property + def orchestrator(self) -> str: + return self.source + + @property + def formatted_platform_instance(self) -> str: + return self.platform_instance.replace(".", "/") + + @property + def cluster(self) -> str: + return f"{self.env}" + + +@dataclass +class MSSQLProceduresContainer: + db: str + platform_instance: str + name: str + env: str + source: str = "mssql" + type: str = "JOB" + + @property + def formatted_name(self) -> str: + return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}" + + @property + def orchestrator(self) -> str: + return self.source + + @property + def formatted_platform_instance(self) -> str: + return self.platform_instance.replace(".", "/") + + @property + def cluster(self) -> str: + return f"{self.env}" + + @property + def full_type(self) -> str: + return f"({self.source},{self.name},{self.env})" + + +@dataclass +class ProcedureParameter: + name: str + type: str + + @property + def properties(self) -> Dict[str, str]: + return {"type": self.type} + + +@dataclass +class StoredProcedure: + db: str + schema: str + name: str + flow: Union[MSSQLJob, MSSQLProceduresContainer] + type: str = "STORED_PROCEDURE" + source: str = "mssql" + + @property + def full_type(self) -> str: + return self.source.upper() + "_" + self.type + + @property + def formatted_name(self) -> str: + return self.name.replace(",", "-") + + @property + def full_name(self) -> str: + return f"{self.db}.{self.schema}.{self.formatted_name}" + + @property + def escape_full_name(self) -> str: + return f"[{self.db}].[{self.schema}].[{self.formatted_name}]" + + +@dataclass +class JobStep: + job_name: str + step_name: str + flow: MSSQLJob + type: str = "JOB_STEP" + source: str = "mssql" + + @property + def formatted_step(self) -> str: + return self.step_name.replace(",", "-").replace(" ", "_").lower() + + @property + def formatted_name(self) -> str: + return self.job_name.replace(",", "-") + + @property + def full_type(self) -> str: + return self.source.upper() + "_" + self.type + + @property + def full_name(self) -> str: + return f"{self.formatted_name}.{self.formatted_name}" + + +@dataclass +class MSSQLDataJob: + entity: Union[StoredProcedure, JobStep] + type: str = "dataJob" + source: str = "mssql" + external_url: str = "" + description: Optional[str] = None + status: Optional[str] = None + incoming: List[str] = field(default_factory=list) + outgoing: List[str] = field(default_factory=list) + input_jobs: List[str] = field(default_factory=list) + job_properties: Dict[str, str] = field(default_factory=dict) + + @property + def urn(self) -> str: + return make_data_job_urn( + orchestrator=self.entity.flow.orchestrator, + flow_id=self.entity.flow.formatted_name, + job_id=self.entity.formatted_name, + cluster=self.entity.flow.cluster, + ) + + def add_property( + self, + name: str, + value: str, + ) -> None: + self.job_properties[name] = value + + @property + def valued_properties(self) -> Dict[str, str]: + if self.job_properties: + return {k: v for k, v in self.job_properties.items() if v is not None} + return self.job_properties + + @property + def as_datajob_input_output_aspect(self) -> DataJobInputOutputClass: + return DataJobInputOutputClass( + inputDatasets=sorted(self.incoming), + outputDatasets=sorted(self.outgoing), + inputDatajobs=sorted(self.input_jobs), + ) + + @property + def as_datajob_info_aspect(self) -> DataJobInfoClass: + return DataJobInfoClass( + name=self.entity.full_name, + type=self.entity.full_type, + description=self.description, + customProperties=self.valued_properties, + externalUrl=self.external_url, + status=self.status, + ) + + +@dataclass +class MSSQLDataFlow: + entity: Union[MSSQLJob, MSSQLProceduresContainer] + type: str = "dataFlow" + source: str = "mssql" + external_url: str = "" + flow_properties: Dict[str, str] = field(default_factory=dict) + + def add_property( + self, + name: str, + value: str, + ) -> None: + self.flow_properties[name] = value + + @property + def urn(self) -> str: + return make_data_flow_urn( + orchestrator=self.entity.orchestrator, + flow_id=self.entity.formatted_name, + cluster=self.entity.cluster, + ) + + @property + def as_dataflow_info_aspect(self) -> DataFlowInfoClass: + return DataFlowInfoClass( + name=self.entity.formatted_name, + customProperties=self.flow_properties, + externalUrl=self.external_url, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py new file mode 100644 index 0000000000000..3c7701d93edeb --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -0,0 +1,665 @@ +import logging +import re +import urllib.parse +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import pydantic +import sqlalchemy.dialects.mssql + +# This import verifies that the dependencies are available. +import sqlalchemy_pytds # noqa: F401 +from pydantic.fields import Field +from sqlalchemy import create_engine, inspect +from sqlalchemy.engine.base import Connection +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.exc import ProgrammingError, ResourceClosedError + +from datahub.configuration.common import AllowDenyPattern +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.sql.mssql.job_models import ( + JobStep, + MSSQLDataFlow, + MSSQLDataJob, + MSSQLJob, + MSSQLProceduresContainer, + ProcedureDependency, + ProcedureLineageStream, + ProcedureParameter, + StoredProcedure, +) +from datahub.ingestion.source.sql.sql_common import ( + SQLAlchemySource, + SqlWorkUnit, + register_custom_type, +) +from datahub.ingestion.source.sql.sql_config import ( + BasicSQLAlchemyConfig, + make_sqlalchemy_uri, +) +from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass + +logger: logging.Logger = logging.getLogger(__name__) + +register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) +register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) + + +class SQLServerConfig(BasicSQLAlchemyConfig): + # defaults + host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") + scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) + include_stored_procedures: bool = Field( + default=True, + description="Include ingest of stored procedures. Requires access to the 'sys' schema.", + ) + include_stored_procedures_code: bool = Field( + default=True, description="Include information about object code." + ) + include_jobs: bool = Field( + default=True, + description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.", + ) + include_descriptions: bool = Field( + default=True, description="Include table descriptions information." + ) + use_odbc: bool = Field( + default=False, + description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", + ) + uri_args: Dict[str, str] = Field( + default={}, + description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", + ) + database_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for databases to filter in ingestion.", + ) + database: Optional[str] = Field( + default=None, + description="database (catalog). If set to Null, all databases will be considered for ingestion.", + ) + convert_urns_to_lowercase: bool = Field( + default=False, + description="Enable to convert the SQL Server assets urns to lowercase", + ) + + @pydantic.validator("uri_args") + def passwords_match(cls, v, values, **kwargs): + if values["use_odbc"] and "driver" not in v: + raise ValueError("uri_args must contain a 'driver' option") + elif not values["use_odbc"] and v: + raise ValueError("uri_args is not supported when ODBC is disabled") + return v + + def get_sql_alchemy_url( + self, + uri_opts: Optional[Dict[str, Any]] = None, + current_db: Optional[str] = None, + ) -> str: + if self.use_odbc: + # Ensure that the import is available. + import pyodbc # noqa: F401 + + self.scheme = "mssql+pyodbc" + + uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri( + self.scheme, # type: ignore + self.username, + self.password.get_secret_value() if self.password else None, + self.host_port, # type: ignore + current_db if current_db else self.database, + uri_opts=uri_opts, + ) + if self.use_odbc: + uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}" + return uri + + @property + def host(self): + return self.platform_instance or self.host_port.split(":")[0] + + @property + def db(self): + return self.database_alias or self.database + + +@platform_name("Microsoft SQL Server", id="mssql") +@config_class(SQLServerConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +class SQLServerSource(SQLAlchemySource): + """ + This plugin extracts the following: + - Metadata for databases, schemas, views and tables + - Column types associated with each table/view + - Table, row, and column statistics via optional SQL profiling + We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. + """ + + def __init__(self, config: SQLServerConfig, ctx: PipelineContext): + super().__init__(config, ctx, "mssql") + # Cache the table and column descriptions + self.config: SQLServerConfig = config + self.current_database = None + self.table_descriptions: Dict[str, str] = {} + self.column_descriptions: Dict[str, str] = {} + if self.config.include_descriptions: + for inspector in self.get_inspectors(): + db_name: str = self.get_db_name(inspector) + with inspector.engine.connect() as conn: + if self.config.use_odbc: + self._add_output_converters(conn) + self._populate_table_descriptions(conn, db_name) + self._populate_column_descriptions(conn, db_name) + + @staticmethod + def _add_output_converters(conn: Connection) -> None: + def handle_sql_variant_as_string(value): + try: + return value.decode("utf-16le") + except UnicodeDecodeError: + return value.decode("Windows-1251") + + # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported + # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy + try: + conn.connection.add_output_converter(-150, handle_sql_variant_as_string) + except AttributeError as e: + logger.debug( + f"Failed to mount output converter for MSSQL data type -150 due to {e}" + ) + + def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: + # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name + # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/ + table_metadata = conn.execute( + """ + SELECT + SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, + T.NAME AS table_name, + EP.VALUE AS table_description + FROM sys.tables AS T + INNER JOIN sys.extended_properties AS EP + ON EP.MAJOR_ID = T.[OBJECT_ID] + AND EP.MINOR_ID = 0 + AND EP.NAME = 'MS_Description' + AND EP.CLASS = 1 + """ + ) + for row in table_metadata: + self.table_descriptions[ + f"{db_name}.{row['schema_name']}.{row['table_name']}" + ] = row["table_description"] + + def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None: + column_metadata = conn.execute( + """ + SELECT + SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, + T.NAME AS table_name, + C.NAME AS column_name , + EP.VALUE AS column_description + FROM sys.tables AS T + INNER JOIN sys.all_columns AS C + ON C.OBJECT_ID = T.[OBJECT_ID] + INNER JOIN sys.extended_properties AS EP + ON EP.MAJOR_ID = T.[OBJECT_ID] + AND EP.MINOR_ID = C.COLUMN_ID + AND EP.NAME = 'MS_Description' + AND EP.CLASS = 1 + """ + ) + for row in column_metadata: + self.column_descriptions[ + f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}" + ] = row["column_description"] + + @classmethod + def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource": + config = SQLServerConfig.parse_obj(config_dict) + return cls(config, ctx) + + # override to get table descriptions + def get_table_properties( + self, inspector: Inspector, schema: str, table: str + ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: + description, properties, location_urn = super().get_table_properties( + inspector, schema, table + ) + # Update description if available. + db_name: str = self.get_db_name(inspector) + description = self.table_descriptions.get( + f"{db_name}.{schema}.{table}", description + ) + return description, properties, location_urn + + # override to get column descriptions + def _get_columns( + self, dataset_name: str, inspector: Inspector, schema: str, table: str + ) -> List[Dict]: + columns: List[Dict] = super()._get_columns( + dataset_name, inspector, schema, table + ) + # Update column description if available. + db_name: str = self.get_db_name(inspector) + for column in columns: + description: Optional[str] = self.column_descriptions.get( + f"{db_name}.{schema}.{table}.{column['name']}", + ) + if description: + column["comment"] = description + return columns + + def get_database_level_workunits( + self, + inspector: Inspector, + database: str, + ) -> Iterable[MetadataWorkUnit]: + yield from super().get_database_level_workunits( + inspector=inspector, + database=database, + ) + if self.config.include_jobs: + try: + yield from self.loop_jobs(inspector, self.config) + except Exception as e: + self.report.report_failure( + "jobs", + f"Failed to list jobs due to error {e}", + ) + + def get_schema_level_workunits( + self, + inspector: Inspector, + schema: str, + database: str, + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from super().get_schema_level_workunits( + inspector=inspector, + schema=schema, + database=database, + ) + if self.config.include_stored_procedures: + try: + yield from self.loop_stored_procedures(inspector, schema, self.config) + except Exception as e: + self.report.report_failure( + "jobs", + f"Failed to list jobs due to error {e}", + ) + + def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]: + jobs_data = conn.execute( + f""" + SELECT + job.job_id, + job.name, + job.description, + job.date_created, + job.date_modified, + steps.step_id, + steps.step_name, + steps.subsystem, + steps.command, + steps.database_name + FROM + msdb.dbo.sysjobs job + INNER JOIN + msdb.dbo.sysjobsteps steps + ON + job.job_id = steps.job_id + where database_name = '{db_name}' + """ + ) + jobs: Dict[str, Dict[str, Any]] = {} + for row in jobs_data: + step_data = dict( + job_id=row["job_id"], + job_name=row["name"], + description=row["description"], + date_created=row["date_created"], + date_modified=row["date_modified"], + step_id=row["step_id"], + step_name=row["step_name"], + subsystem=row["subsystem"], + command=row["command"], + ) + if row["name"] in jobs: + jobs[row["name"]][row["step_id"]] = step_data + else: + jobs[row["name"]] = {row["step_id"]: step_data} + return jobs + + def loop_jobs( + self, + inspector: Inspector, + sql_config: SQLServerConfig, + ) -> Iterable[MetadataWorkUnit]: + """ + Loop MS SQL jobs as dataFlow-s. + :return: + """ + db_name = self.get_db_name(inspector) + with inspector.engine.connect() as conn: + jobs = self._get_jobs(conn, db_name) + for job_name, job_steps in jobs.items(): + job = MSSQLJob( + name=job_name, + env=sql_config.env, + db=db_name, + platform_instance=sql_config.host, + ) + data_flow = MSSQLDataFlow(entity=job) + yield from self.construct_flow_workunits(data_flow=data_flow) + yield from self.loop_job_steps(job, job_steps) + + def loop_job_steps( + self, job: MSSQLJob, job_steps: Dict[str, Any] + ) -> Iterable[MetadataWorkUnit]: + for step_id, step_data in job_steps.items(): + step = JobStep( + job_name=job.formatted_name, + step_name=step_data["step_name"], + flow=job, + ) + data_job = MSSQLDataJob(entity=step) + for data_name, data_value in step_data.items(): + data_job.add_property(name=data_name, value=str(data_value)) + yield from self.construct_job_workunits(data_job) + + def loop_stored_procedures( # noqa: C901 + self, + inspector: Inspector, + schema: str, + sql_config: SQLServerConfig, + ) -> Iterable[MetadataWorkUnit]: + """ + Loop schema data for get stored procedures as dataJob-s. + """ + db_name = self.get_db_name(inspector) + procedure_flow_name = f"{db_name}.{schema}.stored_procedures" + mssql_default_job = MSSQLProceduresContainer( + name=procedure_flow_name, + env=sql_config.env, + db=db_name, + platform_instance=sql_config.host, + ) + data_flow = MSSQLDataFlow(entity=mssql_default_job) + with inspector.engine.connect() as conn: + procedures_data_list = self._get_stored_procedures(conn, db_name, schema) + procedures = [ + StoredProcedure(flow=mssql_default_job, **procedure_data) + for procedure_data in procedures_data_list + ] + if procedures: + yield from self.construct_flow_workunits(data_flow=data_flow) + for procedure in procedures: + upstream = self._get_procedure_upstream(conn, procedure) + downstream = self._get_procedure_downstream(conn, procedure) + data_job = MSSQLDataJob( + entity=procedure, + ) + # TODO: because of this upstream and downstream are more dependencies, + # can't be used as DataJobInputOutput. + # Should be reorganized into lineage. + data_job.add_property("procedure_depends_on", str(upstream.as_property)) + data_job.add_property( + "depending_on_procedure", str(downstream.as_property) + ) + procedure_definition, procedure_code = self._get_procedure_code( + conn, procedure + ) + if procedure_definition: + data_job.add_property("definition", procedure_definition) + if sql_config.include_stored_procedures_code and procedure_code: + data_job.add_property("code", procedure_code) + procedure_inputs = self._get_procedure_inputs(conn, procedure) + properties = self._get_procedure_properties(conn, procedure) + data_job.add_property( + "input parameters", str([param.name for param in procedure_inputs]) + ) + for param in procedure_inputs: + data_job.add_property( + f"parameter {param.name}", str(param.properties) + ) + for property_name, property_value in properties.items(): + data_job.add_property(property_name, str(property_value)) + yield from self.construct_job_workunits(data_job) + + @staticmethod + def _get_procedure_downstream( + conn: Connection, procedure: StoredProcedure + ) -> ProcedureLineageStream: + downstream_data = conn.execute( + f""" + SELECT DISTINCT OBJECT_SCHEMA_NAME ( referencing_id ) AS [schema], + OBJECT_NAME(referencing_id) AS [name], + o.type_desc AS [type] + FROM sys.sql_expression_dependencies AS sed + INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id + left join sys.objects o1 on sed.referenced_id = o1.object_id + WHERE referenced_id = OBJECT_ID(N'{procedure.escape_full_name}') + AND o.type_desc in ('TABLE_TYPE', 'VIEW', 'USER_TABLE') + """ + ) + downstream_dependencies = [] + for row in downstream_data: + downstream_dependencies.append( + ProcedureDependency( + db=procedure.db, + schema=row["schema"], + name=row["name"], + type=row["type"], + env=procedure.flow.env, + server=procedure.flow.platform_instance, + ) + ) + return ProcedureLineageStream(dependencies=downstream_dependencies) + + @staticmethod + def _get_procedure_upstream( + conn: Connection, procedure: StoredProcedure + ) -> ProcedureLineageStream: + upstream_data = conn.execute( + f""" + SELECT DISTINCT + coalesce(lower(referenced_database_name), db_name()) AS db, + referenced_schema_name AS [schema], + referenced_entity_name AS [name], + o1.type_desc AS [type] + FROM sys.sql_expression_dependencies AS sed + INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id + left join sys.objects o1 on sed.referenced_id = o1.object_id + WHERE referencing_id = OBJECT_ID(N'{procedure.escape_full_name}') + AND referenced_schema_name is not null + AND o1.type_desc in ('TABLE_TYPE', 'VIEW', 'SQL_STORED_PROCEDURE', 'USER_TABLE') + """ + ) + upstream_dependencies = [] + for row in upstream_data: + upstream_dependencies.append( + ProcedureDependency( + db=row["db"], + schema=row["schema"], + name=row["name"], + type=row["type"], + env=procedure.flow.env, + server=procedure.flow.platform_instance, + ) + ) + return ProcedureLineageStream(dependencies=upstream_dependencies) + + @staticmethod + def _get_procedure_inputs( + conn: Connection, procedure: StoredProcedure + ) -> List[ProcedureParameter]: + inputs_data = conn.execute( + f""" + SELECT + name, + type_name(user_type_id) AS 'type' + FROM sys.parameters + WHERE object_id = object_id('{procedure.escape_full_name}') + """ + ) + inputs_list = [] + for row in inputs_data: + inputs_list.append(ProcedureParameter(name=row["name"], type=row["type"])) + return inputs_list + + @staticmethod + def _get_procedure_code( + conn: Connection, procedure: StoredProcedure + ) -> Tuple[Optional[str], Optional[str]]: + query = f"EXEC [{procedure.db}].dbo.sp_helptext '{procedure.full_name}'" + try: + code_data = conn.execute(query) + except ProgrammingError: + logger.warning( + "Denied permission for read text from procedure '%s'", + procedure.full_name, + ) + return None, None + code_list = [] + code_slice_index = 0 + code_slice_text = "create procedure" + try: + for index, row in enumerate(code_data): + code_list.append(row["Text"]) + if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip(): + code_slice_index = index + definition = "\n".join(code_list[:code_slice_index]) + code = "\n".join(code_list[code_slice_index:]) + except ResourceClosedError: + logger.warning( + "Connection was closed from procedure '%s'", + procedure.full_name, + ) + return None, None + return definition, code + + @staticmethod + def _get_procedure_properties( + conn: Connection, procedure: StoredProcedure + ) -> Dict[str, Any]: + properties_data = conn.execute( + f""" + SELECT + create_date as date_created, + modify_date as date_modified + FROM sys.procedures + WHERE object_id = object_id('{procedure.full_name}') + """ + ) + properties = {} + for row in properties_data: + properties = dict( + date_created=row["date_created"], date_modified=row["date_modified"] + ) + return properties + + @staticmethod + def _get_stored_procedures( + conn: Connection, db_name: str, schema: str + ) -> List[Dict[str, str]]: + stored_procedures_data = conn.execute( + f""" + SELECT + pr.name as procedure_name, + s.name as schema_name + FROM + [{db_name}].[sys].[procedures] pr + INNER JOIN + [{db_name}].[sys].[schemas] s ON pr.schema_id = s.schema_id + where s.name = '{schema}' + """ + ) + procedures_list = [] + for row in stored_procedures_data: + procedures_list.append( + dict(db=db_name, schema=row["schema_name"], name=row["procedure_name"]) + ) + return procedures_list + + def construct_job_workunits( + self, + data_job: MSSQLDataJob, + ) -> Iterable[MetadataWorkUnit]: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_info_aspect, + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_input_output_aspect, + ).as_workunit() + # TODO: Add SubType when it appear + + def construct_flow_workunits( + self, + data_flow: MSSQLDataFlow, + ) -> Iterable[MetadataWorkUnit]: + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_flow.as_dataflow_info_aspect, + ).as_workunit() + # TODO: Add SubType when it appear + + def get_inspectors(self) -> Iterable[Inspector]: + # This method can be overridden in the case that you want to dynamically + # run on multiple databases. + url = self.config.get_sql_alchemy_url() + logger.debug(f"sql_alchemy_url={url}") + engine = create_engine(url, **self.config.options) + with engine.connect() as conn: + if self.config.database and self.config.database != "": + inspector = inspect(conn) + yield inspector + else: + databases = conn.execute( + "SELECT name FROM master.sys.databases WHERE name NOT IN \ + ('master', 'model', 'msdb', 'tempdb', 'Resource', \ + 'distribution' , 'reportserver', 'reportservertempdb'); " + ) + for db in databases: + if self.config.database_pattern.allowed(db["name"]): + url = self.config.get_sql_alchemy_url(current_db=db["name"]) + with create_engine( + url, **self.config.options + ).connect() as conn: + inspector = inspect(conn) + self.current_database = db["name"] + yield inspector + + def get_identifier( + self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any + ) -> str: + regular = f"{schema}.{entity}" + qualified_table_name = regular + if self.config.database: + if self.config.database_alias: + qualified_table_name = f"{self.config.database_alias}.{regular}" + else: + qualified_table_name = f"{self.config.database}.{regular}" + if self.current_database: + qualified_table_name = f"{self.current_database}.{regular}" + return ( + qualified_table_name.lower() + if self.config.convert_urns_to_lowercase + else qualified_table_name + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py index 3bec07f6a13d5..e4969ce946f78 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py @@ -19,6 +19,7 @@ make_sqlalchemy_type, register_custom_type, ) +from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig from datahub.ingestion.source.sql.two_tier_sql_source import ( TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, @@ -45,11 +46,13 @@ base.ischema_names["decimal128"] = DECIMAL128 -class MySQLConfig(TwoTierSQLAlchemyConfig): +class MySQLConnectionConfig(SQLAlchemyConnectionConfig): # defaults host_port = Field(default="localhost:3306", description="MySQL host URL.") scheme = "mysql+pymysql" + +class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig): def get_identifier(self, *, schema: str, table: str) -> str: regular = f"{schema}.{table}" if self.database_alias: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py index 1f3092888054e..ceb9ecacb25d2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py @@ -38,7 +38,7 @@ ) from datahub.ingestion.source.sql.sql_config import ( BasicSQLAlchemyConfig, - SQLAlchemyConfig, + SQLCommonConfig, make_sqlalchemy_uri, ) from datahub.ingestion.source.sql.sql_utils import ( @@ -453,7 +453,7 @@ def loop_tables( self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: # In mysql we get tables for all databases and we should filter out the non metastore one if ( @@ -718,7 +718,7 @@ def loop_views( self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: assert isinstance(sql_config, PrestoOnHiveConfig) @@ -904,7 +904,7 @@ def _set_partition_key(self, columns, schema_fields): class SQLAlchemyClient: - def __init__(self, config: SQLAlchemyConfig): + def __init__(self, config: SQLCommonConfig): self.config = config self.connection = self._get_connection() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 64dca217e694d..b5458a42192fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -38,7 +38,7 @@ DatasetContainerSubTypes, DatasetSubTypes, ) -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, downgrade_schema_from_v2, @@ -331,7 +331,7 @@ class ProfileMetadata: class SQLAlchemySource(StatefulIngestionSourceBase): """A Base class for all SQL Sources that use SQLAlchemy to extend""" - def __init__(self, config: SQLAlchemyConfig, ctx: PipelineContext, platform: str): + def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str): super(SQLAlchemySource, self).__init__(config, ctx) self.config = config self.platform = platform @@ -478,6 +478,27 @@ def add_table_to_schema_container( parent_container_key=schema_container_key, ) + def get_database_level_workunits( + self, + inspector: Inspector, + database: str, + ) -> Iterable[MetadataWorkUnit]: + yield from self.gen_database_containers(database=database) + + def get_schema_level_workunits( + self, + inspector: Inspector, + schema: str, + database: str, + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from self.gen_schema_containers(schema=schema, database=database) + + if self.config.include_tables: + yield from self.loop_tables(inspector, schema, self.config) + + if self.config.include_views: + yield from self.loop_views(inspector, schema, self.config) + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), @@ -516,27 +537,20 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit ) db_name = self.get_db_name(inspector) - yield from self.gen_database_containers( + yield from self.get_database_level_workunits( + inspector=inspector, database=db_name, ) for schema in self.get_allowed_schemas(inspector, db_name): self.add_information_for_schema(inspector, schema) - yield from self.gen_schema_containers( - database=db_name, + yield from self.get_schema_level_workunits( + inspector=inspector, schema=schema, - extra_properties=self.get_schema_properties( - inspector=inspector, schema=schema, database=db_name - ), + database=db_name, ) - if sql_config.include_tables: - yield from self.loop_tables(inspector, schema, sql_config) - - if sql_config.include_views: - yield from self.loop_views(inspector, schema, sql_config) - if profiler: profile_requests += list( self.loop_profiler_requests(inspector, schema, sql_config) @@ -599,7 +613,7 @@ def loop_tables( # noqa: C901 self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: tables_seen: Set[str] = set() try: @@ -647,7 +661,7 @@ def _process_table( inspector: Inspector, schema: str, table: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = self._get_columns(dataset_name, inspector, schema, table) dataset_urn = make_dataset_urn_with_platform_instance( @@ -867,7 +881,7 @@ def loop_views( self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: try: for view in inspector.get_view_names(schema): @@ -904,7 +918,7 @@ def _process_view( inspector: Inspector, schema: str, view: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: try: columns = inspector.get_columns(view, schema) @@ -1026,7 +1040,7 @@ def generate_profile_candidates( def is_dataset_eligible_for_profiling( self, dataset_name: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, inspector: Inspector, profile_candidates: Optional[List[str]], ) -> bool: @@ -1042,7 +1056,7 @@ def loop_profiler_requests( self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable["GEProfilerRequest"]: from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 76d1dbd14a7db..8f1e04b915f3b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -6,7 +6,7 @@ import pydantic from pydantic import Field -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig @@ -21,7 +21,7 @@ logger: logging.Logger = logging.getLogger(__name__) -class SQLAlchemyConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): +class SQLCommonConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): options: dict = pydantic.Field( default_factory=dict, description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", @@ -97,7 +97,7 @@ def get_sql_alchemy_url(self): pass -class BasicSQLAlchemyConfig(SQLAlchemyConfig): +class SQLAlchemyConnectionConfig(ConfigModel): username: Optional[str] = Field(default=None, description="username") password: Optional[pydantic.SecretStr] = Field( default=None, exclude=True, description="password" @@ -115,6 +115,12 @@ class BasicSQLAlchemyConfig(SQLAlchemyConfig): description="URI of database to connect to. See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls. Takes precedence over other connection parameters.", ) + # Duplicate of SQLCommonConfig.options + options: dict = pydantic.Field( + default_factory=dict, + description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + ) + _database_alias_deprecation = pydantic_field_deprecated( "database_alias", message="database_alias is deprecated. Use platform_instance instead.", @@ -136,6 +142,10 @@ def get_sql_alchemy_url( ) +class BasicSQLAlchemyConfig(SQLAlchemyConnectionConfig, SQLCommonConfig): + pass + + def make_sqlalchemy_uri( scheme: str, username: Optional[str], diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py index a31a5ac64e5fb..345f5bd57b44c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py @@ -14,7 +14,7 @@ support_status, ) from datahub.ingestion.source.sql.sql_common import SQLAlchemySource -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig @dataclass @@ -44,13 +44,13 @@ class BaseView: comment: Optional[str] created: Optional[datetime] last_altered: Optional[datetime] - view_definition: str + view_definition: Optional[str] size_in_bytes: Optional[int] = None rows_count: Optional[int] = None column_count: Optional[int] = None -class SQLAlchemyGenericConfig(SQLAlchemyConfig): +class SQLAlchemyGenericConfig(SQLCommonConfig): platform: str = Field( description="Name of platform being ingested, used in constructing URNs." ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index 63403c265598b..344c114d464a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -12,7 +12,7 @@ GEProfilerRequest, ) from datahub.ingestion.source.sql.sql_common import SQLSourceReport -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile @@ -53,7 +53,7 @@ class TableProfilerRequest(GEProfilerRequest): class GenericProfiler: def __init__( self, - config: SQLAlchemyConfig, + config: SQLCommonConfig, report: ProfilingSqlReport, platform: str, state_handler: Optional[ProfilingHandler] = None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py index c5baf148b0e5e..723a8c5fd8669 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py @@ -35,7 +35,7 @@ def gen_schema_key( platform: str, platform_instance: Optional[str], env: Optional[str], -) -> ContainerKey: +) -> SchemaKey: return SchemaKey( database=db_name, schema=schema, @@ -48,7 +48,7 @@ def gen_schema_key( def gen_database_key( database: str, platform: str, platform_instance: Optional[str], env: Optional[str] -) -> ContainerKey: +) -> DatabaseKey: return DatabaseKey( database=database, platform=platform, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py index 764f26c256893..a417cae2b1ab0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py @@ -32,7 +32,7 @@ ) from datahub.ingestion.source.sql.sql_config import ( BasicSQLAlchemyConfig, - SQLAlchemyConfig, + SQLCommonConfig, ) from datahub.ingestion.source.sql.sql_utils import get_domain_wu from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass @@ -220,7 +220,7 @@ def _process_table( inspector: VerticaInspector, schema: str, table: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: dataset_urn = make_dataset_urn_with_platform_instance( self.platform, @@ -242,7 +242,7 @@ def loop_views( self, inspector: VerticaInspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: try: for view in inspector.get_view_names(schema): @@ -314,7 +314,7 @@ def _process_view( inspector: VerticaInspector, schema: str, view: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: """ This function is used for performing operation and gets data for every view inside a schema @@ -324,7 +324,7 @@ def _process_view( inspector (Inspector) schema (str): schema name view (str): name of the view to inspect - sql_config (SQLAlchemyConfig) + sql_config (SQLCommonConfig) table_tags (Dict[str, str], optional) Defaults to dict(). Returns: @@ -356,7 +356,7 @@ def loop_projections( self, inspector: VerticaInspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: """ this function loop through all the projection in the given schema. @@ -366,7 +366,7 @@ def loop_projections( Args: inspector (Inspector): inspector obj from reflection schema (str): schema name - sql_config (SQLAlchemyConfig): config + sql_config (SQLCommonConfig): config Returns: Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: [description] @@ -438,7 +438,7 @@ def _process_projections( inspector: VerticaInspector, schema: str, projection: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = inspector.get_projection_columns(projection, schema) dataset_urn = make_dataset_urn_with_platform_instance( @@ -512,7 +512,7 @@ def loop_profiler_requests( self, inspector: VerticaInspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable["GEProfilerRequest"]: """Function is used for collecting profiling related information for every projections inside an schema. @@ -590,7 +590,7 @@ def loop_models( self, inspector: VerticaInspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: """ This function is for iterating over the ml models in vertica db @@ -598,7 +598,7 @@ def loop_models( Args: inspector (Inspector) : inspector obj from reflection engine schema (str): schema name - sql_config (SQLAlchemyConfig): config + sql_config (SQLCommonConfig): config Returns: Iterable[Union[SqlWorkUnit, MetadataWorkUnit]] @@ -646,7 +646,7 @@ def _process_models( inspector: VerticaInspector, schema: str, table: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: """ To fetch ml models related information of ml_model from vertica db @@ -655,7 +655,7 @@ def _process_models( inspector (Inspector): inspector obj from reflection schema (str): schema name entity table (str): name of ml model - sql_config (SQLAlchemyConfig) + sql_config (SQLCommonConfig) Returns: Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: [description] diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py new file mode 100644 index 0000000000000..2fcc93292c2ef --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -0,0 +1,223 @@ +import json +import logging +import os +from dataclasses import dataclass +from datetime import datetime, timezone +from functools import partial +from typing import Iterable, List, Optional, Set + +from pydantic import Field + +from datahub.configuration.source_common import ( + EnvConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.emitter.mce_builder import ( + make_dataset_urn_with_platform_instance, + make_user_urn, +) +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit_reporter +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig +from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage + +logger = logging.getLogger(__name__) + + +class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): + query_file: str = Field(description="Path to file to ingest") + + platform: str = Field( + description="The platform for which to generate data, e.g. snowflake" + ) + + usage: BaseUsageConfig = Field( + description="The usage config to use when generating usage statistics", + default=BaseUsageConfig(), + ) + + use_schema_resolver: bool = Field( + description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.", + default=True, + hidden_from_docs=True, + ) + default_db: Optional[str] = Field( + description="The default database to use for unqualified table names", + default=None, + ) + default_schema: Optional[str] = Field( + description="The default schema to use for unqualified table names", + default=None, + ) + + +class SqlQueriesSourceReport(SourceReport): + num_queries_parsed: int = 0 + num_table_parse_failures: int = 0 + num_column_parse_failures: int = 0 + + def compute_stats(self) -> None: + super().compute_stats() + self.table_failure_rate = ( + f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}" + if self.num_queries_parsed + else "0" + ) + self.column_failure_rate = ( + f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}" + if self.num_queries_parsed + else "0" + ) + + +@platform_name("SQL Queries") +@config_class(SqlQueriesSourceConfig) +@support_status(SupportStatus.TESTING) +class SqlQueriesSource(Source): + # TODO: Documentation + urns: Optional[Set[str]] + schema_resolver: SchemaResolver + builder: SqlParsingBuilder + + def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig): + if not ctx.graph: + raise ValueError( + "SqlQueriesSource needs a datahub_api from which to pull schema metadata" + ) + + self.graph: DataHubGraph = ctx.graph + self.ctx = ctx + self.config = config + self.report = SqlQueriesSourceReport() + + self.builder = SqlParsingBuilder(usage_config=self.config.usage) + + if self.config.use_schema_resolver: + schema_resolver, urns = self.graph.initialize_schema_resolver_from_datahub( + platform=self.config.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self.schema_resolver = schema_resolver + self.urns = urns + else: + self.schema_resolver = self.graph._make_schema_resolver( + platform=self.config.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self.urns = None + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource": + config = SqlQueriesSourceConfig.parse_obj(config_dict) + return cls(ctx, config) + + def get_report(self) -> SqlQueriesSourceReport: + return self.report + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [partial(auto_workunit_reporter, self.get_report())] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}") + with open(self.config.query_file) as f: + for line in f: + try: + query_dict = json.loads(line, strict=False) + entry = QueryEntry.create(query_dict, config=self.config) + yield from self._process_query(entry) + except Exception as e: + logger.warning("Error processing query", exc_info=True) + self.report.report_warning("process-query", str(e)) + + logger.info("Generating workunits") + yield from self.builder.gen_workunits() + + def _process_query(self, entry: "QueryEntry") -> Iterable[MetadataWorkUnit]: + self.report.num_queries_parsed += 1 + if self.report.num_queries_parsed % 1000 == 0: + logger.info(f"Parsed {self.report.num_queries_parsed} queries") + + result = sqlglot_lineage( + sql=entry.query, + schema_resolver=self.schema_resolver, + default_db=self.config.default_db, + default_schema=self.config.default_schema, + ) + if result.debug_info.table_error: + logger.info(f"Error parsing table lineage, {result.debug_info.table_error}") + self.report.num_table_parse_failures += 1 + for downstream_urn in set(entry.downstream_tables): + self.builder.add_lineage( + downstream_urn=downstream_urn, + upstream_urns=entry.upstream_tables, + timestamp=entry.timestamp, + user=entry.user, + ) + return + elif result.debug_info.column_error: + logger.debug( + f"Error parsing column lineage, {result.debug_info.column_error}" + ) + self.report.num_column_parse_failures += 1 + + yield from self.builder.process_sql_parsing_result( + result, + query=entry.query, + query_timestamp=entry.timestamp, + user=entry.user, + custom_operation_type=entry.operation_type, + include_urns=self.urns, + ) + + +@dataclass +class QueryEntry: + query: str + timestamp: Optional[datetime] + user: Optional[str] + operation_type: Optional[str] + downstream_tables: List[str] + upstream_tables: List[str] + + @classmethod + def create( + cls, entry_dict: dict, *, config: SqlQueriesSourceConfig + ) -> "QueryEntry": + return cls( + query=entry_dict["query"], + timestamp=datetime.fromtimestamp(entry_dict["timestamp"], tz=timezone.utc) + if "timestamp" in entry_dict + else None, + user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None, + operation_type=entry_dict.get("operation_type"), + downstream_tables=[ + make_dataset_urn_with_platform_instance( + name=table, + platform=config.platform, + platform_instance=config.platform_instance, + env=config.env, + ) + for table in entry_dict.get("downstream_tables", []) + ], + upstream_tables=[ + make_dataset_urn_with_platform_instance( + name=table, + platform=config.platform, + platform_instance=config.platform_instance, + env=config.env, + ) + for table in entry_dict.get("upstream_tables", []) + ], + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py index eace13368897e..5bfd48eb754d5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py @@ -6,7 +6,7 @@ import logging import pickle from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from typing import Callable, Generic, Optional, Type, TypeVar import pydantic @@ -144,7 +144,7 @@ def create_from_checkpoint_aspect( ) logger.info( f"Successfully constructed last checkpoint state for job {job_name} " - f"with timestamp {datetime.utcfromtimestamp(checkpoint_aspect.timestampMillis/1000)}" + f"with timestamp {datetime.fromtimestamp(checkpoint_aspect.timestampMillis/1000, tz=timezone.utc)}" ) return checkpoint return None @@ -213,7 +213,7 @@ def to_checkpoint_aspect( ), ) checkpoint_aspect = DatahubIngestionCheckpointClass( - timestampMillis=int(datetime.utcnow().timestamp() * 1000), + timestampMillis=int(datetime.now(tz=timezone.utc).timestamp() * 1000), pipelineName=self.pipeline_name, platformInstanceId="", runId=self.run_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/redundant_run_skip_handler.py b/metadata-ingestion/src/datahub/ingestion/source/state/redundant_run_skip_handler.py index 459dbe0ce0af7..a2e078f233f1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/redundant_run_skip_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/redundant_run_skip_handler.py @@ -1,8 +1,11 @@ import logging -from typing import Optional, cast +from abc import ABCMeta, abstractmethod +from datetime import datetime +from typing import Dict, Optional, Tuple, cast import pydantic +from datahub.configuration.time_window_config import BucketDuration, get_time_bucket from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import JobId from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.stateful_ingestion_base import ( @@ -10,26 +13,24 @@ StatefulIngestionConfigBase, StatefulIngestionSourceBase, ) -from datahub.ingestion.source.state.usage_common_state import BaseUsageCheckpointState +from datahub.ingestion.source.state.usage_common_state import ( + BaseTimeWindowCheckpointState, +) from datahub.ingestion.source.state.use_case_handler import ( StatefulIngestionUsecaseHandlerBase, ) -from datahub.utilities.time import get_datetime_from_ts_millis_in_utc +from datahub.utilities.time import ( + TimeWindow, + datetime_to_ts_millis, + ts_millis_to_datetime, +) logger: logging.Logger = logging.getLogger(__name__) -class StatefulRedundantRunSkipConfig(StatefulIngestionConfig): - """ - Base specialized config of Stateful Ingestion to skip redundant runs. - """ - - # Defines the alias 'force_rerun' for ignore_old_state field. - ignore_old_state = pydantic.Field(False, alias="force_rerun") - - class RedundantRunSkipHandler( - StatefulIngestionUsecaseHandlerBase[BaseUsageCheckpointState] + StatefulIngestionUsecaseHandlerBase[BaseTimeWindowCheckpointState], + metaclass=ABCMeta, ): """ The stateful ingestion helper class that handles skipping redundant runs. @@ -41,38 +42,28 @@ class RedundantRunSkipHandler( def __init__( self, source: StatefulIngestionSourceBase, - config: StatefulIngestionConfigBase[StatefulRedundantRunSkipConfig], + config: StatefulIngestionConfigBase[StatefulIngestionConfig], pipeline_name: Optional[str], run_id: str, ): self.source = source self.state_provider = source.state_provider self.stateful_ingestion_config: Optional[ - StatefulRedundantRunSkipConfig + StatefulIngestionConfig ] = config.stateful_ingestion self.pipeline_name = pipeline_name self.run_id = run_id - self.checkpointing_enabled: bool = ( - self.state_provider.is_stateful_ingestion_configured() - ) self._job_id = self._init_job_id() self.state_provider.register_stateful_ingestion_usecase_handler(self) - def _ignore_old_state(self) -> bool: - if ( - self.stateful_ingestion_config is not None - and self.stateful_ingestion_config.ignore_old_state - ): - return True - return False + # step -> step status + self.status: Dict[str, bool] = {} def _ignore_new_state(self) -> bool: - if ( + return ( self.stateful_ingestion_config is not None and self.stateful_ingestion_config.ignore_new_state - ): - return True - return False + ) def _init_job_id(self) -> JobId: platform: Optional[str] = None @@ -80,22 +71,26 @@ def _init_job_id(self) -> JobId: if hasattr(source_class, "get_platform_name"): platform = source_class.get_platform_name() # type: ignore - # Handle backward-compatibility for existing sources. - if platform == "Snowflake": - return JobId("snowflake_usage_ingestion") - # Default name for everything else - job_name_suffix = "skip_redundant_run" - return JobId(f"{platform}_{job_name_suffix}" if platform else job_name_suffix) + job_name_suffix = self.get_job_name_suffix() + return JobId( + f"{platform}_skip_redundant_run{job_name_suffix}" + if platform + else f"skip_redundant_run{job_name_suffix}" + ) + + @abstractmethod + def get_job_name_suffix(self): + raise NotImplementedError("Sub-classes must override this method.") @property def job_id(self) -> JobId: return self._job_id def is_checkpointing_enabled(self) -> bool: - return self.checkpointing_enabled + return self.state_provider.is_stateful_ingestion_configured() - def create_checkpoint(self) -> Optional[Checkpoint[BaseUsageCheckpointState]]: + def create_checkpoint(self) -> Optional[Checkpoint[BaseTimeWindowCheckpointState]]: if not self.is_checkpointing_enabled() or self._ignore_new_state(): return None @@ -104,46 +99,150 @@ def create_checkpoint(self) -> Optional[Checkpoint[BaseUsageCheckpointState]]: job_name=self.job_id, pipeline_name=self.pipeline_name, run_id=self.run_id, - state=BaseUsageCheckpointState( + state=BaseTimeWindowCheckpointState( begin_timestamp_millis=self.INVALID_TIMESTAMP_VALUE, end_timestamp_millis=self.INVALID_TIMESTAMP_VALUE, ), ) - def update_state( + def report_current_run_status(self, step: str, status: bool) -> None: + """ + A helper to track status of all steps of current run. + This will be used to decide overall status of the run. + Checkpoint state will not be updated/committed for current run if there are any failures. + """ + self.status[step] = status + + def is_current_run_successful(self) -> bool: + return all(self.status.values()) + + def get_current_checkpoint( self, - start_time_millis: pydantic.PositiveInt, - end_time_millis: pydantic.PositiveInt, - ) -> None: - if not self.is_checkpointing_enabled() or self._ignore_new_state(): - return + ) -> Optional[Checkpoint]: + if ( + not self.is_checkpointing_enabled() + or self._ignore_new_state() + or not self.is_current_run_successful() + ): + return None cur_checkpoint = self.state_provider.get_current_checkpoint(self.job_id) assert cur_checkpoint is not None - cur_state = cast(BaseUsageCheckpointState, cur_checkpoint.state) - cur_state.begin_timestamp_millis = start_time_millis - cur_state.end_timestamp_millis = end_time_millis - - def should_skip_this_run(self, cur_start_time_millis: int) -> bool: - if not self.is_checkpointing_enabled() or self._ignore_old_state(): - return False - # Determine from the last check point state - last_successful_pipeline_run_end_time_millis: Optional[int] = None + return cur_checkpoint + + def should_skip_this_run( + self, cur_start_time: datetime, cur_end_time: datetime + ) -> bool: + skip: bool = False + last_checkpoint = self.state_provider.get_last_checkpoint( - self.job_id, BaseUsageCheckpointState + self.job_id, BaseTimeWindowCheckpointState ) - if last_checkpoint and last_checkpoint.state: - state = cast(BaseUsageCheckpointState, last_checkpoint.state) - last_successful_pipeline_run_end_time_millis = state.end_timestamp_millis - if ( - last_successful_pipeline_run_end_time_millis is not None - and cur_start_time_millis <= last_successful_pipeline_run_end_time_millis + if last_checkpoint: + last_run_time_window = TimeWindow( + ts_millis_to_datetime(last_checkpoint.state.begin_timestamp_millis), + ts_millis_to_datetime(last_checkpoint.state.end_timestamp_millis), + ) + + logger.debug( + f"{self.job_id} : Last run start, end times:" + f"({last_run_time_window})" + ) + + # If current run's time window is subset of last run's time window, then skip. + # Else there is at least some part in current time window that was not covered in past run's time window + if last_run_time_window.contains(TimeWindow(cur_start_time, cur_end_time)): + skip = True + + return skip + + def suggest_run_time_window( + self, + cur_start_time: datetime, + cur_end_time: datetime, + allow_reduce: int = True, + allow_expand: int = False, + ) -> Tuple[datetime, datetime]: + # If required in future, allow_reduce, allow_expand can be accepted as user input + # as part of stateful ingestion configuration. It is likely that they may cause + # more confusion than help to most users hence not added to start with. + last_checkpoint = self.state_provider.get_last_checkpoint( + self.job_id, BaseTimeWindowCheckpointState + ) + if (last_checkpoint is None) or self.should_skip_this_run( + cur_start_time, cur_end_time ): - warn_msg = ( - f"Skippig this run, since the last run's bucket duration end: " - f"{get_datetime_from_ts_millis_in_utc(last_successful_pipeline_run_end_time_millis)}" - f" is later than the current start_time: {get_datetime_from_ts_millis_in_utc(cur_start_time_millis)}" + return cur_start_time, cur_end_time + + suggested_start_time, suggested_end_time = cur_start_time, cur_end_time + + last_run = last_checkpoint.state.to_time_interval() + self.log(f"Last run start, end times:{last_run}") + cur_run = TimeWindow(cur_start_time, cur_end_time) + + if cur_run.starts_after(last_run): + # scenario of time gap between past successful run window and current run window - maybe due to failed past run + # Should we keep some configurable limits here to decide how much increase in time window is fine ? + if allow_expand: + suggested_start_time = last_run.end_time + self.log( + f"Expanding time window. Updating start time to {suggested_start_time}." + ) + else: + self.log( + f"Observed gap in last run end time({last_run.end_time}) and current run start time({cur_start_time})." + ) + elif allow_reduce and cur_run.left_intersects(last_run): + # scenario of scheduled ingestions with default start, end times + suggested_start_time = last_run.end_time + self.log( + f"Reducing time window. Updating start time to {suggested_start_time}." + ) + elif allow_reduce and cur_run.right_intersects(last_run): + # a manual backdated run + suggested_end_time = last_run.start_time + self.log( + f"Reducing time window. Updating end time to {suggested_end_time}." ) - logger.warning(warn_msg) - return True - return False + + # make sure to consider complete time bucket for usage + if last_checkpoint.state.bucket_duration: + suggested_start_time = get_time_bucket( + suggested_start_time, last_checkpoint.state.bucket_duration + ) + + self.log( + "Adjusted start, end times: " + f"({suggested_start_time}, {suggested_end_time})" + ) + return (suggested_start_time, suggested_end_time) + + def log(self, msg: str) -> None: + logger.info(f"{self.job_id} : {msg}") + + +class RedundantLineageRunSkipHandler(RedundantRunSkipHandler): + def get_job_name_suffix(self): + return "_lineage" + + def update_state(self, start_time: datetime, end_time: datetime) -> None: + cur_checkpoint = self.get_current_checkpoint() + if cur_checkpoint: + cur_state = cast(BaseTimeWindowCheckpointState, cur_checkpoint.state) + cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time) + cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time) + + +class RedundantUsageRunSkipHandler(RedundantRunSkipHandler): + def get_job_name_suffix(self): + return "_usage" + + def update_state( + self, start_time: datetime, end_time: datetime, bucket_duration: BucketDuration + ) -> None: + cur_checkpoint = self.get_current_checkpoint() + if cur_checkpoint: + cur_state = cast(BaseTimeWindowCheckpointState, cur_checkpoint.state) + cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time) + cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time) + cur_state.bucket_duration = bucket_duration diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index 9dd6d27d56ea9..be97e9380f1f5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -14,6 +14,7 @@ LineageConfig, ) from datahub.configuration.time_window_config import BaseTimeWindowConfig +from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import ( IngestionCheckpointingProviderBase, @@ -100,57 +101,75 @@ class StatefulIngestionConfigBase(GenericModel, Generic[CustomConfig]): class StatefulLineageConfigMixin(LineageConfig): - store_last_lineage_extraction_timestamp: bool = Field( - default=False, - description="Enable checking last lineage extraction date in store.", + enable_stateful_lineage_ingestion: bool = Field( + default=True, + description="Enable stateful lineage ingestion." + " This will store lineage window timestamps after successful lineage ingestion. " + "and will not run lineage ingestion for same timestamps in subsequent run. ", + ) + + _store_last_lineage_extraction_timestamp = pydantic_renamed_field( + "store_last_lineage_extraction_timestamp", "enable_stateful_lineage_ingestion" ) @root_validator(pre=False) def lineage_stateful_option_validator(cls, values: Dict) -> Dict: sti = values.get("stateful_ingestion") if not sti or not sti.enabled: - if values.get("store_last_lineage_extraction_timestamp"): + if values.get("enable_stateful_lineage_ingestion"): logger.warning( - "Stateful ingestion is disabled, disabling store_last_lineage_extraction_timestamp config option as well" + "Stateful ingestion is disabled, disabling enable_stateful_lineage_ingestion config option as well" ) - values["store_last_lineage_extraction_timestamp"] = False + values["enable_stateful_lineage_ingestion"] = False return values class StatefulProfilingConfigMixin(ConfigModel): - store_last_profiling_timestamps: bool = Field( - default=False, - description="Enable storing last profile timestamp in store.", + enable_stateful_profiling: bool = Field( + default=True, + description="Enable stateful profiling." + " This will store profiling timestamps per dataset after successful profiling. " + "and will not run profiling again in subsequent run if table has not been updated. ", + ) + + _store_last_profiling_timestamps = pydantic_renamed_field( + "store_last_profiling_timestamps", "enable_stateful_profiling" ) @root_validator(pre=False) def profiling_stateful_option_validator(cls, values: Dict) -> Dict: sti = values.get("stateful_ingestion") if not sti or not sti.enabled: - if values.get("store_last_profiling_timestamps"): + if values.get("enable_stateful_profiling"): logger.warning( - "Stateful ingestion is disabled, disabling store_last_profiling_timestamps config option as well" + "Stateful ingestion is disabled, disabling enable_stateful_profiling config option as well" ) - values["store_last_profiling_timestamps"] = False + values["enable_stateful_profiling"] = False return values class StatefulUsageConfigMixin(BaseTimeWindowConfig): - store_last_usage_extraction_timestamp: bool = Field( + enable_stateful_usage_ingestion: bool = Field( default=True, - description="Enable checking last usage timestamp in store.", + description="Enable stateful lineage ingestion." + " This will store usage window timestamps after successful usage ingestion. " + "and will not run usage ingestion for same timestamps in subsequent run. ", + ) + + _store_last_usage_extraction_timestamp = pydantic_renamed_field( + "store_last_usage_extraction_timestamp", "enable_stateful_usage_ingestion" ) @root_validator(pre=False) def last_usage_extraction_stateful_option_validator(cls, values: Dict) -> Dict: sti = values.get("stateful_ingestion") if not sti or not sti.enabled: - if values.get("store_last_usage_extraction_timestamp"): + if values.get("enable_stateful_usage_ingestion"): logger.warning( - "Stateful ingestion is disabled, disabling store_last_usage_extraction_timestamp config option as well" + "Stateful ingestion is disabled, disabling enable_stateful_usage_ingestion config option as well" ) - values["store_last_usage_extraction_timestamp"] = False + values["enable_stateful_usage_ingestion"] = False return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/usage_common_state.py b/metadata-ingestion/src/datahub/ingestion/source/state/usage_common_state.py index 5ecd9946d3602..b8d44796e4b69 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/usage_common_state.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/usage_common_state.py @@ -1,14 +1,27 @@ +from typing import Optional + import pydantic +from datahub.configuration.time_window_config import BucketDuration from datahub.ingestion.source.state.checkpoint import CheckpointStateBase +from datahub.utilities.time import TimeWindow, ts_millis_to_datetime -class BaseUsageCheckpointState(CheckpointStateBase): +class BaseTimeWindowCheckpointState(CheckpointStateBase): """ - Base class for representing the checkpoint state for all usage based sources. + Base class for representing the checkpoint state for all time window based ingestion stages. Stores the last successful run's begin and end timestamps. Subclasses can define additional state as appropriate. """ - begin_timestamp_millis: pydantic.PositiveInt - end_timestamp_millis: pydantic.PositiveInt + begin_timestamp_millis: pydantic.NonNegativeInt + end_timestamp_millis: pydantic.NonNegativeInt + + # Required for time bucket based aggregations - e.g. Usage + bucket_duration: Optional[BucketDuration] = None + + def to_time_interval(self) -> TimeWindow: + return TimeWindow( + ts_millis_to_datetime(self.begin_timestamp_millis), + ts_millis_to_datetime(self.end_timestamp_millis), + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py index 874ee08cc78f9..d7ebcba2c6695 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py @@ -104,7 +104,7 @@ def commit(self) -> None: for job_name, checkpoint in self.state_to_commit.items(): # Emit the ingestion state for each job - logger.info( + logger.debug( f"Committing ingestion checkpoint for pipeline:'{checkpoint.pipelineName}', " f"job:'{job_name}'" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index 7bf19db25e3bb..2a4563439b6ba 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -200,6 +200,8 @@ def get_platform_from_database_id(self, database_id): f"{self.config.connect_uri}/api/v1/database/{database_id}" ).json() sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri") + if sqlalchemy_uri is None: + return database_response.get("result", {}).get("backend", "external") return sql_common.get_platform_from_sqlalchemy_uri(sqlalchemy_uri) @lru_cache(maxsize=None) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 6752bdf519830..ec0af37089b1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -31,6 +31,7 @@ from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError import datahub.emitter.mce_builder as builder +import datahub.utilities.sqlglot_lineage as sqlglot_l from datahub.configuration.common import ( AllowDenyPattern, ConfigModel, @@ -136,12 +137,7 @@ ViewPropertiesClass, ) from datahub.utilities import config_clean -from datahub.utilities.sqlglot_lineage import ( - ColumnLineageInfo, - SchemaResolver, - SqlParsingResult, - sqlglot_lineage, -) +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult logger: logging.Logger = logging.getLogger(__name__) @@ -1585,42 +1581,14 @@ def parse_custom_sql( f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}" ) - parsed_result: Optional["SqlParsingResult"] = None - try: - schema_resolver = ( - self.ctx.graph._make_schema_resolver( - platform=platform, - platform_instance=platform_instance, - env=env, - ) - if self.ctx.graph is not None - else SchemaResolver( - platform=platform, - platform_instance=platform_instance, - env=env, - graph=None, - ) - ) - - if schema_resolver.graph is None: - logger.warning( - "Column Level Lineage extraction would not work as DataHub graph client is None." - ) - - parsed_result = sqlglot_lineage( - query, - schema_resolver=schema_resolver, - default_db=upstream_db, - ) - except Exception as e: - self.report.report_warning( - key="csql-lineage", - reason=f"Unable to retrieve lineage from query. " - f"Query: {query} " - f"Reason: {str(e)} ", - ) - - return parsed_result + return sqlglot_l.create_lineage_sql_parsed_result( + query=query, + database=upstream_db, + platform=platform, + platform_instance=platform_instance, + env=env, + graph=self.ctx.graph, + ) def _create_lineage_from_unsupported_csql( self, csql_urn: str, csql: dict diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index cef4b763fea57..94ff755e3b254 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -97,6 +97,11 @@ class UnityCatalogSourceConfig( description="Name of the workspace. Default to deployment name present in workspace_url", ) + ingest_data_platform_instance_aspect: Optional[bool] = pydantic.Field( + default=False, + description="Option to enable/disable ingestion of the data platform instance aspect. The default data platform instance id for a dataset is workspace_name", + ) + _only_ingest_assigned_metastore_removed = pydantic_removed_field( "only_ingest_assigned_metastore" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 69e1cac79380d..493acb939c3bb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -7,6 +7,7 @@ from datahub.emitter.mce_builder import ( make_data_platform_urn, + make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, make_domain_urn, make_schema_field_urn, @@ -68,6 +69,7 @@ ViewProperties, ) from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, DatasetLineageTypeClass, DatasetPropertiesClass, DomainsClass, @@ -278,6 +280,7 @@ def process_table(self, table: Table, schema: Schema) -> Iterable[MetadataWorkUn operation = self._create_table_operation_aspect(table) domain = self._get_domain_aspect(dataset_name=table.ref.qualified_table_name) ownership = self._create_table_ownership_aspect(table) + data_platform_instance = self._create_data_platform_instance_aspect(table) lineage: Optional[UpstreamLineageClass] = None if self.config.include_column_lineage: @@ -299,6 +302,7 @@ def process_table(self, table: Table, schema: Schema) -> Iterable[MetadataWorkUn operation, domain, ownership, + data_platform_instance, lineage, ], ) @@ -558,6 +562,19 @@ def _create_table_ownership_aspect(self, table: Table) -> Optional[OwnershipClas ) return None + def _create_data_platform_instance_aspect( + self, table: Table + ) -> Optional[DataPlatformInstanceClass]: + # Only ingest the DPI aspect if the flag is true + if self.config.ingest_data_platform_instance_aspect: + return DataPlatformInstanceClass( + platform=make_data_platform_urn(self.platform), + instance=make_dataplatform_instance_urn( + self.platform, self.platform_instance_name + ), + ) + return None + def _create_table_sub_type_aspect(self, table: Table) -> SubTypesClass: return SubTypesClass( typeNames=[DatasetSubTypes.VIEW if table.is_view else DatasetSubTypes.TABLE] diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index d5da93c7be35e..49f56b46fb012 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -176,10 +176,8 @@ def _parse_query_via_lineage_runner(self, query: str) -> Optional[StringTableInf for table in runner.target_tables ], ) - except Exception: - logger.info( - f"Could not parse query via lineage runner, {query}", exc_info=True - ) + except Exception as e: + logger.info(f"Could not parse query via lineage runner, {query}: {e!r}") return None @staticmethod @@ -202,8 +200,8 @@ def _parse_query_via_spark_sql_plan(self, query: str) -> Optional[StringTableInf return GenericTableInfo( source_tables=[t for t in tables if t], target_tables=[] ) - except Exception: - logger.info(f"Could not parse query via spark plan, {query}", exc_info=True) + except Exception as e: + logger.info(f"Could not parse query via spark plan, {query}: {e!r}") return None @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py index ffa08752070dd..855958f0755e1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py @@ -248,6 +248,7 @@ def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: self.config.top_n_queries, self.config.format_sql_queries, self.config.include_top_n_queries, + self.config.queries_character_limit, ) def get_report(self) -> SourceReport: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py index ea817f40f6a2b..99a980b326e53 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py @@ -392,6 +392,7 @@ def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: self.config.top_n_queries, self.config.format_sql_queries, self.config.include_top_n_queries, + self.config.queries_character_limit, ) def get_report(self) -> RedshiftUsageSourceReport: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index 7dd66fd1e3d0c..9394a8bba5e0b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -282,6 +282,7 @@ def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: self.config.top_n_queries, self.config.format_sql_queries, self.config.include_top_n_queries, + self.config.queries_character_limit, ) def get_report(self) -> SourceReport: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py index 92f8223f34d14..4547f9f368198 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py @@ -47,7 +47,7 @@ ResourceType = TypeVar("ResourceType") # The total number of characters allowed across all queries in a single workunit. -TOTAL_BUDGET_FOR_QUERY_LIST = 24000 +DEFAULT_QUERIES_CHARACTER_LIMIT = 24000 def default_user_urn_builder(email: str) -> str: @@ -65,8 +65,8 @@ def make_usage_workunit( resource_urn_builder: Callable[[ResourceType], str], top_n_queries: int, format_sql_queries: bool, + queries_character_limit: int, user_urn_builder: Optional[Callable[[str], str]] = None, - total_budget_for_query_list: int = TOTAL_BUDGET_FOR_QUERY_LIST, query_trimmer_string: str = " ...", ) -> MetadataWorkUnit: if user_urn_builder is None: @@ -74,7 +74,7 @@ def make_usage_workunit( top_sql_queries: Optional[List[str]] = None if query_freq is not None: - budget_per_query: int = int(total_budget_for_query_list / top_n_queries) + budget_per_query: int = int(queries_character_limit / top_n_queries) top_sql_queries = [ trim_query( format_sql_query(query, keyword_case="upper", reindent_aligned=True) @@ -154,8 +154,8 @@ def make_usage_workunit( top_n_queries: int, format_sql_queries: bool, include_top_n_queries: bool, + queries_character_limit: int, user_urn_builder: Optional[Callable[[str], str]] = None, - total_budget_for_query_list: int = TOTAL_BUDGET_FOR_QUERY_LIST, query_trimmer_string: str = " ...", ) -> MetadataWorkUnit: query_freq = ( @@ -173,12 +173,21 @@ def make_usage_workunit( user_urn_builder=user_urn_builder, top_n_queries=top_n_queries, format_sql_queries=format_sql_queries, - total_budget_for_query_list=total_budget_for_query_list, + queries_character_limit=queries_character_limit, query_trimmer_string=query_trimmer_string, ) class BaseUsageConfig(BaseTimeWindowConfig): + queries_character_limit: int = Field( + default=DEFAULT_QUERIES_CHARACTER_LIMIT, + description=( + "Total character limit for all queries in a single usage aspect." + " Queries will be truncated to length `queries_character_limit / top_n_queries`." + ), + hidden_from_docs=True, # Don't want to encourage people to break elasticsearch + ) + top_n_queries: pydantic.PositiveInt = Field( default=10, description="Number of top queries to save to each table." ) @@ -203,10 +212,10 @@ class BaseUsageConfig(BaseTimeWindowConfig): ) @pydantic.validator("top_n_queries") - def ensure_top_n_queries_is_not_too_big(cls, v: int) -> int: + def ensure_top_n_queries_is_not_too_big(cls, v: int, values: dict) -> int: minimum_query_size = 20 - max_queries = int(TOTAL_BUDGET_FOR_QUERY_LIST / minimum_query_size) + max_queries = int(values["queries_character_limit"] / minimum_query_size) if v > max_queries: raise ValueError( f"top_n_queries is set to {v} but it can be maximum {max_queries}" @@ -259,6 +268,7 @@ def generate_workunits( include_top_n_queries=self.config.include_top_n_queries, resource_urn_builder=resource_urn_builder, user_urn_builder=user_urn_builder, + queries_character_limit=self.config.queries_character_limit, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 27ac2472bef93..0d72fc52da0ca 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -21,10 +21,7 @@ CLIENT_SESSION_KEEP_ALIVE, ) from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator -from datahub.ingestion.source.sql.sql_config import ( - SQLAlchemyConfig, - make_sqlalchemy_uri, -) +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri from datahub.utilities.config_clean import ( remove_protocol, remove_suffix, @@ -261,7 +258,7 @@ def get_connect_args(self) -> dict: return connect_args -class SnowflakeConfig(BaseSnowflakeConfig, SQLAlchemyConfig): +class SnowflakeConfig(BaseSnowflakeConfig, SQLCommonConfig): database_pattern: AllowDenyPattern = AllowDenyPattern( deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] ) diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py new file mode 100644 index 0000000000000..e7da7eb6e701a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -0,0 +1,41 @@ +import logging +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Optional + +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.stats_collections import TopKDict + +logger: logging.Logger = logging.getLogger(__name__) + + +METADATA_EXTRACTION = "Metadata Extraction" +LINEAGE_EXTRACTION = "Lineage Extraction" +USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion" +USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats" +USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation" +PROFILING = "Profiling" + + +@dataclass +class IngestionStageReport: + ingestion_stage: Optional[str] = None + ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict) + + _timer: Optional[PerfTimer] = field( + default=None, init=False, repr=False, compare=False + ) + + def report_ingestion_stage_start(self, stage: str) -> None: + if self._timer: + elapsed = round(self._timer.elapsed_seconds(), 2) + logger.info( + f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds" + ) + if self.ingestion_stage: + self.ingestion_stage_durations[self.ingestion_stage] = elapsed + else: + self._timer = PerfTimer() + + self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" + self._timer.start() diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_report/sql/snowflake.py deleted file mode 100644 index 8ad583686f061..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source_report/sql/snowflake.py +++ /dev/null @@ -1,37 +0,0 @@ -from dataclasses import dataclass, field -from datetime import datetime -from typing import Dict, List, Optional - -from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport -from datahub.ingestion.source_report.time_window import BaseTimeWindowReport - - -@dataclass -class BaseSnowflakeReport(BaseTimeWindowReport): - pass - - -@dataclass -class SnowflakeReport(BaseSnowflakeReport, ProfilingSqlReport): - num_table_to_table_edges_scanned: int = 0 - num_table_to_view_edges_scanned: int = 0 - num_view_to_table_edges_scanned: int = 0 - num_external_table_edges_scanned: int = 0 - ignore_start_time_lineage: Optional[bool] = None - upstream_lineage_in_report: Optional[bool] = None - upstream_lineage: Dict[str, List[str]] = field(default_factory=dict) - lineage_start_time: Optional[datetime] = None - lineage_end_time: Optional[datetime] = None - - cleaned_account_id: str = "" - run_ingestion: bool = False - - # https://community.snowflake.com/s/topic/0TO0Z000000Unu5WAC/releases - saas_version: Optional[str] = None - default_warehouse: Optional[str] = None - default_db: Optional[str] = None - default_schema: Optional[str] = None - role: str = "" - - profile_if_updated_since: Optional[datetime] = None - profile_candidates: Dict[str, List[str]] = field(default_factory=dict) diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/usage/snowflake_usage.py b/metadata-ingestion/src/datahub/ingestion/source_report/usage/snowflake_usage.py deleted file mode 100644 index 5f7962fc36710..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source_report/usage/snowflake_usage.py +++ /dev/null @@ -1,23 +0,0 @@ -from dataclasses import dataclass -from datetime import datetime -from typing import Optional - -from datahub.ingestion.source.state.stateful_ingestion_base import ( - StatefulIngestionReport, -) -from datahub.ingestion.source_report.sql.snowflake import BaseSnowflakeReport - - -@dataclass -class SnowflakeUsageReport(BaseSnowflakeReport, StatefulIngestionReport): - min_access_history_time: Optional[datetime] = None - max_access_history_time: Optional[datetime] = None - access_history_range_query_secs: float = -1 - access_history_query_secs: float = -1 - - rows_processed: int = 0 - rows_missing_query_text: int = 0 - rows_zero_base_objects_accessed: int = 0 - rows_zero_direct_objects_accessed: int = 0 - rows_missing_email: int = 0 - rows_parsing_error: int = 0 diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py index b017afc8c1448..5c52e1ab4f0b3 100644 --- a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py +++ b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py @@ -55,7 +55,6 @@ def assert_metadata_files_equal( output = load_json_file(output_path) if update_golden and not golden_exists: - golden = load_json_file(output_path) shutil.copyfile(str(output_path), str(golden_path)) return else: diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index e5a9954802019..d677b0874b985 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -7,7 +7,6 @@ from collections import defaultdict from typing import Dict, List, Optional, Set, Tuple, Union -import pydantic import pydantic.dataclasses import sqlglot import sqlglot.errors @@ -23,7 +22,7 @@ from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.metadata.schema_classes import SchemaMetadataClass +from datahub.metadata.schema_classes import OperationTypeClass, SchemaMetadataClass from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.dataset_urn import DatasetUrn @@ -34,6 +33,8 @@ # A lightweight table schema: column -> type mapping. SchemaInfo = Dict[str, str] +SQL_PARSE_RESULT_CACHE_SIZE = 1000 + class QueryType(enum.Enum): CREATE = "CREATE" @@ -45,6 +46,22 @@ class QueryType(enum.Enum): UNKNOWN = "UNKNOWN" + def to_operation_type(self) -> Optional[str]: + if self == QueryType.CREATE: + return OperationTypeClass.CREATE + elif self == QueryType.INSERT: + return OperationTypeClass.INSERT + elif self == QueryType.UPDATE: + return OperationTypeClass.UPDATE + elif self == QueryType.DELETE: + return OperationTypeClass.DELETE + elif self == QueryType.MERGE: + return OperationTypeClass.UPDATE + elif self == QueryType.SELECT: + return None + else: + return OperationTypeClass.UNKNOWN + def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: # UPGRADE: Once we use Python 3.10, replace this with a match expression. @@ -442,6 +459,19 @@ def _sqlglot_force_column_normalizer( # statement.sql(pretty=True, dialect=dialect), # ) + def _schema_aware_fuzzy_column_resolve( + table: Optional[_TableName], sqlglot_column: str + ) -> str: + default_col_name = ( + sqlglot_column.lower() if use_case_insensitive_cols else sqlglot_column + ) + if table: + return table_schema_normalized_mapping[table].get( + sqlglot_column, default_col_name + ) + else: + return default_col_name + # Optimize the statement + qualify column references. logger.debug( "Prior to qualification sql %s", statement.sql(pretty=True, dialect=dialect) @@ -523,10 +553,8 @@ def _sqlglot_force_column_normalizer( normalized_col = sqlglot.parse_one(node.name).this.name if node.subfield: normalized_col = f"{normalized_col}.{node.subfield}" - col = table_schema_normalized_mapping[table_ref].get( - normalized_col, normalized_col - ) + col = _schema_aware_fuzzy_column_resolve(table_ref, normalized_col) direct_col_upstreams.add(_ColumnRef(table=table_ref, column=col)) else: # This branch doesn't matter. For example, a count(*) column would go here, and @@ -540,6 +568,9 @@ def _sqlglot_force_column_normalizer( # This is a bit jank since we're relying on sqlglot internals, but it seems to be # the best way to do it. output_col = original_col_expression.this.sql(dialect=dialect) + + output_col = _schema_aware_fuzzy_column_resolve(output_table, output_col) + if not direct_col_upstreams: logger.debug(f' "{output_col}" has no upstreams') column_lineage.append( @@ -623,16 +654,21 @@ def _translate_internal_column_lineage( ) +def _get_dialect(platform: str) -> str: + # TODO: convert datahub platform names to sqlglot dialect + if platform == "presto-on-hive": + return "hive" + else: + return platform + + def _sqlglot_lineage_inner( sql: str, schema_resolver: SchemaResolver, default_db: Optional[str] = None, default_schema: Optional[str] = None, ) -> SqlParsingResult: - # TODO: convert datahub platform names to sqlglot dialect - # TODO: Pull the platform name from the schema resolver? - dialect = schema_resolver.platform - + dialect = _get_dialect(schema_resolver.platform) if dialect == "snowflake": # in snowflake, table identifiers must be uppercased to match sqlglot's behavior. if default_db: @@ -677,10 +713,7 @@ def _sqlglot_lineage_inner( # Fetch schema info for the relevant tables. table_name_urn_mapping: Dict[_TableName, str] = {} table_name_schema_mapping: Dict[_TableName, SchemaInfo] = {} - for table, is_input in itertools.chain( - [(table, True) for table in tables], - [(table, False) for table in modified], - ): + for table in itertools.chain(tables, modified): # For select statements, qualification will be a no-op. For other statements, this # is where the qualification actually happens. qualified_table = table.qualified( @@ -690,19 +723,21 @@ def _sqlglot_lineage_inner( urn, schema_info = schema_resolver.resolve_table(qualified_table) table_name_urn_mapping[qualified_table] = urn - if is_input and schema_info: + if schema_info: table_name_schema_mapping[qualified_table] = schema_info # Also include the original, non-qualified table name in the urn mapping. table_name_urn_mapping[table] = urn + total_tables_discovered = len(tables) + len(modified) + total_schemas_resolved = len(table_name_schema_mapping) debug_info = SqlParsingDebugInfo( - confidence=0.9 if len(tables) == len(table_name_schema_mapping) + confidence=0.9 if total_tables_discovered == total_schemas_resolved # If we're missing any schema info, our confidence will be in the 0.2-0.5 range depending # on how many tables we were able to resolve. - else 0.2 + 0.3 * len(table_name_schema_mapping) / len(tables), - tables_discovered=len(tables), - table_schemas_resolved=len(table_name_schema_mapping), + else 0.2 + 0.3 * total_schemas_resolved / total_tables_discovered, + tables_discovered=total_tables_discovered, + table_schemas_resolved=total_schemas_resolved, ) logger.debug( f"Resolved {len(table_name_schema_mapping)} of {len(tables)} table schemas" @@ -755,6 +790,7 @@ def _sqlglot_lineage_inner( ) +@functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE) def sqlglot_lineage( sql: str, schema_resolver: SchemaResolver, @@ -766,7 +802,8 @@ def sqlglot_lineage( This is a schema-aware lineage generator, meaning that it will use the schema information for the tables involved to generate lineage information for the columns involved. The schema_resolver is responsible for providing - the table schema information. + the table schema information. In most cases, the DataHubGraph can be used + to construct a schema_resolver that will fetch schemas from DataHub. The parser supports most types of DML statements (SELECT, INSERT, UPDATE, DELETE, MERGE) as well as CREATE TABLE AS SELECT (CTAS) statements. It @@ -825,3 +862,42 @@ def sqlglot_lineage( table_error=e, ), ) + + +def create_lineage_sql_parsed_result( + query: str, + database: Optional[str], + platform: str, + platform_instance: Optional[str], + env: str, + schema: Optional[str] = None, + graph: Optional[DataHubGraph] = None, +) -> Optional["SqlParsingResult"]: + parsed_result: Optional["SqlParsingResult"] = None + try: + schema_resolver = ( + graph._make_schema_resolver( + platform=platform, + platform_instance=platform_instance, + env=env, + ) + if graph is not None + else SchemaResolver( + platform=platform, + platform_instance=platform_instance, + env=env, + graph=None, + ) + ) + + parsed_result = sqlglot_lineage( + query, + schema_resolver=schema_resolver, + default_db=database, + default_schema=schema, + ) + except Exception as e: + logger.debug(f"Fail to prase query {query}", exc_info=e) + logger.warning("Fail to parse custom SQL") + + return parsed_result diff --git a/metadata-ingestion/src/datahub/utilities/time.py b/metadata-ingestion/src/datahub/utilities/time.py index d9e643b6bccc2..0df7afb19935f 100644 --- a/metadata-ingestion/src/datahub/utilities/time.py +++ b/metadata-ingestion/src/datahub/utilities/time.py @@ -1,4 +1,5 @@ import time +from dataclasses import dataclass from datetime import datetime, timezone @@ -6,9 +7,37 @@ def get_current_time_in_seconds() -> int: return int(time.time()) -def get_datetime_from_ts_millis_in_utc(ts_millis: int) -> datetime: +def ts_millis_to_datetime(ts_millis: int) -> datetime: + """Converts input timestamp in milliseconds to a datetime object with UTC timezone""" return datetime.fromtimestamp(ts_millis / 1000, tz=timezone.utc) def datetime_to_ts_millis(dt: datetime) -> int: + """Converts a datetime object to timestamp in milliseconds""" return int(round(dt.timestamp() * 1000)) + + +@dataclass +class TimeWindow: + start_time: datetime + end_time: datetime + + def contains(self, other: "TimeWindow") -> bool: + """Whether current window contains other window completely""" + return self.start_time <= other.start_time <= other.end_time <= self.end_time + + def left_intersects(self, other: "TimeWindow") -> bool: + """Whether only left part of current window overlaps other window.""" + return other.start_time <= self.start_time < other.end_time < self.end_time + + def right_intersects(self, other: "TimeWindow") -> bool: + """Whether only right part of current window overlaps other window.""" + return self.start_time < other.start_time < self.end_time <= other.end_time + + def starts_after(self, other: "TimeWindow") -> bool: + """Whether current window starts after other window ends""" + return other.start_time <= other.end_time < self.start_time + + def ends_after(self, other: "TimeWindow") -> bool: + """Whether current window ends after other window ends.""" + return self.end_time > other.end_time diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py index 8732592b80020..79cf54dfe920a 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py @@ -29,3 +29,6 @@ def _validate_entity_type(entity_type: str) -> None: raise InvalidUrnError( f"Entity type should be {DataPlatformUrn.ENTITY_TYPE} but found {entity_type}" ) + + def get_platform_name(self) -> str: + return self.get_entity_id()[0] diff --git a/metadata-ingestion/src/datahub_provider/__init__.py b/metadata-ingestion/src/datahub_provider/__init__.py index 4c0b2bd8e714e..306076dadf82b 100644 --- a/metadata-ingestion/src/datahub_provider/__init__.py +++ b/metadata-ingestion/src/datahub_provider/__init__.py @@ -1,28 +1 @@ -import datahub - - -# This is needed to allow Airflow to pick up specific metadata fields it needs for -# certain features. We recognize it's a bit unclean to define these in multiple places, -# but at this point it's the only workaround if you'd like your custom conn type to -# show up in the Airflow UI. -def get_provider_info(): - return { - "name": "DataHub", - "description": "`DataHub `__\n", - "connection-types": [ - { - "hook-class-name": "datahub_provider.hooks.datahub.DatahubRestHook", - "connection-type": "datahub_rest", - }, - { - "hook-class-name": "datahub_provider.hooks.datahub.DatahubKafkaHook", - "connection-type": "datahub_kafka", - }, - ], - "hook-class-names": [ - "datahub_provider.hooks.datahub.DatahubRestHook", - "datahub_provider.hooks.datahub.DatahubKafkaHook", - ], - "package-name": datahub.__package_name__, - "versions": [datahub.__version__], - } +from datahub_airflow_plugin import get_provider_info diff --git a/metadata-ingestion/src/datahub_provider/_airflow_compat.py b/metadata-ingestion/src/datahub_provider/_airflow_compat.py index 67c3348ec987c..98b96e32fee78 100644 --- a/metadata-ingestion/src/datahub_provider/_airflow_compat.py +++ b/metadata-ingestion/src/datahub_provider/_airflow_compat.py @@ -1,12 +1,3 @@ -# This module must be imported before any Airflow imports in any of our files. -# The AIRFLOW_PATCHED just helps avoid flake8 errors. +from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED -from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED - -assert MARKUPSAFE_PATCHED - -AIRFLOW_PATCHED = True - -__all__ = [ - "AIRFLOW_PATCHED", -] +__all__ = ["AIRFLOW_PATCHED"] diff --git a/metadata-ingestion/src/datahub_provider/_airflow_shims.py b/metadata-ingestion/src/datahub_provider/_airflow_shims.py index 31e1237c0d21d..d5e4a019a4b81 100644 --- a/metadata-ingestion/src/datahub_provider/_airflow_shims.py +++ b/metadata-ingestion/src/datahub_provider/_airflow_shims.py @@ -1,29 +1,15 @@ -from datahub_provider._airflow_compat import AIRFLOW_PATCHED - -from airflow.models.baseoperator import BaseOperator - -try: - from airflow.models.mappedoperator import MappedOperator - from airflow.models.operator import Operator - from airflow.operators.empty import EmptyOperator -except ModuleNotFoundError: - # Operator isn't a real class, but rather a type alias defined - # as the union of BaseOperator and MappedOperator. - # Since older versions of Airflow don't have MappedOperator, we can just use BaseOperator. - Operator = BaseOperator # type: ignore - MappedOperator = None # type: ignore - from airflow.operators.dummy import DummyOperator as EmptyOperator # type: ignore - -try: - from airflow.sensors.external_task import ExternalTaskSensor -except ImportError: - from airflow.sensors.external_task_sensor import ExternalTaskSensor # type: ignore - -assert AIRFLOW_PATCHED +from datahub_airflow_plugin._airflow_shims import ( + AIRFLOW_PATCHED, + EmptyOperator, + ExternalTaskSensor, + MappedOperator, + Operator, +) __all__ = [ - "Operator", - "MappedOperator", + "AIRFLOW_PATCHED", "EmptyOperator", "ExternalTaskSensor", + "Operator", + "MappedOperator", ] diff --git a/metadata-ingestion/src/datahub_provider/_lineage_core.py b/metadata-ingestion/src/datahub_provider/_lineage_core.py index 07c70eeca4e6d..4305b39cac684 100644 --- a/metadata-ingestion/src/datahub_provider/_lineage_core.py +++ b/metadata-ingestion/src/datahub_provider/_lineage_core.py @@ -1,114 +1,3 @@ -from datetime import datetime -from typing import TYPE_CHECKING, Dict, List +from datahub_airflow_plugin._lineage_core import DatahubBasicLineageConfig -import datahub.emitter.mce_builder as builder -from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult -from datahub.configuration.common import ConfigModel -from datahub.utilities.urns.dataset_urn import DatasetUrn -from datahub_provider.client.airflow_generator import AirflowGenerator -from datahub_provider.entities import _Entity - -if TYPE_CHECKING: - from airflow import DAG - from airflow.models.dagrun import DagRun - from airflow.models.taskinstance import TaskInstance - - from datahub_provider._airflow_shims import Operator - from datahub_provider.hooks.datahub import DatahubGenericHook - - -def _entities_to_urn_list(iolets: List[_Entity]) -> List[DatasetUrn]: - return [DatasetUrn.create_from_string(let.urn) for let in iolets] - - -class DatahubBasicLineageConfig(ConfigModel): - enabled: bool = True - - # DataHub hook connection ID. - datahub_conn_id: str - - # Cluster to associate with the pipelines and tasks. Defaults to "prod". - cluster: str = builder.DEFAULT_FLOW_CLUSTER - - # If true, the owners field of the DAG will be capture as a DataHub corpuser. - capture_ownership_info: bool = True - - # If true, the tags field of the DAG will be captured as DataHub tags. - capture_tags_info: bool = True - - capture_executions: bool = False - - def make_emitter_hook(self) -> "DatahubGenericHook": - # This is necessary to avoid issues with circular imports. - from datahub_provider.hooks.datahub import DatahubGenericHook - - return DatahubGenericHook(self.datahub_conn_id) - - -def send_lineage_to_datahub( - config: DatahubBasicLineageConfig, - operator: "Operator", - inlets: List[_Entity], - outlets: List[_Entity], - context: Dict, -) -> None: - if not config.enabled: - return - - dag: "DAG" = context["dag"] - task: "Operator" = context["task"] - ti: "TaskInstance" = context["task_instance"] - - hook = config.make_emitter_hook() - emitter = hook.make_emitter() - - dataflow = AirflowGenerator.generate_dataflow( - cluster=config.cluster, - dag=dag, - capture_tags=config.capture_tags_info, - capture_owner=config.capture_ownership_info, - ) - dataflow.emit(emitter) - operator.log.info(f"Emitted from Lineage: {dataflow}") - - datajob = AirflowGenerator.generate_datajob( - cluster=config.cluster, - task=task, - dag=dag, - capture_tags=config.capture_tags_info, - capture_owner=config.capture_ownership_info, - ) - datajob.inlets.extend(_entities_to_urn_list(inlets)) - datajob.outlets.extend(_entities_to_urn_list(outlets)) - - datajob.emit(emitter) - operator.log.info(f"Emitted from Lineage: {datajob}") - - if config.capture_executions: - dag_run: "DagRun" = context["dag_run"] - - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=config.cluster, - ti=ti, - dag=dag, - dag_run=dag_run, - datajob=datajob, - emit_templates=False, - ) - - operator.log.info(f"Emitted from Lineage: {dpi}") - - dpi = AirflowGenerator.complete_datajob( - emitter=emitter, - cluster=config.cluster, - ti=ti, - dag=dag, - dag_run=dag_run, - datajob=datajob, - result=InstanceRunResult.SUCCESS, - end_timestamp_millis=int(datetime.utcnow().timestamp() * 1000), - ) - operator.log.info(f"Emitted from Lineage: {dpi}") - - emitter.flush() +__all__ = ["DatahubBasicLineageConfig"] diff --git a/metadata-ingestion/src/datahub_provider/_plugin.py b/metadata-ingestion/src/datahub_provider/_plugin.py index 6f6c7c9ab71b7..3d74e715bd644 100644 --- a/metadata-ingestion/src/datahub_provider/_plugin.py +++ b/metadata-ingestion/src/datahub_provider/_plugin.py @@ -1,368 +1,3 @@ -from datahub_provider._airflow_compat import AIRFLOW_PATCHED +from datahub_airflow_plugin.datahub_plugin import DatahubPlugin -import contextlib -import logging -import traceback -from typing import Any, Callable, Iterable, List, Optional, Union - -from airflow.configuration import conf -from airflow.lineage import PIPELINE_OUTLETS -from airflow.models.baseoperator import BaseOperator -from airflow.plugins_manager import AirflowPlugin -from airflow.utils.module_loading import import_string -from cattr import structure - -from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult -from datahub_provider._airflow_shims import MappedOperator, Operator -from datahub_provider.client.airflow_generator import AirflowGenerator -from datahub_provider.hooks.datahub import DatahubGenericHook -from datahub_provider.lineage.datahub import DatahubLineageConfig - -assert AIRFLOW_PATCHED -logger = logging.getLogger(__name__) - -TASK_ON_FAILURE_CALLBACK = "on_failure_callback" -TASK_ON_SUCCESS_CALLBACK = "on_success_callback" - - -def get_lineage_config() -> DatahubLineageConfig: - """Load the lineage config from airflow.cfg.""" - - enabled = conf.get("datahub", "enabled", fallback=True) - datahub_conn_id = conf.get("datahub", "conn_id", fallback="datahub_rest_default") - cluster = conf.get("datahub", "cluster", fallback="prod") - graceful_exceptions = conf.get("datahub", "graceful_exceptions", fallback=True) - capture_tags_info = conf.get("datahub", "capture_tags_info", fallback=True) - capture_ownership_info = conf.get( - "datahub", "capture_ownership_info", fallback=True - ) - capture_executions = conf.get("datahub", "capture_executions", fallback=True) - return DatahubLineageConfig( - enabled=enabled, - datahub_conn_id=datahub_conn_id, - cluster=cluster, - graceful_exceptions=graceful_exceptions, - capture_ownership_info=capture_ownership_info, - capture_tags_info=capture_tags_info, - capture_executions=capture_executions, - ) - - -def _task_inlets(operator: "Operator") -> List: - # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets - if hasattr(operator, "_inlets"): - return operator._inlets # type: ignore[attr-defined, union-attr] - return operator.inlets - - -def _task_outlets(operator: "Operator") -> List: - # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets - # We have to use _outlets because outlets is empty in Airflow < 2.4.0 - if hasattr(operator, "_outlets"): - return operator._outlets # type: ignore[attr-defined, union-attr] - return operator.outlets - - -def get_inlets_from_task(task: BaseOperator, context: Any) -> Iterable[Any]: - # TODO: Fix for https://github.com/apache/airflow/commit/1b1f3fabc5909a447a6277cafef3a0d4ef1f01ae - # in Airflow 2.4. - # TODO: ignore/handle airflow's dataset type in our lineage - - inlets: List[Any] = [] - task_inlets = _task_inlets(task) - # From Airflow 2.3 this should be AbstractOperator but due to compatibility reason lets use BaseOperator - if isinstance(task_inlets, (str, BaseOperator)): - inlets = [ - task_inlets, - ] - - if task_inlets and isinstance(task_inlets, list): - inlets = [] - task_ids = ( - {o for o in task_inlets if isinstance(o, str)} - .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) - .intersection(task.get_flat_relative_ids(upstream=True)) - ) - - from airflow.lineage import AUTO - - # pick up unique direct upstream task_ids if AUTO is specified - if AUTO.upper() in task_inlets or AUTO.lower() in task_inlets: - print("Picking up unique direct upstream task_ids as AUTO is specified") - task_ids = task_ids.union( - task_ids.symmetric_difference(task.upstream_task_ids) - ) - - inlets = task.xcom_pull( - context, task_ids=list(task_ids), dag_id=task.dag_id, key=PIPELINE_OUTLETS - ) - - # re-instantiate the obtained inlets - inlets = [ - structure(item["data"], import_string(item["type_name"])) - # _get_instance(structure(item, Metadata)) - for sublist in inlets - if sublist - for item in sublist - ] - - for inlet in task_inlets: - if isinstance(inlet, str): - inlets.append(inlet) - - return inlets - - -def _make_emit_callback( - logger: logging.Logger, -) -> Callable[[Optional[Exception], str], None]: - def emit_callback(err: Optional[Exception], msg: str) -> None: - if err: - logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) - - return emit_callback - - -def datahub_task_status_callback(context, status): - ti = context["ti"] - task: "BaseOperator" = ti.task - dag = context["dag"] - - # This code is from the original airflow lineage code -> - # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py - inlets = get_inlets_from_task(task, context) - - emitter = ( - DatahubGenericHook(context["_datahub_config"].datahub_conn_id) - .get_underlying_hook() - .make_emitter() - ) - - dataflow = AirflowGenerator.generate_dataflow( - cluster=context["_datahub_config"].cluster, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - task.log.info(f"Emitting Datahub Dataflow: {dataflow}") - dataflow.emit(emitter, callback=_make_emit_callback(task.log)) - - datajob = AirflowGenerator.generate_datajob( - cluster=context["_datahub_config"].cluster, - task=task, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - - for inlet in inlets: - datajob.inlets.append(inlet.urn) - - task_outlets = _task_outlets(task) - for outlet in task_outlets: - datajob.outlets.append(outlet.urn) - - task.log.info(f"Emitting Datahub Datajob: {datajob}") - datajob.emit(emitter, callback=_make_emit_callback(task.log)) - - if context["_datahub_config"].capture_executions: - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag=dag, - dag_run=context["dag_run"], - datajob=datajob, - start_timestamp_millis=int(ti.start_date.timestamp() * 1000), - ) - - task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}") - - dpi = AirflowGenerator.complete_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag_run=context["dag_run"], - result=status, - dag=dag, - datajob=datajob, - end_timestamp_millis=int(ti.end_date.timestamp() * 1000), - ) - task.log.info(f"Emitted Completed Data Process Instance: {dpi}") - - emitter.flush() - - -def datahub_pre_execution(context): - ti = context["ti"] - task: "BaseOperator" = ti.task - dag = context["dag"] - - task.log.info("Running Datahub pre_execute method") - - emitter = ( - DatahubGenericHook(context["_datahub_config"].datahub_conn_id) - .get_underlying_hook() - .make_emitter() - ) - - # This code is from the original airflow lineage code -> - # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py - inlets = get_inlets_from_task(task, context) - - datajob = AirflowGenerator.generate_datajob( - cluster=context["_datahub_config"].cluster, - task=context["ti"].task, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - - for inlet in inlets: - datajob.inlets.append(inlet.urn) - - task_outlets = _task_outlets(task) - - for outlet in task_outlets: - datajob.outlets.append(outlet.urn) - - task.log.info(f"Emitting Datahub dataJob {datajob}") - datajob.emit(emitter, callback=_make_emit_callback(task.log)) - - if context["_datahub_config"].capture_executions: - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag=dag, - dag_run=context["dag_run"], - datajob=datajob, - start_timestamp_millis=int(ti.start_date.timestamp() * 1000), - ) - - task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}") - - emitter.flush() - - -def _wrap_pre_execution(pre_execution): - def custom_pre_execution(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - datahub_pre_execution(context) - - # Call original policy - if pre_execution: - pre_execution(context) - - return custom_pre_execution - - -def _wrap_on_failure_callback(on_failure_callback): - def custom_on_failure_callback(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - try: - datahub_task_status_callback(context, status=InstanceRunResult.FAILURE) - except Exception as e: - if not config.graceful_exceptions: - raise e - else: - print(f"Exception: {traceback.format_exc()}") - - # Call original policy - if on_failure_callback: - on_failure_callback(context) - - return custom_on_failure_callback - - -def _wrap_on_success_callback(on_success_callback): - def custom_on_success_callback(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - try: - datahub_task_status_callback(context, status=InstanceRunResult.SUCCESS) - except Exception as e: - if not config.graceful_exceptions: - raise e - else: - print(f"Exception: {traceback.format_exc()}") - - # Call original policy - if on_success_callback: - on_success_callback(context) - - return custom_on_success_callback - - -def task_policy(task: Union[BaseOperator, MappedOperator]) -> None: - task.log.debug(f"Setting task policy for Dag: {task.dag_id} Task: {task.task_id}") - # task.add_inlets(["auto"]) - # task.pre_execute = _wrap_pre_execution(task.pre_execute) - - # MappedOperator's callbacks don't have setters until Airflow 2.X.X - # https://github.com/apache/airflow/issues/24547 - # We can bypass this by going through partial_kwargs for now - if MappedOperator and isinstance(task, MappedOperator): # type: ignore - on_failure_callback_prop: property = getattr( - MappedOperator, TASK_ON_FAILURE_CALLBACK - ) - on_success_callback_prop: property = getattr( - MappedOperator, TASK_ON_SUCCESS_CALLBACK - ) - if not on_failure_callback_prop.fset or not on_success_callback_prop.fset: - task.log.debug( - "Using MappedOperator's partial_kwargs instead of callback properties" - ) - task.partial_kwargs[TASK_ON_FAILURE_CALLBACK] = _wrap_on_failure_callback( - task.on_failure_callback - ) - task.partial_kwargs[TASK_ON_SUCCESS_CALLBACK] = _wrap_on_success_callback( - task.on_success_callback - ) - return - - task.on_failure_callback = _wrap_on_failure_callback(task.on_failure_callback) # type: ignore - task.on_success_callback = _wrap_on_success_callback(task.on_success_callback) # type: ignore - # task.pre_execute = _wrap_pre_execution(task.pre_execute) - - -def _wrap_task_policy(policy): - if policy and hasattr(policy, "_task_policy_patched_by"): - return policy - - def custom_task_policy(task): - policy(task) - task_policy(task) - - # Add a flag to the policy to indicate that we've patched it. - custom_task_policy._task_policy_patched_by = "datahub_plugin" # type: ignore[attr-defined] - return custom_task_policy - - -def _patch_policy(settings): - if hasattr(settings, "task_policy"): - datahub_task_policy = _wrap_task_policy(settings.task_policy) - settings.task_policy = datahub_task_policy - - -def _patch_datahub_policy(): - with contextlib.suppress(ImportError): - import airflow_local_settings - - _patch_policy(airflow_local_settings) - - from airflow.models.dagbag import settings - - _patch_policy(settings) - - -_patch_datahub_policy() - - -class DatahubPlugin(AirflowPlugin): - name = "datahub_plugin" +__all__ = ["DatahubPlugin"] diff --git a/metadata-ingestion/src/datahub_provider/client/airflow_generator.py b/metadata-ingestion/src/datahub_provider/client/airflow_generator.py index d2d29b00d244f..d50ae152f2b1e 100644 --- a/metadata-ingestion/src/datahub_provider/client/airflow_generator.py +++ b/metadata-ingestion/src/datahub_provider/client/airflow_generator.py @@ -1,509 +1,3 @@ -from datahub_provider._airflow_compat import AIRFLOW_PATCHED +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union, cast - -from airflow.configuration import conf - -from datahub.api.entities.datajob import DataFlow, DataJob -from datahub.api.entities.dataprocess.dataprocess_instance import ( - DataProcessInstance, - InstanceRunResult, -) -from datahub.metadata.schema_classes import DataProcessTypeClass -from datahub.utilities.urns.data_flow_urn import DataFlowUrn -from datahub.utilities.urns.data_job_urn import DataJobUrn - -assert AIRFLOW_PATCHED - -if TYPE_CHECKING: - from airflow import DAG - from airflow.models import DagRun, TaskInstance - - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - from datahub_provider._airflow_shims import Operator - - -def _task_downstream_task_ids(operator: "Operator") -> Set[str]: - if hasattr(operator, "downstream_task_ids"): - return operator.downstream_task_ids - return operator._downstream_task_id # type: ignore[attr-defined,union-attr] - - -class AirflowGenerator: - @staticmethod - def _get_dependencies( - task: "Operator", dag: "DAG", flow_urn: DataFlowUrn - ) -> List[DataJobUrn]: - from datahub_provider._airflow_shims import ExternalTaskSensor - - # resolve URNs for upstream nodes in subdags upstream of the current task. - upstream_subdag_task_urns: List[DataJobUrn] = [] - - for upstream_task_id in task.upstream_task_ids: - upstream_task = dag.task_dict[upstream_task_id] - - # if upstream task is not a subdag, then skip it - upstream_subdag = getattr(upstream_task, "subdag", None) - if upstream_subdag is None: - continue - - # else, link the leaf tasks of the upstream subdag as upstream tasks - for upstream_subdag_task_id in upstream_subdag.task_dict: - upstream_subdag_task = upstream_subdag.task_dict[ - upstream_subdag_task_id - ] - - upstream_subdag_task_urn = DataJobUrn.create_from_ids( - job_id=upstream_subdag_task_id, data_flow_urn=str(flow_urn) - ) - - # if subdag task is a leaf task, then link it as an upstream task - if len(_task_downstream_task_ids(upstream_subdag_task)) == 0: - upstream_subdag_task_urns.append(upstream_subdag_task_urn) - - # resolve URNs for upstream nodes that trigger the subdag containing the current task. - # (if it is in a subdag at all) - upstream_subdag_triggers: List[DataJobUrn] = [] - - # subdags are always named with 'parent.child' style or Airflow won't run them - # add connection from subdag trigger(s) if subdag task has no upstreams - if ( - dag.is_subdag - and dag.parent_dag is not None - and len(task.upstream_task_ids) == 0 - ): - # filter through the parent dag's tasks and find the subdag trigger(s) - subdags = [ - x for x in dag.parent_dag.task_dict.values() if x.subdag is not None - ] - matched_subdags = [ - x for x in subdags if x.subdag and x.subdag.dag_id == dag.dag_id - ] - - # id of the task containing the subdag - subdag_task_id = matched_subdags[0].task_id - - # iterate through the parent dag's tasks and find the ones that trigger the subdag - for upstream_task_id in dag.parent_dag.task_dict: - upstream_task = dag.parent_dag.task_dict[upstream_task_id] - upstream_task_urn = DataJobUrn.create_from_ids( - data_flow_urn=str(flow_urn), job_id=upstream_task_id - ) - - # if the task triggers the subdag, link it to this node in the subdag - if subdag_task_id in _task_downstream_task_ids(upstream_task): - upstream_subdag_triggers.append(upstream_task_urn) - - # If the operator is an ExternalTaskSensor then we set the remote task as upstream. - # It is possible to tie an external sensor to DAG if external_task_id is omitted but currently we can't tie - # jobflow to anothet jobflow. - external_task_upstreams = [] - if task.task_type == "ExternalTaskSensor": - task = cast(ExternalTaskSensor, task) - if hasattr(task, "external_task_id") and task.external_task_id is not None: - external_task_upstreams = [ - DataJobUrn.create_from_ids( - job_id=task.external_task_id, - data_flow_urn=str( - DataFlowUrn.create_from_ids( - orchestrator=flow_urn.get_orchestrator_name(), - flow_id=task.external_dag_id, - env=flow_urn.get_env(), - ) - ), - ) - ] - # exclude subdag operator tasks since these are not emitted, resulting in empty metadata - upstream_tasks = ( - [ - DataJobUrn.create_from_ids(job_id=task_id, data_flow_urn=str(flow_urn)) - for task_id in task.upstream_task_ids - if getattr(dag.task_dict[task_id], "subdag", None) is None - ] - + upstream_subdag_task_urns - + upstream_subdag_triggers - + external_task_upstreams - ) - return upstream_tasks - - @staticmethod - def generate_dataflow( - cluster: str, - dag: "DAG", - capture_owner: bool = True, - capture_tags: bool = True, - ) -> DataFlow: - """ - Generates a Dataflow object from an Airflow DAG - :param cluster: str - name of the cluster - :param dag: DAG - - :param capture_tags: - :param capture_owner: - :return: DataFlow - Data generated dataflow - """ - id = dag.dag_id - orchestrator = "airflow" - description = f"{dag.description}\n\n{dag.doc_md or ''}" - data_flow = DataFlow( - env=cluster, id=id, orchestrator=orchestrator, description=description - ) - - flow_property_bag: Dict[str, str] = {} - - allowed_flow_keys = [ - "_access_control", - "_concurrency", - "_default_view", - "catchup", - "fileloc", - "is_paused_upon_creation", - "start_date", - "tags", - "timezone", - ] - - for key in allowed_flow_keys: - if hasattr(dag, key): - flow_property_bag[key] = repr(getattr(dag, key)) - - data_flow.properties = flow_property_bag - base_url = conf.get("webserver", "base_url") - data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}" - - if capture_owner and dag.owner: - data_flow.owners.add(dag.owner) - - if capture_tags and dag.tags: - data_flow.tags.update(dag.tags) - - return data_flow - - @staticmethod - def _get_description(task: "Operator") -> Optional[str]: - from airflow.models.baseoperator import BaseOperator - - if not isinstance(task, BaseOperator): - # TODO: Get docs for mapped operators. - return None - - if hasattr(task, "doc") and task.doc: - return task.doc - elif hasattr(task, "doc_md") and task.doc_md: - return task.doc_md - elif hasattr(task, "doc_json") and task.doc_json: - return task.doc_json - elif hasattr(task, "doc_yaml") and task.doc_yaml: - return task.doc_yaml - elif hasattr(task, "doc_rst") and task.doc_yaml: - return task.doc_yaml - return None - - @staticmethod - def generate_datajob( - cluster: str, - task: "Operator", - dag: "DAG", - set_dependencies: bool = True, - capture_owner: bool = True, - capture_tags: bool = True, - ) -> DataJob: - """ - - :param cluster: str - :param task: TaskIntance - :param dag: DAG - :param set_dependencies: bool - whether to extract dependencies from airflow task - :param capture_owner: bool - whether to extract owner from airflow task - :param capture_tags: bool - whether to set tags automatically from airflow task - :return: DataJob - returns the generated DataJob object - """ - dataflow_urn = DataFlowUrn.create_from_ids( - orchestrator="airflow", env=cluster, flow_id=dag.dag_id - ) - datajob = DataJob(id=task.task_id, flow_urn=dataflow_urn) - - # TODO add support for MappedOperator - datajob.description = AirflowGenerator._get_description(task) - - job_property_bag: Dict[str, str] = {} - - allowed_task_keys = [ - "_downstream_task_ids", - "_inlets", - "_outlets", - "_task_type", - "_task_module", - "depends_on_past", - "email", - "label", - "execution_timeout", - "sla", - "sql", - "task_id", - "trigger_rule", - "wait_for_downstream", - # In Airflow 2.3, _downstream_task_ids was renamed to downstream_task_ids - "downstream_task_ids", - # In Airflow 2.4, _inlets and _outlets were removed in favor of non-private versions. - "inlets", - "outlets", - ] - - for key in allowed_task_keys: - if hasattr(task, key): - job_property_bag[key] = repr(getattr(task, key)) - - datajob.properties = job_property_bag - base_url = conf.get("webserver", "base_url") - datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={datajob.flow_urn.get_flow_id()}&_flt_3_task_id={task.task_id}" - - if capture_owner and dag.owner: - datajob.owners.add(dag.owner) - - if capture_tags and dag.tags: - datajob.tags.update(dag.tags) - - if set_dependencies: - datajob.upstream_urns.extend( - AirflowGenerator._get_dependencies( - task=task, dag=dag, flow_urn=datajob.flow_urn - ) - ) - - return datajob - - @staticmethod - def create_datajob_instance( - cluster: str, - task: "Operator", - dag: "DAG", - data_job: Optional[DataJob] = None, - ) -> DataProcessInstance: - if data_job is None: - data_job = AirflowGenerator.generate_datajob(cluster, task=task, dag=dag) - dpi = DataProcessInstance.from_datajob( - datajob=data_job, id=task.task_id, clone_inlets=True, clone_outlets=True - ) - return dpi - - @staticmethod - def run_dataflow( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], - cluster: str, - dag_run: "DagRun", - start_timestamp_millis: Optional[int] = None, - dataflow: Optional[DataFlow] = None, - ) -> None: - if dataflow is None: - assert dag_run.dag - dataflow = AirflowGenerator.generate_dataflow(cluster, dag_run.dag) - - if start_timestamp_millis is None: - assert dag_run.execution_date - start_timestamp_millis = int(dag_run.execution_date.timestamp() * 1000) - - assert dag_run.run_id - dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dag_run.run_id) - - # This property only exists in Airflow2 - if hasattr(dag_run, "run_type"): - from airflow.utils.types import DagRunType - - if dag_run.run_type == DagRunType.SCHEDULED: - dpi.type = DataProcessTypeClass.BATCH_SCHEDULED - elif dag_run.run_type == DagRunType.MANUAL: - dpi.type = DataProcessTypeClass.BATCH_AD_HOC - else: - if dag_run.run_id.startswith("scheduled__"): - dpi.type = DataProcessTypeClass.BATCH_SCHEDULED - else: - dpi.type = DataProcessTypeClass.BATCH_AD_HOC - - property_bag: Dict[str, str] = {} - property_bag["run_id"] = str(dag_run.run_id) - property_bag["execution_date"] = str(dag_run.execution_date) - property_bag["end_date"] = str(dag_run.end_date) - property_bag["start_date"] = str(dag_run.start_date) - property_bag["creating_job_id"] = str(dag_run.creating_job_id) - property_bag["data_interval_start"] = str(dag_run.data_interval_start) - property_bag["data_interval_end"] = str(dag_run.data_interval_end) - property_bag["external_trigger"] = str(dag_run.external_trigger) - dpi.properties.update(property_bag) - - dpi.emit_process_start( - emitter=emitter, start_timestamp_millis=start_timestamp_millis - ) - - @staticmethod - def complete_dataflow( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], - cluster: str, - dag_run: "DagRun", - end_timestamp_millis: Optional[int] = None, - dataflow: Optional[DataFlow] = None, - ) -> None: - """ - - :param emitter: DatahubRestEmitter - the datahub rest emitter to emit the generated mcps - :param cluster: str - name of the cluster - :param dag_run: DagRun - :param end_timestamp_millis: Optional[int] - the completion time in milliseconds if not set the current time will be used. - :param dataflow: Optional[Dataflow] - """ - if dataflow is None: - assert dag_run.dag - dataflow = AirflowGenerator.generate_dataflow(cluster, dag_run.dag) - - assert dag_run.run_id - dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dag_run.run_id) - if end_timestamp_millis is None: - if dag_run.end_date is None: - raise Exception( - f"Dag {dag_run.dag_id}_{dag_run.run_id} is still running and unable to get end_date..." - ) - end_timestamp_millis = int(dag_run.end_date.timestamp() * 1000) - - # We should use DagRunState but it is not available in Airflow 1 - if dag_run.state == "success": - result = InstanceRunResult.SUCCESS - elif dag_run.state == "failed": - result = InstanceRunResult.FAILURE - else: - raise Exception( - f"Result should be either success or failure and it was {dag_run.state}" - ) - - dpi.emit_process_end( - emitter=emitter, - end_timestamp_millis=end_timestamp_millis, - result=result, - result_type="airflow", - ) - - @staticmethod - def run_datajob( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], - cluster: str, - ti: "TaskInstance", - dag: "DAG", - dag_run: "DagRun", - start_timestamp_millis: Optional[int] = None, - datajob: Optional[DataJob] = None, - attempt: Optional[int] = None, - emit_templates: bool = True, - ) -> DataProcessInstance: - if datajob is None: - datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) - - assert dag_run.run_id - dpi = DataProcessInstance.from_datajob( - datajob=datajob, - id=f"{dag.dag_id}_{ti.task_id}_{dag_run.run_id}", - clone_inlets=True, - clone_outlets=True, - ) - job_property_bag: Dict[str, str] = {} - job_property_bag["run_id"] = str(dag_run.run_id) - job_property_bag["duration"] = str(ti.duration) - job_property_bag["start_date"] = str(ti.start_date) - job_property_bag["end_date"] = str(ti.end_date) - job_property_bag["execution_date"] = str(ti.execution_date) - job_property_bag["try_number"] = str(ti.try_number - 1) - job_property_bag["hostname"] = str(ti.hostname) - job_property_bag["max_tries"] = str(ti.max_tries) - # Not compatible with Airflow 1 - if hasattr(ti, "external_executor_id"): - job_property_bag["external_executor_id"] = str(ti.external_executor_id) - job_property_bag["pid"] = str(ti.pid) - job_property_bag["state"] = str(ti.state) - job_property_bag["operator"] = str(ti.operator) - job_property_bag["priority_weight"] = str(ti.priority_weight) - job_property_bag["unixname"] = str(ti.unixname) - job_property_bag["log_url"] = ti.log_url - dpi.properties.update(job_property_bag) - dpi.url = ti.log_url - - # This property only exists in Airflow2 - if hasattr(ti, "dag_run") and hasattr(ti.dag_run, "run_type"): - from airflow.utils.types import DagRunType - - if ti.dag_run.run_type == DagRunType.SCHEDULED: - dpi.type = DataProcessTypeClass.BATCH_SCHEDULED - elif ti.dag_run.run_type == DagRunType.MANUAL: - dpi.type = DataProcessTypeClass.BATCH_AD_HOC - else: - if dag_run.run_id.startswith("scheduled__"): - dpi.type = DataProcessTypeClass.BATCH_SCHEDULED - else: - dpi.type = DataProcessTypeClass.BATCH_AD_HOC - - if start_timestamp_millis is None: - assert ti.start_date - start_timestamp_millis = int(ti.start_date.timestamp() * 1000) - - if attempt is None: - attempt = ti.try_number - - dpi.emit_process_start( - emitter=emitter, - start_timestamp_millis=start_timestamp_millis, - attempt=attempt, - emit_template=emit_templates, - ) - return dpi - - @staticmethod - def complete_datajob( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], - cluster: str, - ti: "TaskInstance", - dag: "DAG", - dag_run: "DagRun", - end_timestamp_millis: Optional[int] = None, - result: Optional[InstanceRunResult] = None, - datajob: Optional[DataJob] = None, - ) -> DataProcessInstance: - """ - - :param emitter: DatahubRestEmitter - :param cluster: str - :param ti: TaskInstance - :param dag: DAG - :param dag_run: DagRun - :param end_timestamp_millis: Optional[int] - :param result: Optional[str] One of the result from datahub.metadata.schema_class.RunResultTypeClass - :param datajob: Optional[DataJob] - :return: DataProcessInstance - """ - if datajob is None: - datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) - - if end_timestamp_millis is None: - assert ti.end_date - end_timestamp_millis = int(ti.end_date.timestamp() * 1000) - - if result is None: - # We should use TaskInstanceState but it is not available in Airflow 1 - if ti.state == "success": - result = InstanceRunResult.SUCCESS - elif ti.state == "failed": - result = InstanceRunResult.FAILURE - else: - raise Exception( - f"Result should be either success or failure and it was {ti.state}" - ) - - dpi = DataProcessInstance.from_datajob( - datajob=datajob, - id=f"{dag.dag_id}_{ti.task_id}_{dag_run.run_id}", - clone_inlets=True, - clone_outlets=True, - ) - dpi.emit_process_end( - emitter=emitter, - end_timestamp_millis=end_timestamp_millis, - result=result, - result_type="airflow", - ) - return dpi +__all__ = ["AirflowGenerator"] diff --git a/metadata-ingestion/src/datahub_provider/entities.py b/metadata-ingestion/src/datahub_provider/entities.py index bfccc2f22eeb8..13be4ecdad655 100644 --- a/metadata-ingestion/src/datahub_provider/entities.py +++ b/metadata-ingestion/src/datahub_provider/entities.py @@ -1,48 +1,3 @@ -from abc import abstractmethod -from typing import Optional +from datahub_airflow_plugin.entities import Dataset, Urn, _Entity -import attr - -import datahub.emitter.mce_builder as builder -from datahub.utilities.urns.urn import guess_entity_type - - -class _Entity: - @property - @abstractmethod - def urn(self) -> str: - pass - - -@attr.s(auto_attribs=True, str=True) -class Dataset(_Entity): - platform: str - name: str - env: str = builder.DEFAULT_ENV - platform_instance: Optional[str] = None - - @property - def urn(self): - return builder.make_dataset_urn_with_platform_instance( - platform=self.platform, - name=self.name, - platform_instance=self.platform_instance, - env=self.env, - ) - - -@attr.s(str=True) -class Urn(_Entity): - _urn: str = attr.ib() - - @_urn.validator - def _validate_urn(self, attribute, value): - if not value.startswith("urn:"): - raise ValueError("invalid urn provided: urns must start with 'urn:'") - if guess_entity_type(value) != "dataset": - # This is because DataJobs only support Dataset lineage. - raise ValueError("Airflow lineage currently only supports datasets") - - @property - def urn(self): - return self._urn +__all__ = ["_Entity", "Dataset", "Urn"] diff --git a/metadata-ingestion/src/datahub_provider/hooks/datahub.py b/metadata-ingestion/src/datahub_provider/hooks/datahub.py index e2e523fc5d6af..949d98ce631ed 100644 --- a/metadata-ingestion/src/datahub_provider/hooks/datahub.py +++ b/metadata-ingestion/src/datahub_provider/hooks/datahub.py @@ -1,216 +1,8 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union - -from airflow.exceptions import AirflowException -from airflow.hooks.base import BaseHook - -from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( - MetadataChangeEvent, - MetadataChangeProposal, +from datahub_airflow_plugin.hooks.datahub import ( + BaseHook, + DatahubGenericHook, + DatahubKafkaHook, + DatahubRestHook, ) -if TYPE_CHECKING: - from airflow.models.connection import Connection - - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - from datahub.ingestion.sink.datahub_kafka import KafkaSinkConfig - - -class DatahubRestHook(BaseHook): - """ - Creates a DataHub Rest API connection used to send metadata to DataHub. - Takes the endpoint for your DataHub Rest API in the Server Endpoint(host) field. - - URI example: :: - - AIRFLOW_CONN_DATAHUB_REST_DEFAULT='datahub-rest://rest-endpoint' - - :param datahub_rest_conn_id: Reference to the DataHub Rest connection. - :type datahub_rest_conn_id: str - """ - - conn_name_attr = "datahub_rest_conn_id" - default_conn_name = "datahub_rest_default" - conn_type = "datahub_rest" - hook_name = "DataHub REST Server" - - def __init__(self, datahub_rest_conn_id: str = default_conn_name) -> None: - super().__init__() - self.datahub_rest_conn_id = datahub_rest_conn_id - - @staticmethod - def get_connection_form_widgets() -> Dict[str, Any]: - return {} - - @staticmethod - def get_ui_field_behaviour() -> Dict: - """Returns custom field behavior""" - return { - "hidden_fields": ["port", "schema", "login"], - "relabeling": { - "host": "Server Endpoint", - }, - } - - def _get_config(self) -> Tuple[str, Optional[str], Optional[int]]: - conn: "Connection" = self.get_connection(self.datahub_rest_conn_id) - - host = conn.host - if not host: - raise AirflowException("host parameter is required") - if conn.port: - if ":" in host: - raise AirflowException( - "host parameter should not contain a port number if the port is specified separately" - ) - host = f"{host}:{conn.port}" - password = conn.password - timeout_sec = conn.extra_dejson.get("timeout_sec") - return (host, password, timeout_sec) - - def make_emitter(self) -> "DatahubRestEmitter": - import datahub.emitter.rest_emitter - - return datahub.emitter.rest_emitter.DatahubRestEmitter(*self._get_config()) - - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: - emitter = self.make_emitter() - - for mce in mces: - emitter.emit_mce(mce) - - def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: - emitter = self.make_emitter() - - for mce in mcps: - emitter.emit_mcp(mce) - - -class DatahubKafkaHook(BaseHook): - """ - Creates a DataHub Kafka connection used to send metadata to DataHub. - Takes your kafka broker in the Kafka Broker(host) field. - - URI example: :: - - AIRFLOW_CONN_DATAHUB_KAFKA_DEFAULT='datahub-kafka://kafka-broker' - - :param datahub_kafka_conn_id: Reference to the DataHub Kafka connection. - :type datahub_kafka_conn_id: str - """ - - conn_name_attr = "datahub_kafka_conn_id" - default_conn_name = "datahub_kafka_default" - conn_type = "datahub_kafka" - hook_name = "DataHub Kafka Sink" - - def __init__(self, datahub_kafka_conn_id: str = default_conn_name) -> None: - super().__init__() - self.datahub_kafka_conn_id = datahub_kafka_conn_id - - @staticmethod - def get_connection_form_widgets() -> Dict[str, Any]: - return {} - - @staticmethod - def get_ui_field_behaviour() -> Dict: - """Returns custom field behavior""" - return { - "hidden_fields": ["port", "schema", "login", "password"], - "relabeling": { - "host": "Kafka Broker", - }, - } - - def _get_config(self) -> "KafkaSinkConfig": - import datahub.ingestion.sink.datahub_kafka - - conn = self.get_connection(self.datahub_kafka_conn_id) - obj = conn.extra_dejson - obj.setdefault("connection", {}) - if conn.host is not None: - if "bootstrap" in obj["connection"]: - raise AirflowException( - "Kafka broker specified twice (present in host and extra)" - ) - obj["connection"]["bootstrap"] = ":".join( - map(str, filter(None, [conn.host, conn.port])) - ) - config = datahub.ingestion.sink.datahub_kafka.KafkaSinkConfig.parse_obj(obj) - return config - - def make_emitter(self) -> "DatahubKafkaEmitter": - import datahub.emitter.kafka_emitter - - sink_config = self._get_config() - return datahub.emitter.kafka_emitter.DatahubKafkaEmitter(sink_config) - - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: - emitter = self.make_emitter() - errors = [] - - def callback(exc, msg): - if exc: - errors.append(exc) - - for mce in mces: - emitter.emit_mce_async(mce, callback) - - emitter.flush() - - if errors: - raise AirflowException(f"failed to push some MCEs: {errors}") - - def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: - emitter = self.make_emitter() - errors = [] - - def callback(exc, msg): - if exc: - errors.append(exc) - - for mcp in mcps: - emitter.emit_mcp_async(mcp, callback) - - emitter.flush() - - if errors: - raise AirflowException(f"failed to push some MCPs: {errors}") - - -class DatahubGenericHook(BaseHook): - """ - Emits Metadata Change Events using either the DatahubRestHook or the - DatahubKafkaHook. Set up a DataHub Rest or Kafka connection to use. - - :param datahub_conn_id: Reference to the DataHub connection. - :type datahub_conn_id: str - """ - - def __init__(self, datahub_conn_id: str) -> None: - super().__init__() - self.datahub_conn_id = datahub_conn_id - - def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: - conn = self.get_connection(self.datahub_conn_id) - - # We need to figure out the underlying hook type. First check the - # conn_type. If that fails, attempt to guess using the conn id name. - if conn.conn_type == DatahubRestHook.conn_type: - return DatahubRestHook(self.datahub_conn_id) - elif conn.conn_type == DatahubKafkaHook.conn_type: - return DatahubKafkaHook(self.datahub_conn_id) - elif "rest" in self.datahub_conn_id: - return DatahubRestHook(self.datahub_conn_id) - elif "kafka" in self.datahub_conn_id: - return DatahubKafkaHook(self.datahub_conn_id) - else: - raise AirflowException( - f"DataHub cannot handle conn_type {conn.conn_type} in {conn}" - ) - - def make_emitter(self) -> Union["DatahubRestEmitter", "DatahubKafkaEmitter"]: - return self.get_underlying_hook().make_emitter() - - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: - return self.get_underlying_hook().emit_mces(mces) +__all__ = ["DatahubRestHook", "DatahubKafkaHook", "DatahubGenericHook", "BaseHook"] diff --git a/metadata-ingestion/src/datahub_provider/lineage/datahub.py b/metadata-ingestion/src/datahub_provider/lineage/datahub.py index 009ce4bb29a97..ffe1adb8255b2 100644 --- a/metadata-ingestion/src/datahub_provider/lineage/datahub.py +++ b/metadata-ingestion/src/datahub_provider/lineage/datahub.py @@ -1,91 +1,6 @@ -import json -from typing import TYPE_CHECKING, Dict, List, Optional - -from airflow.configuration import conf -from airflow.lineage.backend import LineageBackend - -from datahub_provider._lineage_core import ( - DatahubBasicLineageConfig, - send_lineage_to_datahub, +from datahub_airflow_plugin.lineage.datahub import ( + DatahubLineageBackend, + DatahubLineageConfig, ) -if TYPE_CHECKING: - from airflow.models.baseoperator import BaseOperator - - -class DatahubLineageConfig(DatahubBasicLineageConfig): - # If set to true, most runtime errors in the lineage backend will be - # suppressed and will not cause the overall task to fail. Note that - # configuration issues will still throw exceptions. - graceful_exceptions: bool = True - - -def get_lineage_config() -> DatahubLineageConfig: - """Load the lineage config from airflow.cfg.""" - - # The kwargs pattern is also used for secret backends. - kwargs_str = conf.get("lineage", "datahub_kwargs", fallback="{}") - kwargs = json.loads(kwargs_str) - - # Continue to support top-level datahub_conn_id config. - datahub_conn_id = conf.get("lineage", "datahub_conn_id", fallback=None) - if datahub_conn_id: - kwargs["datahub_conn_id"] = datahub_conn_id - - return DatahubLineageConfig.parse_obj(kwargs) - - -class DatahubLineageBackend(LineageBackend): - """ - Sends lineage data from tasks to DataHub. - - Configurable via ``airflow.cfg`` as follows: :: - - # For REST-based: - airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://localhost:8080' - # For Kafka-based (standard Kafka sink config can be passed via extras): - airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' - - [lineage] - backend = datahub_provider.lineage.datahub.DatahubLineageBackend - datahub_kwargs = { - "datahub_conn_id": "datahub_rest_default", - "capture_ownership_info": true, - "capture_tags_info": true, - "graceful_exceptions": true } - # The above indentation is important! - """ - - def __init__(self) -> None: - super().__init__() - - # By attempting to get and parse the config, we can detect configuration errors - # ahead of time. The init method is only called in Airflow 2.x. - _ = get_lineage_config() - - # With Airflow 2.0, this can be an instance method. However, with Airflow 1.10.x, this - # method is used statically, even though LineageBackend declares it as an instance variable. - @staticmethod - def send_lineage( - operator: "BaseOperator", - inlets: Optional[List] = None, # unused - outlets: Optional[List] = None, # unused - context: Optional[Dict] = None, - ) -> None: - config = get_lineage_config() - if not config.enabled: - return - - try: - context = context or {} # ensure not None to satisfy mypy - send_lineage_to_datahub( - config, operator, operator.inlets, operator.outlets, context - ) - except Exception as e: - if config.graceful_exceptions: - operator.log.error(e) - operator.log.info( - "Suppressing error because graceful_exceptions is set" - ) - else: - raise +__all__ = ["DatahubLineageBackend", "DatahubLineageConfig"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub.py b/metadata-ingestion/src/datahub_provider/operators/datahub.py index cd1d5187e6d85..08b1807cd4614 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub.py @@ -1,63 +1,6 @@ -from typing import List, Union - -from airflow.models import BaseOperator -from airflow.utils.decorators import apply_defaults - -from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent -from datahub_provider.hooks.datahub import ( - DatahubGenericHook, - DatahubKafkaHook, - DatahubRestHook, +from datahub_airflow_plugin.operators.datahub import ( + DatahubBaseOperator, + DatahubEmitterOperator, ) - -class DatahubBaseOperator(BaseOperator): - """ - The DatahubBaseOperator is used as a base operator all DataHub operators. - """ - - ui_color = "#4398c8" - - hook: Union[DatahubRestHook, DatahubKafkaHook] - - # mypy is not a fan of this. Newer versions of Airflow support proper typing for the decorator - # using PEP 612. However, there is not yet a good way to inherit the types of the kwargs from - # the superclass. - @apply_defaults # type: ignore[misc] - def __init__( # type: ignore[no-untyped-def] - self, - *, - datahub_conn_id: str, - **kwargs, - ): - super().__init__(**kwargs) - - self.datahub_conn_id = datahub_conn_id - self.generic_hook = DatahubGenericHook(datahub_conn_id) - - -class DatahubEmitterOperator(DatahubBaseOperator): - """ - Emits a Metadata Change Event to DataHub using either a DataHub - Rest or Kafka connection. - - :param datahub_conn_id: Reference to the DataHub Rest or Kafka Connection. - :type datahub_conn_id: str - """ - - # See above for why these mypy type issues are ignored here. - @apply_defaults # type: ignore[misc] - def __init__( # type: ignore[no-untyped-def] - self, - mces: List[MetadataChangeEvent], - datahub_conn_id: str, - **kwargs, - ): - super().__init__( - datahub_conn_id=datahub_conn_id, - **kwargs, - ) - self.mces = mces - - def execute(self, context): - self.generic_hook.get_underlying_hook().emit_mces(self.mces) +__all__ = ["DatahubEmitterOperator", "DatahubBaseOperator"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_operator.py b/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_operator.py index 28be8ad860179..85469c10f271c 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_operator.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_operator.py @@ -1,78 +1,5 @@ -import datetime -from typing import Any, List, Optional, Sequence, Union - -from airflow.models import BaseOperator - -from datahub.api.circuit_breaker import ( - AssertionCircuitBreaker, - AssertionCircuitBreakerConfig, +from datahub_airflow_plugin.operators.datahub_assertion_operator import ( + DataHubAssertionOperator, ) -from datahub_provider.hooks.datahub import DatahubRestHook - - -class DataHubAssertionOperator(BaseOperator): - r""" - DataHub Assertion Circuit Breaker Operator. - - :param urn: The DataHub dataset unique identifier. (templated) - :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub - which is set as Airflow connection. - :param check_last_assertion_time: If set it checks assertions after the last operation was set on the dataset. - By default it is True. - :param time_delta: If verify_after_last_update is False it checks for assertion within the time delta. - """ - - template_fields: Sequence[str] = ("urn",) - circuit_breaker: AssertionCircuitBreaker - urn: Union[List[str], str] - - def __init__( # type: ignore[no-untyped-def] - self, - *, - urn: Union[List[str], str], - datahub_rest_conn_id: Optional[str] = None, - check_last_assertion_time: bool = True, - time_delta: Optional[datetime.timedelta] = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - hook: DatahubRestHook - if datahub_rest_conn_id is not None: - hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) - else: - hook = DatahubRestHook() - - host, password, timeout_sec = hook._get_config() - self.urn = urn - config: AssertionCircuitBreakerConfig = AssertionCircuitBreakerConfig( - datahub_host=host, - datahub_token=password, - timeout=timeout_sec, - verify_after_last_update=check_last_assertion_time, - time_delta=time_delta if time_delta else datetime.timedelta(days=1), - ) - - self.circuit_breaker = AssertionCircuitBreaker(config=config) - - def execute(self, context: Any) -> bool: - if "datahub_silence_circuit_breakers" in context["dag_run"].conf: - self.log.info( - "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" - ) - return True - - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - if isinstance(self.urn, str): - urns = [self.urn] - elif isinstance(self.urn, list): - urns = self.urn - else: - raise Exception(f"urn parameter has invalid type {type(self.urn)}") - - for urn in urns: - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - ret = self.circuit_breaker.is_circuit_breaker_active(urn=urn) - if ret: - raise Exception(f"Dataset {self.urn} is not in consumable state") - return True +__all__ = ["DataHubAssertionOperator"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_sensor.py b/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_sensor.py index ceb970dd8dc7f..e560ecb6145e0 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_sensor.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_sensor.py @@ -1,78 +1,5 @@ -import datetime -from typing import Any, List, Optional, Sequence, Union - -from airflow.sensors.base import BaseSensorOperator - -from datahub.api.circuit_breaker import ( - AssertionCircuitBreaker, - AssertionCircuitBreakerConfig, +from datahub_airflow_plugin.operators.datahub_assertion_sensor import ( + DataHubAssertionSensor, ) -from datahub_provider.hooks.datahub import DatahubRestHook - - -class DataHubAssertionSensor(BaseSensorOperator): - r""" - DataHub Assertion Circuit Breaker Sensor. - - :param urn: The DataHub dataset unique identifier. (templated) - :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub - which is set as Airflow connection. - :param check_last_assertion_time: If set it checks assertions after the last operation was set on the dataset. - By default it is True. - :param time_delta: If verify_after_last_update is False it checks for assertion within the time delta. - """ - - template_fields: Sequence[str] = ("urn",) - circuit_breaker: AssertionCircuitBreaker - urn: Union[List[str], str] - - def __init__( # type: ignore[no-untyped-def] - self, - *, - urn: Union[List[str], str], - datahub_rest_conn_id: Optional[str] = None, - check_last_assertion_time: bool = True, - time_delta: datetime.timedelta = datetime.timedelta(days=1), - **kwargs, - ) -> None: - super().__init__(**kwargs) - hook: DatahubRestHook - if datahub_rest_conn_id is not None: - hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) - else: - hook = DatahubRestHook() - - host, password, timeout_sec = hook._get_config() - self.urn = urn - config: AssertionCircuitBreakerConfig = AssertionCircuitBreakerConfig( - datahub_host=host, - datahub_token=password, - timeout=timeout_sec, - verify_after_last_update=check_last_assertion_time, - time_delta=time_delta, - ) - self.circuit_breaker = AssertionCircuitBreaker(config=config) - - def poke(self, context: Any) -> bool: - if "datahub_silence_circuit_breakers" in context["dag_run"].conf: - self.log.info( - "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" - ) - return True - - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - if isinstance(self.urn, str): - urns = [self.urn] - elif isinstance(self.urn, list): - urns = self.urn - else: - raise Exception(f"urn parameter has invalid type {type(self.urn)}") - - for urn in urns: - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - ret = self.circuit_breaker.is_circuit_breaker_active(urn=urn) - if ret: - self.log.info(f"Dataset {self.urn} is not in consumable state") - return False - return True +__all__ = ["DataHubAssertionSensor"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub_operation_operator.py b/metadata-ingestion/src/datahub_provider/operators/datahub_operation_operator.py index 6b2535994c101..6107e70c9eddd 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub_operation_operator.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub_operation_operator.py @@ -1,97 +1,5 @@ -import datetime -from typing import Any, List, Optional, Sequence, Union - -from airflow.sensors.base import BaseSensorOperator - -from datahub.api.circuit_breaker import ( - OperationCircuitBreaker, - OperationCircuitBreakerConfig, +from datahub_airflow_plugin.operators.datahub_operation_operator import ( + DataHubOperationCircuitBreakerOperator, ) -from datahub_provider.hooks.datahub import DatahubRestHook - - -class DataHubOperationCircuitBreakerOperator(BaseSensorOperator): - r""" - DataHub Operation Circuit Breaker Operator. - - :param urn: The DataHub dataset unique identifier. (templated) - :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub - which is set as Airflow connection. - :param partition: The partition to check the operation. - :param source_type: The partition to check the operation. :ref:`https://datahubproject.io/docs/graphql/enums#operationsourcetype` - - """ - - template_fields: Sequence[str] = ( - "urn", - "partition", - "source_type", - "operation_type", - ) - circuit_breaker: OperationCircuitBreaker - urn: Union[List[str], str] - partition: Optional[str] - source_type: Optional[str] - operation_type: Optional[str] - - def __init__( # type: ignore[no-untyped-def] - self, - *, - urn: Union[List[str], str], - datahub_rest_conn_id: Optional[str] = None, - time_delta: Optional[datetime.timedelta] = datetime.timedelta(days=1), - partition: Optional[str] = None, - source_type: Optional[str] = None, - operation_type: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - hook: DatahubRestHook - if datahub_rest_conn_id is not None: - hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) - else: - hook = DatahubRestHook() - - host, password, timeout_sec = hook._get_config() - - self.urn = urn - self.partition = partition - self.operation_type = operation_type - self.source_type = source_type - - config: OperationCircuitBreakerConfig = OperationCircuitBreakerConfig( - datahub_host=host, - datahub_token=password, - timeout=timeout_sec, - time_delta=time_delta, - ) - - self.circuit_breaker = OperationCircuitBreaker(config=config) - - def execute(self, context: Any) -> bool: - if "datahub_silence_circuit_breakers" in context["dag_run"].conf: - self.log.info( - "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" - ) - return True - - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - if isinstance(self.urn, str): - urns = [self.urn] - elif isinstance(self.urn, list): - urns = self.urn - else: - raise Exception(f"urn parameter has invalid type {type(self.urn)}") - - for urn in urns: - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - ret = self.circuit_breaker.is_circuit_breaker_active( - urn=urn, - partition=self.partition, - operation_type=self.operation_type, - source_type=self.source_type, - ) - if ret: - raise Exception(f"Dataset {self.urn} is not in consumable state") - return True +__all__ = ["DataHubOperationCircuitBreakerOperator"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub_operation_sensor.py b/metadata-ingestion/src/datahub_provider/operators/datahub_operation_sensor.py index 8796215453500..902a342081490 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub_operation_sensor.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub_operation_sensor.py @@ -1,100 +1,5 @@ -import datetime -from typing import Any, List, Optional, Sequence, Union - -from airflow.sensors.base import BaseSensorOperator - -from datahub.api.circuit_breaker import ( - OperationCircuitBreaker, - OperationCircuitBreakerConfig, +from datahub_airflow_plugin.operators.datahub_operation_sensor import ( + DataHubOperationCircuitBreakerSensor, ) -from datahub_provider.hooks.datahub import DatahubRestHook - - -class DataHubOperationCircuitBreakerSensor(BaseSensorOperator): - r""" - DataHub Operation Circuit Breaker Sensor. - - :param urn: The DataHub dataset unique identifier. (templated) - :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub - which is set as Airflow connection. - :param partition: The partition to check the operation. - :param source_type: The source type to filter on. If not set it will accept any source type. - See valid values at: https://datahubproject.io/docs/graphql/enums#operationsourcetype - :param operation_type: The operation type to filter on. If not set it will accept any source type. - See valid values at: https://datahubproject.io/docs/graphql/enums/#operationtype - """ - - template_fields: Sequence[str] = ( - "urn", - "partition", - "source_type", - "operation_type", - ) - circuit_breaker: OperationCircuitBreaker - urn: Union[List[str], str] - partition: Optional[str] - source_type: Optional[str] - operation_type: Optional[str] - - def __init__( # type: ignore[no-untyped-def] - self, - *, - urn: Union[List[str], str], - datahub_rest_conn_id: Optional[str] = None, - time_delta: Optional[datetime.timedelta] = datetime.timedelta(days=1), - partition: Optional[str] = None, - source_type: Optional[str] = None, - operation_type: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - hook: DatahubRestHook - if datahub_rest_conn_id is not None: - hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) - else: - hook = DatahubRestHook() - - host, password, timeout_sec = hook._get_config() - - self.urn = urn - self.partition = partition - self.operation_type = operation_type - self.source_type = source_type - - config: OperationCircuitBreakerConfig = OperationCircuitBreakerConfig( - datahub_host=host, - datahub_token=password, - timeout=timeout_sec, - time_delta=time_delta, - ) - - self.circuit_breaker = OperationCircuitBreaker(config=config) - - def poke(self, context: Any) -> bool: - if "datahub_silence_circuit_breakers" in context["dag_run"].conf: - self.log.info( - "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" - ) - return True - - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - if isinstance(self.urn, str): - urns = [self.urn] - elif isinstance(self.urn, list): - urns = self.urn - else: - raise Exception(f"urn parameter has invalid type {type(self.urn)}") - - for urn in urns: - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - ret = self.circuit_breaker.is_circuit_breaker_active( - urn=urn, - partition=self.partition, - operation_type=self.operation_type, - source_type=self.source_type, - ) - if ret: - self.log.info(f"Dataset {self.urn} is not in consumable state") - return False - return True +__all__ = ["DataHubOperationCircuitBreakerSensor"] diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 3bda6c5cce84b..cc3ee1f6ceaa4 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -61,6 +61,7 @@ def test_bigquery_v2_ingest( "project_ids": ["project-id-1"], "include_usage_statistics": False, "include_table_lineage": False, + "include_data_platform_instance": True, } pipeline_config_dict: Dict[str, Any] = { diff --git a/metadata-ingestion/tests/integration/iceberg/.gitignore b/metadata-ingestion/tests/integration/iceberg/.gitignore new file mode 100644 index 0000000000000..a7dfcf56788b4 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/.gitignore @@ -0,0 +1,3 @@ +# Folders created by Iceberg's docker-compose +notebooks/ +warehouse/ \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/docker-compose.yml b/metadata-ingestion/tests/integration/iceberg/docker-compose.yml new file mode 100644 index 0000000000000..ab5c534e7289b --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/docker-compose.yml @@ -0,0 +1,74 @@ +version: "3" + +services: + spark-iceberg: + image: tabulario/spark-iceberg:3.3.2_1.3.0 + container_name: spark-iceberg + networks: + iceberg_net: + depends_on: + - rest + - minio + volumes: + - ./warehouse:/home/iceberg/warehouse + - ./notebooks:/home/iceberg/notebooks/notebooks + - ./setup:/home/iceberg/setup + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + ports: + - 8888:8888 + - 8080:8080 + - 10000:10000 + - 10001:10001 + rest: + image: tabulario/iceberg-rest:0.5.0 + container_name: iceberg-rest + networks: + iceberg_net: + ports: + - 8181:8181 + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - CATALOG_WAREHOUSE=s3a://warehouse/wh/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://minio:9000 + minio: + image: minio/minio + container_name: minio + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio + networks: + iceberg_net: + aliases: + - warehouse.minio + ports: + - 9001:9001 + - 9000:9000 + command: ["server", "/data", "--console-address", ":9001"] + mc: + depends_on: + - minio + image: minio/mc + container_name: mc + networks: + iceberg_net: + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/warehouse; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + exit 0; + " +networks: + iceberg_net: diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json new file mode 100644 index 0000000000000..cc94625560a43 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json @@ -0,0 +1,184 @@ +[ + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.another_taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2023-07-04T14:23:10.457317300Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/another_taxis", + "format-version": "1", + "snapshot-id": "6904764113937987369", + "manifest-list": "s3a://warehouse/wh/nyc/another_taxis/metadata/snap-6904764113937987369-1-f18ce54a-d59c-461a-a066-9d3085ccf2f2.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" + }, + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.another_taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.another_taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:iceberg", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:iceberg,test_platform_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00" + } + } + ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json new file mode 100644 index 0000000000000..163911623470e --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json @@ -0,0 +1,153 @@ +[ + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2023-06-12T17:32:17.227545005Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/taxis", + "format-version": "1", + "snapshot-id": "2505818429184337337", + "manifest-list": "s3a://warehouse/wh/nyc/taxis/metadata/snap-2505818429184337337-1-a64915c4-afc8-40e3-97a7-98b072b42e10.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" + }, + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test" + } + } + ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json new file mode 100644 index 0000000000000..bdb7091014626 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json @@ -0,0 +1,216 @@ +[ + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2023-06-12T17:33:25.422993540Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/taxis", + "format-version": "1", + "snapshot-id": "2585047006374307840", + "manifest-list": "s3a://warehouse/wh/nyc/taxis/metadata/snap-2585047006374307840-1-2e2bef19-40d1-4ad1-8fad-e57783477710.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" + }, + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 5, + "columnCount": 6, + "fieldProfiles": [ + { + "fieldPath": "vendor_id", + "nullCount": 0, + "nullProportion": 0.0, + "min": "1", + "max": "3" + }, + { + "fieldPath": "trip_date", + "nullCount": 0, + "nullProportion": 0.0, + "min": "2000-01-01T12:00:00+00:00", + "max": "2000-01-04T12:00:00+00:00" + }, + { + "fieldPath": "trip_id", + "nullCount": 0, + "nullProportion": 0.0, + "min": "1000371", + "max": "1000375" + }, + { + "fieldPath": "trip_distance", + "nullCount": 0, + "nullProportion": 0.0, + "min": "0.0", + "max": "8.399999618530273" + }, + { + "fieldPath": "fare_amount", + "nullCount": 0, + "nullProportion": 0.0, + "min": "0.0", + "max": "42.13" + }, + { + "fieldPath": "store_and_fwd_flag", + "nullCount": 0, + "nullProportion": 0.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test" + } + } + ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml new file mode 100644 index 0000000000000..197c03bf2ee8d --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml @@ -0,0 +1,25 @@ +run_id: iceberg-test + +source: + type: iceberg + config: + catalog: + name: default + type: rest + config: + uri: http://localhost:8181 + s3.access-key-id: admin + s3.secret-access-key: password + s3.region: us-east-1 + warehouse: s3a://warehouse/wh/ + py-io-impl: pyiceberg.io.pyarrow.PyArrowFileIO + s3.endpoint: http://localhost:9000 + user_ownership_property: owner + group_ownership_property: owner + profiling: + enabled: true + +sink: + type: file + config: + filename: "./iceberg_mces.json" diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml b/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml new file mode 100644 index 0000000000000..8b5d035aed259 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml @@ -0,0 +1,22 @@ +run_id: iceberg-test + +source: + type: iceberg + config: + catalog: + name: default + type: rest + config: + uri: http://localhost:8181 + s3.access-key-id: admin + s3.secret-access-key: password + s3.region: us-east-1 + warehouse: s3a://warehouse/wh/ + s3.endpoint: http://localhost:9000 + user_ownership_property: owner + group_ownership_property: owner + +sink: + type: file + config: + filename: "./iceberg_mces.json" diff --git a/metadata-ingestion/tests/integration/iceberg/setup/create.py b/metadata-ingestion/tests/integration/iceberg/setup/create.py new file mode 100644 index 0000000000000..0799ce9c93916 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/setup/create.py @@ -0,0 +1,46 @@ +import sys +from datetime import datetime + +from pyspark.sql import SparkSession +from pyspark.sql.types import ( + DoubleType, + FloatType, + LongType, + StringType, + StructField, + StructType, + TimestampType, +) + + +def main(table_name: str) -> None: + spark = SparkSession.builder.getOrCreate() + + schema = StructType( + [ + StructField("vendor_id", LongType(), True), + StructField("trip_date", TimestampType(), True), + StructField("trip_id", LongType(), True), + StructField("trip_distance", FloatType(), True), + StructField("fare_amount", DoubleType(), True), + StructField("store_and_fwd_flag", StringType(), True), + ] + ) + + data = [ + (1, datetime(2000, 1, 1, 12, 0), 1000371, 1.8, 15.32, "N"), + (2, datetime(2000, 1, 2, 12, 0), 1000372, 2.5, 22.15, "N"), + (2, datetime(2000, 1, 3, 12, 0), 1000373, 0.9, 9.01, "N"), + (1, datetime(2000, 1, 4, 12, 0), 1000374, 8.4, 42.13, "Y"), + # Following entry will test profiling values at 0 + (3, datetime(2000, 1, 4, 12, 0), 1000375, 0.0, 0.0, "Y"), + ] + + df = spark.createDataFrame(data, schema) + df.write.partitionBy("trip_date").saveAsTable(table_name) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + raise ValueError("Missing required parameter ") + main(sys.argv[1]) diff --git a/metadata-ingestion/tests/integration/iceberg/setup/delete.py b/metadata-ingestion/tests/integration/iceberg/setup/delete.py new file mode 100644 index 0000000000000..b00306982f517 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/setup/delete.py @@ -0,0 +1,5 @@ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() + +spark.sql("DROP TABLE nyc.taxis PURGE") diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/iceberg_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/iceberg_mces_golden.json deleted file mode 100644 index b106b91275835..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/iceberg_mces_golden.json +++ /dev/null @@ -1,131 +0,0 @@ -[ -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace.iceberg_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "new_owner", - "provider": "ICEBERG", - "location": "/namespace/iceberg_test" - }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:new_owner", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:new_owner", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "namespace.iceberg_test", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "Schema(1: level: required string(level documentation),2: event_time: required timestamptz(event_time documentation),3: message: required string(message documentation),4: call_stack: optional list(call_stack documentation))" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=string].level", - "nullable": false, - "description": "level documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].event_time", - "nullable": false, - "description": "event_time documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].message", - "nullable": false, - "description": "message documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].call_stack", - "nullable": true, - "description": "call_stack documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "string" - ] - } - } - }, - "nativeDataType": "list", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"list\", \"_nullable\": true}" - } - ] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" - } -} -] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v1.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v1.metadata.json deleted file mode 100644 index e4ac0b9685ddc..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v1.metadata.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "11bbe5de-5ef6-4074-80db-f041065f9862", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1648729616724, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v2.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v2.metadata.json deleted file mode 100644 index 02221330b0665..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v2.metadata.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "16e6ecee-cd5d-470f-a7a6-a197944fa4db", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1649086837695, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "new_owner" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ { - "timestamp-ms" : 1649086837511, - "metadata-file" : "/namespace/iceberg_test/metadata/v1.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/version-hint.text deleted file mode 100644 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-0-72133c37-bb5c-4ffd-8ead-08f33fa2675d-00001.parquet b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-0-72133c37-bb5c-4ffd-8ead-08f33fa2675d-00001.parquet deleted file mode 100644 index 48e75a030f1ca..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-0-72133c37-bb5c-4ffd-8ead-08f33fa2675d-00001.parquet and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-3-c638dd0f-498a-4ce9-b525-8242758d18f8-00001.parquet b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-3-c638dd0f-498a-4ce9-b525-8242758d18f8-00001.parquet deleted file mode 100644 index c70b94612db64..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-3-c638dd0f-498a-4ce9-b525-8242758d18f8-00001.parquet and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-1-5f69f6ed-191f-4a11-9953-09435ffce01d-00001.parquet b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-1-5f69f6ed-191f-4a11-9953-09435ffce01d-00001.parquet deleted file mode 100644 index 4c95fceed72e6..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-1-5f69f6ed-191f-4a11-9953-09435ffce01d-00001.parquet and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-4-b21a5375-b547-40b9-89ca-caf4fcfe6685-00001.parquet b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-4-b21a5375-b547-40b9-89ca-caf4fcfe6685-00001.parquet deleted file mode 100644 index d33a3fd0d8a07..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-4-b21a5375-b547-40b9-89ca-caf4fcfe6685-00001.parquet and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json deleted file mode 100644 index b6ffcfdc55daf..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "e54626bf-c7ab-4f36-a3d0-3e13eec0824f", - "location" : "/home/iceberg/warehouse/datahub/integration/profiling", - "last-updated-ms" : 1651614148692, - "last-column-id" : 3, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - } ], - "partition-spec" : [ ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ ] - } ], - "last-partition-id" : 999, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "root" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00001-fb50681e-5f25-4180-99e2-065ef0b9791b.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00001-fb50681e-5f25-4180-99e2-065ef0b9791b.metadata.json deleted file mode 100644 index da2afa6569f11..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00001-fb50681e-5f25-4180-99e2-065ef0b9791b.metadata.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "e54626bf-c7ab-4f36-a3d0-3e13eec0824f", - "location" : "/home/iceberg/warehouse/datahub/integration/profiling", - "last-updated-ms" : 1651614151056, - "last-column-id" : 3, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - } ], - "partition-spec" : [ ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ ] - } ], - "last-partition-id" : 999, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "root" - }, - "current-snapshot-id" : 4437197002876030991, - "snapshots" : [ { - "snapshot-id" : 4437197002876030991, - "timestamp-ms" : 1651614151056, - "summary" : { - "operation" : "append", - "spark.app.id" : "local-1651614127284", - "added-data-files" : "2", - "added-records" : "2", - "added-files-size" : "2114", - "changed-partition-count" : "1", - "total-records" : "2", - "total-files-size" : "2114", - "total-data-files" : "2", - "total-delete-files" : "0", - "total-position-deletes" : "0", - "total-equality-deletes" : "0" - }, - "manifest-list" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro", - "schema-id" : 0 - } ], - "snapshot-log" : [ { - "timestamp-ms" : 1651614151056, - "snapshot-id" : 4437197002876030991 - } ], - "metadata-log" : [ { - "timestamp-ms" : 1651614148692, - "metadata-file" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json deleted file mode 100644 index ab028a647de4c..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json +++ /dev/null @@ -1,124 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "e54626bf-c7ab-4f36-a3d0-3e13eec0824f", - "location" : "/home/iceberg/warehouse/datahub/integration/profiling", - "last-updated-ms" : 1651614244732, - "last-column-id" : 3, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - } ], - "partition-spec" : [ ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ ] - } ], - "last-partition-id" : 999, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "root" - }, - "current-snapshot-id" : 4220723025353071767, - "snapshots" : [ { - "snapshot-id" : 4437197002876030991, - "timestamp-ms" : 1651614151056, - "summary" : { - "operation" : "append", - "spark.app.id" : "local-1651614127284", - "added-data-files" : "2", - "added-records" : "2", - "added-files-size" : "2114", - "changed-partition-count" : "1", - "total-records" : "2", - "total-files-size" : "2114", - "total-data-files" : "2", - "total-delete-files" : "0", - "total-position-deletes" : "0", - "total-equality-deletes" : "0" - }, - "manifest-list" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro", - "schema-id" : 0 - }, { - "snapshot-id" : 4220723025353071767, - "parent-snapshot-id" : 4437197002876030991, - "timestamp-ms" : 1651614244732, - "summary" : { - "operation" : "append", - "spark.app.id" : "local-1651614127284", - "added-data-files" : "2", - "added-records" : "2", - "added-files-size" : "2111", - "changed-partition-count" : "1", - "total-records" : "4", - "total-files-size" : "4225", - "total-data-files" : "4", - "total-delete-files" : "0", - "total-position-deletes" : "0", - "total-equality-deletes" : "0" - }, - "manifest-list" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro", - "schema-id" : 0 - } ], - "snapshot-log" : [ { - "timestamp-ms" : 1651614151056, - "snapshot-id" : 4437197002876030991 - }, { - "timestamp-ms" : 1651614244732, - "snapshot-id" : 4220723025353071767 - } ], - "metadata-log" : [ { - "timestamp-ms" : 1651614148692, - "metadata-file" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json" - }, { - "timestamp-ms" : 1651614151056, - "metadata-file" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/00001-fb50681e-5f25-4180-99e2-065ef0b9791b.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8-m0.avro b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8-m0.avro deleted file mode 100644 index 3019df4adae30..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8-m0.avro and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/ec0bd970-e5ef-4843-abcb-e96a35a8f14d-m0.avro b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/ec0bd970-e5ef-4843-abcb-e96a35a8f14d-m0.avro deleted file mode 100644 index 1b51cd60d136a..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/ec0bd970-e5ef-4843-abcb-e96a35a8f14d-m0.avro and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro deleted file mode 100644 index 0dd50d23037e9..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro deleted file mode 100644 index 93f69f0ac1540..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/version-hint.text deleted file mode 100755 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/iceberg_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/iceberg_mces_golden.json deleted file mode 100644 index edfa8f80670cf..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/iceberg_mces_golden.json +++ /dev/null @@ -1,129 +0,0 @@ -[ -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,datahub.integration.profiling,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "root", - "provider": "ICEBERG", - "location": "/home/iceberg/warehouse/datahub/integration/profiling", - "snapshot-id": "4220723025353071767", - "manifest-list": "/home/iceberg/warehouse/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro" - }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:root", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:root", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "datahub.integration.profiling", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "Schema(1: field_int: optional long(An integer field),2: field_str: optional string(A string field),3: field_timestamp: optional timestamptz(A timestamp field))" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=long].field_int", - "nullable": true, - "description": "An integer field", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "long", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].field_str", - "nullable": true, - "description": "A string field", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].field_timestamp", - "nullable": true, - "description": "A timestamp field", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" - } - ] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,datahub.integration.profiling,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 4, \"columnCount\": 3, \"fieldProfiles\": [{\"fieldPath\": \"field_int\", \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"4\"}, {\"fieldPath\": \"field_str\", \"nullCount\": 0, \"nullProportion\": 0.0}, {\"fieldPath\": \"field_timestamp\", \"nullCount\": 2, \"nullProportion\": 0.5, \"min\": \"2022-05-03 21:42:29\", \"max\": \"2022-05-03 21:44:04\"}]}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" - } -} -] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/iceberg_deleted_table_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/iceberg_deleted_table_mces_golden.json deleted file mode 100644 index d376d8b645d66..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/iceberg_deleted_table_mces_golden.json +++ /dev/null @@ -1,159 +0,0 @@ -[ -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "new_owner", - "provider": "ICEBERG", - "location": "/namespace/iceberg_test" - }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:new_owner", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:new_owner", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "namespace.iceberg_test", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "Schema(1: level: required string(level documentation),2: event_time: required timestamptz(event_time documentation),3: message: required string(message documentation),4: call_stack: optional list(call_stack documentation))" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=string].level", - "nullable": false, - "description": "level documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].event_time", - "nullable": false, - "description": "event_time documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].message", - "nullable": false, - "description": "message documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].call_stack", - "nullable": true, - "description": "call_stack documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "string" - ] - } - } - }, - "nativeDataType": "list", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"list\", \"_nullable\": true}" - } - ] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test,PROD)", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "value": "{\"platform\": \"urn:li:dataPlatform:iceberg\", \"instance\": \"urn:li:dataPlatformInstance:(urn:li:dataPlatform:iceberg,test_platform_instance)\"}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test_2,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "value": "{\"removed\": true}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" - } -} -] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v1.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v1.metadata.json deleted file mode 100644 index e4ac0b9685ddc..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v1.metadata.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "11bbe5de-5ef6-4074-80db-f041065f9862", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1648729616724, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v2.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v2.metadata.json deleted file mode 100644 index 02221330b0665..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v2.metadata.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "16e6ecee-cd5d-470f-a7a6-a197944fa4db", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1649086837695, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "new_owner" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ { - "timestamp-ms" : 1649086837511, - "metadata-file" : "/namespace/iceberg_test/metadata/v1.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/version-hint.text deleted file mode 100644 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v1.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v1.metadata.json deleted file mode 100644 index e4ac0b9685ddc..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v1.metadata.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "11bbe5de-5ef6-4074-80db-f041065f9862", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1648729616724, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v2.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v2.metadata.json deleted file mode 100644 index 02221330b0665..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v2.metadata.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "16e6ecee-cd5d-470f-a7a6-a197944fa4db", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1649086837695, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "new_owner" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ { - "timestamp-ms" : 1649086837511, - "metadata-file" : "/namespace/iceberg_test/metadata/v1.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/version-hint.text deleted file mode 100644 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v1.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v1.metadata.json deleted file mode 100644 index e4ac0b9685ddc..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v1.metadata.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "11bbe5de-5ef6-4074-80db-f041065f9862", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1648729616724, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v2.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v2.metadata.json deleted file mode 100644 index 02221330b0665..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v2.metadata.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "16e6ecee-cd5d-470f-a7a6-a197944fa4db", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1649086837695, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "new_owner" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ { - "timestamp-ms" : 1649086837511, - "metadata-file" : "/namespace/iceberg_test/metadata/v1.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/version-hint.text deleted file mode 100644 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py index b26b574e54c47..e2a86480672e5 100644 --- a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py +++ b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py @@ -1,14 +1,14 @@ -from pathlib import PosixPath -from typing import Any, Dict, Union +import subprocess +import sys +from typing import Any, Dict, List from unittest.mock import patch import pytest from freezegun import freeze_time -from iceberg.core.filesystem.file_status import FileStatus -from iceberg.core.filesystem.local_filesystem import LocalFileSystem -from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import mce_helpers +from tests.test_helpers.click_helpers import run_datahub_cmd +from tests.test_helpers.docker_helpers import wait_for_port from tests.test_helpers.state_helpers import ( get_current_checkpoint_from_pipeline, run_and_get_pipeline, @@ -20,89 +20,92 @@ GMS_SERVER = f"http://localhost:{GMS_PORT}" +@pytest.fixture(autouse=True) +def skip_tests_if_python_before_3_8(): + if sys.version_info < (3, 8): + pytest.skip("Requires python 3.8 or higher") + + +def spark_submit(file_path: str, args: str = "") -> None: + docker = "docker" + command = f"{docker} exec spark-iceberg spark-submit {file_path} {args}" + ret = subprocess.run( + command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + assert ret.returncode == 0 + + @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_iceberg_ingest(pytestconfig, tmp_path, mock_time): +def test_iceberg_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg/" - # Run the metadata ingestion pipeline. - pipeline = Pipeline.create( - { - "run_id": "iceberg-test", - "source": { - "type": "iceberg", - "config": { - "localfs": str(test_resources_dir / "test_data/ingest_test"), - "user_ownership_property": "owner", - "group_ownership_property": "owner", - }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/iceberg_mces.json", - }, - }, - } - ) - pipeline.run() - pipeline.raise_from_status() - - # Verify the output. - mce_helpers.check_golden_file( - pytestconfig, - output_path=tmp_path / "iceberg_mces.json", - golden_path=test_resources_dir - / "test_data/ingest_test/iceberg_mces_golden.json", - ) + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "iceberg" + ) as docker_services: + wait_for_port(docker_services, "spark-iceberg", 8888, timeout=120) + + # Run the create.py pyspark file to populate the table. + spark_submit("/home/iceberg/setup/create.py", "nyc.taxis") + + # Run the metadata ingestion pipeline. + config_file = (test_resources_dir / "iceberg_to_file.yml").resolve() + run_datahub_cmd( + ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path + ) + # These paths change from one instance run of the clickhouse docker to the other, and the FROZEN_TIME does not apply to these. + ignore_paths: List[str] = [ + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['created-at'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['snapshot-id'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['manifest-list'\]", + ] + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + ignore_paths=ignore_paths, + output_path=tmp_path / "iceberg_mces.json", + golden_path=test_resources_dir / "iceberg_ingest_mces_golden.json", + ) @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = ( - pytestconfig.rootpath / "tests/integration/iceberg/test_data/stateful_test" - ) +def test_iceberg_stateful_ingest( + docker_compose_runner, pytestconfig, tmp_path, mock_time, mock_datahub_graph +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg" platform_instance = "test_platform_instance" - scd_before_deletion: Dict[str, Any] = { - "localfs": str(test_resources_dir / "run1"), - "user_ownership_property": "owner", - "group_ownership_property": "owner", - "platform_instance": f"{platform_instance}", - # enable stateful ingestion - "stateful_ingestion": { - "enabled": True, - "remove_stale_metadata": True, - "fail_safe_threshold": 100.0, - "state_provider": { - "type": "datahub", - "config": {"datahub_api": {"server": GMS_SERVER}}, - }, - }, - } - - scd_after_deletion: Dict[str, Any] = { - "localfs": str(test_resources_dir / "run2"), - "user_ownership_property": "owner", - "group_ownership_property": "owner", - "platform_instance": f"{platform_instance}", - # enable stateful ingestion - "stateful_ingestion": { - "enabled": True, - "remove_stale_metadata": True, - "fail_safe_threshold": 100.0, - "state_provider": { - "type": "datahub", - "config": {"datahub_api": {"server": GMS_SERVER}}, - }, - }, - } - pipeline_config_dict: Dict[str, Any] = { "source": { "type": "iceberg", - "config": scd_before_deletion, + "config": { + "catalog": { + "name": "default", + "type": "rest", + "config": { + "uri": "http://localhost:8181", + "s3.access-key-id": "admin", + "s3.secret-access-key": "password", + "s3.region": "us-east-1", + "warehouse": "s3a://warehouse/wh/", + "s3.endpoint": "http://localhost:9000", + }, + }, + "user_ownership_property": "owner", + "group_ownership_property": "owner", + "platform_instance": f"{platform_instance}", + # enable stateful ingestion + "stateful_ingestion": { + "enabled": True, + "remove_stale_metadata": True, + "fail_safe_threshold": 100.0, + "state_provider": { + "type": "datahub", + "config": {"datahub_api": {"server": GMS_SERVER}}, + }, + }, + }, }, "sink": { # we are not really interested in the resulting events for this test @@ -111,10 +114,18 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub "pipeline_name": "test_pipeline", } - with patch( + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "iceberg" + ) as docker_services, patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint: + wait_for_port(docker_services, "spark-iceberg", 8888, timeout=120) + + # Run the create.py pyspark file to populate two tables. + spark_submit("/home/iceberg/setup/create.py", "nyc.taxis") + spark_submit("/home/iceberg/setup/create.py", "nyc.another_taxis") + # Both checkpoint and reporting will use the same mocked graph instance. mock_checkpoint.return_value = mock_datahub_graph @@ -125,13 +136,14 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub assert checkpoint1 assert checkpoint1.state - # Set iceberg config where a table is deleted. - pipeline_config_dict["source"]["config"] = scd_after_deletion # Capture MCEs of second run to validate Status(removed=true) deleted_mces_path = f"{tmp_path}/iceberg_deleted_mces.json" pipeline_config_dict["sink"]["type"] = "file" pipeline_config_dict["sink"]["config"] = {"filename": deleted_mces_path} + # Run the delete.py pyspark file to delete the table. + spark_submit("/home/iceberg/setup/delete.py") + # Do the second run of the pipeline. pipeline_run2 = run_and_get_pipeline(pipeline_config_dict) checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2) @@ -149,7 +161,7 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub assert len(difference_urns) == 1 - urn1 = "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test_2,PROD)" + urn1 = "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.taxis,PROD)" assert urn1 in difference_urns @@ -161,9 +173,16 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub pipeline=pipeline_run2, expected_providers=1 ) + ignore_paths: List[str] = [ + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['created-at'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['snapshot-id'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['manifest-list'\]", + ] + # Verify the output. mce_helpers.check_golden_file( pytestconfig, + ignore_paths=ignore_paths, output_path=deleted_mces_path, golden_path=test_resources_dir / "iceberg_deleted_table_mces_golden.json", ) @@ -171,117 +190,32 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_iceberg_profiling(pytestconfig, tmp_path, mock_time): - """ - This test is using a table created using https://github.com/tabular-io/docker-spark-iceberg. - Here are the DDL statements that you can execute with `spark-sql`: - ```SQL - CREATE TABLE datahub.integration.profiling ( - field_int bigint COMMENT 'An integer field', - field_str string COMMENT 'A string field', - field_timestamp timestamp COMMENT 'A timestamp field') - USING iceberg; - - INSERT INTO datahub.integration.profiling VALUES (1, 'row1', current_timestamp()), (2, 'row2', null); - INSERT INTO datahub.integration.profiling VALUES (3, 'row3', current_timestamp()), (4, 'row4', null); - ``` - - When importing the metadata files into this test, we need to create a `version-hint.text` with a value that - reflects the version of the table, and then change the code in `TestLocalFileSystem._replace_path()` accordingly. - """ - test_resources_dir = ( - pytestconfig.rootpath / "tests/integration/iceberg/test_data/profiling_test" - ) +def test_iceberg_profiling(docker_compose_runner, pytestconfig, tmp_path, mock_time): + test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg/" - # Run the metadata ingestion pipeline. - pipeline = Pipeline.create( - { - "run_id": "iceberg-test", - "source": { - "type": "iceberg", - "config": { - "localfs": str(test_resources_dir), - "user_ownership_property": "owner", - "group_ownership_property": "owner", - "max_path_depth": 3, - "profiling": { - "enabled": True, - }, - "table_pattern": {"allow": ["datahub.integration.profiling"]}, - }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/iceberg_mces.json", - }, - }, - } - ) + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "iceberg" + ) as docker_services: + wait_for_port(docker_services, "spark-iceberg", 8888, timeout=120) - class TestLocalFileSystem(LocalFileSystem): - # This class acts as a wrapper on LocalFileSystem to intercept calls using a path location. - # The wrapper will normalize those paths to be usable by the test. - fs: LocalFileSystem - - @staticmethod - def _replace_path(path: Union[str, PosixPath]) -> str: - # When the Iceberg table was created, its warehouse folder was '/home/iceberg/warehouse'. Iceberg tables - # are not portable, so we need to replace the warehouse folder by the test location at runtime. - normalized_path: str = str(path).replace( - "/home/iceberg/warehouse", str(test_resources_dir) - ) - - # When the Iceberg table was created, a postgres catalog was used instead of a HadoopCatalog. The HadoopCatalog - # expects a file named 'v{}.metadata.json' where {} is the version number from 'version-hint.text'. Since - # 'v2.metadata.json' does not exist, we will redirect the call to '00002-02782173-8364-4caf-a3c4-9567c1d6608f.metadata.json'. - if normalized_path.endswith("v2.metadata.json"): - return normalized_path.replace( - "v2.metadata.json", - "00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json", - ) - return normalized_path - - def __init__(self, fs: LocalFileSystem) -> None: - self.fs = fs - - def open(self, path: str, mode: str = "rb") -> object: - return self.fs.open(TestLocalFileSystem._replace_path(path), mode) - - def delete(self, path: str) -> None: - self.fs.delete(TestLocalFileSystem._replace_path(path)) - - def stat(self, path: str) -> FileStatus: - return self.fs.stat(TestLocalFileSystem._replace_path(path)) - - @staticmethod - def fix_path(path: str) -> str: - return TestLocalFileSystem.fs.fix_path( - TestLocalFileSystem._replace_path(path) - ) - - def create(self, path: str, overwrite: bool = False) -> object: - return self.fs.create(TestLocalFileSystem._replace_path(path), overwrite) - - def rename(self, src: str, dest: str) -> bool: - return self.fs.rename( - TestLocalFileSystem._replace_path(src), - TestLocalFileSystem._replace_path(dest), - ) - - def exists(self, path: str) -> bool: - return self.fs.exists(TestLocalFileSystem._replace_path(path)) - - local_fs_wrapper: TestLocalFileSystem = TestLocalFileSystem( - LocalFileSystem.get_instance() - ) - with patch.object(LocalFileSystem, "get_instance", return_value=local_fs_wrapper): - pipeline.run() - pipeline.raise_from_status() - - # Verify the output. - mce_helpers.check_golden_file( - pytestconfig, - output_path=tmp_path / "iceberg_mces.json", - golden_path=test_resources_dir / "iceberg_mces_golden.json", - ) + # Run the create.py pyspark file to populate the table. + spark_submit("/home/iceberg/setup/create.py", "nyc.taxis") + + # Run the metadata ingestion pipeline. + config_file = (test_resources_dir / "iceberg_profile_to_file.yml").resolve() + run_datahub_cmd( + ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path + ) + # These paths change from one instance run of the clickhouse docker to the other, and the FROZEN_TIME does not apply to these. + ignore_paths: List[str] = [ + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['created-at'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['snapshot-id'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['manifest-list'\]", + ] + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + ignore_paths=ignore_paths, + output_path=tmp_path / "iceberg_mces.json", + golden_path=test_resources_dir / "iceberg_profile_mces_golden.json", + ) diff --git a/metadata-ingestion/tests/integration/ldap/test_ldap.py b/metadata-ingestion/tests/integration/ldap/test_ldap.py index 148a3a6128013..3e76f13fc823d 100644 --- a/metadata-ingestion/tests/integration/ldap/test_ldap.py +++ b/metadata-ingestion/tests/integration/ldap/test_ldap.py @@ -100,3 +100,54 @@ def test_ldap_memberof_ingest(docker_compose_runner, pytestconfig, tmp_path, moc output_path=tmp_path / "ldap_memberof_mces.json", golden_path=test_resources_dir / "ldap_memberof_mces_golden.json", ) + + +@pytest.mark.integration +def test_ldap_ingest_with_email_as_username( + docker_compose_runner, pytestconfig, tmp_path, mock_time +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/ldap" + + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "ldap" + ) as docker_services: + # The openldap container loads the sample data after exposing the port publicly. As such, + # we must wait a little bit extra to ensure that the sample data is loaded. + wait_for_port(docker_services, "openldap", 389) + time.sleep(5) + + pipeline = Pipeline.create( + { + "run_id": "ldap-test", + "source": { + "type": "ldap", + "config": { + "ldap_server": "ldap://localhost", + "ldap_user": "cn=admin,dc=example,dc=org", + "ldap_password": "admin", + "base_dn": "dc=example,dc=org", + "user_attrs_map": {"email": "mail"}, + "group_attrs_map": { + "members": "memberUid", + "email": "mail", + }, + "use_email_as_username": True, + "custom_props_list": ["givenName"], + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/ldap_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "ldap_mces.json", + golden_path=test_resources_dir / "ldap_mces_golden.json", + ) diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index 6167c63e6c9b8..dee85b40bb7a8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -262,8 +262,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" @@ -412,8 +412,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index e66ec4bb89d8c..72db36e63daf7 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index 11e0760decae3..e5508bdb06b9e 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,datahub-demo.view.faa_flights,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index ddfd102cb15b0..91e13debfa028 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -279,8 +279,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" @@ -429,8 +429,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 54624986216b8..e93079119e4f4 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 6cab0db8c33cf..a9c8efa7cdb98 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -206,32 +206,32 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 9a088a7a8baef..edd15624a14cd 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -206,24 +206,24 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index f8e2565e492e1..aebc89b609a08 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 32d4f7bc64ab4..34bded3cf691e 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -158,8 +158,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/expected_output.json b/metadata-ingestion/tests/integration/lookml/expected_output.json index cdf520cc23a30..b53d5857f1d66 100644 --- a/metadata-ingestion/tests/integration/lookml/expected_output.json +++ b/metadata-ingestion/tests/integration/lookml/expected_output.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json index 73edecbe62205..238f4c2580cdf 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json index 9aa6a952c40b4..45d5d839e9d21 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json index 6ce6d809ae8f5..187cedaefb6b2 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json @@ -450,8 +450,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -557,8 +557,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -664,8 +664,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -816,8 +816,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -923,8 +923,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1123,8 +1123,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1230,8 +1230,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1416,8 +1416,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1615,8 +1615,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1854,8 +1854,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json index 1016d4e211458..a323118666940 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json index fc91c97a53003..c2c879e38f37b 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json index 8635a570c0621..c1ac54b0fb588 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -128,8 +128,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -235,8 +235,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -387,8 +387,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -494,8 +494,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json index 19168aa323142..f602ca37b3160 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.autodetect_sql_name_based_on_view_name,DEV)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.ecommerce.ability,DEV)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.flightstats.accidents,DEV)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json index d4ced76a7475d..104bd365669e3 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" @@ -261,8 +261,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -480,8 +480,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -588,8 +588,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -696,8 +696,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -849,8 +849,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -957,8 +957,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1065,8 +1065,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" @@ -1248,8 +1248,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1356,8 +1356,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1543,8 +1543,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1743,8 +1743,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1983,8 +1983,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json index 2bae6452145df..37a6c94c6952e 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)", "type": "VIEW" @@ -459,8 +459,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json index a5c316f365d4b..49831ee554ab1 100644 --- a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" @@ -303,8 +303,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" @@ -410,8 +410,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.order,PROD)", "type": "VIEW" @@ -607,8 +607,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.issue_history,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json index de303d50e7acd..dc5e1aa9096f8 100644 --- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" @@ -1764,8 +1764,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -2003,8 +2003,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json b/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json index 6e57dfaae0ce0..0ba6afbd04fc9 100644 --- a/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json +++ b/metadata-ingestion/tests/integration/metabase/metabase_mces_golden.json @@ -115,6 +115,61 @@ "runId": "metabase-test" } }, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(metabase,3)", + "aspects": [ + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "Metrics": "Distinct values of order_number, Sum of nominal_total", + "Filters": "['time-interval', ['field', 'completed_at', {'base-type': 'type/DateTimeWithTZ'}], -8, 'day', {'include-current': False}]", + "Dimensions": "completed_at" + }, + "title": "Question with data from other question", + "description": "", + "lastModified": { + "created": { + "time": 1685628119636, + "actor": "urn:li:corpuser:john.doe@example.com" + }, + "lastModified": { + "time": 1685628119636, + "actor": "urn:li:corpuser:john.doe@example.com" + } + }, + "chartUrl": "http://localhost:3000/card/3", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:bigquery,acryl-data.public.payment,PROD)" + } + ], + "type": "TABLE" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:admin@metabase.com", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1636614000000, + "runId": "metabase-test" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { @@ -195,6 +250,21 @@ "runId": "metabase-test" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(metabase,3)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1636614000000, + "runId": "metabase-test" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(metabase,1)", diff --git a/metadata-ingestion/tests/integration/metabase/setup/card.json b/metadata-ingestion/tests/integration/metabase/setup/card.json index 439edbf60014f..83bff66e6c9f3 100644 --- a/metadata-ingestion/tests/integration/metabase/setup/card.json +++ b/metadata-ingestion/tests/integration/metabase/setup/card.json @@ -304,4 +304,196 @@ "favorite": false, "created_at": "2021-12-13T17:48:37.102", "public_uuid": null -}] \ No newline at end of file +}, { + "description": null, + "archived": false, + "collection_position": null, + "table_id": null, + "result_metadata": [ + { + "name": "completed_at", + "display_name": "completed_at", + "base_type": "type/Date", + "special_type": null, + "field_ref": [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ", + "temporal-unit": "day" + } + ], + "unit": "day", + "fingerprint": { + "global": { + "distinct-count": 1916, + "nil%": 0.0385 + } + } + }, + { + "name": "count", + "display_name": "Distinct values of order_number", + "base_type": "type/BigInteger", + "special_type": "type/Quantity", + "field_ref": [ + "aggregation", + 0 + ], + "fingerprint": { + "global": { + "distinct-count": 8, + "nil%": 0.0 + }, + "type": { + "type/Number": { + "min": 44098.0, + "q1": 46911.0, + "q3": 51276.0, + "max": 52228.0, + "sd": 2797.3306887357558, + "avg": 48557.125 + } + } + } + }, + { + "name": "sum", + "display_name": "Sum of nominal_total", + "base_type": "type/Float", + "special_type": null, + "field_ref": [ + "aggregation", + 1 + ], + "fingerprint": { + "global": { + "distinct-count": 8, + "nil%": 0.0 + }, + "type": { + "type/Number": { + "min": 1.256807007034278E8, + "q1": 1.277180884245776E8, + "q3": 1.4257821803491282E8, + "max": 1.4887777502074698E8, + "sd": 8966928.163419789, + "avg": 1.3526486656272435E8 + } + } + } + } + ], + "creator": { + "email": "john.doe@example.com", + "first_name": "John", + "last_login": "2023-08-03T09:33:25.157021Z", + "is_qbnewb": false, + "is_superuser": false, + "id": 1, + "last_name": "Doe", + "date_joined": "2020-07-13T07:29:31.805765Z", + "common_name": "John Doe" + }, + "can_write": true, + "database_id": 2, + "enable_embedding": false, + "collection_id": 1135, + "query_type": "query", + "name": "Question with data from other question", + "last_query_start": null, + "dashboard_count": 1, + "average_query_time": null, + "creator_id": 31337, + "moderation_reviews": [], + "updated_at": "2023-06-01T14:01:59.592811Z", + "made_public_by_id": null, + "embedding_params": null, + "cache_ttl": null, + "dataset_query": { + "database": 2, + "query": { + "source-table": "card__1", + "filter": [ + "time-interval", + [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ" + } + ], + -8, + "day", + { + "include-current": false + } + ], + "aggregation": [ + [ + "distinct", + [ + "field", + "order_number", + { + "base-type": "type/Text" + } + ] + ], + [ + "sum", + [ + "field", + "nominal_total", + { + "base-type": "type/Float" + } + ] + ] + ], + "breakout": [ + [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ", + "temporal-unit": "day" + } + ] + ] + }, + "type": "query" + }, + "id": 3, + "parameter_mappings": null, + "display": "table", + "entity_id": null, + "collection_preview": true, + "last-edit-info": { + "id": 1, + "email": "john.doe@example.com", + "first_name": "John", + "last_name": "Doe", + "timestamp": "2023-06-01T14:01:59.636581Z" + }, + "visualization_settings": {}, + "collection": { + "authority_level": null, + "description": null, + "archived": false, + "slug": "group", + "color": "#509EE3", + "name": "Group", + "personal_owner_id": null, + "id": 1135, + "entity_id": null, + "location": "/3/373/", + "namespace": null, + "created_at": "2020-07-17T19:28:39.513365Z" + }, + "parameters": null, + "dataset": false, + "created_at": "2020-07-17T19:28:39.513365Z", + "parameter_usage_count": 0, + "public_uuid": null +}] diff --git a/metadata-ingestion/tests/integration/metabase/setup/card_3.json b/metadata-ingestion/tests/integration/metabase/setup/card_3.json new file mode 100644 index 0000000000000..3f928cd2e8f69 --- /dev/null +++ b/metadata-ingestion/tests/integration/metabase/setup/card_3.json @@ -0,0 +1,193 @@ +{ + "description": null, + "archived": false, + "collection_position": null, + "table_id": null, + "result_metadata": [ + { + "name": "completed_at", + "display_name": "completed_at", + "base_type": "type/Date", + "special_type": null, + "field_ref": [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ", + "temporal-unit": "day" + } + ], + "unit": "day", + "fingerprint": { + "global": { + "distinct-count": 1916, + "nil%": 0.0385 + } + } + }, + { + "name": "count", + "display_name": "Distinct values of order_number", + "base_type": "type/BigInteger", + "special_type": "type/Quantity", + "field_ref": [ + "aggregation", + 0 + ], + "fingerprint": { + "global": { + "distinct-count": 8, + "nil%": 0.0 + }, + "type": { + "type/Number": { + "min": 44098.0, + "q1": 46911.0, + "q3": 51276.0, + "max": 52228.0, + "sd": 2797.3306887357558, + "avg": 48557.125 + } + } + } + }, + { + "name": "sum", + "display_name": "Sum of nominal_total", + "base_type": "type/Float", + "special_type": null, + "field_ref": [ + "aggregation", + 1 + ], + "fingerprint": { + "global": { + "distinct-count": 8, + "nil%": 0.0 + }, + "type": { + "type/Number": { + "min": 1.256807007034278E8, + "q1": 1.277180884245776E8, + "q3": 1.4257821803491282E8, + "max": 1.4887777502074698E8, + "sd": 8966928.163419789, + "avg": 1.3526486656272435E8 + } + } + } + } + ], + "creator": { + "email": "john.doe@example.com", + "first_name": "John", + "last_login": "2023-08-03T09:33:25.157021Z", + "is_qbnewb": false, + "is_superuser": false, + "id": 1, + "last_name": "Doe", + "date_joined": "2020-07-13T07:29:31.805765Z", + "common_name": "John Doe" + }, + "can_write": true, + "database_id": 2, + "enable_embedding": false, + "collection_id": 1135, + "query_type": "query", + "name": "Question with data from other question", + "last_query_start": null, + "dashboard_count": 1, + "average_query_time": null, + "creator_id": 1, + "moderation_reviews": [], + "updated_at": "2023-06-01T14:01:59.592811Z", + "made_public_by_id": null, + "embedding_params": null, + "cache_ttl": null, + "dataset_query": { + "database": 2, + "query": { + "source-table": "card__1", + "filter": [ + "time-interval", + [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ" + } + ], + -8, + "day", + { + "include-current": false + } + ], + "aggregation": [ + [ + "distinct", + [ + "field", + "order_number", + { + "base-type": "type/Text" + } + ] + ], + [ + "sum", + [ + "field", + "nominal_total", + { + "base-type": "type/Float" + } + ] + ] + ], + "breakout": [ + [ + "field", + "completed_at", + { + "base-type": "type/DateTimeWithTZ", + "temporal-unit": "day" + } + ] + ] + }, + "type": "query" + }, + "id": 3, + "parameter_mappings": null, + "display": "table", + "entity_id": null, + "collection_preview": true, + "last-edit-info": { + "id": 1, + "email": "john.doe@example.com", + "first_name": "John", + "last_name": "Doe", + "timestamp": "2023-06-01T14:01:59.636581Z" + }, + "visualization_settings": {}, + "collection": { + "authority_level": null, + "description": null, + "archived": false, + "slug": "group", + "color": "#509EE3", + "name": "Group", + "personal_owner_id": null, + "id": 1135, + "entity_id": null, + "location": "/3/373/", + "namespace": null, + "created_at": "2020-07-17T19:28:39.513365Z" + }, + "parameters": null, + "dataset": false, + "created_at": "2020-07-17T19:28:39.513365Z", + "parameter_usage_count": 0, + "public_uuid": null +} diff --git a/metadata-ingestion/tests/integration/metabase/test_metabase.py b/metadata-ingestion/tests/integration/metabase/test_metabase.py index 5f5c8efedbfeb..24d254fc8469e 100644 --- a/metadata-ingestion/tests/integration/metabase/test_metabase.py +++ b/metadata-ingestion/tests/integration/metabase/test_metabase.py @@ -23,6 +23,7 @@ "http://localhost:3000/api/card/1": "card_1.json", "http://localhost:3000/api/card/2": "card_2.json", "http://localhost:3000/api/table/21": "table_21.json", + "http://localhost:3000/api/card/3": "card_3.json", } RESPONSE_ERROR_LIST = ["http://localhost:3000/api/dashboard"] diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 5c9553402a8c4..e77a12aa4088e 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,17 +1,22 @@ import logging import sys -from typing import List +from typing import List, Tuple import pytest from lark import Tree import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport -from datahub.ingestion.source.powerbi.m_query import parser, tree_function -from datahub.ingestion.source.powerbi.m_query.resolver import ( - DataPlatformTable, - SupportedDataPlatform, +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, + create_dataplatform_instance_resolver, ) +from datahub.ingestion.source.powerbi.m_query import parser, tree_function +from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -38,9 +43,31 @@ 'let\n Source = AmazonRedshift.Database("redshift-url","dev"),\n public = Source{[Name="public"]}[Data],\n category1 = public{[Name="category"]}[Data]\nin\n category1', 'let\n Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) \n in Source', 'let\n Source = Databricks.Catalogs("adb-123.azuredatabricks.net", "/sql/1.0/endpoints/12345dc91aa25844", [Catalog=null, Database=null]),\n hive_metastore_Database = Source{[Name="hive_metastore",Kind="Database"]}[Data],\n sandbox_revenue_Schema = hive_metastore_Database{[Name="sandbox_revenue",Kind="Schema"]}[Data],\n public_consumer_price_index_Table = sandbox_revenue_Schema{[Name="public_consumer_price_index",Kind="Table"]}[Data],\n #"Renamed Columns" = Table.RenameColumns(public_consumer_price_index_Table,{{"Country", "country"}, {"Metric", "metric"}}),\n #"Inserted Year" = Table.AddColumn(#"Renamed Columns", "ID", each Date.Year([date_id]) + Date.Month([date_id]), Text.Type),\n #"Added Custom" = Table.AddColumn(#"Inserted Year", "Custom", each Text.Combine({Number.ToText(Date.Year([date_id])), Number.ToText(Date.Month([date_id])), [country]})),\n #"Removed Columns" = Table.RemoveColumns(#"Added Custom",{"ID"}),\n #"Renamed Columns1" = Table.RenameColumns(#"Removed Columns",{{"Custom", "ID"}}),\n #"Filtered Rows" = Table.SelectRows(#"Renamed Columns1", each ([metric] = "Consumer Price Index") and (not Number.IsNaN([value])))\nin\n #"Filtered Rows"', + "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source", ] +def get_default_instances( + override_config: dict = {}, +) -> Tuple[ + PipelineContext, PowerBiDashboardSourceConfig, AbstractDataPlatformInstanceResolver +]: + config: PowerBiDashboardSourceConfig = PowerBiDashboardSourceConfig.parse_obj( + { + "tenant_id": "fake", + "client_id": "foo", + "client_secret": "bar", + **override_config, + } + ) + + platform_instance_resolver: AbstractDataPlatformInstanceResolver = ( + create_dataplatform_instance_resolver(config) + ) + + return PipelineContext(run_id="fake"), config, platform_instance_resolver + + @pytest.mark.integration def test_parse_m_query1(): expression: str = M_QUERIES[0] @@ -145,20 +172,20 @@ def test_snowflake_regular_case(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "TESTTABLE" - assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" assert ( - data_platform_tables[0].datasource_server - == "bu10758.ap-unknown-2.fakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,pbi_test.test.testtable,PROD)" ) @@ -174,17 +201,21 @@ def test_postgres_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "order_date" - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].datasource_server == "localhost" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)" ) @@ -200,19 +231,21 @@ def test_databricks_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "public_consumer_price_index" assert ( - data_platform_tables[0].full_name - == "hive_metastore.sandbox_revenue.public_consumer_price_index" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.DATABRICK_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:databricks,hive_metastore.sandbox_revenue.public_consumer_price_index,PROD)" ) @@ -228,17 +261,21 @@ def test_oracle_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "EMPLOYEES" - assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" - assert data_platform_tables[0].datasource_server == "localhost:1521" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.hr.employees,PROD)" ) @@ -255,17 +292,20 @@ def test_mssql_regular_case(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "book_issue" - assert data_platform_tables[0].full_name == "library.dbo.book_issue" - assert data_platform_tables[0].datasource_server == "localhost" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:mssql,library.dbo.book_issue,PROD)" ) @@ -280,14 +320,16 @@ def test_mssql_with_query(): M_QUERIES[11], ] expected_tables = [ - "COMMOPSDB.dbo.V_OIP_ENT_2022", - "COMMOPSDB.dbo.V_INVOICE_BOOKING_2022", - "COMMOPSDB.dbo.V_ARR_ADDS", - "COMMOPSDB.dbo.V_PS_CD_RETENTION", - "COMMOPSDB.dbo.V_TPV_LEADERBOARD", - "COMMOPSDB.dbo.V_ENTERPRISE_INVOICED_REVENUE", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_oip_ent_2022,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_invoice_booking_2022,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_arr_adds,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_tpv_leaderboard,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_enterprise_invoiced_revenue,PROD)", ] + ctx, config, platform_instance_resolver = get_default_instances() + for index, query in enumerate(mssql_queries): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -299,17 +341,15 @@ def test_mssql_with_query(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == expected_tables[index].split(".")[2] - assert data_platform_tables[0].full_name == expected_tables[index] - assert data_platform_tables[0].datasource_server == "AUPRDWHDB" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name - ) + assert data_platform_tables[0].urn == expected_tables[index] @pytest.mark.integration @@ -322,12 +362,14 @@ def test_snowflake_native_query(): ] expected_tables = [ - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", ] + ctx, config, platform_instance_resolver = get_default_instances() + for index, query in enumerate(snowflake_queries): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -339,20 +381,15 @@ def test_snowflake_native_query(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == expected_tables[index].split(".")[2] - assert data_platform_tables[0].full_name == expected_tables[index] - assert ( - data_platform_tables[0].datasource_server - == "bu10758.ap-unknown-2.fakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name - ) + assert data_platform_tables[0].urn == expected_tables[index] def test_google_bigquery_1(): @@ -363,16 +400,20 @@ def test_google_bigquery_1(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "seraphic-music-344307" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,seraphic-music-344307.school_dataset.first,PROD)" ) @@ -387,23 +428,24 @@ def test_google_bigquery_2(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "Parameter - Source": "my-test-project", "My bq project": "gcp_billing", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "my-test-project" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.gcp_billing.gcp_table,PROD)" ) @@ -416,23 +458,24 @@ def test_for_each_expression_1(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "Parameter - Source": "my-test-project", "My bq project": "gcp_billing", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].datasource_server == "my-test-project" - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.universal.d_wh_date,PROD)" ) @@ -445,22 +488,23 @@ def test_for_each_expression_2(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "dwh-prod": "originally-not-a-variable-ref-and-not-resolved", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "dwh-prod" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,dwh-prod.gcp_billing.d_gcp_custom_label,PROD)" ) @@ -476,8 +520,14 @@ def test_native_query_disabled(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + config.native_query_parsing = False data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 0 @@ -493,26 +543,25 @@ def test_multi_source_table(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 2 - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].datasource_server == "localhost" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name - ) - - assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW" assert ( - data_platform_tables[1].datasource_server - == "ghh48144.snowflakefakecomputing.com" + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)" ) assert ( - data_platform_tables[1].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst_view,PROD)" ) @@ -521,36 +570,33 @@ def test_table_combine(): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], measures=[], - expression=M_QUERIES[16], # 1st index has the native query + expression=M_QUERIES[16], name="virtual_order_table", full_name="OrderDataSet.virtual_order_table", ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 2 - assert data_platform_tables[0].full_name == "GSL_TEST_DB.PUBLIC.SALES_FORECAST" - assert ( - data_platform_tables[0].datasource_server - == "ghh48144.snowflakefakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name - ) - assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST" assert ( - data_platform_tables[1].datasource_server - == "ghh48144.snowflakefakecomputing.com" + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_forecast,PROD)" ) + assert ( - data_platform_tables[1].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD)" ) @@ -574,8 +620,14 @@ def test_expression_is_none(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 0 @@ -589,15 +641,20 @@ def test_redshift_regular_case(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)" ) @@ -609,13 +666,60 @@ def test_redshift_native_query(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + + config.native_query_parsing = True + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=True + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)" + ) + + +def test_sqlglot_parser(): + table: powerbi_data_classes.Table = powerbi_data_classes.Table( + expression=M_QUERIES[24], + name="SALES_TARGET", + full_name="dev.public.sales", + ) + reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances( + override_config={ + "server_to_platform_instance": { + "bu10758.ap-unknown-2.fakecomputing.com": { + "platform_instance": "sales_deployment", + "env": "PROD", + } + }, + "native_query_parsing": True, + "enable_advance_lineage_sql_construct": True, + } + ) + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) + + assert len(data_platform_tables) == 2 + assert ( + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit,PROD)" + ) + assert ( + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)" ) diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json index d042c3fbb158b..63efc79941d82 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "file_without_extension.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json index 8e4fcb80ff855..d59fce788c95e 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", @@ -2740,6 +2756,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", @@ -2750,7 +2782,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -2788,62 +2820,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -2907,7 +2939,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -2958,7 +3041,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -3017,7 +3100,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -3072,7 +3155,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -3181,41 +3264,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } @@ -3277,6 +3325,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", @@ -3852,6 +3916,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", @@ -4178,6 +4258,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", @@ -4571,6 +4667,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/small.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/small.csv,DEV)", @@ -7590,6 +7702,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json index 1bd75ae457cb4..ed2c992655a89 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json index b9687b97571cb..f7793140fe033 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json index a5a68777cad5c..f54c62865bcde 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json index 58b81065c190f..d50f00efacaa0 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "file_without_extension.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json index 0c1d92ed58e3d..58c225e1ec4c9 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", @@ -933,7 +949,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -945,6 +961,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", @@ -971,62 +1003,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1110,6 +1142,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", @@ -1319,6 +1367,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", @@ -1482,6 +1546,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", @@ -1647,6 +1727,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/small.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/small.csv,DEV)", @@ -2282,6 +2378,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json index 84ace7d673676..9c41bbdc80c49 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json index f7f3cb8fb743e..985140f774ab4 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json index 5353d95ada8f7..5d87d423a6a67 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro b/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro index 8a6d9df66bb79..79c329b3f8dca 100644 Binary files a/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro and b/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro differ diff --git a/metadata-ingestion/tests/integration/s3/test_s3.py b/metadata-ingestion/tests/integration/s3/test_s3.py index 98ae2eaa393ab..462ca88b7c123 100644 --- a/metadata-ingestion/tests/integration/s3/test_s3.py +++ b/metadata-ingestion/tests/integration/s3/test_s3.py @@ -140,7 +140,7 @@ def test_data_lake_s3_ingest( def test_data_lake_local_ingest( pytestconfig, touch_local_files, source_file, tmp_path, mock_time ): - os.environ["SPARK_VERSION"] = "3.0.3" + os.environ["SPARK_VERSION"] = "3.3.2" test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/" f = open(os.path.join(SOURCE_FILES_PATH, source_file)) source = json.load(f) diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index b08a14d0805c6..81e307a78ae9e 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -144,7 +144,7 @@ def default_query_results( # noqa: C901 } ] elif query == snowflake_query.SnowflakeQuery.operational_data_for_time_window( - 1654499820000, + 1654473600000, 1654586220000, ): return [ @@ -257,7 +257,7 @@ def default_query_results( # noqa: C901 elif ( query == snowflake_query.SnowflakeQuery.usage_per_object_per_time_bucket_for_time_window( - 1654499820000, + 1654473600000, 1654586220000, use_base_objects=False, top_n_queries=10, @@ -268,11 +268,11 @@ def default_query_results( # noqa: C901 return [] elif query in ( snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654499820000, + 1654473600000, 1654586220000, ), snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654499820000, 1654586220000, False + 1654473600000, 1654586220000, False ), ): return [ @@ -331,7 +331,7 @@ def default_query_results( # noqa: C901 ] elif query in ( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( - start_time_millis=1654499820000, + start_time_millis=1654473600000, end_time_millis=1654586220000, include_view_lineage=True, include_column_lineage=True, @@ -403,7 +403,7 @@ def default_query_results( # noqa: C901 ] elif query in ( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( - start_time_millis=1654499820000, + start_time_millis=1654473600000, end_time_millis=1654586220000, include_view_lineage=False, include_column_lineage=False, @@ -434,11 +434,6 @@ def default_query_results( # noqa: C901 } for op_idx in range(1, num_ops + 1) ] - elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654499820000, - 1654586220000, - ): - return [] elif query in [ snowflake_query.SnowflakeQuery.view_dependencies(), ]: @@ -470,11 +465,11 @@ def default_query_results( # noqa: C901 ] elif query in [ snowflake_query.SnowflakeQuery.view_lineage_history( - 1654499820000, + 1654473600000, 1654586220000, ), snowflake_query.SnowflakeQuery.view_lineage_history( - 1654499820000, 1654586220000, False + 1654473600000, 1654586220000, False ), ]: return [ @@ -509,10 +504,6 @@ def default_query_results( # noqa: C901 } ] elif query in [ - snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654499820000, - 1654586220000, - ), snowflake_query.SnowflakeQuery.view_dependencies_v2(), snowflake_query.SnowflakeQuery.view_dependencies(), snowflake_query.SnowflakeQuery.show_external_tables(), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 53b2bcb236cd9..dec50aefd19f0 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -121,11 +121,10 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): include_table_lineage=True, include_view_lineage=True, include_usage_stats=True, - use_legacy_lineage_method=False, validate_upstreams_against_patterns=False, include_operational_stats=True, email_as_user_identifier=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( + start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc ), end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( @@ -213,10 +212,9 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_ include_column_lineage=False, include_views=False, include_view_lineage=False, - use_legacy_lineage_method=False, include_usage_stats=False, include_operational_stats=False, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( + start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc ), end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index ed3bea49f0179..bba53c1e97a47 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -55,8 +55,7 @@ def snowflake_pipeline_config(tmp_path): schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_view_lineage=False, include_usage_stats=False, - use_legacy_lineage_method=False, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( + start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc ), end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(tzinfo=timezone.utc), @@ -228,7 +227,7 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( default_query_results, [ snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( - start_time_millis=1654499820000, + start_time_millis=1654473600000, end_time_millis=1654586220000, include_view_lineage=False, include_column_lineage=True, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py deleted file mode 100644 index 18779bd564f0d..0000000000000 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py +++ /dev/null @@ -1,291 +0,0 @@ -from datetime import datetime, timezone -from typing import cast -from unittest import mock - -from freezegun import freeze_time -from pytest import fixture - -from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig -from datahub.ingestion.source.snowflake import snowflake_query -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from tests.integration.snowflake.common import ( - FROZEN_TIME, - NUM_TABLES, - default_query_results, -) - - -def query_permission_error_override(fn, override_for_query, error_msg): - def my_function(query): - if query in override_for_query: - raise Exception(error_msg) - else: - return fn(query) - - return my_function - - -def query_permission_response_override(fn, override_for_query, response): - def my_function(query): - if query in override_for_query: - return response - else: - return fn(query) - - return my_function - - -@fixture(scope="function") -def snowflake_pipeline_legacy_lineage_config(tmp_path): - output_file = tmp_path / "snowflake_test_events_permission_error.json" - config = PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.aws", - username="TST_USR", - password="TST_PWD", - role="TEST_ROLE", - warehouse="TEST_WAREHOUSE", - include_technical_schema=True, - match_fully_qualified_names=True, - schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_view_lineage=False, - include_usage_stats=False, - use_legacy_lineage_method=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(tzinfo=timezone.utc), - ), - ), - sink=DynamicTypedConfig(type="file", config={"filename": str(output_file)}), - ) - return config - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_role_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - # Snowflake connection fails role not granted error - mock_connect.side_effect = Exception( - "250001 (08001): Failed to connect to DB: abc12345.ap-south-1.snowflakecomputing.com:443. Role 'TEST_ROLE' specified in the connect string is not granted to this user. Contact your local system administrator, or attempt to login with another role, e.g. PUBLIC" - ) - - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_warehouse_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Current warehouse query leads to blank result - sf_cursor.execute.side_effect = query_permission_response_override( - default_query_results, - [SnowflakeQuery.current_warehouse()], - [(None,)], - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_no_databases_with_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing databases - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [SnowflakeQuery.get_databases("TEST_DB")], - "Database 'TEST_DB' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_no_tables_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing databases - no_tables_fn = query_permission_response_override( - default_query_results, - [SnowflakeQuery.tables_for_schema("TEST_SCHEMA", "TEST_DB")], - [], - ) - sf_cursor.execute.side_effect = query_permission_response_override( - no_tables_fn, - [SnowflakeQuery.show_views_for_schema("TEST_SCHEMA", "TEST_DB")], - [], - ) - - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_list_columns_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing columns - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [ - SnowflakeQuery.columns_for_table( - "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB" - ) - for tbl_idx in range(1, NUM_TABLES + 1) - ], - "Database 'TEST_DB' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert ( - "Failed to get columns for table" - in pipeline.source.get_report().warnings.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_list_primary_keys_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing keys leads to warning - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [SnowflakeQuery.show_primary_keys_for_schema("TEST_SCHEMA", "TEST_DB")], - "Insufficient privileges to operate on TEST_DB", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert ( - "Failed to get primary key for table" - in pipeline.source.get_report().warnings.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting lineage - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [ - snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654499820000, 1654586220000, True - ), - ], - "Database 'SNOWFLAKE' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert ( - "lineage-permission-error" in pipeline.source.get_report().failures.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_snowflake_operations_permission_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting access history date range - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [snowflake_query.SnowflakeQuery.get_access_history_date_range()], - "Database 'SNOWFLAKE' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "usage-permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting view lineage - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [snowflake_query.SnowflakeQuery.view_dependencies()], - "Unexpected Error", - ) - - snowflake_pipeline_config1 = snowflake_pipeline_legacy_lineage_config.copy() - cast( - SnowflakeV2Config, - cast(PipelineConfig, snowflake_pipeline_config1).source.config, - ).include_view_lineage = True - pipeline = Pipeline(snowflake_pipeline_config1) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert "view-upstream-lineage" in pipeline.source.get_report().warnings.keys() diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py deleted file mode 100644 index 59da7ddf695d8..0000000000000 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py +++ /dev/null @@ -1,207 +0,0 @@ -import random -from datetime import datetime, timezone -from unittest import mock - -import pandas as pd -import pytest -from freezegun import freeze_time - -from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.glossary.classifier import ( - ClassificationConfig, - DynamicTypedClassifierConfig, -) -from datahub.ingestion.glossary.datahub_classifier import ( - DataHubClassifierConfig, - InfoTypeConfig, - PredictionFactorsAndWeights, - ValuesFactorConfig, -) -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig -from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig -from datahub.ingestion.source.snowflake.snowflake_config import ( - SnowflakeV2Config, - TagOption, -) -from tests.integration.snowflake.common import FROZEN_TIME, default_query_results -from tests.integration.snowflake.test_snowflake import random_cloud_region, random_email -from tests.test_helpers import mce_helpers - - -@pytest.mark.integration -def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" - - # Run the metadata ingestion pipeline. - output_file = tmp_path / "snowflake_test_events.json" - golden_file = test_resources_dir / "snowflake_golden.json" - - with mock.patch("snowflake.connector.connect") as mock_connect, mock.patch( - "datahub.ingestion.source.snowflake.snowflake_v2.SnowflakeV2Source.get_sample_values_for_table" - ) as mock_sample_values: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - sf_cursor.execute.side_effect = default_query_results - - mock_sample_values.return_value = pd.DataFrame( - data={ - "col_1": [random.randint(1, 80) for i in range(20)], - "col_2": [random_email() for i in range(20)], - "col_3": [random_cloud_region() for i in range(20)], - } - ) - - datahub_classifier_config = DataHubClassifierConfig( - minimum_values_threshold=10, - confidence_level_threshold=0.58, - info_types_config={ - "Age": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, Values=1, Description=0, Datatype=0 - ) - ), - "CloudRegion": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, - Description=0, - Datatype=0, - Values=1, - ), - Values=ValuesFactorConfig( - prediction_type="regex", - regex=[ - r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+" - ], - ), - ), - }, - ) - - pipeline = Pipeline( - config=PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.aws", - username="TST_USR", - password="TST_PWD", - match_fully_qualified_names=True, - schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_technical_schema=True, - include_table_lineage=True, - include_view_lineage=True, - include_usage_stats=True, - use_legacy_lineage_method=True, - validate_upstreams_against_patterns=False, - include_operational_stats=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - classification=ClassificationConfig( - enabled=True, - classifiers=[ - DynamicTypedClassifierConfig( - type="datahub", config=datahub_classifier_config - ) - ], - ), - profiling=GEProfilingConfig( - enabled=True, - profile_if_updated_since_days=None, - profile_table_row_limit=None, - profile_table_size_limit=None, - profile_table_level_only=True, - ), - extract_tags=TagOption.without_lineage, - ), - ), - sink=DynamicTypedConfig( - type="file", config={"filename": str(output_file)} - ), - ) - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - # Verify the output. - - mce_helpers.check_golden_file( - pytestconfig, - output_path=output_file, - golden_path=golden_file, - ignore_paths=[ - r"root\[\d+\]\['aspect'\]\['json'\]\['timestampMillis'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['created'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['lastModified'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['fields'\]\[\d+\]\['glossaryTerms'\]\['auditStamp'\]\['time'\]", - r"root\[\d+\]\['systemMetadata'\]", - ], - ) - - -@freeze_time(FROZEN_TIME) -@pytest.mark.integration -def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" - - # Run the metadata ingestion pipeline. - output_file = tmp_path / "snowflake_privatelink_test_events.json" - golden_file = test_resources_dir / "snowflake_privatelink_golden.json" - - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - sf_cursor.execute.side_effect = default_query_results - - pipeline = Pipeline( - config=PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.privatelink", - username="TST_USR", - password="TST_PWD", - schema_pattern=AllowDenyPattern(allow=["test_schema"]), - include_technical_schema=True, - include_table_lineage=True, - include_column_lineage=False, - include_views=False, - include_view_lineage=False, - use_legacy_lineage_method=True, - include_usage_stats=False, - include_operational_stats=False, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - ), - ), - sink=DynamicTypedConfig( - type="file", config={"filename": str(output_file)} - ), - ) - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - # Verify the output. - - mce_helpers.check_golden_file( - pytestconfig, - output_path=output_file, - golden_path=golden_file, - ignore_paths=[], - ) diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py new file mode 100644 index 0000000000000..f72bd5b72d2cd --- /dev/null +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py @@ -0,0 +1,119 @@ +from unittest import mock + +from freezegun import freeze_time + +from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig +from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig +from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StatefulStaleMetadataRemovalConfig, +) +from tests.integration.snowflake.common import FROZEN_TIME, default_query_results +from tests.test_helpers.state_helpers import ( + get_current_checkpoint_from_pipeline, + validate_all_providers_have_committed_successfully, +) + +GMS_PORT = 8080 +GMS_SERVER = f"http://localhost:{GMS_PORT}" + + +def stateful_pipeline_config(include_tables: bool) -> PipelineConfig: + return PipelineConfig( + pipeline_name="test_snowflake", + source=SourceConfig( + type="snowflake", + config=SnowflakeV2Config( + account_id="ABC12345.ap-south-1.aws", + username="TST_USR", + password="TST_PWD", + match_fully_qualified_names=True, + schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), + include_tables=include_tables, + stateful_ingestion=StatefulStaleMetadataRemovalConfig.parse_obj( + { + "enabled": True, + "remove_stale_metadata": True, + "fail_safe_threshold": 100.0, + "state_provider": { + "type": "datahub", + "config": {"datahub_api": {"server": GMS_SERVER}}, + }, + } + ), + ), + ), + sink=DynamicTypedConfig(type="blackhole"), + ) + + +@freeze_time(FROZEN_TIME) +def test_tableau_stateful(mock_datahub_graph): + with mock.patch( + "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", + mock_datahub_graph, + ) as mock_checkpoint, mock.patch("snowflake.connector.connect") as mock_connect: + sf_connection = mock.MagicMock() + sf_cursor = mock.MagicMock() + mock_connect.return_value = sf_connection + sf_connection.cursor.return_value = sf_cursor + + sf_cursor.execute.side_effect = default_query_results + mock_checkpoint.return_value = mock_datahub_graph + pipeline_run1 = Pipeline(config=stateful_pipeline_config(True)) + pipeline_run1.run() + pipeline_run1.raise_from_status() + checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1) + + assert checkpoint1 + assert checkpoint1.state + + with mock.patch( + "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", + mock_datahub_graph, + ) as mock_checkpoint, mock.patch("snowflake.connector.connect") as mock_connect: + sf_connection = mock.MagicMock() + sf_cursor = mock.MagicMock() + mock_connect.return_value = sf_connection + sf_connection.cursor.return_value = sf_cursor + + sf_cursor.execute.side_effect = default_query_results + + mock_checkpoint.return_value = mock_datahub_graph + pipeline_run2 = Pipeline(config=stateful_pipeline_config(False)) + pipeline_run2.run() + pipeline_run2.raise_from_status() + checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2) + + assert checkpoint2 + assert checkpoint2.state + + # Validate that all providers have committed successfully. + validate_all_providers_have_committed_successfully( + pipeline=pipeline_run1, expected_providers=1 + ) + validate_all_providers_have_committed_successfully( + pipeline=pipeline_run2, expected_providers=1 + ) + + # Perform all assertions on the states. The deleted table should not be + # part of the second state + state1 = checkpoint1.state + state2 = checkpoint2.state + + difference_dataset_urns = list( + state1.get_urns_not_in(type="dataset", other_checkpoint_state=state2) + ) + assert sorted(difference_dataset_urns) == [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)", + ] diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index be4ae9e047aea..67a563baa561c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", @@ -3985,6 +4111,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:c6627af82d44de89492e1a9315ae9f4b", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index bc81ce9633432..ef6033dd91943 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", @@ -2053,6 +2179,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index 8be2fe134dca1..8098accebb424 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoDataAlias.Foo.SalesReason,PROD)", @@ -2053,6 +2179,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index ba2ab7330fded..d32002fb5648c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -81,6 +81,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "b6a0c1e2-f90a-4c86-a226-bf7ca59ad79f", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-08-06 21:01:05.157000", + "date_modified": "2023-08-06 21:01:05.283000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:f1b4c0e379c4b2e2e09a8ecd6c1b6dec", @@ -1764,6 +1828,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-08-06 21:01:05.093000", + "date_modified": "2023-08-06 21:01:05.093000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:a6bea84fba7b05fb5d12630c8e6306ac", @@ -2072,5 +2198,65 @@ "lastObserved": 1615443388097, "runId": "mssql-test" } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql index 612de3eb1583c..2ff46e249007a 100644 --- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql +++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql @@ -44,6 +44,10 @@ CREATE TABLE Foo.SalesReason ) ; GO +CREATE PROCEDURE Foo.DBs @ID INT +AS + SELECT @ID AS ThatDB; +GO GO EXEC sys.sp_addextendedproperty @@ -59,5 +63,31 @@ EXEC sys.sp_addextendedproperty @value = N'Description for column LastName of table Persons of schema Foo.', @level0type = N'SCHEMA', @level0name = 'Foo', @level1type = N'TABLE', @level1name = 'Persons', -@level2type = N'COLUMN',@level2name = 'LastName'; -GO \ No newline at end of file +@level2type = N'COLUMN',@level2name = 'LastName'; +GO +USE msdb ; +GO +EXEC dbo.sp_add_job + @job_name = N'Weekly Demo Data Backup' ; +GO +EXEC sp_add_jobstep + @job_name = N'Weekly Demo Data Backup', + @step_name = N'Set database to read only', + @database_name = N'DemoData', + @subsystem = N'TSQL', + @command = N'ALTER DATABASE DemoData SET READ_ONLY', + @retry_attempts = 5, + @retry_interval = 5 ; +GO +EXEC dbo.sp_add_schedule + @schedule_name = N'RunOnce', + @freq_type = 1, + @active_start_time = 233000 ; +GO +EXEC sp_attach_schedule + @job_name = N'Weekly Demo Data Backup', + @schedule_name = N'RunOnce'; +GO +EXEC dbo.sp_add_jobserver + @job_name = N'Weekly Demo Data Backup' +GO diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index 3e7b75edd4878..099690fed34c2 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -50,4 +50,9 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi output_path=tmp_path / "mssql_mces.json", golden_path=test_resources_dir / f"golden_files/golden_mces_{config_file.replace('yml','json')}", + ignore_paths=[ + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['job_id'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_created'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_modified'\]", + ], ) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index d04c8d905b439..71428a7847953 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -791,11 +791,9 @@ def test_tableau_unsupported_csql(mock_datahub_graph): database_override_map={"production database": "prod"} ) - with mock.patch( - "datahub.ingestion.source.tableau.sqlglot_lineage" - ) as sqlglot_lineage: + with mock.patch("datahub.ingestion.source.tableau.sqlglot_l") as sqlglot_lineage: - sqlglot_lineage.return_value = SqlParsingResult( # type:ignore + sqlglot_lineage.create_lineage_sql_parsed_result.return_value = SqlParsingResult( # type:ignore in_tables=[ "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.userdetail,PROD)" ], diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml index ddaf206f236cf..84af5c32a60e3 100644 --- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml +++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml @@ -1,6 +1,7 @@ version: "3.9" services: vertica: + platform: linux/amd64 environment: APP_DB_USER: "dbadmin" APP_DB_PASSWORD: "abc123" @@ -18,6 +19,3 @@ services: volumes: vertica-data: - - - diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py index db8bfd247313b..fe306d1d0b2b8 100644 --- a/metadata-ingestion/tests/integration/vertica/test_vertica.py +++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py @@ -58,6 +58,7 @@ def vertica_runner(docker_compose_runner, test_resources_dir): # Test needs more work to be done , currently it is working fine. @freeze_time(FROZEN_TIME) +@pytest.mark.skip("Failing in CI, cmd failing with exit code 1") @pytest.mark.integration def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica" diff --git a/metadata-ingestion/tests/unit/config/basic.yml b/metadata-ingestion/tests/unit/config/basic.yml index cc5372a05d84a..ce9f3b3f8cf94 100644 --- a/metadata-ingestion/tests/unit/config/basic.yml +++ b/metadata-ingestion/tests/unit/config/basic.yml @@ -5,3 +5,7 @@ nested: array: - one - two + numbers: + 4: "four" + 6: "six" + "8": "eight" diff --git a/metadata-ingestion/tests/unit/config/test_config_loader.py b/metadata-ingestion/tests/unit/config/test_config_loader.py index e29aa3b0b582c..3253c96b876aa 100644 --- a/metadata-ingestion/tests/unit/config/test_config_loader.py +++ b/metadata-ingestion/tests/unit/config/test_config_loader.py @@ -1,6 +1,9 @@ import os +import pathlib +import textwrap from unittest import mock +import deepdiff import expandvars import pytest import yaml @@ -18,7 +21,14 @@ ( # Basic YAML load "tests/unit/config/basic.yml", - {"foo": "bar", "nested": {"array": ["one", "two"], "hi": "hello"}}, + { + "foo": "bar", + "nested": { + "array": ["one", "two"], + "hi": "hello", + "numbers": {4: "four", 6: "six", "8": "eight"}, + }, + }, {}, set(), ), @@ -165,3 +175,46 @@ def test_load_error(pytestconfig, filename, env, error_type): with mock.patch.dict(os.environ, env): with pytest.raises(error_type): _ = load_config_file(filepath) + + +def test_write_file_directive(pytestconfig): + filepath = pytestconfig.rootpath / "tests/unit/config/write_to_file_directive.yml" + + fake_ssl_key = "my-secret-key-value" + + with mock.patch.dict(os.environ, {"DATAHUB_SSL_KEY": fake_ssl_key}): + loaded_config = load_config_file(filepath, squirrel_original_config=False) + + # Check that the rest of the dict is unmodified. + diff = deepdiff.DeepDiff( + loaded_config, + { + "foo": "bar", + "nested": { + "hi": "hello", + "another-key": "final-value", + }, + }, + exclude_paths=[ + "root['nested']['ssl_cert']", + "root['nested']['ssl_key']", + ], + ) + assert not diff + + # Check that the ssl_cert was written to a file. + ssl_cert_path = loaded_config["nested"]["ssl_cert"] + assert ( + pathlib.Path(ssl_cert_path).read_text() + == textwrap.dedent( + """ + -----BEGIN CERTIFICATE----- + thisisnotarealcert + -----END CERTIFICATE----- + """ + ).lstrip() + ) + + # Check that the ssl_key was written to a file. + ssl_key_path = loaded_config["nested"]["ssl_key"] + assert pathlib.Path(ssl_key_path).read_text() == fake_ssl_key diff --git a/metadata-ingestion/tests/unit/config/write_to_file_directive.yml b/metadata-ingestion/tests/unit/config/write_to_file_directive.yml new file mode 100644 index 0000000000000..e47f192096309 --- /dev/null +++ b/metadata-ingestion/tests/unit/config/write_to_file_directive.yml @@ -0,0 +1,11 @@ +foo: bar +nested: + hi: hello + __DATAHUB_TO_FILE_ssl_cert: | + -----BEGIN CERTIFICATE----- + thisisnotarealcert + -----END CERTIFICATE----- + + __DATAHUB_TO_FILE_ssl_key: ${DATAHUB_SSL_KEY} + + another-key: final-value diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json index e456e4450c50a..e241bdd08e243 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json @@ -8,7 +8,7 @@ { "downstream": { "table": null, - "column": "TOTAL_AGG" + "column": "total_agg" }, "upstreams": [ { @@ -20,7 +20,7 @@ { "downstream": { "table": null, - "column": "ORDERKEY" + "column": "orderkey" }, "upstreams": [ { @@ -32,7 +32,7 @@ { "downstream": { "table": null, - "column": "CUSTKEY" + "column": "custkey" }, "upstreams": [ { @@ -44,7 +44,7 @@ { "downstream": { "table": null, - "column": "ORDERSTATUS" + "column": "orderstatus" }, "upstreams": [ { @@ -56,7 +56,7 @@ { "downstream": { "table": null, - "column": "TOTALPRICE" + "column": "totalprice" }, "upstreams": [ { @@ -68,7 +68,7 @@ { "downstream": { "table": null, - "column": "ORDERDATE" + "column": "orderdate" }, "upstreams": [ { @@ -80,7 +80,7 @@ { "downstream": { "table": null, - "column": "ORDERPRIORITY" + "column": "orderpriority" }, "upstreams": [ { @@ -92,7 +92,7 @@ { "downstream": { "table": null, - "column": "CLERK" + "column": "clerk" }, "upstreams": [ { @@ -104,7 +104,7 @@ { "downstream": { "table": null, - "column": "SHIPPRIORITY" + "column": "shippriority" }, "upstreams": [ { @@ -116,7 +116,7 @@ { "downstream": { "table": null, - "column": "COMMENT" + "column": "comment" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json index 8e1fd453ce09d..2340b2e95b0d0 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json @@ -9,14 +9,14 @@ { "downstream": { "table": null, - "column": "LABEL" + "column": "label" }, "upstreams": [] }, { "downstream": { "table": null, - "column": "TOTAL_AGG" + "column": "total_agg" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json index 7d1a4f2039b10..64cd80e9a2d69 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json @@ -8,7 +8,7 @@ { "downstream": { "table": null, - "column": "TOTAL_PRICE_CATEGORY" + "column": "total_price_category" }, "upstreams": [ { @@ -20,7 +20,7 @@ { "downstream": { "table": null, - "column": "TOTAL_PRICE_SUCCESS" + "column": "total_price_success" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json index 694bec3800dbf..7b22a46757e39 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json @@ -8,7 +8,7 @@ { "downstream": { "table": null, - "column": "TOTAL_AGG" + "column": "total_agg" }, "upstreams": [ { @@ -20,7 +20,7 @@ { "downstream": { "table": null, - "column": "TOTAL_AVG" + "column": "total_avg" }, "upstreams": [ { @@ -32,7 +32,7 @@ { "downstream": { "table": null, - "column": "TOTAL_MIN" + "column": "total_min" }, "upstreams": [ { @@ -44,7 +44,7 @@ { "downstream": { "table": null, - "column": "TOTAL_MAX" + "column": "total_max" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json new file mode 100644 index 0000000000000..c912d99a3a8a3 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json @@ -0,0 +1,59 @@ +{ + "query_type": "CREATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", + "column": "Total_Agg" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "TotalPrice" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", + "column": "total_avg" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "TotalPrice" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", + "column": "TOTAL_MIN" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "TotalPrice" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", + "column": "total_max" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "TotalPrice" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json index 157745854128f..2af308ec60623 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json @@ -11,7 +11,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "USER_FK" + "column": "user_fk" }, "upstreams": [ { @@ -23,7 +23,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "EMAIL" + "column": "email" }, "upstreams": [ { @@ -35,7 +35,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "LAST_PURCHASE_DATE" + "column": "last_purchase_date" }, "upstreams": [ { @@ -47,7 +47,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "LIFETIME_PURCHASE_AMOUNT" + "column": "lifetime_purchase_amount" }, "upstreams": [ { @@ -59,7 +59,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "LIFETIME_PURCHASE_COUNT" + "column": "lifetime_purchase_count" }, "upstreams": [ { @@ -71,7 +71,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "AVERAGE_PURCHASE_AMOUNT" + "column": "average_purchase_amount" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 5a294be150fa0..7581d3bac010e 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -287,6 +287,40 @@ def test_snowflake_column_normalization(): ) +def test_snowflake_ctas_column_normalization(): + # For CTAS statements, we also should try to match the output table's + # column name casing. This is technically incorrect since we have the + # exact column names from the query, but necessary to match our column + # name normalization behavior in the Snowflake source. + + assert_sql_result( + """ +CREATE TABLE snowflake_sample_data.tpch_sf1.orders_normalized +AS +SELECT + SUM(o."totalprice") as Total_Agg, + AVG("TotalPrice") as TOTAL_AVG, + MIN("TOTALPRICE") as TOTAL_MIN, + MAX(TotalPrice) as Total_Max +FROM snowflake_sample_data.tpch_sf1.orders o +""", + dialect="snowflake", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": { + "orderkey": "NUMBER", + "TotalPrice": "FLOAT", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)": { + "Total_Agg": "FLOAT", + "total_avg": "FLOAT", + "TOTAL_MIN": "FLOAT", + # Purposely excluding total_max to test out the fallback behavior. + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_ctas_column_normalization.json", + ) + + def test_snowflake_case_statement(): assert_sql_result( """ diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_datahub_ingestion_checkpointing_provider.py b/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_datahub_ingestion_checkpointing_provider.py index 65a348026e852..600985266043b 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_datahub_ingestion_checkpointing_provider.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_datahub_ingestion_checkpointing_provider.py @@ -15,7 +15,9 @@ from datahub.ingestion.source.state.sql_common_state import ( BaseSQLAlchemyCheckpointState, ) -from datahub.ingestion.source.state.usage_common_state import BaseUsageCheckpointState +from datahub.ingestion.source.state.usage_common_state import ( + BaseTimeWindowCheckpointState, +) from datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider import ( DatahubIngestionCheckpointingProvider, ) @@ -113,8 +115,8 @@ def test_provider(self): run_id=self.run_id, state=job1_state_obj, ) - # Job2 - Checkpoint with a BaseUsageCheckpointState state - job2_state_obj = BaseUsageCheckpointState( + # Job2 - Checkpoint with a BaseTimeWindowCheckpointState state + job2_state_obj = BaseTimeWindowCheckpointState( begin_timestamp_millis=10, end_timestamp_millis=100 ) job2_checkpoint = Checkpoint( diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py index c691711890aff..712ae2066b728 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from typing import Dict, List import pydantic @@ -9,7 +9,9 @@ from datahub.ingestion.source.state.sql_common_state import ( BaseSQLAlchemyCheckpointState, ) -from datahub.ingestion.source.state.usage_common_state import BaseUsageCheckpointState +from datahub.ingestion.source.state.usage_common_state import ( + BaseTimeWindowCheckpointState, +) from datahub.metadata.schema_classes import ( DatahubIngestionCheckpointClass, IngestionCheckpointStateClass, @@ -27,7 +29,7 @@ def _assert_checkpoint_deserialization( ) -> Checkpoint: # Serialize a checkpoint aspect with the previous state. checkpoint_aspect = DatahubIngestionCheckpointClass( - timestampMillis=int(datetime.utcnow().timestamp() * 1000), + timestampMillis=int(datetime.now(tz=timezone.utc).timestamp() * 1000), pipelineName=test_pipeline_name, platformInstanceId="this-can-be-anything-and-will-be-ignored", config="this-is-also-ignored", @@ -67,8 +69,8 @@ def _make_sql_alchemy_checkpoint_state() -> BaseSQLAlchemyCheckpointState: return base_sql_alchemy_checkpoint_state_obj -def _make_usage_checkpoint_state() -> BaseUsageCheckpointState: - base_usage_checkpoint_state_obj = BaseUsageCheckpointState( +def _make_usage_checkpoint_state() -> BaseTimeWindowCheckpointState: + base_usage_checkpoint_state_obj = BaseTimeWindowCheckpointState( version="2.0", begin_timestamp_millis=1, end_timestamp_millis=100 ) return base_usage_checkpoint_state_obj @@ -77,8 +79,8 @@ def _make_usage_checkpoint_state() -> BaseUsageCheckpointState: _checkpoint_aspect_test_cases: Dict[str, CheckpointStateBase] = { # An instance of BaseSQLAlchemyCheckpointState. "BaseSQLAlchemyCheckpointState": _make_sql_alchemy_checkpoint_state(), - # An instance of BaseUsageCheckpointState. - "BaseUsageCheckpointState": _make_usage_checkpoint_state(), + # An instance of BaseTimeWindowCheckpointState. + "BaseTimeWindowCheckpointState": _make_usage_checkpoint_state(), } @@ -141,7 +143,7 @@ def test_supported_encodings(): """ Tests utf-8 and base85-bz2-json encodings """ - test_state = BaseUsageCheckpointState( + test_state = BaseTimeWindowCheckpointState( version="1.0", begin_timestamp_millis=1, end_timestamp_millis=100 ) diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py new file mode 100644 index 0000000000000..0400bd6a72aa5 --- /dev/null +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -0,0 +1,273 @@ +from datetime import datetime, timezone +from unittest import mock + +import pytest + +from datahub.configuration.time_window_config import BucketDuration, get_time_bucket +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_v2 import SnowflakeV2Source +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + DynamicTypedStateProviderConfig, +) +from datahub.ingestion.source.state.usage_common_state import ( + BaseTimeWindowCheckpointState, +) +from datahub.utilities.time import datetime_to_ts_millis + +GMS_PORT = 8080 +GMS_SERVER = f"http://localhost:{GMS_PORT}" + + +@pytest.fixture +def stateful_source(mock_datahub_graph: DataHubGraph) -> SnowflakeV2Source: + pipeline_name = "test_redundant_run_lineage" + run_id = "test_redundant_run" + ctx = PipelineContext( + pipeline_name=pipeline_name, + run_id=run_id, + graph=mock_datahub_graph, + ) + config = SnowflakeV2Config( + account_id="ABC12345.ap-south-1", + username="TST_USR", + password="TST_PWD", + stateful_ingestion=StatefulStaleMetadataRemovalConfig( + enabled=True, + state_provider=DynamicTypedStateProviderConfig( + type="datahub", config={"datahub_api": {"server": GMS_SERVER}} + ), + ), + ) + source = SnowflakeV2Source(ctx=ctx, config=config) + return source + + +def test_redundant_run_job_ids(stateful_source: SnowflakeV2Source) -> None: + assert stateful_source.lineage_extractor is not None + assert stateful_source.lineage_extractor.redundant_run_skip_handler is not None + assert ( + stateful_source.lineage_extractor.redundant_run_skip_handler.job_id + == "Snowflake_skip_redundant_run_lineage" + ) + + assert stateful_source.usage_extractor is not None + assert stateful_source.usage_extractor.redundant_run_skip_handler is not None + assert ( + stateful_source.usage_extractor.redundant_run_skip_handler.job_id + == "Snowflake_skip_redundant_run_usage" + ) + + +# last run +last_run_start_time = datetime(2023, 7, 2, tzinfo=timezone.utc) +last_run_end_time = datetime(2023, 7, 3, 12, tzinfo=timezone.utc) + + +@pytest.mark.parametrize( + "start_time,end_time,should_skip,suggested_start_time,suggested_end_time", + [ + # Case = current run time window is same as of last run time window + [ + datetime(2023, 7, 2, tzinfo=timezone.utc), + datetime(2023, 7, 3, 12, tzinfo=timezone.utc), + True, + None, + None, + ], + # Case = current run time window is starts at same time as of last run time window but ends later + [ + datetime(2023, 7, 2, tzinfo=timezone.utc), + datetime(2023, 7, 3, 18, tzinfo=timezone.utc), + False, + datetime(2023, 7, 3, 12, tzinfo=timezone.utc), + datetime(2023, 7, 3, 18, tzinfo=timezone.utc), + ], + # Case = current run time window is subset of last run time window + [ + datetime(2023, 7, 2, tzinfo=timezone.utc), + datetime(2023, 7, 3, tzinfo=timezone.utc), + True, + None, + None, + ], + # Case = current run time window is after last run time window but has some overlap with last run + # Scenario for next day's run for scheduled daily ingestions + [ + datetime(2023, 7, 3, tzinfo=timezone.utc), + datetime(2023, 7, 4, 12, tzinfo=timezone.utc), + False, + datetime(2023, 7, 3, 12, tzinfo=timezone.utc), + datetime(2023, 7, 4, 12, tzinfo=timezone.utc), + ], + # Case = current run time window is after last run time window and has no overlap with last run + [ + datetime(2023, 7, 5, tzinfo=timezone.utc), + datetime(2023, 7, 7, 12, tzinfo=timezone.utc), + False, + datetime(2023, 7, 5, tzinfo=timezone.utc), + datetime(2023, 7, 7, 12, tzinfo=timezone.utc), + ], + # Case = current run time window is before last run time window but has some overlap with last run + # Scenario for manual run for past dates + [ + datetime(2023, 6, 30, tzinfo=timezone.utc), + datetime(2023, 7, 2, 12, tzinfo=timezone.utc), + False, + datetime(2023, 6, 30, tzinfo=timezone.utc), + datetime(2023, 7, 2, tzinfo=timezone.utc), + ], + # Case = current run time window starts before last run time window and ends exactly on last run end time + # Scenario for manual run for past dates + [ + datetime(2023, 6, 30, tzinfo=timezone.utc), + datetime(2023, 7, 3, 12, tzinfo=timezone.utc), + False, + datetime(2023, 6, 30, tzinfo=timezone.utc), + datetime(2023, 7, 2, tzinfo=timezone.utc), + ], + # Case = current run time window is before last run time window and has no overlap with last run + # Scenario for manual run for past dates + [ + datetime(2023, 6, 20, tzinfo=timezone.utc), + datetime(2023, 6, 30, tzinfo=timezone.utc), + False, + datetime(2023, 6, 20, tzinfo=timezone.utc), + datetime(2023, 6, 30, tzinfo=timezone.utc), + ], + # Case = current run time window subsumes last run time window and extends on both sides + # Scenario for manual run + [ + datetime(2023, 6, 20, tzinfo=timezone.utc), + datetime(2023, 7, 20, tzinfo=timezone.utc), + False, + datetime(2023, 6, 20, tzinfo=timezone.utc), + datetime(2023, 7, 20, tzinfo=timezone.utc), + ], + ], +) +def test_redundant_run_skip_handler( + stateful_source: SnowflakeV2Source, + start_time: datetime, + end_time: datetime, + should_skip: bool, + suggested_start_time: datetime, + suggested_end_time: datetime, +) -> None: + # mock_datahub_graph + + # mocked_source = mock.MagicMock() + # mocked_config = mock.MagicMock() + + with mock.patch( + "datahub.ingestion.source.state.stateful_ingestion_base.StateProviderWrapper.get_last_checkpoint" + ) as mocked_fn: + set_mock_last_run_time_window( + mocked_fn, + last_run_start_time, + last_run_end_time, + ) + + # Redundant Lineage Skip Handler + assert stateful_source.lineage_extractor is not None + assert stateful_source.lineage_extractor.redundant_run_skip_handler is not None + assert ( + stateful_source.lineage_extractor.redundant_run_skip_handler.should_skip_this_run( + start_time, end_time + ) + == should_skip + ) + + if not should_skip: + suggested_time_window = stateful_source.lineage_extractor.redundant_run_skip_handler.suggest_run_time_window( + start_time, end_time + ) + assert suggested_time_window == (suggested_start_time, suggested_end_time) + + set_mock_last_run_time_window_usage( + mocked_fn, last_run_start_time, last_run_end_time + ) + # Redundant Usage Skip Handler + assert stateful_source.usage_extractor is not None + assert stateful_source.usage_extractor.redundant_run_skip_handler is not None + assert ( + stateful_source.usage_extractor.redundant_run_skip_handler.should_skip_this_run( + start_time, end_time + ) + == should_skip + ) + + if not should_skip: + suggested_time_window = stateful_source.usage_extractor.redundant_run_skip_handler.suggest_run_time_window( + start_time, end_time + ) + assert suggested_time_window == ( + get_time_bucket(suggested_start_time, BucketDuration.DAY), + suggested_end_time, + ) + + +def set_mock_last_run_time_window(mocked_fn, start_time, end_time): + mock_checkpoint = mock.MagicMock() + mock_checkpoint.state = BaseTimeWindowCheckpointState( + begin_timestamp_millis=datetime_to_ts_millis(start_time), + end_timestamp_millis=datetime_to_ts_millis(end_time), + ) + mocked_fn.return_value = mock_checkpoint + + +def set_mock_last_run_time_window_usage(mocked_fn, start_time, end_time): + mock_checkpoint = mock.MagicMock() + mock_checkpoint.state = BaseTimeWindowCheckpointState( + begin_timestamp_millis=datetime_to_ts_millis(start_time), + end_timestamp_millis=datetime_to_ts_millis(end_time), + bucket_duration=BucketDuration.DAY, + ) + mocked_fn.return_value = mock_checkpoint + + +def test_successful_run_creates_checkpoint(stateful_source: SnowflakeV2Source) -> None: + assert stateful_source.lineage_extractor is not None + assert stateful_source.lineage_extractor.redundant_run_skip_handler is not None + with mock.patch( + "datahub.ingestion.source.state.stateful_ingestion_base.StateProviderWrapper.create_checkpoint" + ) as mocked_create_checkpoint_fn, mock.patch( + "datahub.ingestion.source.state.stateful_ingestion_base.StateProviderWrapper.get_last_checkpoint" + ) as mocked_fn: + set_mock_last_run_time_window( + mocked_fn, + last_run_start_time, + last_run_end_time, + ) + stateful_source.lineage_extractor.redundant_run_skip_handler.update_state( + datetime.now(tz=timezone.utc), datetime.now(tz=timezone.utc) + ) + mocked_create_checkpoint_fn.assert_called_once() + + +def test_failed_run_does_not_create_checkpoint( + stateful_source: SnowflakeV2Source, +) -> None: + assert stateful_source.lineage_extractor is not None + assert stateful_source.lineage_extractor.redundant_run_skip_handler is not None + stateful_source.lineage_extractor.redundant_run_skip_handler.report_current_run_status( + "some_step", False + ) + with mock.patch( + "datahub.ingestion.source.state.stateful_ingestion_base.StateProviderWrapper.create_checkpoint" + ) as mocked_create_checkpoint_fn, mock.patch( + "datahub.ingestion.source.state.stateful_ingestion_base.StateProviderWrapper.get_last_checkpoint" + ) as mocked_fn: + set_mock_last_run_time_window( + mocked_fn, + last_run_start_time, + last_run_end_time, + ) + stateful_source.lineage_extractor.redundant_run_skip_handler.update_state( + datetime.now(tz=timezone.utc), datetime.now(tz=timezone.utc) + ) + mocked_create_checkpoint_fn.assert_not_called() diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_lineage.py index c9308fd89ef72..9b09fa36ba586 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_lineage.py @@ -1,6 +1,8 @@ import datetime from typing import Dict, List, Set +import pytest + from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( BigQueryTableRef, QueryEvent, @@ -14,15 +16,17 @@ from datahub.utilities.sqlglot_lineage import SchemaResolver -def test_lineage_with_timestamps(): - config = BigQueryV2Config() - report = BigQueryV2Report() - extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) - lineage_entries: List[QueryEvent] = [ +@pytest.fixture +def lineage_entries() -> List[QueryEvent]: + return [ QueryEvent( timestamp=datetime.datetime.now(tz=datetime.timezone.utc), actor_email="bla@bla.com", - query="testQuery", + query=""" + INSERT INTO `my_project.my_dataset.my_table` + SELECT first.a, second.b FROM `my_project.my_dataset.my_source_table1` first + LEFT JOIN `my_project.my_dataset.my_source_table2` second ON first.id = second.id + """, statementType="SELECT", project_id="proj_12344", end_time=None, @@ -73,6 +77,12 @@ def test_lineage_with_timestamps(): ), ] + +def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None: + config = BigQueryV2Config() + report = BigQueryV2Report() + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) + bq_table = BigQueryTableRef.from_string_name( "projects/my_project/datasets/my_dataset/tables/my_table" ) @@ -90,3 +100,31 @@ def test_lineage_with_timestamps(): ) assert upstream_lineage assert len(upstream_lineage.upstreams) == 4 + + +def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: + config = BigQueryV2Config(extract_column_lineage=True, incremental_lineage=False) + report = BigQueryV2Report() + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) + + bq_table = BigQueryTableRef.from_string_name( + "projects/my_project/datasets/my_dataset/tables/my_table" + ) + + lineage_map: Dict[str, Set[LineageEdge]] = extractor._create_lineage_map( + lineage_entries[:1], + sql_parser_schema_resolver=SchemaResolver(platform="bigquery"), + ) + + upstream_lineage = extractor.get_lineage_for_table( + bq_table=bq_table, + bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", + lineage_metadata=lineage_map, + platform="bigquery", + ) + assert upstream_lineage + assert len(upstream_lineage.upstreams) == 2 + assert ( + upstream_lineage.fineGrainedLineages + and len(upstream_lineage.fineGrainedLineages) == 2 + ) diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py index a2aec8df93d09..44ce5f0a02e37 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py +++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py @@ -37,6 +37,7 @@ def test_generate_day_partitioned_partition_profiler_query(): ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) @@ -79,6 +80,7 @@ def test_generate_day_partitioned_partition_profiler_query_with_set_partition_ti ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) @@ -120,6 +122,7 @@ def test_generate_hour_partitioned_partition_profiler_query(): ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index fc8ca166b105a..47418d9a989bb 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -138,13 +138,12 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})" ) - config = BigQueryV2Config.parse_obj({}) + config = BigQueryV2Config.parse_obj({"include_data_platform_instance": True}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) data_platform_instance = source.get_dataplatform_instance_aspect( "urn:li:test", project_id ) - metadata = data_platform_instance.get_metadata()["metadata"] assert data_platform_instance is not None @@ -152,6 +151,20 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): assert metadata.aspect.instance == expected_instance +def test_get_dataplatform_instance_default_no_instance(): + config = BigQueryV2Config.parse_obj({}) + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + + data_platform_instance = source.get_dataplatform_instance_aspect( + "urn:li:test", "project_id" + ) + metadata = data_platform_instance.get_metadata()["metadata"] + + assert data_platform_instance is not None + assert metadata.aspectName == "dataPlatformInstance" + assert metadata.aspect.instance is None + + @patch("google.cloud.bigquery.client.Client") def test_get_projects_with_single_project_id(client_mock): config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) diff --git a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py index a71e07b68d898..b047cd16c52a9 100644 --- a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py +++ b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py @@ -4,6 +4,7 @@ from confluent_kafka.schema_registry.schema_registry_client import ( RegisteredSchema, Schema, + SchemaReference, ) from datahub.ingestion.source.confluent_schema_registry import ConfluentSchemaRegistry @@ -90,7 +91,9 @@ def new_get_latest_version(subject_name: str) -> RegisteredSchema: schema_str=schema_str_orig, schema_type="AVRO", references=[ - dict(name="TestTopic1", subject="schema_subject_1", version=1) + SchemaReference( + name="TestTopic1", subject="schema_subject_1", version=1 + ) ], ) ) @@ -109,7 +112,9 @@ def new_get_latest_version(subject_name: str) -> RegisteredSchema: schema_str=schema_str_orig, schema_type="AVRO", references=[ - dict(name="schema_subject_1", subject="TestTopic1", version=1) + SchemaReference( + name="schema_subject_1", subject="TestTopic1", version=1 + ) ], ) ) diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index f3ea071d76400..768d4f958af1f 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -1,405 +1,482 @@ +import sys +import uuid +from decimal import Decimal from typing import Any, Optional import pytest -from iceberg.api import types as IcebergTypes -from iceberg.api.types.types import NestedField - -from datahub.configuration.common import ConfigurationError -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.azure.azure_common import AdlsSourceConfig -from datahub.ingestion.source.iceberg.iceberg import IcebergSource, IcebergSourceConfig -from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField -from datahub.metadata.schema_classes import ( - ArrayTypeClass, - BooleanTypeClass, - BytesTypeClass, - DateTypeClass, - FixedTypeClass, - NumberTypeClass, - RecordTypeClass, - StringTypeClass, - TimeTypeClass, -) - +from pydantic import ValidationError + +if sys.version_info >= (3, 8): + from pyiceberg.schema import Schema + from pyiceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IcebergType, + IntegerType, + ListType, + LongType, + MapType, + NestedField, + PrimitiveType, + StringType, + StructType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, + ) -def with_iceberg_source() -> IcebergSource: - adls: AdlsSourceConfig = AdlsSourceConfig( - account_name="test", account_key="test", container_name="test" + from datahub.ingestion.api.common import PipelineContext + from datahub.ingestion.source.iceberg.iceberg import ( + IcebergProfiler, + IcebergSource, + IcebergSourceConfig, ) - return IcebergSource( - ctx=PipelineContext(run_id="iceberg-source-test"), - config=IcebergSourceConfig(adls=adls), + from datahub.ingestion.source.iceberg.iceberg_common import IcebergCatalogConfig + from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField + from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BooleanTypeClass, + BytesTypeClass, + DateTypeClass, + FixedTypeClass, + NumberTypeClass, + RecordTypeClass, + StringTypeClass, + TimeTypeClass, ) - -def assert_field( - schema_field: SchemaField, - expected_description: Optional[str], - expected_nullable: bool, - expected_type: Any, -) -> None: - assert ( - schema_field.description == expected_description - ), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'" - assert ( - schema_field.nullable == expected_nullable - ), f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'" - assert isinstance( - schema_field.type.type, expected_type - ), f"Field type {schema_field.type.type} is different from expected type {expected_type}" - - -def test_adls_config_no_credential(): - """ - Test when no ADLS credential information is provided (SAS token, Account key). - """ - with pytest.raises(ConfigurationError): - AdlsSourceConfig(account_name="test", container_name="test") - - -def test_adls_config_with_sas_credential(): - """ - Test when a SAS token is used as an ADLS credential. - """ - AdlsSourceConfig(account_name="test", sas_token="test", container_name="test") - - -def test_adls_config_with_key_credential(): - """ - Test when an account key is used as an ADLS credential. - """ - AdlsSourceConfig(account_name="test", account_key="test", container_name="test") - - -def test_adls_config_with_client_secret_credential(): - """ - Test when a client secret is used as an ADLS credential. - """ - AdlsSourceConfig( - account_name="test", - tenant_id="test", - client_id="test", - client_secret="test", - container_name="test", + pytestmark = pytest.mark.skipif( + sys.version_info < (3, 8), reason="requires python 3.8 or higher" ) - # Test when tenant_id is missing - with pytest.raises(ConfigurationError): - AdlsSourceConfig( - account_name="test", - client_id="test", - client_secret="test", - container_name="test", + def with_iceberg_source() -> IcebergSource: + catalog: IcebergCatalogConfig = IcebergCatalogConfig( + name="test", type="rest", config={} ) - - # Test when client_id is missing - with pytest.raises(ConfigurationError): - AdlsSourceConfig( - account_name="test", - tenant_id="test", - client_secret="test", - container_name="test", - ) - - # Test when client_secret is missing - with pytest.raises(ConfigurationError): - AdlsSourceConfig( - account_name="test", - tenant_id="test", - client_id="test", - container_name="test", - ) - - -def test_config_for_tests(): - """ - Test valid iceberg source that will be used in unit tests. - """ - with_iceberg_source() - - -def test_config_no_filesystem(): - """ - Test when a SAS token is used as an ADLS credential. - """ - with pytest.raises(ConfigurationError): - IcebergSource( + return IcebergSource( ctx=PipelineContext(run_id="iceberg-source-test"), - config=IcebergSourceConfig(), + config=IcebergSourceConfig(catalog=catalog), ) - -def test_config_multiple_filesystems(): - """ - Test when more than 1 filesystem is configured. - """ - with pytest.raises(ConfigurationError): - adls: AdlsSourceConfig = AdlsSourceConfig( - account_name="test", container_name="test" - ) - IcebergSource( - ctx=PipelineContext(run_id="iceberg-source-test"), - config=IcebergSourceConfig(adls=adls, localfs="/tmp"), + def with_iceberg_profiler() -> IcebergProfiler: + iceberg_source_instance = with_iceberg_source() + return IcebergProfiler( + iceberg_source_instance.report, iceberg_source_instance.config.profiling ) - -@pytest.mark.parametrize( - "iceberg_type, expected_schema_field_type", - [ - (IcebergTypes.BinaryType.get(), BytesTypeClass), - (IcebergTypes.BooleanType.get(), BooleanTypeClass), - (IcebergTypes.DateType.get(), DateTypeClass), - ( - IcebergTypes.DecimalType.of(3, 2), - NumberTypeClass, - ), - (IcebergTypes.DoubleType.get(), NumberTypeClass), - (IcebergTypes.FixedType.of_length(4), FixedTypeClass), - (IcebergTypes.FloatType.get(), NumberTypeClass), - (IcebergTypes.IntegerType.get(), NumberTypeClass), - (IcebergTypes.LongType.get(), NumberTypeClass), - (IcebergTypes.StringType.get(), StringTypeClass), - ( - IcebergTypes.TimestampType.with_timezone(), - TimeTypeClass, - ), - ( - IcebergTypes.TimestampType.without_timezone(), - TimeTypeClass, - ), - (IcebergTypes.TimeType.get(), TimeTypeClass), - ( - IcebergTypes.UUIDType.get(), - StringTypeClass, - ), - ], -) -def test_iceberg_primitive_type_to_schema_field( - iceberg_type: IcebergTypes.PrimitiveType, expected_schema_field_type: Any -) -> None: - """ - Test converting a primitive typed Iceberg field to a SchemaField - """ - iceberg_source_instance = with_iceberg_source() - for column in [ - NestedField.required( - 1, "required_field", iceberg_type, "required field documentation" - ), - NestedField.optional( - 1, "optional_field", iceberg_type, "optional field documentation" - ), - ]: - schema_fields = iceberg_source_instance._get_schema_fields_for_column(column) + def assert_field( + schema_field: SchemaField, + expected_description: Optional[str], + expected_nullable: bool, + expected_type: Any, + ) -> None: assert ( - len(schema_fields) == 1 - ), f"Expected 1 field, but got {len(schema_fields)}" - assert_field( - schema_fields[0], column.doc, column.is_optional, expected_schema_field_type - ) - - -@pytest.mark.parametrize( - "iceberg_type, expected_array_nested_type", - [ - (IcebergTypes.BinaryType.get(), "bytes"), - (IcebergTypes.BooleanType.get(), "boolean"), - (IcebergTypes.DateType.get(), "date"), - ( - IcebergTypes.DecimalType.of(3, 2), - "decimal", - ), - (IcebergTypes.DoubleType.get(), "double"), - (IcebergTypes.FixedType.of_length(4), "fixed"), - (IcebergTypes.FloatType.get(), "float"), - (IcebergTypes.IntegerType.get(), "int"), - (IcebergTypes.LongType.get(), "long"), - (IcebergTypes.StringType.get(), "string"), - ( - IcebergTypes.TimestampType.with_timezone(), - "timestamp-micros", - ), - ( - IcebergTypes.TimestampType.without_timezone(), - "timestamp-micros", - ), - (IcebergTypes.TimeType.get(), "time-micros"), - ( - IcebergTypes.UUIDType.get(), - "uuid", - ), - ], -) -def test_iceberg_list_to_schema_field( - iceberg_type: IcebergTypes.PrimitiveType, expected_array_nested_type: Any -) -> None: - """ - Test converting a list typed Iceberg field to an ArrayType SchemaField, including the list nested type. - """ - list_column: NestedField = NestedField.required( - 1, - "listField", - IcebergTypes.ListType.of_required(2, iceberg_type), - "documentation", + schema_field.description == expected_description + ), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'" + assert ( + schema_field.nullable == expected_nullable + ), f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'" + assert isinstance( + schema_field.type.type, expected_type + ), f"Field type {schema_field.type.type} is different from expected type {expected_type}" + + def test_config_no_catalog(): + """ + Test when no Iceberg catalog is provided. + """ + with pytest.raises(ValidationError, match="catalog"): + IcebergSourceConfig() # type: ignore + + def test_config_catalog_not_configured(): + """ + Test when an Iceberg catalog is provided, but not properly configured. + """ + with pytest.raises(ValidationError): + IcebergCatalogConfig() # type: ignore + + with pytest.raises(ValidationError, match="conf"): + IcebergCatalogConfig(type="a type") # type: ignore + + with pytest.raises(ValidationError, match="type"): + IcebergCatalogConfig(conf={}) # type: ignore + + def test_config_for_tests(): + """ + Test valid iceberg source that will be used in unit tests. + """ + with_iceberg_source() + + @pytest.mark.parametrize( + "iceberg_type, expected_schema_field_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], ) - iceberg_source_instance = with_iceberg_source() - schema_fields = iceberg_source_instance._get_schema_fields_for_column(list_column) - assert len(schema_fields) == 1, f"Expected 1 field, but got {len(schema_fields)}" - assert_field( - schema_fields[0], list_column.doc, list_column.is_optional, ArrayTypeClass + def test_iceberg_primitive_type_to_schema_field( + iceberg_type: PrimitiveType, expected_schema_field_type: Any + ) -> None: + """ + Test converting a primitive typed Iceberg field to a SchemaField + """ + iceberg_source_instance = with_iceberg_source() + for column in [ + NestedField( + 1, "required_field", iceberg_type, True, "required field documentation" + ), + NestedField( + 1, "optional_field", iceberg_type, False, "optional field documentation" + ), + ]: + schema = Schema(column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema( + schema + ) + assert ( + len(schema_fields) == 1 + ), f"Expected 1 field, but got {len(schema_fields)}" + assert_field( + schema_fields[0], + column.doc, + column.optional, + expected_schema_field_type, + ) + + @pytest.mark.parametrize( + "iceberg_type, expected_array_nested_type", + [ + (BinaryType(), "bytes"), + (BooleanType(), "boolean"), + (DateType(), "date"), + ( + DecimalType(3, 2), + "decimal", + ), + (DoubleType(), "double"), + (FixedType(4), "fixed"), + (FloatType(), "float"), + (IntegerType(), "int"), + (LongType(), "long"), + (StringType(), "string"), + ( + TimestampType(), + "timestamp-micros", + ), + ( + TimestamptzType(), + "timestamp-micros", + ), + (TimeType(), "time-micros"), + ( + UUIDType(), + "uuid", + ), + ], ) - assert isinstance( - schema_fields[0].type.type, ArrayType - ), f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}" - arrayType: ArrayType = schema_fields[0].type.type - assert arrayType.nestedType == [ - expected_array_nested_type - ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" - - -@pytest.mark.parametrize( - "iceberg_type, expected_map_type", - [ - (IcebergTypes.BinaryType.get(), BytesTypeClass), - (IcebergTypes.BooleanType.get(), BooleanTypeClass), - (IcebergTypes.DateType.get(), DateTypeClass), - ( - IcebergTypes.DecimalType.of(3, 2), - NumberTypeClass, - ), - (IcebergTypes.DoubleType.get(), NumberTypeClass), - (IcebergTypes.FixedType.of_length(4), FixedTypeClass), - (IcebergTypes.FloatType.get(), NumberTypeClass), - (IcebergTypes.IntegerType.get(), NumberTypeClass), - (IcebergTypes.LongType.get(), NumberTypeClass), - (IcebergTypes.StringType.get(), StringTypeClass), - ( - IcebergTypes.TimestampType.with_timezone(), - TimeTypeClass, - ), - ( - IcebergTypes.TimestampType.without_timezone(), - TimeTypeClass, - ), - (IcebergTypes.TimeType.get(), TimeTypeClass), - ( - IcebergTypes.UUIDType.get(), - StringTypeClass, - ), - ], -) -def test_iceberg_map_to_schema_field( - iceberg_type: IcebergTypes.PrimitiveType, expected_map_type: Any -) -> None: - """ - Test converting a map typed Iceberg field to a MapType SchemaField, where the key is the same type as the value. - """ - map_column: NestedField = NestedField.required( - 1, - "mapField", - IcebergTypes.MapType.of_required(11, 12, iceberg_type, iceberg_type), - "documentation", + def test_iceberg_list_to_schema_field( + iceberg_type: PrimitiveType, expected_array_nested_type: Any + ) -> None: + """ + Test converting a list typed Iceberg field to an ArrayType SchemaField, including the list nested type. + """ + for list_column in [ + NestedField( + 1, + "listField", + ListType(2, iceberg_type, True), + True, + "required field, required element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, False), + True, + "required field, optional element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, True), + False, + "optional field, required element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, False), + False, + "optional field, optional element documentation", + ), + ]: + iceberg_source_instance = with_iceberg_source() + schema = Schema(list_column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema( + schema + ) + assert ( + len(schema_fields) == 1 + ), f"Expected 1 field, but got {len(schema_fields)}" + assert_field( + schema_fields[0], list_column.doc, list_column.optional, ArrayTypeClass + ) + assert isinstance( + schema_fields[0].type.type, ArrayType + ), f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}" + arrayType: ArrayType = schema_fields[0].type.type + assert arrayType.nestedType == [ + expected_array_nested_type + ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" + + @pytest.mark.parametrize( + "iceberg_type, expected_map_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], ) - iceberg_source_instance = with_iceberg_source() - schema_fields = iceberg_source_instance._get_schema_fields_for_column(map_column) - # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records. - # The first field will be the array. - assert len(schema_fields) == 3, f"Expected 3 fields, but got {len(schema_fields)}" - assert_field( - schema_fields[0], map_column.doc, map_column.is_optional, ArrayTypeClass + def test_iceberg_map_to_schema_field( + iceberg_type: PrimitiveType, expected_map_type: Any + ) -> None: + """ + Test converting a map typed Iceberg field to a MapType SchemaField, where the key is the same type as the value. + """ + for map_column in [ + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, True), + True, + "required field, required value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, False), + True, + "required field, optional value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, True), + False, + "optional field, required value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, False), + False, + "optional field, optional value documentation", + ), + ]: + iceberg_source_instance = with_iceberg_source() + schema = Schema(map_column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema( + schema + ) + # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records. + # The first field will be the array. + assert ( + len(schema_fields) == 3 + ), f"Expected 3 fields, but got {len(schema_fields)}" + assert_field( + schema_fields[0], map_column.doc, map_column.optional, ArrayTypeClass + ) + + # The second field will be the key type + assert_field(schema_fields[1], None, False, expected_map_type) + + # The third field will be the value type + assert_field( + schema_fields[2], + None, + not map_column.field_type.value_required, + expected_map_type, + ) + + @pytest.mark.parametrize( + "iceberg_type, expected_schema_field_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], ) + def test_iceberg_struct_to_schema_field( + iceberg_type: PrimitiveType, expected_schema_field_type: Any + ) -> None: + """ + Test converting a struct typed Iceberg field to a RecordType SchemaField. + """ + field1 = NestedField(11, "field1", iceberg_type, True, "field documentation") + struct_column = NestedField( + 1, "structField", StructType(field1), True, "struct documentation" + ) + iceberg_source_instance = with_iceberg_source() + schema = Schema(struct_column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) + assert ( + len(schema_fields) == 2 + ), f"Expected 2 fields, but got {len(schema_fields)}" + assert_field( + schema_fields[0], struct_column.doc, struct_column.optional, RecordTypeClass + ) + assert_field( + schema_fields[1], field1.doc, field1.optional, expected_schema_field_type + ) - # The second field will be the key type - assert_field(schema_fields[1], None, False, expected_map_type) - - # The third field will be the value type - assert_field(schema_fields[2], None, True, expected_map_type) - - -@pytest.mark.parametrize( - "iceberg_type, expected_schema_field_type", - [ - (IcebergTypes.BinaryType.get(), BytesTypeClass), - (IcebergTypes.BooleanType.get(), BooleanTypeClass), - (IcebergTypes.DateType.get(), DateTypeClass), - ( - IcebergTypes.DecimalType.of(3, 2), - NumberTypeClass, - ), - (IcebergTypes.DoubleType.get(), NumberTypeClass), - (IcebergTypes.FixedType.of_length(4), FixedTypeClass), - (IcebergTypes.FloatType.get(), NumberTypeClass), - (IcebergTypes.IntegerType.get(), NumberTypeClass), - (IcebergTypes.LongType.get(), NumberTypeClass), - (IcebergTypes.StringType.get(), StringTypeClass), - ( - IcebergTypes.TimestampType.with_timezone(), - TimeTypeClass, - ), - ( - IcebergTypes.TimestampType.without_timezone(), - TimeTypeClass, - ), - (IcebergTypes.TimeType.get(), TimeTypeClass), - ( - IcebergTypes.UUIDType.get(), - StringTypeClass, - ), - ], -) -def test_iceberg_struct_to_schema_field( - iceberg_type: IcebergTypes.PrimitiveType, expected_schema_field_type: Any -) -> None: - """ - Test converting a struct typed Iceberg field to a RecordType SchemaField. - """ - field1: NestedField = NestedField.required( - 11, "field1", iceberg_type, "field documentation" - ) - struct_column: NestedField = NestedField.required( - 1, "structField", IcebergTypes.StructType.of([field1]), "struct documentation" + @pytest.mark.parametrize( + "value_type, value, expected_value", + [ + (BinaryType(), bytes([1, 2, 3, 4, 5]), "b'\\x01\\x02\\x03\\x04\\x05'"), + (BooleanType(), True, "True"), + (DateType(), 19543, "2023-07-05"), + (DecimalType(3, 2), Decimal((0, (3, 1, 4), -2)), "3.14"), + (DoubleType(), 3.4, "3.4"), + (FixedType(4), bytes([1, 2, 3, 4]), "b'\\x01\\x02\\x03\\x04'"), + (FloatType(), 3.4, "3.4"), + (IntegerType(), 3, "3"), + (LongType(), 4294967295000, "4294967295000"), + (StringType(), "a string", "a string"), + ( + TimestampType(), + 1688559488157000, + "2023-07-05T12:18:08.157000", + ), + ( + TimestamptzType(), + 1688559488157000, + "2023-07-05T12:18:08.157000+00:00", + ), + (TimeType(), 40400000000, "11:13:20"), + ( + UUIDType(), + uuid.UUID("00010203-0405-0607-0809-0a0b0c0d0e0f"), + "00010203-0405-0607-0809-0a0b0c0d0e0f", + ), + ], ) - iceberg_source_instance = with_iceberg_source() - schema_fields = iceberg_source_instance._get_schema_fields_for_column(struct_column) - assert len(schema_fields) == 2, f"Expected 2 fields, but got {len(schema_fields)}" - assert_field( - schema_fields[0], struct_column.doc, struct_column.is_optional, RecordTypeClass - ) - assert_field( - schema_fields[1], field1.doc, field1.is_optional, expected_schema_field_type - ) - - -def test_avro_decimal_bytes_nullable(): - """ - The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do. - NOTE: This bug was by-passed by mapping the Decimal type to fixed instead of bytes. - """ - import avro.schema + def test_iceberg_profiler_value_render( + value_type: IcebergType, value: Any, expected_value: Optional[str] + ) -> None: + iceberg_profiler_instance = with_iceberg_profiler() + assert ( + iceberg_profiler_instance._render_value("a.dataset", value_type, value) + == expected_value + ) - decimal_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "bytes", "precision": 3, "scale": 2, "logicalType": "decimal", "native_data_type": "decimal(3, 2)", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" - decimal_avro_schema = avro.schema.parse(decimal_avro_schema_string) - print("\nDecimal (bytes)") - print( - f"Original avro schema string: {decimal_avro_schema_string}" - ) - print(f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}") + def test_avro_decimal_bytes_nullable() -> None: + """ + The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do. + NOTE: This bug was by-passed by mapping the Decimal type to fixed instead of bytes. + """ + import avro.schema + + decimal_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "bytes", "precision": 3, "scale": 2, "logicalType": "decimal", "native_data_type": "decimal(3, 2)", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" + decimal_avro_schema = avro.schema.parse(decimal_avro_schema_string) + print("\nDecimal (bytes)") + print( + f"Original avro schema string: {decimal_avro_schema_string}" + ) + print( + f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}" + ) - decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}""" - decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string) - print("\nDecimal (fixed)") - print( - f"Original avro schema string: {decimal_fixed_avro_schema_string}" - ) - print( - f"After avro parsing, _nullable attribute is preserved: {decimal_fixed_avro_schema}" - ) + decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}""" + decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string) + print("\nDecimal (fixed)") + print( + f"Original avro schema string: {decimal_fixed_avro_schema_string}" + ) + print( + f"After avro parsing, _nullable attribute is preserved: {decimal_fixed_avro_schema}" + ) - boolean_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "boolean", "native_data_type": "boolean", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" - boolean_avro_schema = avro.schema.parse(boolean_avro_schema_string) - print("\nBoolean") - print( - f"Original avro schema string: {boolean_avro_schema_string}" - ) - print( - f"After avro parsing, _nullable attribute is preserved: {boolean_avro_schema}" - ) + boolean_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "boolean", "native_data_type": "boolean", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" + boolean_avro_schema = avro.schema.parse(boolean_avro_schema_string) + print("\nBoolean") + print( + f"Original avro schema string: {boolean_avro_schema_string}" + ) + print( + f"After avro parsing, _nullable attribute is preserved: {boolean_avro_schema}" + ) diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/test_snowflake_shares.py new file mode 100644 index 0000000000000..7de86139baf39 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_snowflake_shares.py @@ -0,0 +1,348 @@ +from typing import List + +import pytest + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.source.snowflake.snowflake_config import ( + DatabaseId, + SnowflakeShareConfig, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_schema import ( + SnowflakeDatabase, + SnowflakeSchema, +) +from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler +from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings +from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeProposal + + +@pytest.fixture(scope="module") +def snowflake_databases() -> List[SnowflakeDatabase]: + return [ + SnowflakeDatabase( + name="db1", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema11", + created=None, + comment=None, + last_altered=None, + tables=["table111", "table112"], + views=["view111"], + ), + SnowflakeSchema( + name="schema12", + created=None, + comment=None, + last_altered=None, + tables=["table121", "table122"], + views=["view121"], + ), + ], + ), + SnowflakeDatabase( + name="db2", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema21", + created=None, + comment=None, + last_altered=None, + tables=["table211", "table212"], + views=["view211"], + ), + SnowflakeSchema( + name="schema22", + created=None, + comment=None, + last_altered=None, + tables=["table221", "table222"], + views=["view221"], + ), + ], + ), + SnowflakeDatabase( + name="db3", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema31", + created=None, + comment=None, + last_altered=None, + tables=["table311", "table312"], + views=["view311"], + ) + ], + ), + ] + + +def make_snowflake_urn(table_name, instance_name=None): + return make_dataset_urn_with_platform_instance( + "snowflake", table_name, instance_name + ) + + +def test_snowflake_shares_workunit_no_shares( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config(account_id="abc12345", platform_instance="instance1") + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x) + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + assert len(wus) == 0 + + +def test_same_database_inbound_and_outbound_invalid_config() -> None: + with pytest.raises( + ValueError, + match="Same database can not be present as consumer in more than one share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance3", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + }, + ) + + with pytest.raises( + ValueError, + match="Database included in a share can not be present as consumer in any share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance1", + consumers=[ + DatabaseId(database="db1", platform_instance="instance3") + ], + ), + }, + ) + + with pytest.raises( + ValueError, + match="Database included in a share can not be present as consumer in any share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance1", + consumers=[ + DatabaseId(database="db1", platform_instance="instance3") + ], + ), + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + }, + ) + + +def test_snowflake_shares_workunit_inbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[DatabaseId(database="db1", platform_instance="instance1")], + ) + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets + # Hence 6 Sibling and 6 upstreamLineage aspects + assert len(wus) == 12 + upstream_lineage_aspect_entity_urns = set() + sibling_aspect_entity_urns = set() + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "instance1.db1", "instance2.db1" + ) + upstream_lineage_aspect_entity_urns.add(wu.get_urn()) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db1", "instance2.db1") + ] + sibling_aspect_entity_urns.add(wu.get_urn()) + + assert upstream_lineage_aspect_entity_urns == sibling_aspect_entity_urns + + +def test_snowflake_shares_workunit_outbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share2": SnowflakeShareConfig( + database="db2", + platform_instance="instance1", + consumers=[ + DatabaseId( + database="db2_from_share", platform_instance="instance2" + ), + DatabaseId(database="db2", platform_instance="instance3"), + ], + ) + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets + # Hence 6 Sibling aspects + assert len(wus) == 6 + entity_urns = set() + + for wu in wus: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), + wu.get_urn().replace("instance1.db2", "instance3.db2"), + ] + entity_urns.add(wu.get_urn()) + + assert len((entity_urns)) == 6 + + +def test_snowflake_shares_workunit_inbound_and_outbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[DatabaseId(database="db1", platform_instance="instance1")], + ), + "share2": SnowflakeShareConfig( + database="db2", + platform_instance="instance1", + consumers=[ + DatabaseId( + database="db2_from_share", platform_instance="instance2" + ), + DatabaseId(database="db2", platform_instance="instance3"), + ], + ), + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 6 Sibling and 6 upstreamLineage aspects for db1 tables + # 6 Sibling aspects for db2 tables + assert len(wus) == 18 + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "instance1.db1", "instance2.db1" + ) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + if "db1" in wu.get_urn(): + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db1", "instance2.db1") + ] + else: + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), + wu.get_urn().replace("instance1.db2", "instance3.db2"), + ] diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index db14b8f6de738..95af0e623e991 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -9,10 +9,10 @@ SQLAlchemySource, get_platform_from_sqlalchemy_uri, ) -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig -class _TestSQLAlchemyConfig(SQLAlchemyConfig): +class _TestSQLAlchemyConfig(SQLCommonConfig): def get_sql_alchemy_url(self): pass @@ -22,7 +22,7 @@ class _TestSQLAlchemySource(SQLAlchemySource): def test_generate_foreign_key(): - config: SQLAlchemyConfig = _TestSQLAlchemyConfig() + config: SQLCommonConfig = _TestSQLAlchemyConfig() ctx: PipelineContext = PipelineContext(run_id="test_ctx") platform: str = "TEST" inspector: Inspector = Mock() @@ -49,7 +49,7 @@ def test_generate_foreign_key(): def test_use_source_schema_for_foreign_key_if_not_specified(): - config: SQLAlchemyConfig = _TestSQLAlchemyConfig() + config: SQLCommonConfig = _TestSQLAlchemyConfig() ctx: PipelineContext = PipelineContext(run_id="test_ctx") platform: str = "TEST" inspector: Inspector = Mock() diff --git a/metadata-ingestion/tests/unit/test_time_window_config.py b/metadata-ingestion/tests/unit/test_time_window_config.py new file mode 100644 index 0000000000000..847bda2511a0c --- /dev/null +++ b/metadata-ingestion/tests/unit/test_time_window_config.py @@ -0,0 +1,80 @@ +from datetime import datetime, timezone + +import pytest +from freezegun import freeze_time + +from datahub.configuration.time_window_config import BaseTimeWindowConfig + +FROZEN_TIME = "2023-08-03 09:00:00" +FROZEN_TIME2 = "2023-08-03 09:10:00" + + +@freeze_time(FROZEN_TIME) +def test_default_start_end_time(): + config = BaseTimeWindowConfig.parse_obj({}) + assert config.start_time == datetime(2023, 8, 2, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME2) +def test_default_start_end_time_hour_bucket_duration(): + config = BaseTimeWindowConfig.parse_obj({"bucket_duration": "HOUR"}) + assert config.start_time == datetime(2023, 8, 3, 8, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, 10, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME) +def test_relative_start_time(): + config = BaseTimeWindowConfig.parse_obj({"start_time": "-2 days"}) + assert config.start_time == datetime(2023, 8, 1, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + config = BaseTimeWindowConfig.parse_obj({"start_time": "-2d"}) + assert config.start_time == datetime(2023, 8, 1, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + config = BaseTimeWindowConfig.parse_obj( + {"start_time": "-2 days", "end_time": "2023-07-07T09:00:00Z"} + ) + assert config.start_time == datetime(2023, 7, 5, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 7, 7, 9, tzinfo=timezone.utc) + + config = BaseTimeWindowConfig.parse_obj( + {"start_time": "-2 days", "end_time": "2023-07-07T09:00:00Z"} + ) + assert config.start_time == datetime(2023, 7, 5, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 7, 7, 9, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME) +def test_absolute_start_time(): + config = BaseTimeWindowConfig.parse_obj({"start_time": "2023-07-01T00:00:00Z"}) + assert config.start_time == datetime(2023, 7, 1, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + config = BaseTimeWindowConfig.parse_obj({"start_time": "2023-07-01T09:00:00Z"}) + assert config.start_time == datetime(2023, 7, 1, 9, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME) +def test_invalid_relative_start_time(): + with pytest.raises(ValueError, match="Unknown string format"): + BaseTimeWindowConfig.parse_obj({"start_time": "-2 das"}) + + with pytest.raises( + ValueError, + match="Relative start time should be in terms of configured bucket duration", + ): + BaseTimeWindowConfig.parse_obj({"start_time": "-2"}) + + with pytest.raises( + ValueError, match="Relative start time should start with minus sign" + ): + BaseTimeWindowConfig.parse_obj({"start_time": "2d"}) + + with pytest.raises( + ValueError, + match="Relative start time should be in terms of configured bucket duration", + ): + BaseTimeWindowConfig.parse_obj({"start_time": "-2m"}) diff --git a/metadata-ingestion/tests/unit/test_usage_common.py b/metadata-ingestion/tests/unit/test_usage_common.py index 8c9c25593afa8..1e2b2b6999177 100644 --- a/metadata-ingestion/tests/unit/test_usage_common.py +++ b/metadata-ingestion/tests/unit/test_usage_common.py @@ -1,6 +1,5 @@ import time from datetime import datetime -from unittest import mock import pytest from freezegun import freeze_time @@ -12,6 +11,7 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.usage.usage_common import ( + DEFAULT_QUERIES_CHARACTER_LIMIT, BaseUsageConfig, GenericAggregatedDataset, convert_usage_aggregation_class, @@ -183,6 +183,7 @@ def test_make_usage_workunit(): top_n_queries=10, format_sql_queries=False, include_top_n_queries=True, + queries_character_limit=DEFAULT_QUERIES_CHARACTER_LIMIT, ) ts_timestamp = int(floored_ts.timestamp() * 1000) @@ -218,6 +219,7 @@ def test_query_formatting(): top_n_queries=10, format_sql_queries=True, include_top_n_queries=True, + queries_character_limit=DEFAULT_QUERIES_CHARACTER_LIMIT, ) ts_timestamp = int(floored_ts.timestamp() * 1000) assert ( @@ -234,7 +236,7 @@ def test_query_trimming(): test_email: str = "test_email@test.com" test_query: str = "select * from test where a > 10 and b > 20 order by a asc" top_n_queries: int = 10 - total_budget_for_query_list: int = 200 + queries_character_limit: int = 200 event_time = datetime(2020, 1, 1) floored_ts = get_time_bucket(event_time, BucketDuration.DAY) resource = "test_db.test_schema.test_table" @@ -251,7 +253,7 @@ def test_query_trimming(): top_n_queries=top_n_queries, format_sql_queries=False, include_top_n_queries=True, - total_budget_for_query_list=total_budget_for_query_list, + queries_character_limit=queries_character_limit, ) ts_timestamp = int(floored_ts.timestamp() * 1000) @@ -267,11 +269,7 @@ def test_query_trimming(): def test_top_n_queries_validator_fails(): with pytest.raises(ValidationError) as excinfo: - with mock.patch( - "datahub.ingestion.source.usage.usage_common.TOTAL_BUDGET_FOR_QUERY_LIST", - 20, - ): - BaseUsageConfig(top_n_queries=2) + BaseUsageConfig(top_n_queries=2, queries_character_limit=20) assert "top_n_queries is set to 2 but it can be maximum 1" in str(excinfo.value) @@ -294,6 +292,7 @@ def test_make_usage_workunit_include_top_n_queries(): top_n_queries=10, format_sql_queries=False, include_top_n_queries=False, + queries_character_limit=DEFAULT_QUERIES_CHARACTER_LIMIT, ) ts_timestamp = int(floored_ts.timestamp() * 1000) diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 025273fc9263e..e304bb5329c62 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -26,12 +26,11 @@ tasks.withType(Test).configureEach { } dependencies { - implementation project(':metadata-models') implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } - compile externalDependency.avro_1_7 + implementation externalDependency.avro_1_7 constraints { implementation('commons-collections:commons-collections:3.2.2') { because 'Vulnerability Issue' @@ -48,12 +47,14 @@ dependencies { implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito - testCompile externalDependency.mockServer - testCompile externalDependency.mockServerClient - testCompile externalDependency.testContainers - testCompile externalDependency.httpAsyncClient - testRuntime externalDependency.logbackClassic + // VisibleForTesting + compileOnly externalDependency.guava + testImplementation externalDependency.mockito + testImplementation externalDependency.mockServer + testImplementation externalDependency.mockServerClient + testImplementation externalDependency.testContainers + testImplementation externalDependency.httpAsyncClient + testRuntimeOnly externalDependency.logbackClassic swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.33' } @@ -139,6 +140,7 @@ checkShadowJar { assemble { dependsOn shadowJar } +compileTestJava.dependsOn shadowJar task sourcesJar(type: Jar) { archiveClassifier = 'sources' @@ -231,7 +233,12 @@ tasks.register('generateOpenApiPojos', GenerateSwaggerCode) { } compileJava.dependsOn generateOpenApiPojos +processResources.dependsOn generateOpenApiPojos sourceSets.main.java.srcDir "${generateOpenApiPojos.outputDir}/src/main/java" sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/resources" checkstyleMain.exclude '**/generated/**' + +clean { + project.delete("$projectDir/generated") +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-protobuf/build.gradle b/metadata-integration/java/datahub-protobuf/build.gradle index fa33e6baab534..bc919119f8fac 100644 --- a/metadata-integration/java/datahub-protobuf/build.gradle +++ b/metadata-integration/java/datahub-protobuf/build.gradle @@ -30,6 +30,7 @@ dependencies { implementation project(':metadata-models') implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') + implementation externalDependency.guava implementation externalDependency.protobuf implementation externalDependency.jgrapht implementation externalDependency.gson diff --git a/metadata-integration/java/examples/build.gradle b/metadata-integration/java/examples/build.gradle index b9e8e253dc359..581e9f82da0dc 100644 --- a/metadata-integration/java/examples/build.gradle +++ b/metadata-integration/java/examples/build.gradle @@ -24,7 +24,7 @@ dependencies { implementation project(path: ':li-utils') implementation project(path: ':metadata-models') - compile project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') + implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') implementation externalDependency.httpAsyncClient // Tests need a concrete log4j available. Providing it here diff --git a/metadata-integration/java/spark-lineage/build.gradle b/metadata-integration/java/spark-lineage/build.gradle index 7257cf0aabc35..7143ac4833143 100644 --- a/metadata-integration/java/spark-lineage/build.gradle +++ b/metadata-integration/java/spark-lineage/build.gradle @@ -145,7 +145,7 @@ assemble { dependsOn shadowJar } -task integrationTest(type: Exec, dependsOn: [shadowJar, ':docker:quickstart'] ) { +task integrationTest(type: Exec, dependsOn: [shadowJar, ':docker:quickstartSlim'] ) { environment "RUN_QUICKSTART", "false" commandLine "spark-smoke-test/smoke.sh" } diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index ae56cd4cb8a96..e8ef0b3d6819d 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'org.hidetake.swagger.generator' configurations { @@ -6,64 +6,69 @@ configurations { } dependencies { + implementation project(':entity-registry') + api project(':metadata-utils') + api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-events:mxe-registration') + api project(':metadata-events:mxe-utils-avro-1.7') + api project(':metadata-models') + api project(':metadata-service:restli-client') + api project(':metadata-service:configuration') + api project(':metadata-service:services') + + implementation spec.product.pegasus.data + implementation spec.product.pegasus.generator + + implementation externalDependency.guava + implementation externalDependency.reflections implementation externalDependency.jsonPatch - compile project(':entity-registry') - compile project(':metadata-utils') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile project(':metadata-models') - compile project(':metadata-service:restli-client') - compile project(':metadata-service:configuration') - compile project(':metadata-service:services') - - compile spec.product.pegasus.data - compile spec.product.pegasus.generator - - compile externalDependency.dgraph4j exclude group: 'com.google.guava', module: 'guava' + api externalDependency.dgraph4j exclude group: 'com.google.guava', module: 'guava' implementation externalDependency.slf4jApi - testImplementation project(':metadata-integration:java:datahub-client') - runtime externalDependency.logbackClassic + runtimeOnly externalDependency.logbackClassic compileOnly externalDependency.lombok implementation externalDependency.commonsCollections - compile externalDependency.datastaxOssNativeProtocol - compile externalDependency.datastaxOssCore - compile externalDependency.datastaxOssQueryBuilder - compile externalDependency.elasticSearchRest - compile externalDependency.elasticSearchTransport - compile externalDependency.javatuples - compile externalDependency.javaxValidation + api externalDependency.datastaxOssNativeProtocol + api externalDependency.datastaxOssCore + api externalDependency.datastaxOssQueryBuilder + api externalDependency.elasticSearchRest + api externalDependency.elasticSearchTransport + implementation externalDependency.javatuples + api externalDependency.javaxValidation runtimeOnly externalDependency.jna - compile externalDependency.kafkaClients - compile externalDependency.ebean + api externalDependency.kafkaClients + api externalDependency.ebean enhance externalDependency.ebeanAgent - compile externalDependency.opentelemetryAnnotations - compile externalDependency.resilience4j - compile externalDependency.springContext - compile externalDependency.swaggerAnnotations + implementation externalDependency.opentelemetryAnnotations + implementation externalDependency.resilience4j + api externalDependency.springContext + implementation externalDependency.swaggerAnnotations swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.33' - compile (externalDependency.mixpanel) { + implementation(externalDependency.mixpanel) { exclude group: 'org.json', module: 'json' } annotationProcessor externalDependency.lombok - testCompile externalDependency.testng - testCompile externalDependency.h2 - testCompile externalDependency.mysqlConnector - testCompile externalDependency.neo4jHarness - testCompile externalDependency.mockito - testCompile externalDependency.mockitoInline - testCompile externalDependency.iStackCommons - testCompile externalDependency.resilience4j - testCompile externalDependency.testContainers - testCompile externalDependency.testContainersJunit - testCompile externalDependency.testContainersElasticsearch - testCompile externalDependency.testContainersCassandra - testCompile externalDependency.lombok - testCompile project(':test-models') - testImplementation externalDependency.springBootTest + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') testImplementation project(':datahub-graphql-core') + testImplementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') + testImplementation externalDependency.testng + testImplementation externalDependency.h2 + testImplementation externalDependency.mysqlConnector + testImplementation externalDependency.neo4jHarness + testImplementation externalDependency.mockito + testImplementation externalDependency.mockitoInline + testImplementation externalDependency.iStackCommons + testImplementation externalDependency.resilience4j + testImplementation externalDependency.testContainers + testImplementation externalDependency.testContainersJunit + testImplementation externalDependency.testContainersElasticsearch + testImplementation externalDependency.testContainersCassandra + testImplementation externalDependency.lombok + testImplementation externalDependency.springBootTest + testImplementation spec.product.pegasus.restliServer + // logback >=1.3 required due to `testcontainers` only testImplementation 'ch.qos.logback:logback-classic:1.4.7' @@ -88,6 +93,9 @@ dependencies { implementation(externalDependency.jettison) { because("previous versions are vulnerable") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } } @@ -132,6 +140,7 @@ tasks.register('generateOpenApiPojos', GenerateSwaggerCode) { } compileJava.dependsOn generateOpenApiPojos +processResources.dependsOn generateOpenApiPojos sourceSets.main.java.srcDir "${generateOpenApiPojos.outputDir}/src/main/java" sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/resources" diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java index c561ddd38b919..9b8e9bce7e670 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/LineageSearchService.java @@ -598,7 +598,9 @@ public LineageScrollResult scrollAcrossLineage(@Nonnull Urn sourceUrn, @Nonnull List lineageRelationships = filterRelationships(lineageResult, new HashSet<>(entities), inputFilters); - return getScrollResultInBatches(lineageRelationships, input != null ? input : "*", inputFilters, sortCriterion, + Filter reducedFilters = + SearchUtils.removeCriteria(inputFilters, criterion -> criterion.getField().equals(DEGREE_FILTER_INPUT)); + return getScrollResultInBatches(lineageRelationships, input != null ? input : "*", reducedFilters, sortCriterion, scrollId, keepAlive, size, searchFlags); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index 555acb2ffdd3b..4bbff3915aca9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -42,6 +43,13 @@ public static Map getPartialNgramConfigWithOverrides(Map getMappingsForField(@Nonnull final Searchable mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); // Add keyword subfield without lowercase filter mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP)); - } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) { + } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { mappingForField.put(TYPE, KEYWORD); mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); Map subFields = new HashMap<>(); - if (fieldType == FieldType.TEXT_PARTIAL) { + if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { subFields.put(NGRAM, getPartialNgramConfigWithOverrides( ImmutableMap.of( ANALYZER, PARTIAL_ANALYZER ) )); + if (fieldType == FieldType.WORD_GRAM) { + for (Map.Entry entry : Map.of( + WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER, + WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER, + WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) { + String fieldName = entry.getKey(); + String analyzerName = entry.getValue(); + subFields.put(fieldName, ImmutableMap.of( + TYPE, TEXT, + ANALYZER, analyzerName, + SEARCH_ANALYZER, analyzerName + )); + } + } } subFields.put(DELIMITED, ImmutableMap.of( TYPE, TEXT, @@ -163,6 +185,7 @@ private static Map getMappingsForField(@Nonnull final Searchable searchableFieldSpec.getSearchableAnnotation() .getNumValuesFieldName() .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, LONG))); + mappings.putAll(getMappingsForFieldNameAliases(searchableFieldSpec)); return mappings; } @@ -172,4 +195,16 @@ private static Map getMappingsForSearchScoreField( return ImmutableMap.of(searchScoreFieldSpec.getSearchScoreAnnotation().getFieldName(), ImmutableMap.of(TYPE, DOUBLE)); } + + private static Map getMappingsForFieldNameAliases(@Nonnull final SearchableFieldSpec searchableFieldSpec) { + Map mappings = new HashMap<>(); + List fieldNameAliases = searchableFieldSpec.getSearchableAnnotation().getFieldNameAliases(); + fieldNameAliases.forEach(alias -> { + Map aliasMappings = new HashMap<>(); + aliasMappings.put(TYPE, ALIAS); + aliasMappings.put(PATH, searchableFieldSpec.getSearchableAnnotation().getFieldName()); + mappings.put(alias, aliasMappings); + }); + return mappings; + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index 5b3e396837aa7..e180c8296b48d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -66,6 +66,9 @@ public class SettingsBuilder { public static final String KEYWORD_ANALYZER = "keyword"; public static final String URN_ANALYZER = "urn_component"; public static final String URN_SEARCH_ANALYZER = "query_urn_component"; + public static final String WORD_GRAM_2_ANALYZER = "word_gram_2"; + public static final String WORD_GRAM_3_ANALYZER = "word_gram_3"; + public static final String WORD_GRAM_4_ANALYZER = "word_gram_4"; // Filters public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space"; @@ -80,6 +83,10 @@ public class SettingsBuilder { public static final String MULTIFILTER = "multifilter"; public static final String MULTIFILTER_GRAPH = "multifilter_graph"; public static final String PARTIAL_URN_COMPONENT = "partial_urn_component"; + public static final String SHINGLE = "shingle"; + public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter"; + public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter"; + public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter"; public static final String SNOWBALL = "snowball"; public static final String STEM_OVERRIDE = "stem_override"; public static final String STOP = "stop"; @@ -108,6 +115,7 @@ public class SettingsBuilder { public static final String SLASH_TOKENIZER = "slash_tokenizer"; public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer"; public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer"; + public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer"; // Do not remove the space, needed for multi-term synonyms public static final List ALPHANUM_SPACE_PATTERNS = ImmutableList.of( "([a-z0-9 _-]{2,})", @@ -161,6 +169,13 @@ public class SettingsBuilder { AUTOCOMPLETE_CUSTOM_DELIMITER, LOWERCASE); + public static final List WORD_GRAM_TOKEN_FILTERS = ImmutableList.of( + ASCII_FOLDING, + LOWERCASE, + TRIM, + REMOVE_QUOTES + ); + public final Map settings; public SettingsBuilder(String mainTokenizer) { @@ -275,6 +290,17 @@ private static Map buildFilters() throws IOException { .collect(Collectors.toList())) .build()); } + + for (Map.Entry entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) { + String filterName = entry.getKey(); + Integer gramSize = entry.getValue(); + filters.put(filterName, ImmutableMap.builder() + .put(TYPE, SHINGLE) + .put("min_shingle_size", gramSize) + .put("max_shingle_size", gramSize) + .put("output_unigrams", false) + .build()); + } } return filters.build(); @@ -302,13 +328,24 @@ private static Map buildTokenizers() { .put(DELIMITER, "␟") .build()); - // Tokenize by whitespace and most special chars + // Tokenize by most special chars + // Do NOT tokenize by whitespace to keep multi-word synonyms in the same token + // The split by whitespace is done later in the token filters phase tokenizers.put(MAIN_TOKENIZER, ImmutableMap.builder() .put(TYPE, PATTERN) .put(PATTERN, "[(),./:]") .build()); + // Tokenize by whitespace and most special chars for wordgrams + // only split on - when not preceded by a whitespace to preserve exclusion functionality + // i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently + tokenizers.put(WORD_GRAM_TOKENIZER, + ImmutableMap.builder() + .put(TYPE, PATTERN) + .put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)") + .build()); + return tokenizers.build(); } @@ -382,6 +419,21 @@ private static Map buildAnalyzers(String mainTokenizer) { .put(FILTER, SEARCH_TOKEN_FILTERS) .build()); + // Support word grams + for (Map.Entry entry : Map.of( + WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER, + WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER, + WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) { + String analyzerName = entry.getKey(); + String filterName = entry.getValue(); + analyzers.put(analyzerName, ImmutableMap.builder() + .put(TOKENIZER, WORD_GRAM_TOKENIZER) + .put(FILTER, ImmutableList.builder() + .addAll(WORD_GRAM_TOKEN_FILTERS) + .add(filterName).build()) + .build()); + } + // For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN) // Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token analyzers.put(PARTIAL_ANALYZER, ImmutableMap.builder() @@ -395,6 +447,7 @@ private static Map buildAnalyzers(String mainTokenizer) { .put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS) .build()); + return analyzers.build(); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java index fb7e19a5d67bc..a75ed40ffca52 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java @@ -11,11 +11,8 @@ import java.util.Set; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; + @Builder @Getter @@ -33,7 +30,8 @@ public class SearchFieldConfig { private static final Set TYPES_WITH_DELIMITED_SUBFIELD = Set.of( SearchableAnnotation.FieldType.TEXT, - SearchableAnnotation.FieldType.TEXT_PARTIAL + SearchableAnnotation.FieldType.TEXT_PARTIAL, + SearchableAnnotation.FieldType.WORD_GRAM // NOT URN_PARTIAL (urn field is special) ); // NOT comprehensive @@ -56,6 +54,7 @@ public class SearchFieldConfig { SearchableAnnotation.FieldType.TEXT, SearchableAnnotation.FieldType.TEXT_PARTIAL, SearchableAnnotation.FieldType.KEYWORD, + SearchableAnnotation.FieldType.WORD_GRAM, // not analyzed SearchableAnnotation.FieldType.BOOLEAN, SearchableAnnotation.FieldType.COUNT, @@ -69,6 +68,11 @@ public class SearchFieldConfig { SearchableAnnotation.FieldType.URN_PARTIAL ); + public static final Set TYPES_WITH_WORD_GRAM = + Set.of( + SearchableAnnotation.FieldType.WORD_GRAM + ); + @Nonnull private final String fieldName; @Nonnull @@ -78,9 +82,11 @@ public class SearchFieldConfig { private final String analyzer; private boolean hasKeywordSubfield; private boolean hasDelimitedSubfield; + private boolean hasWordGramSubfields; private boolean isQueryByDefault; private boolean isDelimitedSubfield; private boolean isKeywordSubfield; + private boolean isWordGramSubfield; public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) { final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation(); @@ -106,6 +112,7 @@ public static SearchFieldConfig detectSubFieldType(String fieldName, .analyzer(getAnalyzer(fieldName, fieldType)) .hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType)) .hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType)) + .hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType)) .isQueryByDefault(isQueryByDefault) .build(); } @@ -118,6 +125,11 @@ private static boolean hasDelimitedSubfield(String fieldName, SearchableAnnotati return !fieldName.contains(".") && ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType)); } + + private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) { + return !fieldName.contains(".") + && (TYPES_WITH_WORD_GRAM.contains(fieldType)); + } private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) { return !"urn".equals(fieldName) && !fieldName.contains(".") @@ -155,6 +167,7 @@ public SearchFieldConfigBuilder fieldName(@Nonnull String fieldName) { this.fieldName = fieldName; isDelimitedSubfield(fieldName.endsWith(".delimited")); isKeywordSubfield(fieldName.endsWith(".keyword")); + isWordGramSubfield(fieldName.contains("wordGrams")); shortName(fieldName.split("[.]")[0]); return this; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index 289c6f1f84e32..a00882cfde240 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -3,6 +3,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.BoolQueryConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.config.search.custom.QueryConfiguration; @@ -11,6 +12,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.linkedin.metadata.Constants; import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.models.SearchScoreFieldSpec; import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.annotation.SearchScoreAnnotation; import com.linkedin.metadata.models.annotation.SearchableAnnotation; @@ -51,6 +53,9 @@ import org.elasticsearch.search.SearchModule; import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; +import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.*; + @Slf4j public class SearchQueryBuilder { @@ -69,6 +74,7 @@ public class SearchQueryBuilder { public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q "; private final ExactMatchConfiguration exactMatchConfiguration; private final PartialConfiguration partialConfiguration; + private final WordGramConfiguration wordGramConfiguration; private final CustomizedQueryHandler customizedQueryHandler; @@ -76,6 +82,7 @@ public SearchQueryBuilder(@Nonnull SearchConfiguration searchConfiguration, @Nullable CustomSearchConfiguration customSearchConfiguration) { this.exactMatchConfiguration = searchConfiguration.getExactMatch(); this.partialConfiguration = searchConfiguration.getPartial(); + this.wordGramConfiguration = searchConfiguration.getWordGram(); this.customizedQueryHandler = CustomizedQueryHandler.builder(customSearchConfiguration).build(); } @@ -148,6 +155,36 @@ private Set getStandardFields(@Nonnull EntitySpec entitySpec) fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited", searchFieldConfig.boost() * partialConfiguration.getFactor(), searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault())); + + if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) { + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams2") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getTwoGramFactor()) + .analyzer(WORD_GRAM_2_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams3") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getThreeGramFactor()) + .analyzer(WORD_GRAM_3_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams4") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getFourGramFactor()) + .analyzer(WORD_GRAM_4_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + } } } @@ -188,7 +225,7 @@ private Optional getSimpleQuery(@Nullable QueryConfiguration custo .filter(SearchFieldConfig::isQueryByDefault) .collect(Collectors.groupingBy(SearchFieldConfig::analyzer)); - analyzerGroup.keySet().stream().sorted().forEach(analyzer -> { + analyzerGroup.keySet().stream().sorted().filter(str -> !str.contains("word_gram")).forEach(analyzer -> { List fieldConfigs = analyzerGroup.get(analyzer); SimpleQueryStringBuilder simpleBuilder = QueryBuilders.simpleQueryStringQuery(sanitizedQuery); simpleBuilder.analyzer(analyzer); @@ -253,6 +290,13 @@ private Optional getPrefixAndExactMatchQuery(@Nullable QueryConfig * exactMatchConfiguration.getCaseSensitivityFactor()) .queryName(searchFieldConfig.fieldName())); } + + if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) { + finalQuery.should(QueryBuilders + .matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) + .boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName())) + .queryName(searchFieldConfig.shortName())); + } }); return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty(); @@ -279,23 +323,31 @@ private static FunctionScoreQueryBuilder.FilterFunctionBuilder[] buildAnnotation finalScoreFunctions.add( new FunctionScoreQueryBuilder.FilterFunctionBuilder(ScoreFunctionBuilders.weightFactorFunction(1.0f))); - entitySpecs.stream() + Map annotations = entitySpecs.stream() .map(EntitySpec::getSearchableFieldSpecs) .flatMap(List::stream) .map(SearchableFieldSpec::getSearchableAnnotation) - .flatMap(annotation -> annotation - .getWeightsPerFieldValue() - .entrySet() - .stream() - .map(entry -> buildWeightFactorFunction(annotation.getFieldName(), entry.getKey(), - entry.getValue()))) - .forEach(finalScoreFunctions::add); + .collect(Collectors.toMap(SearchableAnnotation::getFieldName, annotation -> annotation, (annotation1, annotation2) -> annotation1)); + + for (Map.Entry annotationEntry : annotations.entrySet()) { + SearchableAnnotation annotation = annotationEntry.getValue(); + annotation + .getWeightsPerFieldValue() + .entrySet() + .stream() + .map(entry -> buildWeightFactorFunction(annotation.getFieldName(), entry.getKey(), + entry.getValue())).forEach(finalScoreFunctions::add); + } - entitySpecs.stream() + Map searchScoreAnnotationMap = entitySpecs.stream() .map(EntitySpec::getSearchScoreFieldSpecs) .flatMap(List::stream) - .map(fieldSpec -> buildScoreFunctionFromSearchScoreAnnotation(fieldSpec.getSearchScoreAnnotation())) - .forEach(finalScoreFunctions::add); + .map(SearchScoreFieldSpec::getSearchScoreAnnotation) + .collect(Collectors.toMap(SearchScoreAnnotation::getFieldName, annotation -> annotation, (annotation1, annotation2) -> annotation1)); + for (Map.Entry searchScoreAnnotationEntry : searchScoreAnnotationMap.entrySet()) { + SearchScoreAnnotation annotation = searchScoreAnnotationEntry.getValue(); + finalScoreFunctions.add(buildScoreFunctionFromSearchScoreAnnotation(annotation)); + } return finalScoreFunctions.toArray(new FunctionScoreQueryBuilder.FilterFunctionBuilder[0]); } @@ -377,4 +429,15 @@ private FunctionScoreQueryBuilder toFunctionScoreQueryBuilder(QueryBuilder query throw new RuntimeException(e); } } + + public float getWordGramFactor(String fieldName) { + if (fieldName.endsWith("Grams2")) { + return wordGramConfiguration.getTwoGramFactor(); + } else if (fieldName.endsWith("Grams3")) { + return wordGramConfiguration.getThreeGramFactor(); + } else if (fieldName.endsWith("Grams4")) { + return wordGramConfiguration.getFourGramFactor(); + } + throw new IllegalArgumentException(fieldName + " does not end with Grams[2-4]"); + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index bd1e6037ec0c5..5973f77da28aa 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -28,6 +28,8 @@ import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchResultMetadata; +import com.linkedin.metadata.search.SearchSuggestion; +import com.linkedin.metadata.search.SearchSuggestionArray; import com.linkedin.metadata.search.features.Features; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.SearchUtil; @@ -68,7 +70,9 @@ import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; +import org.elasticsearch.search.suggest.term.TermSuggestion; +import static com.linkedin.metadata.search.utils.ESUtils.NAME_SUGGESTION; import static com.linkedin.metadata.search.utils.ESUtils.toFacetField; import static com.linkedin.metadata.search.utils.SearchUtils.applyDefaultSearchFlags; import static com.linkedin.metadata.utils.SearchUtil.*; @@ -199,6 +203,11 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi searchSourceBuilder.highlighter(_highlights); } ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + + if (finalSearchFlags.isGetSuggestions()) { + ESUtils.buildNameSuggestions(searchSourceBuilder, input); + } + searchRequest.source(searchSourceBuilder); log.debug("Search request is: " + searchRequest.toString()); @@ -471,6 +480,9 @@ private SearchResultMetadata extractSearchResultMetadata(@Nonnull SearchResponse final List aggregationMetadataList = extractAggregationMetadata(searchResponse, filter); searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList)); + final List searchSuggestions = extractSearchSuggestions(searchResponse); + searchResultMetadata.setSuggestions(new SearchSuggestionArray(searchSuggestions)); + return searchResultMetadata; } @@ -517,6 +529,23 @@ public static Map extractTermAggregations(@Nonnull SearchResponse return extractTermAggregations((ParsedTerms) aggregation, aggregationName.equals("_entityType")); } + private List extractSearchSuggestions(@Nonnull SearchResponse searchResponse) { + final List searchSuggestions = new ArrayList<>(); + if (searchResponse.getSuggest() != null) { + TermSuggestion termSuggestion = searchResponse.getSuggest().getSuggestion(NAME_SUGGESTION); + if (termSuggestion != null && termSuggestion.getEntries().size() > 0) { + termSuggestion.getEntries().get(0).getOptions().forEach(suggestOption -> { + SearchSuggestion searchSuggestion = new SearchSuggestion(); + searchSuggestion.setText(String.valueOf(suggestOption.getText())); + searchSuggestion.setFrequency(suggestOption.getFreq()); + searchSuggestion.setScore(suggestOption.getScore()); + searchSuggestions.add(searchSuggestion); + }); + } + } + return searchSuggestions; + } + /** * Adds nested sub-aggregation values to the aggregated results * @param aggs The aggregations to traverse. Could be null (base case) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index 8a385e4ab2b54..741eb5568d2ea 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -27,6 +27,10 @@ import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.search.sort.ScoreSortBuilder; import org.elasticsearch.search.sort.SortOrder; +import org.elasticsearch.search.suggest.SuggestBuilder; +import org.elasticsearch.search.suggest.SuggestBuilders; +import org.elasticsearch.search.suggest.SuggestionBuilder; +import org.elasticsearch.search.suggest.term.TermSuggestionBuilder; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.KEYWORD_FIELDS; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.PATH_HIERARCHY_FIELDS; @@ -45,6 +49,9 @@ public class ESUtils { public static final int MAX_RESULT_SIZE = 10000; public static final String OPAQUE_ID_HEADER = "X-Opaque-Id"; public static final String HEADER_VALUE_DELIMITER = "|"; + public static final String KEYWORD_TYPE = "keyword"; + public static final String ENTITY_NAME_FIELD = "_entityName"; + public static final String NAME_SUGGESTION = "nameSuggestion"; // we use this to make sure we filter for editable & non-editable fields. Also expands out top-level properties // to field level properties @@ -174,6 +181,8 @@ public static QueryBuilder getQueryBuilderFromCriterion(@Nonnull final Criterion * If no sort criterion is provided then the default sorting criterion is chosen which is descending order of score * Furthermore to resolve conflicts, the results are further sorted by ascending order of urn * If the input sort criterion is urn itself, then no additional sort criterion is applied as there will be no conflicts. + * When sorting, set the unmappedType param to arbitrary "keyword" so we essentially ignore sorting where indices do not + * have the field we are sorting on. *

* * @param searchSourceBuilder {@link SearchSourceBuilder} that needs to be populated with sort order @@ -187,13 +196,24 @@ public static void buildSortOrder(@Nonnull SearchSourceBuilder searchSourceBuild final SortOrder esSortOrder = (sortCriterion.getOrder() == com.linkedin.metadata.query.filter.SortOrder.ASCENDING) ? SortOrder.ASC : SortOrder.DESC; - searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder)); + searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder).unmappedType(KEYWORD_TYPE)); } if (sortCriterion == null || !sortCriterion.getField().equals(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD)) { searchSourceBuilder.sort(new FieldSortBuilder(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD).order(SortOrder.ASC)); } } + /** + * Populates source field of search query with the suggestions query so that we get search suggestions back. + * Right now we are only supporting suggestions based on the virtual _entityName field alias. + */ + public static void buildNameSuggestions(@Nonnull SearchSourceBuilder searchSourceBuilder, @Nullable String textInput) { + SuggestionBuilder builder = SuggestBuilders.termSuggestion(ENTITY_NAME_FIELD).text(textInput); + SuggestBuilder suggestBuilder = new SuggestBuilder(); + suggestBuilder.addSuggestion(NAME_SUGGESTION, builder); + searchSourceBuilder.suggest(suggestBuilder); + } + /** * Escapes the Elasticsearch reserved characters in the given input string. * diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java index 35a322d37b2fd..8b56ae0beb3f1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java @@ -78,7 +78,7 @@ public static Map getRequestMap(@Nullable Filter requestParams) return criterionArray.stream().collect(Collectors.toMap(Criterion::getField, Criterion::getValue)); } - static boolean isUrn(@Nonnull String value) { + public static boolean isUrn(@Nonnull String value) { // TODO(https://github.com/datahub-project/datahub-gma/issues/51): This method is a bit of a hack to support searching for // URNs that have commas in them, while also using commas a delimiter for search. We should stop supporting commas // as delimiter, and then we can stop using this hack. diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java index 847029bc180eb..ef9992db1fb25 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java @@ -46,6 +46,7 @@ import java.util.Map; import static com.linkedin.metadata.Constants.*; +import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; import static org.mockito.ArgumentMatchers.anySet; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -54,6 +55,13 @@ @TestConfiguration @Import(ESTestConfiguration.class) public class ESSampleDataFixture { + /** + * Interested in adding more fixtures? Here's what you will need to update? + * 1. Create a new indexPrefix and FixtureName. Both are needed or else all fixtures will load on top of each other, + * overwriting each other + * 2. Create a new IndexConvention, IndexBuilder, and EntityClient. These are needed + * to index a different set of entities. + */ @Autowired private ESBulkProcessor _bulkProcessor; @@ -61,6 +69,9 @@ public class ESSampleDataFixture { @Autowired private RestHighLevelClient _searchClient; + @Autowired + private RestHighLevelClient _longTailSearchClient; + @Autowired private SearchConfiguration _searchConfiguration; @@ -68,24 +79,54 @@ public class ESSampleDataFixture { private CustomSearchConfiguration _customSearchConfiguration; @Bean(name = "sampleDataPrefix") - protected String indexPrefix() { + protected String sampleDataPrefix() { return "smpldat"; } + @Bean(name = "longTailPrefix") + protected String longTailIndexPrefix() { + return "lngtl"; + } + @Bean(name = "sampleDataIndexConvention") protected IndexConvention indexConvention(@Qualifier("sampleDataPrefix") String prefix) { return new IndexConventionImpl(prefix); } + @Bean(name = "longTailIndexConvention") + protected IndexConvention longTailIndexConvention(@Qualifier("longTailPrefix") String prefix) { + return new IndexConventionImpl(prefix); + } + @Bean(name = "sampleDataFixtureName") - protected String fixtureName() { + protected String sampleDataFixtureName() { return "sample_data"; } + @Bean(name = "longTailFixtureName") + protected String longTailFixtureName() { + return "long_tail"; + } + @Bean(name = "sampleDataEntityIndexBuilders") protected EntityIndexBuilders entityIndexBuilders( @Qualifier("entityRegistry") EntityRegistry entityRegistry, @Qualifier("sampleDataIndexConvention") IndexConvention indexConvention + ) { + return entityIndexBuildersHelper(entityRegistry, indexConvention); + } + + @Bean(name = "longTailEntityIndexBuilders") + protected EntityIndexBuilders longTailEntityIndexBuilders( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailIndexConvention") IndexConvention indexConvention + ) { + return entityIndexBuildersHelper(longTailEntityRegistry, indexConvention); + } + + protected EntityIndexBuilders entityIndexBuildersHelper( + EntityRegistry entityRegistry, + IndexConvention indexConvention ) { GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); ESIndexBuilder indexBuilder = new ESIndexBuilder(_searchClient, 1, 0, 1, @@ -100,6 +141,23 @@ protected ElasticSearchService entitySearchService( @Qualifier("entityRegistry") EntityRegistry entityRegistry, @Qualifier("sampleDataEntityIndexBuilders") EntityIndexBuilders indexBuilders, @Qualifier("sampleDataIndexConvention") IndexConvention indexConvention + ) throws IOException { + return entitySearchServiceHelper(entityRegistry, indexBuilders, indexConvention); + } + + @Bean(name = "longTailEntitySearchService") + protected ElasticSearchService longTailEntitySearchService( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailEntityIndexBuilders") EntityIndexBuilders longTailEndexBuilders, + @Qualifier("longTailIndexConvention") IndexConvention longTailIndexConvention + ) throws IOException { + return entitySearchServiceHelper(longTailEntityRegistry, longTailEndexBuilders, longTailIndexConvention); + } + + protected ElasticSearchService entitySearchServiceHelper( + EntityRegistry entityRegistry, + EntityIndexBuilders indexBuilders, + IndexConvention indexConvention ) throws IOException { CustomConfiguration customConfiguration = new CustomConfiguration(); customConfiguration.setEnabled(true); @@ -107,7 +165,7 @@ protected ElasticSearchService entitySearchService( CustomSearchConfiguration customSearchConfiguration = customConfiguration.resolve(new YAMLMapper()); ESSearchDAO searchDAO = new ESSearchDAO(entityRegistry, _searchClient, indexConvention, false, - ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, customSearchConfiguration); + ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, customSearchConfiguration); ESBrowseDAO browseDAO = new ESBrowseDAO(entityRegistry, _searchClient, indexConvention, _searchConfiguration, _customSearchConfiguration); ESWriteDAO writeDAO = new ESWriteDAO(entityRegistry, _searchClient, indexConvention, _bulkProcessor, 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); @@ -120,9 +178,30 @@ protected SearchService searchService( @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, @Qualifier("sampleDataEntityIndexBuilders") EntityIndexBuilders indexBuilders, @Qualifier("sampleDataPrefix") String prefix, - @Qualifier("sampleDataFixtureName") String fixtureName + @Qualifier("sampleDataFixtureName") String sampleDataFixtureName ) throws IOException { + return searchServiceHelper(entityRegistry, entitySearchService, indexBuilders, prefix, sampleDataFixtureName); + } + @Bean(name = "longTailSearchService") + @Nonnull + protected SearchService longTailSearchService( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailEntitySearchService") ElasticSearchService longTailEntitySearchService, + @Qualifier("longTailEntityIndexBuilders") EntityIndexBuilders longTailIndexBuilders, + @Qualifier("longTailPrefix") String longTailPrefix, + @Qualifier("longTailFixtureName") String longTailFixtureName + ) throws IOException { + return searchServiceHelper(longTailEntityRegistry, longTailEntitySearchService, longTailIndexBuilders, longTailPrefix, longTailFixtureName); + } + + public SearchService searchServiceHelper( + EntityRegistry entityRegistry, + ElasticSearchService entitySearchService, + EntityIndexBuilders indexBuilders, + String prefix, + String fixtureName + ) throws IOException { int batchSize = 100; SearchRanker ranker = new SimpleRanker(); CacheManager cacheManager = new ConcurrentMapCacheManager(); @@ -147,6 +226,7 @@ protected SearchService searchService( .bulkProcessor(_bulkProcessor) .fixtureName(fixtureName) .targetIndexPrefix(prefix) + .refreshIntervalSeconds(REFRESH_INTERVAL_SECONDS) .build() .read(); @@ -159,6 +239,24 @@ protected EntityClient entityClient( @Qualifier("sampleDataSearchService") SearchService searchService, @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, @Qualifier("entityRegistry") EntityRegistry entityRegistry + ) { + return entityClientHelper(searchService, entitySearchService, entityRegistry); + } + + @Bean(name = "longTailEntityClient") + @Nonnull + protected EntityClient longTailEntityClient( + @Qualifier("sampleDataSearchService") SearchService searchService, + @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry + ) { + return entityClientHelper(searchService, entitySearchService, longTailEntityRegistry); + } + + private EntityClient entityClientHelper( + SearchService searchService, + ElasticSearchService entitySearchService, + EntityRegistry entityRegistry ) { CachingEntitySearchService cachingEntitySearchService = new CachingEntitySearchService( new ConcurrentMapCacheManager(), @@ -173,7 +271,7 @@ protected EntityClient entityClient( preProcessHooks.setUiEnabled(true); return new JavaEntityClient( new EntityServiceImpl(mockAspectDao, null, entityRegistry, true, null, - preProcessHooks), + preProcessHooks), null, entitySearchService, cachingEntitySearchService, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java b/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java index ee3be08d82a1f..ade7435bf6652 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESSearchLineageFixture.java @@ -48,6 +48,7 @@ import java.util.Map; import static com.linkedin.metadata.Constants.*; +import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; @TestConfiguration @@ -154,6 +155,7 @@ protected LineageSearchService lineageSearchService( .bulkProcessor(_bulkProcessor) .fixtureName(fixtureName) .targetIndexPrefix(prefix) + .refreshIntervalSeconds(REFRESH_INTERVAL_SECONDS) .build() .read(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java index 0d7ac506599af..327447341badf 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; @@ -35,7 +36,7 @@ @TestConfiguration public class ESTestConfiguration { private static final int HTTP_PORT = 9200; - private static final int REFRESH_INTERVAL_SECONDS = 5; + public static final int REFRESH_INTERVAL_SECONDS = 5; public static void syncAfterWrite(ESBulkProcessor bulkProcessor) throws InterruptedException { bulkProcessor.flush(); @@ -55,11 +56,17 @@ public SearchConfiguration searchConfiguration() { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.5f); searchConfiguration.setExactMatch(exactMatchConfiguration); + searchConfiguration.setWordGram(wordGramConfiguration); searchConfiguration.setPartial(partialConfiguration); return searchConfiguration; } @@ -137,4 +144,10 @@ public EntityRegistry entityRegistry() throws EntityRegistryException { return new ConfigEntityRegistry( ESTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); } + + @Bean(name = "longTailEntityRegistry") + public EntityRegistry longTailEntityRegistry() throws EntityRegistryException { + return new ConfigEntityRegistry( + ESTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java index 1f0b7b24397ca..914c5be9f5b09 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestFixtureUtils.java @@ -15,6 +15,7 @@ import java.io.IOException; import java.util.Set; +import static com.linkedin.metadata.ESTestConfiguration.REFRESH_INTERVAL_SECONDS; import static com.linkedin.metadata.ESTestUtils.environmentRestClientBuilder; @TestConfiguration @@ -111,6 +112,7 @@ private void reindexTestFixtureData() throws IOException { FixtureReader reader = FixtureReader.builder() .bulkProcessor(bulkProcessor) .fixtureName("long_tail") + .refreshIntervalSeconds(REFRESH_INTERVAL_SECONDS) .build(); reader.read(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java index 79496888650e1..45c4c16864b07 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java @@ -77,6 +77,11 @@ public static SearchResult searchAcrossEntities(SearchService searchService, Str 100, new SearchFlags().setFulltext(true).setSkipCache(true), facets); } + public static SearchResult searchAcrossCustomEntities(SearchService searchService, String query, List searchableEntities) { + return searchService.searchAcrossEntities(searchableEntities, query, null, null, 0, + 100, new SearchFlags().setFulltext(true).setSkipCache(true)); + } + public static SearchResult search(SearchService searchService, String query) { return search(searchService, SEARCHABLE_ENTITIES, query); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java index 7b2978b747011..faff9f780e31c 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java @@ -317,6 +317,83 @@ public void testSearchService() throws Exception { } + @Test + public void testScrollAcrossLineage() throws Exception { + when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), + anyInt(), eq(null), eq(null))).thenReturn(mockResult(Collections.emptyList())); + LineageScrollResult scrollResult = scrollAcrossLineage(null, TEST1); + assertEquals(scrollResult.getNumEntities().intValue(), 0); + assertNull(scrollResult.getScrollId()); + scrollResult = scrollAcrossLineage(null, TEST1); + assertEquals(scrollResult.getNumEntities().intValue(), 0); + assertNull(scrollResult.getScrollId()); + clearCache(false); + + when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), + anyInt(), eq(null), eq(null))).thenReturn( + mockResult(ImmutableList.of(new LineageRelationship().setEntity(TEST_URN).setType("test").setDegree(1)))); + // just testing null input does not throw any exception + scrollAcrossLineage(null, null); + + scrollResult = scrollAcrossLineage(null, TEST); + assertEquals(scrollResult.getNumEntities().intValue(), 0); + assertNull(scrollResult.getScrollId()); + scrollResult = scrollAcrossLineage(null, TEST1); + assertEquals(scrollResult.getNumEntities().intValue(), 0); + assertNull(scrollResult.getScrollId()); + clearCache(false); + + Urn urn = new TestEntityUrn("test1", "urn1", "VALUE_1"); + ObjectNode document = JsonNodeFactory.instance.objectNode(); + document.set("urn", JsonNodeFactory.instance.textNode(urn.toString())); + document.set("keyPart1", JsonNodeFactory.instance.textNode("test")); + document.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride")); + document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); + _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); + syncAfterWrite(_bulkProcessor); + + when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), + anyInt(), eq(null), eq(null))).thenReturn(mockResult(Collections.emptyList())); + scrollResult = scrollAcrossLineage(null, TEST1); + assertEquals(scrollResult.getNumEntities().intValue(), 0); + assertEquals(scrollResult.getEntities().size(), 0); + assertNull(scrollResult.getScrollId()); + clearCache(false); + + when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), + anyInt(), eq(null), eq(null))).thenReturn( + mockResult(ImmutableList.of(new LineageRelationship().setEntity(urn).setType("test").setDegree(1)))); + scrollResult = scrollAcrossLineage(null, TEST1); + assertEquals(scrollResult.getNumEntities().intValue(), 1); + assertEquals(scrollResult.getEntities().get(0).getEntity(), urn); + assertEquals(scrollResult.getEntities().get(0).getDegree().intValue(), 1); + assertNull(scrollResult.getScrollId()); + + scrollResult = scrollAcrossLineage(QueryUtils.newFilter("degree.keyword", "1"), TEST1); + assertEquals(scrollResult.getNumEntities().intValue(), 1); + assertEquals(scrollResult.getEntities().get(0).getEntity(), urn); + assertEquals(scrollResult.getEntities().get(0).getDegree().intValue(), 1); + assertNull(scrollResult.getScrollId()); + + scrollResult = scrollAcrossLineage(QueryUtils.newFilter("degree.keyword", "2"), TEST1); + assertEquals(scrollResult.getNumEntities().intValue(), 0); + assertEquals(scrollResult.getEntities().size(), 0); + assertNull(scrollResult.getScrollId()); + clearCache(false); + + // Cleanup + _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); + syncAfterWrite(_bulkProcessor); + + when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), + anyInt())).thenReturn( + mockResult(ImmutableList.of(new LineageRelationship().setEntity(urn).setType("test1").setDegree(1)))); + scrollResult = scrollAcrossLineage(null, TEST1); + + assertEquals(scrollResult.getNumEntities().intValue(), 0); + assertNull(scrollResult.getScrollId()); + } + @Test public void testLightningSearchService() throws Exception { // Mostly this test ensures the code path is exercised @@ -731,6 +808,16 @@ private LineageSearchResult searchAcrossLineage(@Nullable Filter filter, @Nullab new SearchFlags().setSkipCache(true)); } + private LineageScrollResult scrollAcrossLineage(@Nullable Filter filter, @Nullable String input, String scrollId, int size) { + return _lineageSearchService.scrollAcrossLineage(TEST_URN, LineageDirection.DOWNSTREAM, ImmutableList.of(), input, + null, filter, null, scrollId, "5m", size, null, null, + new SearchFlags().setSkipCache(true)); + } + + private LineageScrollResult scrollAcrossLineage(@Nullable Filter filter, @Nullable String input) { + return scrollAcrossLineage(filter, input, null, 10); + } + @Test public void testCanDoLightning() throws Exception { Map platformCounts = new HashMap<>(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java new file mode 100644 index 0000000000000..d720c95fef84d --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java @@ -0,0 +1,167 @@ +package com.linkedin.metadata.search.elasticsearch.fixtures; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.ESSampleDataFixture; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.search.MatchedFieldArray; +import com.linkedin.metadata.search.SearchEntityArray; +import com.linkedin.metadata.search.SearchResult; +import com.linkedin.metadata.search.SearchService; +import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.linkedin.metadata.ESTestUtils.*; +import static org.testng.Assert.assertTrue; +import static org.testng.AssertJUnit.*; + +@Import(ESSampleDataFixture.class) +public class ElasticSearchGoldenTest extends AbstractTestNGSpringContextTests { + + private static final List SEARCHABLE_LONGTAIL_ENTITIES = Stream.of(EntityType.CHART, EntityType.CONTAINER, + EntityType.DASHBOARD, EntityType.DATASET, EntityType.DOMAIN, EntityType.TAG + ).map(EntityTypeMapper::getName) + .collect(Collectors.toList()); + @Autowired + private RestHighLevelClient _searchClient; + + @Autowired + @Qualifier("longTailSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("longTailEntityClient") + protected EntityClient entityClient; + + @Autowired + @Qualifier("longTailEntityRegistry") + private EntityRegistry entityRegistry; + + @Test + public void testNameMatchPetProfiles() { + /* + Searching for "pet profiles" should return "pet_profiles" as the first 2 search results + */ + assertNotNull(searchService); + assertNotNull(entityRegistry); + SearchResult searchResult = searchAcrossCustomEntities(searchService, "pet profiles", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("pet_profiles")); + assertTrue(secondResultUrn.toString().contains("pet_profiles")); + } + + @Test + public void testNameMatchPetProfile() { + /* + Searching for "pet profile" should return "pet_profiles" as the first 2 search results + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "pet profile", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("pet_profiles")); + assertTrue(secondResultUrn.toString().contains("pet_profiles")); + } + + @Test + public void testGlossaryTerms() { + /* + Searching for "ReturnRate" should return all tables that have the glossary term applied before + anything else + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "ReturnRate", SEARCHABLE_LONGTAIL_ENTITIES); + SearchEntityArray entities = searchResult.getEntities(); + assertTrue(searchResult.getEntities().size() >= 4); + MatchedFieldArray firstResultMatchedFields = entities.get(0).getMatchedFields(); + MatchedFieldArray secondResultMatchedFields = entities.get(1).getMatchedFields(); + MatchedFieldArray thirdResultMatchedFields = entities.get(2).getMatchedFields(); + MatchedFieldArray fourthResultMatchedFields = entities.get(3).getMatchedFields(); + + assertTrue(firstResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(secondResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(thirdResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(fourthResultMatchedFields.toString().contains("ReturnRate")); + } + + @Test + public void testNameMatchPartiallyQualified() { + /* + Searching for "analytics.pet_details" (partially qualified) should return the fully qualified table + name as the first search results before any others + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "analytics.pet_details", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("snowflake,long_tail_companions.analytics.pet_details")); + assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details")); + } + + @Test + public void testNameMatchCollaborativeActionitems() { + /* + Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search + result, followed by "collaborative_actionitems_old" + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + // Checks that the table name is not suffixed with anything + assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,")); + assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old")); + + Double firstResultScore = searchResult.getEntities().get(0).getScore(); + Double secondResultScore = searchResult.getEntities().get(1).getScore(); + + // Checks that the scores aren't tied so that we are matching on table name more than column name + assertTrue(firstResultScore > secondResultScore); + } + + @Test + public void testNameMatchCustomerOrders() { + /* + Searching for "customer orders" should return "customer_orders" as the first search + result, not suffixed by anything + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "customer orders", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + + // Checks that the table name is not suffixed with anything + assertTrue(firstResultUrn.toString().contains("customer_orders,")); + + Double firstResultScore = searchResult.getEntities().get(0).getScore(); + Double secondResultScore = searchResult.getEntities().get(1).getScore(); + + // Checks that the scores aren't tied so that we are matching on table name more than column name + assertTrue(firstResultScore > secondResultScore); + } + + /* + Tests that should pass but do not yet can be added below here, with the following annotation: + @Test(enabled = false) + */ + +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java index dada13bd6f479..450378b247cea 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java @@ -82,6 +82,7 @@ public class SampleDataFixtureTests extends AbstractTestNGSpringContextTests { protected EntityClient entityClient; @Autowired + @Qualifier("entityRegistry") private EntityRegistry entityRegistry; @Test @@ -160,6 +161,7 @@ public void testSearchFieldConfig() throws IOException { // this is a subfield therefore cannot have a subfield assertFalse(test.hasKeywordSubfield()); assertFalse(test.hasDelimitedSubfield()); + assertFalse(test.hasWordGramSubfields()); String[] fieldAndSubfield = test.fieldName().split("[.]", 2); @@ -357,6 +359,84 @@ public void testDelimitedSynonym() throws IOException { }).collect(Collectors.toList()); } + @Test + public void testNegateAnalysis() throws IOException { + String queryWithMinus = "logging_events -bckp"; + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "query_word_delimited", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp")); + + request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_gram_3", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp")); + + request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_gram_4", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + + } + + @Test + public void testWordGram() throws IOException { + String text = "hello.cat_cool_customer"; + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer")); + + String testMoreSeparators = "quick.brown:fox jumped-LAZY_Dog"; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog")); + + String textWithQuotesAndDuplicateWord = "\"my_db.my_exact_table\""; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table")); + + String textWithParens = "(hi) there"; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithParens); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there")); + + String oneWordText = "hello"; + for (String analyzer : List.of("word_gram_2", "word_gram_3", "word_gram_4")) { + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", analyzer, oneWordText); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + } + } + @Test public void testUrnSynonym() throws IOException { List expectedTokens = List.of("bigquery"); @@ -1266,6 +1346,53 @@ public void testParens() { String.format("%s - Expected search results to include matched fields", query)); assertEquals(result.getEntities().size(), 2); } + @Test + public void testGram() { + String query = "jaffle shop customers"; + SearchResult result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)", + "Expected exact match in 1st position"); + + query = "shop customers source"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)", + "Expected ngram match in 1st position"); + + query = "jaffle shop stg customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)", + "Expected ngram match in 1st position"); + + query = "jaffle shop transformers customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)", + "Expected ngram match in 1st position"); + + query = "shop raw customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)", + "Expected ngram match in 1st position"); + } @Test public void testPrefixVsExact() { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java index ed72b46e98c46..0b33185549299 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java @@ -16,7 +16,7 @@ public void testMappingsBuilder() { Map result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec()); assertEquals(result.size(), 1); Map properties = (Map) result.get("properties"); - assertEquals(properties.size(), 17); + assertEquals(properties.size(), 19); assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword", "fields", ImmutableMap.of("delimited", @@ -66,6 +66,11 @@ public void testMappingsBuilder() { assertTrue(textFieldSubfields.containsKey("delimited")); assertTrue(textFieldSubfields.containsKey("keyword")); + // TEXT with addToFilters aliased under "_entityName" + Map textFieldAlias = (Map) properties.get("_entityName"); + assertEquals(textFieldAlias.get("type"), "alias"); + assertEquals(textFieldAlias.get("path"), "textFieldOverride"); + // TEXT_PARTIAL Map textArrayField = (Map) properties.get("textArrayField"); assertEquals(textArrayField.get("type"), "keyword"); @@ -76,6 +81,19 @@ public void testMappingsBuilder() { assertTrue(textArrayFieldSubfields.containsKey("ngram")); assertTrue(textArrayFieldSubfields.containsKey("keyword")); + // WORD_GRAM + Map wordGramField = (Map) properties.get("wordGramField"); + assertEquals(wordGramField.get("type"), "keyword"); + assertEquals(wordGramField.get("normalizer"), "keyword_normalizer"); + Map wordGramFieldSubfields = (Map) wordGramField.get("fields"); + assertEquals(wordGramFieldSubfields.size(), 6); + assertTrue(wordGramFieldSubfields.containsKey("delimited")); + assertTrue(wordGramFieldSubfields.containsKey("ngram")); + assertTrue(wordGramFieldSubfields.containsKey("keyword")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams2")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams3")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams4")); + // URN Map foreignKey = (Map) properties.get("foreignKey"); assertEquals(foreignKey.get("type"), "text"); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java index 10b4ee42b1a71..36c8bb8f9a676 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java @@ -31,7 +31,8 @@ public void testGetDefaultAggregationsHasFields() { 1.0, Optional.of("hasTest"), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); @@ -60,7 +61,8 @@ public void testGetDefaultAggregationsFields() { 1.0, Optional.empty(), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); @@ -89,7 +91,8 @@ public void testGetSpecificAggregationsHasFields() { 1.0, Optional.of("hasTest1"), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchableAnnotation annotation2 = new SearchableAnnotation( @@ -104,7 +107,8 @@ public void testGetSpecificAggregationsHasFields() { 1.0, Optional.empty(), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java index a2ec396c34b2d..282b1d8bb6778 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; import com.google.common.collect.ImmutableList; @@ -18,6 +19,7 @@ import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.MatchAllQueryBuilder; import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder; +import org.elasticsearch.index.query.MatchPhraseQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryStringQueryBuilder; import org.elasticsearch.index.query.SimpleQueryStringBuilder; @@ -46,11 +48,17 @@ public class SearchQueryBuilderTest { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.7f); testQueryConfig.setExactMatch(exactMatchConfiguration); + testQueryConfig.setWordGram(wordGramConfiguration); testQueryConfig.setPartial(partialConfiguration); } public static final SearchQueryBuilder TEST_BUILDER = new SearchQueryBuilder(testQueryConfig, null); @@ -70,16 +78,17 @@ public void testQueryBuilderFulltext() { assertEquals(keywordQuery.value(), "testQuery"); assertEquals(keywordQuery.analyzer(), "keyword"); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 8); + assertEquals(keywordFields.size(), 9); assertEquals(keywordFields, Map.of( - "urn", 10.f, - "textArrayField", 1.0f, - "customProperties", 1.0f, - "nestedArrayArrayField", 1.0f, - "textFieldOverride", 1.0f, - "nestedArrayStringField", 1.0f, - "keyPart1", 10.0f, - "esObjectField", 1.0f + "urn", 10.f, + "textArrayField", 1.0f, + "customProperties", 1.0f, + "wordGramField", 1.0f, + "nestedArrayArrayField", 1.0f, + "textFieldOverride", 1.0f, + "nestedArrayStringField", 1.0f, + "keyPart1", 10.0f, + "esObjectField", 1.0f )); SimpleQueryStringBuilder urnComponentQuery = (SimpleQueryStringBuilder) analyzerGroupQuery.should().get(1); @@ -99,7 +108,8 @@ public void testQueryBuilderFulltext() { "nestedArrayArrayField.delimited", 0.4f, "urn.delimited", 7.0f, "textArrayField.delimited", 0.4f, - "nestedArrayStringField.delimited", 0.4f + "nestedArrayStringField.delimited", 0.4f, + "wordGramField.delimited", 0.4f )); BoolQueryBuilder boolPrefixQuery = (BoolQueryBuilder) shouldQueries.get(1); @@ -109,21 +119,30 @@ public void testQueryBuilderFulltext() { if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) { MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery; return Pair.of(builder.fieldName(), builder.boost()); - } else { + } else if (prefixQuery instanceof TermQueryBuilder) { // exact TermQueryBuilder builder = (TermQueryBuilder) prefixQuery; return Pair.of(builder.fieldName(), builder.boost()); + } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) { + // ngram + MatchPhraseQueryBuilder builder = (MatchPhraseQueryBuilder) prefixQuery; + return Pair.of(builder.fieldName(), builder.boost()); } }).collect(Collectors.toList()); - assertEquals(prefixFieldWeights.size(), 22); + assertEquals(prefixFieldWeights.size(), 28); List.of( Pair.of("urn", 100.0f), Pair.of("urn", 70.0f), Pair.of("keyPart1.delimited", 16.8f), Pair.of("keyPart1.keyword", 100.0f), - Pair.of("keyPart1.keyword", 70.0f) + Pair.of("keyPart1.keyword", 70.0f), + Pair.of("wordGramField.wordGrams2", 1.44f), + Pair.of("wordGramField.wordGrams3", 2.25f), + Pair.of("wordGramField.wordGrams4", 3.2399998f), + Pair.of("wordGramField.keyword", 10.0f), + Pair.of("wordGramField.keyword", 7.0f) ).forEach(p -> assertTrue(prefixFieldWeights.contains(p), "Missing: " + p)); // Validate scorer @@ -144,7 +163,7 @@ public void testQueryBuilderStructured() { assertEquals(keywordQuery.queryString(), "testQuery"); assertNull(keywordQuery.analyzer()); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 16); + assertEquals(keywordFields.size(), 21); assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f); assertFalse(keywordFields.containsKey("keyPart3")); assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f); @@ -196,10 +215,14 @@ public void testCustomExactMatch() { List queries = boolPrefixQuery.should().stream().map(prefixQuery -> { if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) { + // prefix return (MatchPhrasePrefixQueryBuilder) prefixQuery; - } else { + } else if (prefixQuery instanceof TermQueryBuilder) { // exact return (TermQueryBuilder) prefixQuery; + } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) { + // ngram + return (MatchPhraseQueryBuilder) prefixQuery; } }).collect(Collectors.toList()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java index d66d6a0ab0e76..db56e2d34881b 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java @@ -7,6 +7,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.TestEntitySpecBuilder; +import com.linkedin.metadata.config.search.WordGramConfiguration; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -65,11 +66,17 @@ public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.7f); testQueryConfig.setExactMatch(exactMatchConfiguration); + testQueryConfig.setWordGram(wordGramConfiguration); testQueryConfig.setPartial(partialConfiguration); } @@ -113,10 +120,10 @@ public void testSearchRequestHandler() { HighlightBuilder highlightBuilder = sourceBuilder.highlighter(); List fields = highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList()); - assertEquals(fields.size(), 20); + assertEquals(fields.size(), 22); List highlightableFields = ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey", - "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField"); + "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "wordGramField"); highlightableFields.forEach(field -> { assertTrue(fields.contains(field), "Missing: " + field); assertTrue(fields.contains(field + ".*"), "Missing: " + field + ".*"); diff --git a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java b/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java index 2b37d86f058db..a0c551b28b507 100644 --- a/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java +++ b/metadata-io/src/test/java/io/datahub/test/fixtures/elasticsearch/FixtureReader.java @@ -36,6 +36,8 @@ public class FixtureReader { @Builder.Default private String targetIndexPrefix = ""; + private long refreshIntervalSeconds; + public Set read() throws IOException { try (Stream files = Files.list(Paths.get(String.format("%s/%s", inputBase, fixtureName)))) { return files.map(file -> { @@ -64,7 +66,7 @@ public Set read() throws IOException { } finally { bulkProcessor.flush(); try { - Thread.sleep(1000); + Thread.sleep(1000 * refreshIntervalSeconds); } catch (InterruptedException ignored) { } } diff --git a/metadata-jobs/mae-consumer-job/build.gradle b/metadata-jobs/mae-consumer-job/build.gradle index e7941a04224e3..ca099eea5a8a3 100644 --- a/metadata-jobs/mae-consumer-job/build.gradle +++ b/metadata-jobs/mae-consumer-job/build.gradle @@ -11,22 +11,27 @@ ext { } dependencies { + implementation project(':metadata-service:factories') implementation project(':metadata-jobs:mae-consumer') // TODO: Extract PE consumer into separate pod. implementation project(':metadata-jobs:pe-consumer') + implementation(externalDependency.springBootStarterWeb) { exclude module: "spring-boot-starter-tomcat" } implementation externalDependency.springBootStarterJetty implementation externalDependency.springKafka + implementation externalDependency.springBootAutoconfigure + implementation externalDependency.springActuator implementation externalDependency.slf4jApi implementation externalDependency.log4j2Api compileOnly externalDependency.lombok implementation externalDependency.logbackClassic + testImplementation project(':metadata-dao-impl:kafka-producer') testImplementation externalDependency.springBootTest - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng } bootJar { @@ -43,6 +48,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -55,7 +62,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/metadata-jobs/mae-consumer/build.gradle b/metadata-jobs/mae-consumer/build.gradle index 26b3d82b8570a..69fe2255a6916 100644 --- a/metadata-jobs/mae-consumer/build.gradle +++ b/metadata-jobs/mae-consumer/build.gradle @@ -11,40 +11,41 @@ configurations { dependencies { avro project(path: ':metadata-models', configuration: 'avroSchema') - compile project(':li-utils') - compile (project(':metadata-service:factories')) { + implementation project(':li-utils') + implementation(project(':metadata-service:factories')) { exclude group: 'org.neo4j.test' } - compile project(':metadata-service:auth-config') - compile project(':metadata-service:restli-client') - compile project(':metadata-io') - compile project(':ingestion-scheduler') - compile project(':metadata-utils') - compile project(":entity-registry") - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-service:auth-config') + implementation project(':metadata-service:restli-client') + implementation project(':metadata-io') + implementation project(':ingestion-scheduler') + implementation project(':metadata-utils') + implementation project(":entity-registry") + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':datahub-graphql-core') - compile externalDependency.elasticSearchRest - compile externalDependency.kafkaAvroSerde + implementation externalDependency.elasticSearchRest + implementation externalDependency.kafkaAvroSerde implementation externalDependency.protobuf - compile externalDependency.neo4jJavaDriver + implementation externalDependency.neo4jJavaDriver - compile externalDependency.springKafka - compile externalDependency.springActuator + implementation externalDependency.springKafka + implementation externalDependency.springActuator implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - runtime externalDependency.logbackClassic + runtimeOnly externalDependency.logbackClassic - testCompile externalDependency.mockito + testImplementation externalDependency.mockito implementation externalDependency.awsMskIamAuth testImplementation externalDependency.springBootTest - testRuntime externalDependency.logbackClassic + testRuntimeOnly externalDependency.logbackClassic } task avroSchemaSources(type: Copy) { diff --git a/metadata-jobs/mce-consumer-job/build.gradle b/metadata-jobs/mce-consumer-job/build.gradle index 5981284e9da3f..b72d4baff23d6 100644 --- a/metadata-jobs/mce-consumer-job/build.gradle +++ b/metadata-jobs/mce-consumer-job/build.gradle @@ -21,6 +21,8 @@ dependencies { } implementation externalDependency.springBootStarterJetty implementation externalDependency.springKafka + implementation externalDependency.springBootAutoconfigure + implementation externalDependency.springActuator implementation spec.product.pegasus.restliDocgen implementation spec.product.pegasus.restliSpringBridge implementation externalDependency.slf4jApi @@ -28,15 +30,16 @@ dependencies { compileOnly externalDependency.lombok implementation externalDependency.logbackClassic - runtime externalDependency.mariadbConnector - runtime externalDependency.mysqlConnector - runtime externalDependency.postgresql + runtimeOnly externalDependency.mariadbConnector + runtimeOnly externalDependency.mysqlConnector + runtimeOnly externalDependency.postgresql annotationProcessor externalDependency.lombok + testImplementation project(':metadata-dao-impl:kafka-producer') testImplementation externalDependency.springBootTest - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng } bootJar { @@ -56,6 +59,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -68,7 +73,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/metadata-jobs/mce-consumer/build.gradle b/metadata-jobs/mce-consumer/build.gradle index 467d1dbdd3717..0bca55e0e5f92 100644 --- a/metadata-jobs/mce-consumer/build.gradle +++ b/metadata-jobs/mce-consumer/build.gradle @@ -11,24 +11,24 @@ configurations { dependencies { avro project(path: ':metadata-models', configuration: 'avroSchema') - compile project(':li-utils') - compile (project(':metadata-service:factories')) { + implementation project(':li-utils') + implementation(project(':metadata-service:factories')) { exclude group: 'org.neo4j.test' } - compile project(':metadata-utils') - compile project(':metadata-events:mxe-schemas') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile project(':metadata-io') - compile project(':metadata-service:restli-client') - compile spec.product.pegasus.restliClient - compile spec.product.pegasus.restliCommon - compile externalDependency.elasticSearchRest + implementation project(':metadata-utils') + implementation project(':metadata-events:mxe-schemas') + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-io') + implementation project(':metadata-service:restli-client') + implementation spec.product.pegasus.restliClient + implementation spec.product.pegasus.restliCommon + implementation externalDependency.elasticSearchRest implementation externalDependency.protobuf - compile externalDependency.springKafka - compile externalDependency.springActuator + implementation externalDependency.springKafka + implementation externalDependency.springActuator implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/metadata-jobs/pe-consumer/build.gradle b/metadata-jobs/pe-consumer/build.gradle index 517b021353f9d..1899a4de15635 100644 --- a/metadata-jobs/pe-consumer/build.gradle +++ b/metadata-jobs/pe-consumer/build.gradle @@ -9,21 +9,21 @@ configurations { dependencies { avro project(path: ':metadata-models', configuration: 'avroSchema') - compile project(':li-utils') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile (project(':metadata-service:factories')) { + implementation project(':li-utils') + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation(project(':metadata-service:factories')) { exclude group: 'org.neo4j.test' } - compile externalDependency.springKafka - compile externalDependency.springActuator + implementation externalDependency.springKafka + implementation externalDependency.springActuator implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - runtime externalDependency.logbackClassic - testCompile externalDependency.mockito - testRuntime externalDependency.logbackClassic + runtimeOnly externalDependency.logbackClassic + testImplementation externalDependency.mockito + testRuntimeOnly externalDependency.logbackClassic } task avroSchemaSources(type: Copy) { diff --git a/metadata-models-custom/build.gradle b/metadata-models-custom/build.gradle index 4af866502f5dc..95a00766039a8 100644 --- a/metadata-models-custom/build.gradle +++ b/metadata-models-custom/build.gradle @@ -11,10 +11,10 @@ buildscript { plugins { id 'base' + id 'maven-publish' } apply plugin: 'pegasus' - if (project.hasProperty('projVersion')) { project.version = project.projVersion } else { @@ -23,11 +23,11 @@ if (project.hasProperty('projVersion')) { dependencies { - compile spec.product.pegasus.data + implementation spec.product.pegasus.data // Uncomment these if you want to depend on models defined in core datahub - //compile project(':li-utils') + //implementation project(':li-utils') //dataModel project(':li-utils') - //compile project(':metadata-models') + //implementation project(':metadata-models') //dataModel project(':metadata-models') } @@ -69,6 +69,6 @@ task modelDeploy(type: Copy) { modelDeploy.dependsOn modelArtifact -install.dependsOn modelDeploy +publish.dependsOn modelDeploy diff --git a/metadata-models-validator/build.gradle b/metadata-models-validator/build.gradle index bd1ec9449fb19..c8d1d2e6651d6 100644 --- a/metadata-models-validator/build.gradle +++ b/metadata-models-validator/build.gradle @@ -1,13 +1,13 @@ apply plugin: 'java' dependencies { - compile project(":entity-registry") - compile spec.product.pegasus.data - compile spec.product.pegasus.generator + implementation project(":entity-registry") + implementation spec.product.pegasus.data + implementation spec.product.pegasus.generator - compile externalDependency.commonsIo - compile externalDependency.findbugsAnnotations - compile externalDependency.guava + implementation externalDependency.commonsIo + implementation externalDependency.findbugsAnnotations + implementation externalDependency.guava implementation externalDependency.slf4jApi runtimeOnly externalDependency.logbackClassic diff --git a/metadata-models/build.gradle b/metadata-models/build.gradle index 432823852a263..2e8efae9b7bce 100644 --- a/metadata-models/build.gradle +++ b/metadata-models/build.gradle @@ -1,6 +1,6 @@ import io.datahubproject.GenerateJsonSchemaTask - +apply plugin: 'java-library' apply plugin: 'pegasus' tasks.withType(JavaCompile).configureEach { @@ -15,16 +15,16 @@ tasks.withType(Test).configureEach { } dependencies { - compile spec.product.pegasus.data + api spec.product.pegasus.data constraints { implementation('org.apache.commons:commons-text:1.10.0') { because 'Vulnerability Issue' } } - compile project(':li-utils') + api project(':li-utils') dataModel project(':li-utils') - testCompile externalDependency.guava + testImplementation externalDependency.guava } mainAvroSchemaJar.dependsOn generateAvroSchema diff --git a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl index 4339a186f1304..9fea71003ae6e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl @@ -20,8 +20,9 @@ record ChartInfo includes CustomProperties, ExternalReference { * Title of the chart */ @Searchable = { - "fieldType": "TEXT_PARTIAL", - "enableAutocomplete": true + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "fieldNameAliases": [ "_entityName" ] } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl index 26745fe46caaa..526878cbe60d3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl @@ -15,9 +15,10 @@ record ContainerProperties includes CustomProperties, ExternalReference { * Display name of the Asset Container */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -25,7 +26,7 @@ record ContainerProperties includes CustomProperties, ExternalReference { * Fully-qualified name of the Container */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -61,4 +62,4 @@ record ContainerProperties includes CustomProperties, ExternalReference { } } lastModified: optional TimeStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl index 5cb306039506e..c436011eb58db 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl @@ -22,9 +22,10 @@ record DashboardInfo includes CustomProperties, ExternalReference { * Title of the dashboard */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } title: string @@ -126,4 +127,4 @@ record DashboardInfo includes CustomProperties, ExternalReference { * The time when this dashboard last refreshed */ lastRefreshed: optional Time -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl index 481240740876a..2ff3e8cd930af 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl @@ -17,9 +17,10 @@ record DataFlowInfo includes CustomProperties, ExternalReference { * Flow name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl index 8737dd4d9ef52..250fb76003777 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl @@ -18,9 +18,10 @@ record DataJobInfo includes CustomProperties, ExternalReference { * Job name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl index acc40e9f693ec..5dd35c7f49520 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl @@ -15,9 +15,10 @@ record DataPlatformInfo { */ @validate.strlen.max = 15 @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": false, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -25,7 +26,7 @@ record DataPlatformInfo { * The name that will be used for displaying a platform type. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl index d7ce5565103ee..b24e220ac3bcf 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl @@ -16,9 +16,10 @@ record DataPlatformInstanceProperties includes CustomProperties, ExternalReferen * Display name of the Data Platform Instance */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index 72eefd5e294e4..c63cb1a97c017 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -19,7 +19,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc * Process name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -31,6 +31,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc @Searchable = { "fieldType": "KEYWORD", "addToFilters": true, + "fieldName": "processType", "filterNameOverride": "Process Type" } type: optional enum DataProcessType { diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl index 3861b7def7669..b2d26094fd0b7 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl @@ -13,9 +13,10 @@ record DataProductProperties includes CustomProperties, ExternalReference { * Display name of the Data Product */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl index 57b1fe7693129..ad8705a29d4ed 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl @@ -17,9 +17,10 @@ record DatasetProperties includes CustomProperties, ExternalReference { * Display name of the Dataset */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string @@ -27,7 +28,7 @@ record DatasetProperties includes CustomProperties, ExternalReference { * Fully-qualified name of the Dataset */ @Searchable = { - "fieldType": "TEXT", + "fieldType": "WORD_GRAM", "addToFilters": false, "enableAutocomplete": true, "boostScore": 10.0 @@ -77,4 +78,4 @@ record DatasetProperties includes CustomProperties, ExternalReference { */ @deprecated = "Use GlobalTags aspect instead." tags: array[string] = [ ] -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl index 5a0b8657ecb47..5c8c8a4912e4c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl @@ -14,9 +14,10 @@ record DomainProperties { * Display name of the Domain */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl index 1e840e5a1df7e..c3388d4f462d4 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl @@ -35,9 +35,10 @@ record GlossaryNodeInfo { */ @Searchable = { "fieldName": "displayName", - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string @@ -49,4 +50,4 @@ record GlossaryNodeInfo { } id: optional string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl index aa2a8b31e3dde..e987a71be7131 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl @@ -23,9 +23,10 @@ record GlossaryTermInfo includes CustomProperties { * Display name of the term */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string @@ -75,4 +76,4 @@ record GlossaryTermInfo includes CustomProperties { */ @deprecated rawSchema: optional string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl index 8d764604237da..28b87476c61bd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl @@ -21,7 +21,8 @@ record CorpGroupInfo { "fieldType": "TEXT_PARTIAL" "queryByDefault": true, "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } displayName: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl index 6b050f484fedd..48ee53377e582 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl @@ -45,7 +45,7 @@ record CorpUserEditableInfo { * DataHub-native display name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl index 1cb705d426cc0..382b120fa942a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl @@ -26,10 +26,11 @@ record CorpUserInfo includes CustomProperties { * displayName of this user , e.g. Hang Zhang(DataHQ) */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } displayName: optional string @@ -89,7 +90,7 @@ record CorpUserInfo includes CustomProperties { * Common name of this user, format is firstName + lastName (split by a whitespace) */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl index 075cc14ddc83b..9e65b8f6e9929 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl @@ -11,10 +11,10 @@ record CorpGroupKey { * The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl index d1a8a4bb5bb23..476a0ad9704b3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl @@ -12,7 +12,7 @@ record CorpUserKey { */ @Searchable = { "fieldName": "ldap", - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "boostScore": 2.0, "enableAutocomplete": true } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl index bcdb92f75d055..d8342630248b6 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl @@ -19,7 +19,7 @@ record DataFlowKey { * Unique Identifier of the data flow */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } flowId: string @@ -31,4 +31,4 @@ record DataFlowKey { "fieldType": "TEXT_PARTIAL" } cluster: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl index d0ac7dbca0f99..60ec51b464dcc 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl @@ -27,7 +27,7 @@ record DataJobKey { * Unique Identifier of the data job */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } jobId: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl index a5c05029352c2..4df1364a04ebe 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl @@ -13,7 +13,7 @@ record DataProcessKey { * Process name i.e. an ETL job name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 4.0 } @@ -37,4 +37,4 @@ record DataProcessKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl index ea1f9510ed438..70c5d174171af 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl @@ -25,7 +25,7 @@ record DatasetKey { //This is no longer to be used for Dataset native name. Use name, qualifiedName from DatasetProperties instead. @Searchable = { "fieldName": "id" - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl index 88697fe3ff364..51a3bc00f4e9e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl @@ -12,9 +12,9 @@ import com.linkedin.common.FabricType record GlossaryNodeKey { @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl index a9f35146da18e..61bcd60cbc754 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl @@ -13,10 +13,10 @@ record GlossaryTermKey { * The term name, which serves as a unique id */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "fieldName": "id" } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl index 579f1966977a9..050b954c89fb8 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl @@ -20,9 +20,10 @@ record MLFeatureKey { * Name of the feature */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl index 1f786ad417be7..175a7b0d31b00 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl @@ -22,9 +22,10 @@ record MLFeatureTableKey { * Name of the feature table */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl index 7c36f410fede3..daa1deceb5fc3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl @@ -19,9 +19,10 @@ record MLModelDeploymentKey { * Name of the MLModelDeployment */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -35,4 +36,4 @@ record MLModelDeploymentKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl index 17c401c0b8c48..582a899633c2a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl @@ -19,9 +19,10 @@ record MLModelGroupKey { * Name of the MLModelGroup */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -33,4 +34,4 @@ record MLModelGroupKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl index 55fd2bc370846..f097bbda738a2 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl @@ -19,9 +19,10 @@ record MLModelKey { * Name of the MLModel */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -35,4 +36,4 @@ record MLModelKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl index 9eb67eaf5f651..ef812df206b46 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl @@ -21,9 +21,10 @@ record MLPrimaryKeyKey { * Name of the primary key */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl index 47f1a631b4a2c..4622e32dce67b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl @@ -11,10 +11,10 @@ record TagKey { * The tag name, which serves as a unique id */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0, "fieldName": "id" } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl index 05a94b8fabc4b..be1a30c7f082c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl @@ -28,4 +28,9 @@ record SearchFlags { * Whether to skip aggregates/facets */ skipAggregates:optional boolean = false + + /** + * Whether to request for search suggestions on the _entityName virtualized field + */ + getSuggestions:optional boolean = false } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl index 718d80ba4cb36..60f1b568f586a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl @@ -12,4 +12,9 @@ record SearchResultMetadata { */ aggregations: array[AggregationMetadata] = [] + /** + * A list of search query suggestions based on the given query + */ + suggestions: array[SearchSuggestion] = [] + } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl new file mode 100644 index 0000000000000..7776ec54fe03e --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl @@ -0,0 +1,24 @@ +namespace com.linkedin.metadata.search + +/** + * The model for the search result + */ +record SearchSuggestion { + + /** + * The suggestion text for this search query + */ + text: string + + /** + * The score for how close this suggestion is to the original search query. + * The closer to 1 means it is closer to the original query and 0 is further away. + */ + score: float + + /** + * How many matches there are with the suggested text for the given field + */ + frequency: long + +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/mxe/SystemMetadata.pdl b/metadata-models/src/main/pegasus/com/linkedin/mxe/SystemMetadata.pdl index b9cf7d58d434e..e0f355229c912 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/mxe/SystemMetadata.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/mxe/SystemMetadata.pdl @@ -14,6 +14,11 @@ record SystemMetadata { */ runId: optional string = "no-run-id-provided" + /** + * The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion. + */ + pipelineName: optional string + /** * The model registry name that was used to process this event */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl index 1f4dcf975f48c..8ec5f262890f3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl @@ -18,9 +18,10 @@ record NotebookInfo includes CustomProperties, ExternalReference { * Title of the Notebook */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl index 004df6e399be4..3e7b53beff531 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl @@ -14,7 +14,7 @@ record OwnershipTypeInfo { * Display name of the Ownership Type */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -54,4 +54,4 @@ record OwnershipTypeInfo { } } lastModified: AuditStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl index bb7e22900e168..3ba19d348913b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl @@ -29,7 +29,7 @@ record QueryProperties { * Optional display name to identify the query. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -69,4 +69,4 @@ record QueryProperties { } } lastModified: AuditStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl index acebdf5558c59..8422d3c49046c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl @@ -14,9 +14,10 @@ record RoleProperties { * Display name of the IAM Role in the external system */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl index 41c500c6fff2f..9df47fac3928a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl @@ -11,9 +11,10 @@ record TagProperties { * Display name of the tag */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-service/auth-config/build.gradle b/metadata-service/auth-config/build.gradle index 2e9210804bed9..c7a1128897dd5 100644 --- a/metadata-service/auth-config/build.gradle +++ b/metadata-service/auth-config/build.gradle @@ -1,9 +1,9 @@ apply plugin: 'java' dependencies { - compile project(path: ':metadata-models') - compile project(path: ':metadata-auth:auth-api') - compile externalDependency.guava + implementation project(path: ':metadata-models') + implementation project(path: ':metadata-auth:auth-api') + implementation externalDependency.guava implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok diff --git a/metadata-service/auth-filter/build.gradle b/metadata-service/auth-filter/build.gradle index 2c77850209205..2dd07ef10274c 100644 --- a/metadata-service/auth-filter/build.gradle +++ b/metadata-service/auth-filter/build.gradle @@ -1,15 +1,17 @@ apply plugin: 'java' dependencies { - compile project(':metadata-auth:auth-api'); - compile project(path: ':metadata-service:auth-config') - compile project(path: ':metadata-service:factories') + implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:auth-impl') + implementation project(path: ':metadata-service:auth-config') + implementation project(path: ':metadata-service:factories') - compile externalDependency.servletApi + implementation externalDependency.servletApi implementation externalDependency.slf4jApi compileOnly externalDependency.lombok - compile externalDependency.springWeb + implementation externalDependency.springWeb + implementation externalDependency.guice annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito + testImplementation externalDependency.mockito } \ No newline at end of file diff --git a/metadata-service/auth-impl/build.gradle b/metadata-service/auth-impl/build.gradle index aefbf81577a9b..1ffeb99e7ad4a 100644 --- a/metadata-service/auth-impl/build.gradle +++ b/metadata-service/auth-impl/build.gradle @@ -6,11 +6,14 @@ compileJava { } dependencies { - compile project(path: ':metadata-models') - compile project(path: ':metadata-auth:auth-api') - compile project(path: ':metadata-service:auth-config') - compile project(path: ':metadata-io') - + implementation project(path: ':metadata-models') + implementation project(path: ':metadata-auth:auth-api') + implementation project(path: ':metadata-service:auth-config') + implementation project(path: ':metadata-io') + + implementation(externalDependency.mixpanel) { + exclude group: 'org.json', module: 'json' + } implementation 'io.jsonwebtoken:jjwt-api:0.11.2' runtimeOnly 'io.jsonwebtoken:jjwt-impl:0.11.2', 'io.jsonwebtoken:jjwt-jackson:0.11.2' @@ -20,6 +23,5 @@ dependencies { annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito - + testImplementation externalDependency.mockito } \ No newline at end of file diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index 690528059b555..f653ccf72cf54 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -250,11 +250,11 @@ private void addPoliciesToCache(final Map> cache private void addPolicyToCache(final Map> cache, final DataHubPolicyInfo policy) { final List privileges = policy.getPrivileges(); for (String privilege : privileges) { - List existingPolicies = cache.getOrDefault(privilege, new ArrayList<>()); + List existingPolicies = cache.containsKey(privilege) ? new ArrayList<>(cache.get(privilege)) : new ArrayList<>(); existingPolicies.add(policy); cache.put(privilege, existingPolicies); } - List existingPolicies = cache.getOrDefault(ALL, new ArrayList<>()); + List existingPolicies = cache.containsKey(ALL) ? new ArrayList<>(cache.get(ALL)) : new ArrayList<>(); existingPolicies.add(policy); cache.put(ALL, existingPolicies); } diff --git a/metadata-service/auth-servlet-impl/build.gradle b/metadata-service/auth-servlet-impl/build.gradle index 3338f3a5c6b94..7945b3b4e9a06 100644 --- a/metadata-service/auth-servlet-impl/build.gradle +++ b/metadata-service/auth-servlet-impl/build.gradle @@ -1,15 +1,17 @@ apply plugin: 'java' dependencies { - compile project(':metadata-auth:auth-api') - compile project(':metadata-service:factories') + implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:auth-impl') + implementation project(':metadata-service:factories') - compile externalDependency.springCore - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.graphqlJava - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.springCore + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.graphqlJava + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.guice implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/metadata-service/configuration/build.gradle b/metadata-service/configuration/build.gradle index 8623e53d2554a..30fa3079d29a4 100644 --- a/metadata-service/configuration/build.gradle +++ b/metadata-service/configuration/build.gradle @@ -3,7 +3,7 @@ plugins { } dependencies { - compile externalDependency.jacksonDataBind + implementation externalDependency.jacksonDataBind implementation externalDependency.slf4jApi implementation externalDependency.springCore diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java new file mode 100644 index 0000000000000..7094bbd710f75 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java @@ -0,0 +1,11 @@ +package com.linkedin.metadata.config; + +import lombok.Data; + +@Data +public class SearchResultVisualConfig { + /** + * The default tab to show first on a Domain entity profile. Defaults to React code sorting if not present. + */ + public Boolean enableNameHighlight; +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java index d1c357186e1ae..14ac2406c2256 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java @@ -22,4 +22,9 @@ public class VisualConfiguration { * Queries tab related configurations */ public EntityProfileConfig entityProfile; + + /** + * Search result related configurations + */ + public SearchResultVisualConfig searchResult; } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java index 1a56db1bd68b0..b2b5260dc5e70 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java @@ -11,4 +11,5 @@ public class SearchConfiguration { private PartialConfiguration partial; private CustomConfiguration custom; private GraphQueryConfiguration graph; + private WordGramConfiguration wordGram; } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java new file mode 100644 index 0000000000000..624d2a4c63c4c --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java @@ -0,0 +1,11 @@ +package com.linkedin.metadata.config.search; + +import lombok.Data; + + +@Data +public class WordGramConfiguration { + private float twoGramFactor; + private float threeGramFactor; + private float fourGramFactor; +} diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 9f7bf92039fdc..f49498bfa2325 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -111,6 +111,8 @@ visualConfig: entityProfile: # we only support default tab for domains right now. In order to implement for other entities, update React code domainDefaultTab: ${DOMAIN_DEFAULT_TAB:} # set to DOCUMENTATION_TAB to show documentation tab first + searchResult: + enableNameHighlight: ${SEARCH_RESULT_NAME_HIGHLIGHT_ENABLED:true} # Enables visual highlighting on search result names/descriptions. # Storage Layer @@ -198,6 +200,10 @@ elasticsearch: prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search + wordGram: + twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens + threeGramFactor: ${ELASTICSEARCH_QUERY_THREE_GRAM_FACTOR:1.5} # boost multiplier when match on 3-gram tokens + fourGramFactor: ${ELASTICSEARCH_QUERY_FOUR_GRAM_FACTOR:1.8} # boost multiplier when match on 4-gram tokens # Field weight annotations are typically calibrated for exact match, if partial match is possible on the field use these adjustments partial: urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed @@ -288,8 +294,8 @@ featureFlags: alwaysEmitChangeLog: ${ALWAYS_EMIT_CHANGE_LOG:false} # Enables always emitting a MCL even when no changes are detected. Used for Time Based Lineage when no changes occur. searchServiceDiffModeEnabled: ${SEARCH_SERVICE_DIFF_MODE_ENABLED:true} # Enables diff mode for search document writes, reduces amount of writes to ElasticSearch documents for no-ops readOnlyModeEnabled: ${READ_ONLY_MODE_ENABLED:false} # Enables read only mode for an instance. Right now this only affects ability to edit user profile image URL but can be extended - showSearchFiltersV2: ${SHOW_SEARCH_FILTERS_V2:false} # Enables showing the search filters V2 experience. - showBrowseV2: ${SHOW_BROWSE_V2:false} # Enables showing the browse v2 sidebar experience. + showSearchFiltersV2: ${SHOW_SEARCH_FILTERS_V2:true} # Enables showing the search filters V2 experience. + showBrowseV2: ${SHOW_BROWSE_V2:true} # Enables showing the browse v2 sidebar experience. preProcessHooks: uiEnabled: ${PRE_PROCESS_HOOKS_UI_ENABLED:true} # Circumvents Kafka for processing index updates for UI changes sourced from GraphQL to avoid processing delays showAcrylInfo: ${SHOW_ACRYL_INFO:false} # Show different CTAs within DataHub around moving to Managed DataHub. Set to true for the demo site. @@ -318,4 +324,4 @@ cache: search: lineage: ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day - lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} \ No newline at end of file + lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index 796b6ee436b78..f848a5e339781 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -1,54 +1,64 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply from: "../../gradle/versioning/versioning.gradle" dependencies { - compile project(':metadata-io') - compile project(':metadata-utils') - compile project(':metadata-service:auth-impl') - compile project(':metadata-service:auth-config') - compile project(':metadata-service:plugin') - compile project(':metadata-service:configuration') - compile project(':datahub-graphql-core') - compile project(':metadata-service:restli-servlet-impl') - compile project(':metadata-dao-impl:kafka-producer') - compile project(':ingestion-scheduler') - - compile (externalDependency.awsGlueSchemaRegistrySerde) { + api project(':metadata-io') + api project(':metadata-utils') + implementation project(':metadata-service:auth-impl') + api project(':metadata-service:auth-config') + api project(':metadata-service:plugin') + api project(':metadata-service:configuration') + implementation project(':datahub-graphql-core') + implementation project(':metadata-service:restli-servlet-impl') + implementation project(':metadata-dao-impl:kafka-producer') + implementation project(':ingestion-scheduler') + + implementation (externalDependency.awsGlueSchemaRegistrySerde) { exclude group: 'org.json', module: 'json' } - compile externalDependency.elasticSearchRest - compile externalDependency.httpClient - compile externalDependency.gson + implementation externalDependency.elasticSearchRest + implementation externalDependency.httpClient + implementation externalDependency.gson implementation (externalDependency.hazelcast) { exclude group: 'org.json', module: 'json' } - compile externalDependency.hazelcastSpring - compile externalDependency.kafkaClients - compile externalDependency.kafkaAvroSerde + implementation externalDependency.hazelcastSpring + implementation externalDependency.kafkaClients + implementation externalDependency.kafkaAvroSerde compileOnly externalDependency.lombok - compile externalDependency.servletApi - compile externalDependency.springBeans - compile externalDependency.springBootAutoconfigure - compile externalDependency.springBootStarterCache - compile externalDependency.springContext - compile externalDependency.springCore - compile externalDependency.springKafka - compile externalDependency.springWeb + implementation externalDependency.servletApi + api externalDependency.springBeans + implementation externalDependency.springBootAutoconfigure + implementation externalDependency.springBootStarterCache + api externalDependency.springContext + api externalDependency.springCore + api externalDependency.springKafka + api externalDependency.springWeb implementation externalDependency.awsPostgresIamAuth implementation externalDependency.awsRds + implementation(externalDependency.mixpanel) { + exclude group: 'org.json', module: 'json' + } annotationProcessor externalDependency.lombok - compile spec.product.pegasus.restliSpringBridge + implementation spec.product.pegasus.restliSpringBridge implementation spec.product.pegasus.restliDocgen + implementation externalDependency.jline + implementation externalDependency.common testImplementation externalDependency.springBootTest + testImplementation externalDependency.mockito + testImplementation externalDependency.testng + testImplementation externalDependency.hazelcastTest + testImplementation externalDependency.javatuples - testCompile externalDependency.mockito - testCompile externalDependency.testng - testCompile externalDependency.hazelcastTest - implementation externalDependency.jline - implementation externalDependency.common + + constraints { + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } + } } configurations.all{ diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java index 0f52bc3816c2d..980cafaceae27 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java @@ -1,6 +1,7 @@ package com.linkedin.metadata.boot; import com.linkedin.gms.factory.config.ConfigurationProvider; +import com.linkedin.gms.factory.kafka.schemaregistry.InternalSchemaRegistryFactory; import java.io.IOException; import java.util.Set; import java.util.concurrent.ExecutorService; @@ -48,12 +49,17 @@ public class OnBootApplicationListener { public void onApplicationEvent(@Nonnull ContextRefreshedEvent event) { log.warn("OnBootApplicationListener context refreshed! {} event: {}", ROOT_WEB_APPLICATION_CONTEXT_ID.equals(event.getApplicationContext().getId()), event); + String schemaRegistryType = provider.getKafka().getSchemaRegistry().getType(); if (ROOT_WEB_APPLICATION_CONTEXT_ID.equals(event.getApplicationContext().getId())) { - executorService.submit(isSchemaRegistryAPIServeletReady()); + if (InternalSchemaRegistryFactory.TYPE.equals(schemaRegistryType)) { + executorService.submit(isSchemaRegistryAPIServletReady()); + } else { + _bootstrapManager.start(); + } } } - public Runnable isSchemaRegistryAPIServeletReady() { + public Runnable isSchemaRegistryAPIServletReady() { return () -> { final HttpGet request = new HttpGet(provider.getKafka().getSchemaRegistry().getUrl()); int timeouts = 30; diff --git a/metadata-service/graphql-servlet-impl/build.gradle b/metadata-service/graphql-servlet-impl/build.gradle index ff64f9a8a8233..52fd20ef32389 100644 --- a/metadata-service/graphql-servlet-impl/build.gradle +++ b/metadata-service/graphql-servlet-impl/build.gradle @@ -1,16 +1,19 @@ apply plugin: 'java' dependencies { - compile project(':datahub-graphql-core') - compile project(':metadata-auth:auth-api') - compile project(':metadata-service:factories') + implementation project(':datahub-graphql-core') + implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:auth-impl') + implementation project(':metadata-service:factories') - compile externalDependency.springCore - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.graphqlJava - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.servletApi + implementation externalDependency.springCore + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.graphqlJava + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.guice implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok diff --git a/metadata-service/health-servlet/build.gradle b/metadata-service/health-servlet/build.gradle index 3237c56779ada..6095f724b3cd4 100644 --- a/metadata-service/health-servlet/build.gradle +++ b/metadata-service/health-servlet/build.gradle @@ -2,16 +2,17 @@ apply plugin: 'java' dependencies { - compile project(':metadata-service:factories') + implementation project(':metadata-service:factories') - compile externalDependency.reflections - compile externalDependency.springBoot - compile externalDependency.springCore - compile externalDependency.springDocUI - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.guava + implementation externalDependency.reflections + implementation externalDependency.springBoot + implementation externalDependency.springCore + implementation externalDependency.springDocUI + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.springBeans + implementation externalDependency.springContext implementation externalDependency.slf4jApi compileOnly externalDependency.lombok implementation externalDependency.antlr4Runtime diff --git a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java b/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java index 45edcb2a6a5d9..02ca5182cd2be 100644 --- a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java +++ b/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java @@ -10,6 +10,7 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; + import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.client.RequestOptions; diff --git a/metadata-service/openapi-servlet/build.gradle b/metadata-service/openapi-servlet/build.gradle index 7cd022f97247c..1909b4862d294 100644 --- a/metadata-service/openapi-servlet/build.gradle +++ b/metadata-service/openapi-servlet/build.gradle @@ -2,36 +2,38 @@ apply plugin: 'java' dependencies { - compile project(':metadata-auth:auth-api') - compile project(':metadata-service:factories') - compile project(':metadata-service:schema-registry-api') + implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:auth-impl') + implementation project(':metadata-service:factories') + implementation project(':metadata-service:schema-registry-api') - compile externalDependency.reflections - compile externalDependency.springBoot - compile externalDependency.springCore - compile(externalDependency.springDocUI) { + implementation externalDependency.reflections + implementation externalDependency.springBoot + implementation externalDependency.springCore + implementation(externalDependency.springDocUI) { exclude group: 'org.springframework.boot' } - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.springBeans + implementation externalDependency.springContext implementation externalDependency.slf4jApi compileOnly externalDependency.lombok implementation externalDependency.antlr4Runtime implementation externalDependency.antlr4 + implementation externalDependency.swaggerAnnotations annotationProcessor externalDependency.lombok testImplementation externalDependency.springBootTest testImplementation project(':mock-entity-registry') - testCompile externalDependency.springBoot - testCompile externalDependency.testContainers - testCompile externalDependency.springKafka - testCompile externalDependency.testng - testCompile externalDependency.mockito - testCompile externalDependency.logbackClassic - testCompile externalDependency.jacksonCore - testCompile externalDependency.jacksonDataBind - testCompile externalDependency.springBootStarterWeb + testImplementation externalDependency.springBoot + testImplementation externalDependency.testContainers + testImplementation externalDependency.springKafka + testImplementation externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.logbackClassic + testImplementation externalDependency.jacksonCore + testImplementation externalDependency.jacksonDataBind + testImplementation externalDependency.springBootStarterWeb } \ No newline at end of file diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java index 796a7774da303..1e37170f37b3b 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java @@ -18,8 +18,11 @@ import com.linkedin.metadata.search.utils.QueryUtils; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.openapi.exception.UnauthorizedException; -import io.swagger.annotations.ApiOperation; +import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.media.Content; +import io.swagger.v3.oas.annotations.media.Schema; +import io.swagger.v3.oas.annotations.responses.ApiResponse; import io.swagger.v3.oas.annotations.tags.Tag; import java.net.URLDecoder; import java.nio.charset.Charset; @@ -94,7 +97,8 @@ private RelatedEntitiesResult getRelatedEntities(String rawUrn, List rel } @GetMapping(value = "/", produces = MediaType.APPLICATION_JSON_VALUE) - @ApiOperation(code = 0, response = RelatedEntitiesResult.class, value = "") + @Operation(responses = { @ApiResponse(responseCode = "0", description = "", + content = @Content(schema = @Schema(implementation = RelatedEntitiesResult.class)))}) public ResponseEntity getRelationships( @Parameter(name = "urn", required = true, description = "The urn for the entity whose relationships are being queried") diff --git a/metadata-service/plugin/src/test/sample-test-plugins/build.gradle b/metadata-service/plugin/src/test/sample-test-plugins/build.gradle index 7d4b43402a586..f299a35db0f64 100644 --- a/metadata-service/plugin/src/test/sample-test-plugins/build.gradle +++ b/metadata-service/plugin/src/test/sample-test-plugins/build.gradle @@ -7,6 +7,7 @@ dependencies { implementation project(path: ':metadata-auth:auth-api') implementation externalDependency.lombok implementation externalDependency.logbackClassic; + implementation 'com.google.code.findbugs:jsr305:3.0.2' testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.1' testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.8.1' diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 7aeca546af3c9..ee6318026e27d 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -72,6 +72,11 @@ "doc" : "The run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided", "optional" : true + }, { + "name" : "pipelineName", + "type" : "string", + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "optional" : true }, { "name" : "registryName", "type" : "string", @@ -341,7 +346,8 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1279,7 +1285,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1405,7 +1412,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1464,7 +1472,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1865,7 +1874,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -1876,7 +1886,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -2061,7 +2071,8 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "id", @@ -2097,7 +2108,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -2161,6 +2173,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2288,7 +2301,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2340,7 +2353,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2403,7 +2417,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2496,7 +2510,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2516,7 +2530,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2562,7 +2576,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2599,7 +2613,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3174,7 +3188,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3192,7 +3206,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3217,7 +3231,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3282,7 +3297,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3849,7 +3865,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3867,7 +3883,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 83ecaf41022c4..d63a938bbce9d 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -94,7 +94,8 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1326,7 +1327,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1471,7 +1473,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1530,7 +1533,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1922,7 +1926,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : false, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" }, "validate" : { "strlen" : { @@ -1937,7 +1942,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "type", @@ -2111,7 +2116,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -2122,7 +2128,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -2417,7 +2423,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2437,6 +2443,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2555,7 +2562,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2585,7 +2592,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2648,7 +2656,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2709,7 +2717,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2870,7 +2878,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2933,7 +2941,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2986,7 +2994,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3599,7 +3607,7 @@ "Searchable" : { "boostScore" : 4.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "orchestrator", @@ -3704,7 +3712,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4302,7 +4311,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4390,7 +4400,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4484,7 +4495,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4590,7 +4602,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4696,7 +4709,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4778,7 +4792,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4796,7 +4810,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -4851,7 +4866,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4879,7 +4894,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -5057,7 +5073,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -5096,7 +5112,8 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "id", @@ -5710,6 +5727,12 @@ "doc" : "Whether to skip aggregates/facets", "default" : false, "optional" : true + }, { + "name" : "getSuggestions", + "type" : "boolean", + "doc" : "Whether to request for search suggestions on the _entityName virtualized field", + "default" : false, + "optional" : true } ] }, { "type" : "enum", @@ -6081,6 +6104,31 @@ }, "doc" : "A list of search result metadata such as aggregations", "default" : [ ] + }, { + "name" : "suggestions", + "type" : { + "type" : "array", + "items" : { + "type" : "record", + "name" : "SearchSuggestion", + "doc" : "The model for the search result", + "fields" : [ { + "name" : "text", + "type" : "string", + "doc" : "The suggestion text for this search query" + }, { + "name" : "score", + "type" : "float", + "doc" : "The score for how close this suggestion is to the original search query.\nThe closer to 1 means it is closer to the original query and 0 is further away." + }, { + "name" : "frequency", + "type" : "long", + "doc" : "How many matches there are with the suggested text for the given field" + } ] + } + }, + "doc" : "A list of search query suggestions based on the given query", + "default" : [ ] } ] }, "doc" : "Metadata specific to the browse result of the queried path" @@ -6187,7 +6235,7 @@ "type" : "int", "doc" : "The total number of entities directly under searched path" } ] - }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { + }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.search.SearchSuggestion", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { "type" : "record", "name" : "SystemMetadata", "namespace" : "com.linkedin.mxe", @@ -6204,6 +6252,11 @@ "doc" : "The run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided", "optional" : true + }, { + "name" : "pipelineName", + "type" : "string", + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "optional" : true }, { "name" : "registryName", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json index de65aa841876f..0b31bf9683d0c 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json @@ -117,6 +117,11 @@ "doc" : "The run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided", "optional" : true + }, { + "name" : "pipelineName", + "type" : "string", + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "optional" : true }, { "name" : "registryName", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesVersionedV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesVersionedV2.snapshot.json index b7bcd8db99691..24a4ec2cc6802 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesVersionedV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesVersionedV2.snapshot.json @@ -126,6 +126,11 @@ "doc" : "The run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided", "optional" : true + }, { + "name" : "pipelineName", + "type" : "string", + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "optional" : true }, { "name" : "registryName", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index b1489df3db55e..b20953749ac35 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -94,7 +94,8 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1032,7 +1033,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1158,7 +1160,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1217,7 +1220,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1618,7 +1622,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -1629,7 +1634,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1806,7 +1811,8 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "id", @@ -1842,7 +1848,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -1906,6 +1913,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2033,7 +2041,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2085,7 +2093,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2148,7 +2157,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2241,7 +2250,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2261,7 +2270,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2307,7 +2316,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2344,7 +2353,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2919,7 +2928,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2937,7 +2946,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2962,7 +2971,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3027,7 +3037,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3594,7 +3605,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3612,7 +3623,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index f4c2d16f84747..e29dd6809b968 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -94,7 +94,8 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1032,7 +1033,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1158,7 +1160,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1217,7 +1220,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1618,7 +1622,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -1629,7 +1634,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1800,7 +1805,8 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "id", @@ -1836,7 +1842,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -1900,6 +1907,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2027,7 +2035,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2079,7 +2087,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2142,7 +2151,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2235,7 +2244,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2255,7 +2264,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2301,7 +2310,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2338,7 +2347,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2913,7 +2922,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2931,7 +2940,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2956,7 +2965,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3021,7 +3031,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3588,7 +3599,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3606,7 +3617,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 2676c2687bd72..8391af60f8ece 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -94,7 +94,8 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1326,7 +1327,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1471,7 +1473,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1530,7 +1533,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1922,7 +1926,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : false, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" }, "validate" : { "strlen" : { @@ -1937,7 +1942,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "type", @@ -2111,7 +2116,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -2122,7 +2128,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -2411,7 +2417,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2431,6 +2437,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2549,7 +2556,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2579,7 +2586,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2642,7 +2650,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2703,7 +2711,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2864,7 +2872,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2927,7 +2935,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2980,7 +2988,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3593,7 +3601,7 @@ "Searchable" : { "boostScore" : 4.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "orchestrator", @@ -3698,7 +3706,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4296,7 +4305,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4384,7 +4394,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4478,7 +4489,8 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4584,7 +4596,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4690,7 +4703,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4772,7 +4786,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4790,7 +4804,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -4845,7 +4860,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4873,7 +4888,8 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -5051,7 +5067,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -5090,7 +5106,8 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", - "fieldType" : "TEXT_PARTIAL" + "fieldNameAliases" : [ "_entityName" ], + "fieldType" : "WORD_GRAM" } }, { "name" : "id", diff --git a/metadata-service/restli-client/build.gradle b/metadata-service/restli-client/build.gradle index 263d4b49197f4..45cf008d3ca7d 100644 --- a/metadata-service/restli-client/build.gradle +++ b/metadata-service/restli-client/build.gradle @@ -1,18 +1,19 @@ apply plugin: 'pegasus' +apply plugin: 'java-library' dependencies { - compile project(':metadata-service:restli-api') - compile project(':metadata-auth:auth-api') - compile project(path: ':metadata-service:restli-api', configuration: 'restClient') - compile project(':metadata-events:mxe-schemas') - compile project(':metadata-utils') + api project(':metadata-service:restli-api') + api project(':metadata-auth:auth-api') + api project(path: ':metadata-service:restli-api', configuration: 'restClient') + api project(':metadata-events:mxe-schemas') + api project(':metadata-utils') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - compile spec.product.pegasus.restliClient + implementation spec.product.pegasus.restliClient - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng } diff --git a/metadata-service/restli-servlet-impl/build.gradle b/metadata-service/restli-servlet-impl/build.gradle index 1028f7c3dcce4..cb307863748c3 100644 --- a/metadata-service/restli-servlet-impl/build.gradle +++ b/metadata-service/restli-servlet-impl/build.gradle @@ -11,7 +11,7 @@ sourceSets { idea { module { testSourceDirs += file('src/integTest/java') - scopes.TEST.plus += [ configurations.integTestCompile ] + scopes.TEST.plus += [ configurations.integTestCompileOnly ] } } @@ -19,6 +19,10 @@ idea { configurations { integTestImplementation.extendsFrom implementation integTestRuntimeOnly.extendsFrom runtimeOnly + integTestCompileOnly { + extendsFrom compileOnly + canBeResolved = true + } modelValidation } @@ -32,34 +36,37 @@ dependencies { } } - compile project(':metadata-service:restli-api') - compile project(':metadata-auth:auth-api') - compile project(path: ':metadata-service:restli-api', configuration: 'dataTemplate') - compile project(':li-utils') - compile project(':metadata-models') - compile project(':metadata-utils') - compile project(':metadata-io') - compile spec.product.pegasus.restliServer + implementation project(':metadata-service:restli-api') + implementation project(':metadata-auth:auth-api') + implementation project(path: ':metadata-service:restli-api', configuration: 'dataTemplate') + implementation project(':li-utils') + implementation project(':metadata-models') + implementation project(':metadata-utils') + implementation project(':metadata-io') + implementation spec.product.pegasus.restliServer implementation externalDependency.slf4jApi - // This is compile and not compileOnly because of restli - compile externalDependency.lombok - compile externalDependency.neo4jJavaDriver - compile externalDependency.opentelemetryAnnotations + implementation externalDependency.dropwizardMetricsCore + implementation externalDependency.dropwizardMetricsJmx + + compileOnly externalDependency.lombok + implementation externalDependency.neo4jJavaDriver + implementation externalDependency.opentelemetryAnnotations runtimeOnly externalDependency.logbackClassic annotationProcessor externalDependency.lombok - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') testImplementation project(':mock-entity-registry') - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng integTestImplementation externalDependency.junitJupiterApi integTestRuntimeOnly externalDependency.junitJupiterEngine - integTestCompile externalDependency.junitJupiterApi - integTestCompile externalDependency.junitJupiterParams + integTestCompileOnly externalDependency.junitJupiterApi + integTestCompileOnly externalDependency.junitJupiterParams modelValidation project(path: ':metadata-models-validator') dataModel project(path: ':metadata-models', configuration: 'dataTemplate') diff --git a/metadata-service/schema-registry-api/build.gradle b/metadata-service/schema-registry-api/build.gradle index e60ca7d348b5c..7bf1e558c8906 100644 --- a/metadata-service/schema-registry-api/build.gradle +++ b/metadata-service/schema-registry-api/build.gradle @@ -3,26 +3,26 @@ apply plugin: 'org.hidetake.swagger.generator' dependencies { // Dependencies for open api - compile externalDependency.reflections - compile externalDependency.springBoot - compile externalDependency.springCore - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.reflections + implementation externalDependency.springBoot + implementation externalDependency.springCore + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.springBeans + implementation externalDependency.springContext implementation externalDependency.antlr4Runtime implementation externalDependency.antlr4 - compile externalDependency.javaxValidation - compile externalDependency.servletApi - compile group: 'javax.annotation', name: 'javax.annotation-api', version: '1.3.2' - compile externalDependency.jacksonDataBind - compile externalDependency.slf4jApi + implementation externalDependency.javaxValidation + implementation externalDependency.servletApi + implementation group: 'javax.annotation', name: 'javax.annotation-api', version: '1.3.2' + implementation externalDependency.jacksonDataBind + implementation externalDependency.slf4jApi // End of dependencies - compile externalDependency.swaggerAnnotations - swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.33' + implementation externalDependency.swaggerAnnotations + swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.46' - testCompile externalDependency.assertJ + testImplementation externalDependency.assertJ } tasks.register('generateOpenApiPojos', GenerateSwaggerCode) { diff --git a/metadata-service/schema-registry-servlet/build.gradle b/metadata-service/schema-registry-servlet/build.gradle index ec62203ddf0c5..554ac696c94fd 100644 --- a/metadata-service/schema-registry-servlet/build.gradle +++ b/metadata-service/schema-registry-servlet/build.gradle @@ -1,19 +1,20 @@ apply plugin: 'java' dependencies { - compile project(':metadata-service:factories') - compile project(':metadata-service:schema-registry-api') + implementation project(':metadata-service:factories') + implementation project(':metadata-service:schema-registry-api') - compile externalDependency.reflections - compile externalDependency.springBoot - compile externalDependency.springCore - compile(externalDependency.springDocUI) { + implementation externalDependency.reflections + implementation externalDependency.springBoot + implementation externalDependency.springCore + implementation(externalDependency.springDocUI) { exclude group: 'org.springframework.boot' } - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.springBootAutoconfigure implementation externalDependency.slf4jApi compileOnly externalDependency.lombok implementation externalDependency.antlr4Runtime @@ -23,14 +24,14 @@ dependencies { testImplementation externalDependency.springBootTest testImplementation project(':mock-entity-registry') - testCompile externalDependency.springBoot - testCompile externalDependency.testContainers - testCompile externalDependency.testContainersKafka - testCompile externalDependency.springKafka - testCompile externalDependency.testng - testCompile externalDependency.mockito - testCompile externalDependency.logbackClassic - testCompile externalDependency.jacksonCore - testCompile externalDependency.jacksonDataBind - testCompile externalDependency.springBootStarterWeb + testImplementation externalDependency.springBoot + testImplementation externalDependency.testContainers + testImplementation externalDependency.testContainersKafka + testImplementation externalDependency.springKafka + testImplementation externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.logbackClassic + testImplementation externalDependency.jacksonCore + testImplementation externalDependency.jacksonDataBind + testImplementation externalDependency.springBootStarterWeb } \ No newline at end of file diff --git a/metadata-service/services/build.gradle b/metadata-service/services/build.gradle index adc7b7bf09d99..99345d6f6bc3f 100644 --- a/metadata-service/services/build.gradle +++ b/metadata-service/services/build.gradle @@ -7,32 +7,33 @@ configurations { dependencies { implementation externalDependency.jsonPatch - compile project(':entity-registry') - compile project(':metadata-utils') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile project(':metadata-models') - compile project(':metadata-service:restli-client') - compile project(':metadata-service:configuration') + implementation project(':entity-registry') + implementation project(':metadata-utils') + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-models') + implementation project(':metadata-service:restli-client') + implementation project(':metadata-service:configuration') implementation externalDependency.slf4jApi implementation externalDependency.swaggerAnnotations - runtime externalDependency.logbackClassic + runtimeOnly externalDependency.logbackClassic compileOnly externalDependency.lombok implementation externalDependency.commonsCollections - compile externalDependency.javatuples - compile externalDependency.javaxValidation - compile externalDependency.opentelemetryAnnotations + implementation externalDependency.javatuples + implementation externalDependency.javaxValidation + implementation externalDependency.opentelemetryAnnotations annotationProcessor externalDependency.lombok - testCompile externalDependency.testng - testCompile externalDependency.junit - testCompile externalDependency.mockito - testCompile externalDependency.mockitoInline + testImplementation externalDependency.testng + testImplementation externalDependency.junit + testImplementation externalDependency.mockito + testImplementation externalDependency.mockitoInline testCompileOnly externalDependency.lombok - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') testImplementation project(':datahub-graphql-core') // logback >=1.3 required due to `testcontainers` only testImplementation 'ch.qos.logback:logback-classic:1.4.7' diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/service/TagService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/service/TagService.java index b52d68e2e75ee..9e12fc80a3cdb 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/service/TagService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/service/TagService.java @@ -20,7 +20,7 @@ import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; -import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.EntityClient; import com.datahub.authentication.Authentication; import javax.annotation.Nonnull; import lombok.extern.slf4j.Slf4j; diff --git a/metadata-service/servlet/build.gradle b/metadata-service/servlet/build.gradle index 9242d21201886..eb2cd9c2d3de7 100644 --- a/metadata-service/servlet/build.gradle +++ b/metadata-service/servlet/build.gradle @@ -1,13 +1,16 @@ apply plugin: 'java' dependencies { - compile project(':metadata-io') - compile externalDependency.httpClient - compile externalDependency.servletApi - compile externalDependency.gson - compile externalDependency.jacksonDataBind - compile externalDependency.springWebMVC + implementation project(':metadata-io') + implementation project(':datahub-graphql-core') + implementation project(':entity-registry') + implementation project(':metadata-service:factories') + + implementation externalDependency.httpClient + implementation externalDependency.servletApi + implementation externalDependency.gson + implementation externalDependency.jacksonDataBind + implementation externalDependency.springWebMVC + compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - compile project(':entity-registry') - compile project(':metadata-service:factories') } diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 7e9aa90664611..ae207e0260e60 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -12,33 +12,33 @@ ext { ext.apiProject = project(':metadata-service:restli-api') dependencies { - runtime project(':metadata-service:factories') - runtime project(':metadata-service:auth-filter') - runtime project(':metadata-service:servlet') - runtime project(':metadata-service:auth-servlet-impl') - runtime project(':metadata-service:graphql-servlet-impl') - runtime project(':metadata-service:health-servlet') - runtime project(':metadata-service:openapi-servlet') - runtime project(':metadata-service:schema-registry-servlet') - runtime project(':metadata-jobs:mce-consumer') - runtime project(':metadata-jobs:mae-consumer') - runtime project(':metadata-jobs:pe-consumer') + runtimeOnly project(':metadata-service:factories') + runtimeOnly project(':metadata-service:auth-filter') + runtimeOnly project(':metadata-service:servlet') + runtimeOnly project(':metadata-service:auth-servlet-impl') + runtimeOnly project(':metadata-service:graphql-servlet-impl') + runtimeOnly project(':metadata-service:health-servlet') + runtimeOnly project(':metadata-service:openapi-servlet') + runtimeOnly project(':metadata-service:schema-registry-servlet') + runtimeOnly project(':metadata-jobs:mce-consumer') + runtimeOnly project(':metadata-jobs:mae-consumer') + runtimeOnly project(':metadata-jobs:pe-consumer') - runtime externalDependency.awsSecretsManagerJdbc - runtime externalDependency.h2 - runtime externalDependency.mariadbConnector - runtime externalDependency.mysqlConnector - runtime externalDependency.postgresql - runtime externalDependency.springWebMVC + runtimeOnly externalDependency.awsSecretsManagerJdbc + runtimeOnly externalDependency.h2 + runtimeOnly externalDependency.mariadbConnector + runtimeOnly externalDependency.mysqlConnector + runtimeOnly externalDependency.postgresql + runtimeOnly externalDependency.springWebMVC - runtime spec.product.pegasus.restliDocgen - runtime spec.product.pegasus.restliSpringBridge + runtimeOnly spec.product.pegasus.restliDocgen + runtimeOnly spec.product.pegasus.restliSpringBridge - runtime externalDependency.log4jCore - runtime externalDependency.log4j2Api - runtime externalDependency.logbackClassic + runtimeOnly externalDependency.log4jCore + runtimeOnly externalDependency.log4j2Api + runtimeOnly externalDependency.logbackClassic implementation externalDependency.awsMskIamAuth - testRuntime externalDependency.logbackClassic + testRuntimeOnly externalDependency.logbackClassic implementation externalDependency.charle } configurations.all{ @@ -72,6 +72,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -84,7 +86,7 @@ tasks.getByName("docker").dependsOn([build, war]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/metadata-service/war/src/main/resources/boot/policies.json b/metadata-service/war/src/main/resources/boot/policies.json index 3fddf3456ecd7..3cda0269b79f1 100644 --- a/metadata-service/war/src/main/resources/boot/policies.json +++ b/metadata-service/war/src/main/resources/boot/policies.json @@ -19,6 +19,7 @@ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_USER_CREDENTIALS", @@ -102,6 +103,7 @@ "VIEW_ANALYTICS", "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_TAGS", @@ -190,6 +192,7 @@ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_USER_CREDENTIALS", @@ -283,6 +286,7 @@ "privileges":[ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_GLOSSARIES", "MANAGE_TAGS" ], diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle index 3b04a5dc53d75..9f8ef70a0e728 100644 --- a/metadata-utils/build.gradle +++ b/metadata-utils/build.gradle @@ -1,30 +1,31 @@ -apply plugin: 'java' +apply plugin: 'java-library' dependencies { - compile externalDependency.avro_1_7 - compile externalDependency.commonsLang - compile externalDependency.dropwizardMetricsCore - compile externalDependency.dropwizardMetricsJmx - compile externalDependency.elasticSearchRest - compile externalDependency.httpClient - compile externalDependency.neo4jJavaDriver - compile externalDependency.json - - compile spec.product.pegasus.restliClient - compile spec.product.pegasus.restliCommon - compile spec.product.pegasus.restliServer - - compile project(':li-utils') - compile project(':entity-registry') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-utils-avro-1.7') + api externalDependency.avro_1_7 + implementation externalDependency.commonsLang + api externalDependency.dropwizardMetricsCore + implementation externalDependency.dropwizardMetricsJmx + api externalDependency.elasticSearchRest + implementation externalDependency.httpClient + api externalDependency.neo4jJavaDriver + api externalDependency.json + + implementation spec.product.pegasus.restliClient + implementation spec.product.pegasus.restliCommon + implementation spec.product.pegasus.restliServer + + api project(':li-utils') + api project(':entity-registry') + api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-events:mxe-utils-avro-1.7') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') constraints { implementation(externalDependency.log4jCore) { diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java index c46d02a6eadf0..0b0d462f079bf 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java @@ -64,6 +64,11 @@ public class PoliciesConfig { "Manage Domains", "Create and remove Asset Domains."); + public static final Privilege MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE = Privilege.of( + "MANAGE_GLOBAL_ANNOUNCEMENTS", + "Manage Home Page Posts", + "Create and delete home page posts"); + public static final Privilege MANAGE_TESTS_PRIVILEGE = Privilege.of( "MANAGE_TESTS", "Manage Tests", @@ -113,6 +118,7 @@ public class PoliciesConfig { MANAGE_USERS_AND_GROUPS_PRIVILEGE, VIEW_ANALYTICS_PRIVILEGE, MANAGE_DOMAINS_PRIVILEGE, + MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE, MANAGE_INGESTION_PRIVILEGE, MANAGE_SECRETS_PRIVILEGE, GENERATE_PERSONAL_ACCESS_TOKENS_PRIVILEGE, @@ -192,8 +198,8 @@ public class PoliciesConfig { public static final Privilege EDIT_ENTITY_PRIVILEGE = Privilege.of( "EDIT_ENTITY", - "Edit All", - "The ability to edit any information about an entity. Super user privileges."); + "Edit Entity", + "The ability to edit any information about an entity. Super user privileges for the entity."); public static final Privilege DELETE_ENTITY_PRIVILEGE = Privilege.of( "DELETE_ENTITY", diff --git a/perf-test/README.md b/perf-test/README.md index 24fb064d3e28a..191833361eae9 100644 --- a/perf-test/README.md +++ b/perf-test/README.md @@ -58,7 +58,9 @@ locust -f perf-test/locustfiles/ingest.py This will set up the web interface in http://localhost:8089 (unless the port is already taken). Once you click into it, you should see the following -![Locust Example](../docs/imgs/locust-example.png) +

+ +

Input the number of users you would like to spawn and the spawn rate. Point the host to the deployed DataHub GMS ( locally, it should be http://localhost:8080). Click on the "Start swarming" button to start the load test. diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index 050b5d2db95c9..d40e4a5e7a4aa 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -15,4 +15,4 @@ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false \ DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ -datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup +datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js new file mode 100644 index 0000000000000..2a8fe045f154e --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js @@ -0,0 +1,51 @@ +const DATASET_ENTITY_TYPE = 'dataset'; +const DATASET_URN = 'urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)'; + +describe("column-level lineage graph test", () => { + + it("navigate to lineage graph view and verify that column-level lineage is showing correctly", () => { + cy.login(); + cy.goToEntityLineageGraph(DATASET_ENTITY_TYPE, DATASET_URN); + //verify columns not shown by default + cy.waitTextVisible("SampleCypressHdfs"); + cy.waitTextVisible("SampleCypressHive"); + cy.waitTextVisible("cypress_logging"); + cy.ensureTextNotPresent("shipment_info"); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.ensureTextNotPresent("event_name"); + cy.ensureTextNotPresent("event_data"); + cy.ensureTextNotPresent("timestamp"); + cy.ensureTextNotPresent("browser"); + cy.clickOptionWithTestId("column-toggle") + //verify columns appear and belong co correct dataset + cy.waitTextVisible("shipment_info"); + cy.waitTextVisible("shipment_info.date"); + cy.waitTextVisible("shipment_info.target"); + cy.waitTextVisible("shipment_info.destination"); + cy.waitTextVisible("shipment_info.geo_info"); + cy.waitTextVisible("field_foo"); + cy.waitTextVisible("field_baz"); + cy.waitTextVisible("event_name"); + cy.waitTextVisible("event_data"); + cy.waitTextVisible("timestamp"); + cy.waitTextVisible("browser"); + //verify columns can be hidden and shown again + cy.contains("Hide").click({ force:true }); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.get("[aria-label='down']").eq(1).click({ force:true }); + cy.waitTextVisible("field_foo"); + cy.waitTextVisible("field_baz"); + //verify columns can be disabled successfully + cy.clickOptionWithTestId("column-toggle") + cy.ensureTextNotPresent("shipment_info"); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.ensureTextNotPresent("event_name"); + cy.ensureTextNotPresent("event_data"); + cy.ensureTextNotPresent("timestamp"); + cy.ensureTextNotPresent("browser"); + }); + +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js b/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js index 1d41d155440e8..2fa11654a3c3e 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js @@ -1,19 +1,29 @@ -describe("deprecation", () => { +describe("dataset deprecation", () => { it("go to dataset and check deprecation works", () => { const urn = "urn:li:dataset:(urn:li:dataPlatform:hive,cypress_logging_events,PROD)"; const datasetName = "cypress_logging_events"; cy.login(); - cy.goToDataset(urn, datasetName); cy.openThreeDotDropdown(); cy.clickOptionWithText("Mark as deprecated"); cy.addViaFormModal("test deprecation", "Add Deprecation Details"); - - cy.goToDataset(urn, datasetName); - cy.contains("DEPRECATED"); - + cy.waitTextVisible("Deprecation Updated"); + cy.waitTextVisible("DEPRECATED") cy.openThreeDotDropdown(); cy.clickOptionWithText("Mark as un-deprecated"); + cy.waitTextVisible("Deprecation Updated"); + cy.ensureTextNotPresent("DEPRECATED"); + cy.openThreeDotDropdown(); + cy.clickOptionWithText("Mark as deprecated"); + cy.addViaFormModal("test deprecation", "Add Deprecation Details"); + cy.waitTextVisible("Deprecation Updated"); + cy.waitTextVisible("DEPRECATED"); + cy.contains("DEPRECATED").trigger("mouseover", { force: true }); + cy.waitTextVisible("Deprecation note"); + cy.get("[role='tooltip']").contains("Mark as un-deprecated").click(); + cy.waitTextVisible("Confirm Mark as un-deprecated"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Marked assets as un-deprecated!"); cy.ensureTextNotPresent("DEPRECATED"); - }); + }); }); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js new file mode 100644 index 0000000000000..e4e5a39ce1100 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js @@ -0,0 +1,97 @@ +const test_id = Math.floor(Math.random() * 100000); +const documentation_edited = `This is test${test_id} documentation EDITED`; +const wrong_url = "https://www.linkedincom"; +const correct_url = "https://www.linkedin.com"; + +describe("edit documentation and link to dataset", () => { + it("open test dataset page, edit documentation", () => { + //edit documentation and verify changes saved + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("[role='tab']").contains("Documentation").click(); + cy.waitTextVisible("my hive dataset"); + cy.waitTextVisible("Sample doc"); + cy.clickOptionWithText("Edit"); + cy.focused().clear(); + cy.focused().type(documentation_edited); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible(documentation_edited); + //return documentation to original state + cy.clickOptionWithText("Edit"); + cy.focused().clear().wait(1000); + cy.focused().type("my hive dataset"); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible("my hive dataset"); + }); + + it("open test dataset page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("[role='tab']").contains("Documentation").click(); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + }); + + it("open test domain page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit("/domain/urn:li:domain:marketing/Entities"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + }); + + it("edit field documentation", () => { + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.waitTextVisible("Update description"); + cy.waitTextVisible("Foo field description has changed"); + cy.focused().clear().wait(1000); + cy.focused().type(documentation_edited); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible(documentation_edited); + cy.waitTextVisible("(edited)"); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.focused().clear().wait(1000); + cy.focused().type("Foo field description has changed"); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible("Foo field description has changed"); + cy.waitTextVisible("(edited)"); + }); +}); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js index ddda8626fba2f..24a24cc21138d 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js @@ -31,8 +31,7 @@ describe("run managed ingestion", () => { cy.waitTextVisible(testName) cy.contains(testName).parent().within(() => { - // TODO: Skipping until disk size resolved - // cy.contains("Succeeded", {timeout: 30000}) + cy.contains("Succeeded", {timeout: 180000}) cy.clickOptionWithTestId("delete-button"); }) cy.clickOptionWithText("Yes") diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js index 7686acfe50de0..9559435ff01c8 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js @@ -64,6 +64,7 @@ describe("create and manage group", () => { }); it("update group info", () => { + var expected_name = Cypress.env('ADMIN_USERNAME'); cy.loginWithCredentials(); cy.visit("/settings/identities/groups"); cy.clickOptionWithText(group_name); @@ -77,13 +78,13 @@ describe("create and manage group", () => { cy.contains("Test group description EDITED").should("be.visible"); cy.clickOptionWithText("Add Owners"); cy.contains("Search for users or groups...").click({ force: true }); - cy.focused().type(Cypress.env('ADMIN_USERNAME')); - cy.get(".ant-select-item-option").contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).click(); + cy.focused().type(expected_name); + cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click(); cy.focused().blur(); - cy.contains(Cypress.env('ADMIN_USERNAME')).should("have.length", 1); + cy.contains(expected_name).should("have.length", 1); cy.get('[role="dialog"] button').contains("Done").click(); cy.waitTextVisible("Owners Added"); - cy.contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).should("be.visible"); + cy.contains(expected_name, { matchCase: false }).should("be.visible"); cy.clickOptionWithText("Edit Group"); cy.waitTextVisible("Edit Profile"); cy.get("#email").type(`${test_id}@testemail.com`); diff --git a/smoke-test/tests/cypress/data.json b/smoke-test/tests/cypress/data.json index c6606519e8d73..3b2ee1afaba58 100644 --- a/smoke-test/tests/cypress/data.json +++ b/smoke-test/tests/cypress/data.json @@ -2012,4 +2012,4 @@ }, "systemMetadata": null } -] +] \ No newline at end of file diff --git a/test-models/build.gradle b/test-models/build.gradle index 4cfbcc1399e7d..c74f7249fa1d9 100644 --- a/test-models/build.gradle +++ b/test-models/build.gradle @@ -1,5 +1,5 @@ apply plugin: 'pegasus' -apply plugin: 'java' +apply plugin: 'java-library' tasks.withType(JavaCompile).configureEach { javaCompiler = javaToolchains.compilerFor { @@ -13,8 +13,8 @@ tasks.withType(Test).configureEach { } dependencies { - compile spec.product.pegasus.data - compile externalDependency.commonsIo + implementation spec.product.pegasus.data + implementation externalDependency.commonsIo dataModel project(':metadata-models') dataModel project(':li-utils') } diff --git a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl index ed30244c31b17..6dff14133ee60 100644 --- a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl +++ b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl @@ -14,7 +14,8 @@ record TestEntityInfo includes CustomProperties { @Searchable = { "fieldName": "textFieldOverride", "fieldType": "TEXT", - "addToFilters": true + "addToFilters": true, + "fieldNameAliases": [ "_entityName" ] } textField: optional string @@ -25,6 +26,11 @@ record TestEntityInfo includes CustomProperties { } textArrayField: optional array[string] + @Searchable = { + "fieldType": "WORD_GRAM" + } + wordGramField: optional string + @Relationship = { "name": "foreignKey", "entityTypes": []