diff --git a/.github/actions/docker-custom-build-and-push/action.yml b/.github/actions/docker-custom-build-and-push/action.yml index 96d4d759dbb84..bd6bb842b1fb8 100644 --- a/.github/actions/docker-custom-build-and-push/action.yml +++ b/.github/actions/docker-custom-build-and-push/action.yml @@ -30,6 +30,9 @@ inputs: # e.g. latest,head,sha12345 description: "List of tags to use for the Docker image" required: true + target: + description: "Sets the target stage to build" + required: false outputs: image_tag: description: "Docker image tags" @@ -62,6 +65,7 @@ runs: platforms: linux/amd64 build-args: ${{ inputs.build-args }} tags: ${{ steps.docker_meta.outputs.tags }} + target: ${{ inputs.target }} load: true push: false cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }} @@ -94,6 +98,7 @@ runs: platforms: ${{ inputs.platforms }} build-args: ${{ inputs.build-args }} tags: ${{ steps.docker_meta.outputs.tags }} + target: ${{ inputs.target }} push: true cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }} cache-to: type=inline diff --git a/.github/workflows/docker-ingestion-base.yml b/.github/workflows/docker-ingestion-base.yml index 0d29f79aa5f6c..e69de29bb2d1d 100644 --- a/.github/workflows/docker-ingestion-base.yml +++ b/.github/workflows/docker-ingestion-base.yml @@ -1,45 +0,0 @@ -name: ingestion base -on: - release: - types: [published] - push: - branches: - - master - paths: - - ".github/workflows/docker-ingestion-base.yml" - - "docker/datahub-ingestion-base/**" - - "gradle*" - pull_request: - branches: - - master - paths: - - ".github/workflows/docker-ingestion-base.yml" - - "docker/datahub-ingestion-base/**" - - "gradle*" - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - build-base: - name: Build and Push Docker Image to Docker Hub - runs-on: ubuntu-latest - steps: - - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 - - name: Build and Push image - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion-base - tags: latest - username: ${{ secrets.ACRYL_DOCKER_USERNAME }} - password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - publish: ${{ github.ref == 'refs/heads/master' }} - context: . - file: ./docker/datahub-ingestion-base/Dockerfile - platforms: linux/amd64,linux/arm64/v8 diff --git a/.github/workflows/docker-ingestion.yml b/.github/workflows/docker-ingestion.yml deleted file mode 100644 index f3768cfde5002..0000000000000 --- a/.github/workflows/docker-ingestion.yml +++ /dev/null @@ -1,118 +0,0 @@ -name: datahub-ingestion docker -on: - push: - branches: - - master - paths-ignore: - - "docs/**" - - "**.md" - pull_request: - branches: - - master - paths: - - "metadata-ingestion/**" - - "metadata-models/**" - - "docker/datahub-ingestion/**" - - "docker/datahub-ingestion-slim/**" - - ".github/workflows/docker-ingestion.yml" - release: - types: [published] - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - setup: - runs-on: ubuntu-latest - outputs: - tag: ${{ steps.tag.outputs.tag }} - publish: ${{ steps.publish.outputs.publish }} - python_release_version: ${{ steps.python_release_version.outputs.release_version }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Compute Tag - id: tag - run: | - source .github/scripts/docker_helpers.sh - echo "tag=$(get_tag)" >> $GITHUB_OUTPUT - - name: Compute Python Release Version - id: python_release_version - run: | - source .github/scripts/docker_helpers.sh - echo "release_version=$(get_python_docker_release_v)" >> $GITHUB_OUTPUT - - name: Check whether publishing enabled - id: publish - env: - ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD }} - run: | - echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" - echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT - push_to_registries: - name: Build and Push Docker Image to Docker Hub - runs-on: ubuntu-latest - needs: setup - steps: - - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 - - name: Build and push - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - linkedin/datahub-ingestion - tags: ${{ needs.setup.outputs.tag }} - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - publish: ${{ needs.setup.outputs.publish == 'true' }} - context: . - file: ./docker/datahub-ingestion/Dockerfile - platforms: linux/amd64,linux/arm64/v8 - build-args: | - RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} - - name: Build and Push image (slim) - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion-slim - tags: ${{ needs.setup.outputs.tag }} - username: ${{ secrets.ACRYL_DOCKER_USERNAME }} - password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - publish: ${{ needs.setup.outputs.publish == 'true' }} - context: . - file: ./docker/datahub-ingestion-slim/Dockerfile - platforms: linux/amd64,linux/arm64/v8 - ingestion-slim_scan: - permissions: - contents: read # for actions/checkout to fetch code - security-events: write # for github/codeql-action/upload-sarif to upload SARIF results - actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status - name: "[Monitoring] Scan datahub-ingestion-slim images for vulnerabilities" - if: ${{ github.ref == 'refs/heads/master' }} - runs-on: ubuntu-latest - needs: [push_to_registries] - steps: - - name: Checkout # adding checkout step just to make trivy upload happy - uses: actions/checkout@v3 - - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 - with: - image: acryldata/datahub-ingestion-slim:latest - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@0.8.0 - env: - TRIVY_OFFLINE_SCAN: true - with: - image-ref: acryldata/datahub-ingestion-slim:latest - format: "template" - template: "@/contrib/sarif.tpl" - output: "trivy-results.sarif" - severity: "CRITICAL,HIGH" - ignore-unfixed: true - vuln-type: "os,library" - - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: "trivy-results.sarif" diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 1eb2a393600d2..532669c44722c 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -31,13 +31,19 @@ env: DATAHUB_ELASTIC_SETUP_IMAGE: "linkedin/datahub-elasticsearch-setup" DATAHUB_MYSQL_SETUP_IMAGE: "acryldata/datahub-mysql-setup" DATAHUB_UPGRADE_IMAGE: "acryldata/datahub-upgrade" + DATAHUB_INGESTION_BASE_IMAGE: "acryldata/datahub-ingestion-base" + DATAHUB_INGESTION_IMAGE: "acryldata/datahub-ingestion" jobs: setup: runs-on: ubuntu-latest outputs: tag: ${{ steps.tag.outputs.tag }} + slim_tag: ${{ steps.tag.outputs.slim_tag }} + full_tag: ${{ steps.tag.outputs.full_tag }} unique_tag: ${{ steps.tag.outputs.unique_tag }} + unique_slim_tag: ${{ steps.tag.outputs.unique_slim_tag }} + unique_full_tag: ${{ steps.tag.outputs.unique_full_tag }} publish: ${{ steps.publish.outputs.publish }} steps: - name: Checkout @@ -47,14 +53,18 @@ jobs: run: | source .github/scripts/docker_helpers.sh echo "tag=$(get_tag)" >> $GITHUB_OUTPUT + echo "slim_tag=$(get_tag)-slim" >> $GITHUB_OUTPUT + echo "full_tag=$(get_tag)-full" >> $GITHUB_OUTPUT echo "unique_tag=$(get_unique_tag)" >> $GITHUB_OUTPUT + echo "unique_slim_tag=$(get_unique_tag)-slim" >> $GITHUB_OUTPUT + echo "unique_full_tag=$(get_unique_tag)-full" >> $GITHUB_OUTPUT - name: Check whether publishing enabled id: publish env: - ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD }} + ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD != '' && secrets.ACRYL_DOCKER_PASSWORD != '' }} run: | - echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" - echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT + echo "Enable publish: ${{ env.ENABLE_PUBLISH }}" + echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT gms_build: name: Build and Push DataHub GMS Docker Image @@ -414,6 +424,289 @@ jobs: file: ./docker/elasticsearch-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + datahub_ingestion_base_build: + name: Build and Push DataHub Ingestion (Base) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: setup + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Build and push Base Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: base + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_base_slim_build: + name: Build and Push DataHub Ingestion (Base-Slim) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: [setup, datahub_ingestion_base_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + - name: Build and push Base-Slim Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: slim-install + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.slim_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + build-args: | + APP_ENV=slim + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base-Slim) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_base_full_build: + name: Build and Push DataHub Ingestion (Base-Full) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: [setup, datahub_ingestion_base_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + - name: Build and push Base-Full Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: full-install + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.unique_full_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + build-args: | + APP_ENV=full + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base-Full) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT + + + datahub_ingestion_slim_build: + name: Build and Push DataHub Ingestion Docker Images + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }} + needs: [setup, datahub_ingestion_base_slim_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + datahub-ingestion: + - 'docker/datahub-ingestion/**' + - name: Build codegen + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + run: ./gradlew :metadata-ingestion:codegen + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} + - name: Build and push Slim Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: final + images: | + ${{ env.DATAHUB_INGESTION_IMAGE }} + build-args: | + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} + APP_ENV=slim + tags: ${{ needs.setup.outputs.slim_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute Tag + id: tag + run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_slim_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Datahub Ingestion Slim images for vulnerabilities" + runs-on: ubuntu-latest + needs: [setup, datahub_ingestion_slim_build] + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: actions/checkout@v3 + - name: Download image Slim Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + - name: Run Trivy vulnerability scanner Slim Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" + + datahub_ingestion_full_build: + name: Build and Push DataHub Ingestion (Full) Docker Images + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }} + needs: [setup, datahub_ingestion_base_full_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + datahub-ingestion: + - 'docker/datahub-ingestion/**' + - name: Build codegen + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + run: ./gradlew :metadata-ingestion:codegen + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }} + - name: Build and push Full Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: final + images: | + ${{ env.DATAHUB_INGESTION_IMAGE }} + build-args: | + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }} + tags: ${{ needs.setup.outputs.unique_full_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute Tag (Full) + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_full_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Datahub Ingestion images for vulnerabilities" + runs-on: ubuntu-latest + needs: [setup, datahub_ingestion_full_build] + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: actions/checkout@v3 + - name: Download image Full Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} + - name: Run Trivy vulnerability scanner Full Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" + smoke_test: name: Run Smoke Tests runs-on: ubuntu-latest @@ -432,8 +725,11 @@ jobs: mae_consumer_build, mce_consumer_build, datahub_upgrade_build, + datahub_ingestion_slim_build, ] steps: + - name: Disk Check + run: df -h . && docker images - name: Check out the repo uses: actions/checkout@v3 - name: Set up JDK 11 @@ -450,6 +746,12 @@ jobs: - name: Build datahub cli run: | ./gradlew :metadata-ingestion:install + - name: Disk Check + run: df -h . && docker images + - name: Remove images + run: docker image prune -a -f || true + - name: Disk Check + run: df -h . && docker images - name: Download GMS image uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' }} @@ -490,13 +792,21 @@ jobs: if: ${{ needs.setup.outputs.publish != 'true' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - - name: Disable datahub-actions - run: | - yq -i 'del(.services.datahub-actions)' docker/quickstart/docker-compose-without-neo4j.quickstart.yml + - name: Download datahub-ingestion-slim image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + - name: Disk Check + run: df -h . && docker images - name: run quickstart env: DATAHUB_TELEMETRY_ENABLED: false DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }} + DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }} + ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag }} + ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor] acryl-datahub-actions' + ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' run: | ./smoke-test/run-quickstart.sh - name: sleep 60s @@ -504,6 +814,8 @@ jobs: # we are doing this because gms takes time to get ready # and we don't have a better readiness check when bootstrap is done sleep 60s + - name: Disk Check + run: df -h . && docker images - name: Disable ES Disk Threshold run: | curl -XPUT "http://localhost:9200/_cluster/settings" \ @@ -518,6 +830,8 @@ jobs: }' - name: Remove Source Code run: find ./*/* ! -path "./metadata-ingestion*" ! -path "./smoke-test*" ! -path "./gradle*" -delete + - name: Disk Check + run: df -h . && docker images - name: Smoke test env: RUN_QUICKSTART: false @@ -528,11 +842,14 @@ jobs: run: | echo "$DATAHUB_VERSION" ./smoke-test/smoke.sh + - name: Disk Check + run: df -h . && docker images - name: store logs if: failure() run: | docker ps -a docker logs datahub-gms >& gms-${{ matrix.test_strategy }}.log + docker logs datahub-actions >& actions-${{ matrix.test_strategy }}.log - name: Upload logs uses: actions/upload-artifact@v3 if: failure() diff --git a/build.gradle b/build.gradle index 605b4fcc050e7..ae54de07cb81c 100644 --- a/build.gradle +++ b/build.gradle @@ -3,8 +3,8 @@ buildscript { // Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md ext.pegasusVersion = '29.22.16' ext.mavenVersion = '3.6.3' - ext.springVersion = '5.3.27' - ext.springBootVersion = '2.7.11' + ext.springVersion = '5.3.29' + ext.springBootVersion = '2.7.14' ext.openTelemetryVersion = '1.18.0' ext.neo4jVersion = '4.4.9' ext.testContainersVersion = '1.17.4' @@ -18,6 +18,7 @@ buildscript { ext.logbackClassic = '1.2.12' ext.hadoop3Version = '3.3.5' ext.kafkaVersion = '2.3.0' + ext.hazelcastVersion = '5.3.1' ext.docker_registry = 'linkedin' @@ -38,7 +39,7 @@ buildscript { plugins { id 'com.gorylenko.gradle-git-properties' version '2.4.0-rc2' id 'com.github.johnrengelman.shadow' version '6.1.0' - id "com.palantir.docker" version "0.34.0" + id "com.palantir.docker" version "0.35.0" // https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/ // TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0" } @@ -101,9 +102,9 @@ project.ext.externalDependency = [ 'hadoopMapreduceClient':'org.apache.hadoop:hadoop-mapreduce-client-core:2.7.2', "hadoopClient": "org.apache.hadoop:hadoop-client:$hadoop3Version", "hadoopCommon3":"org.apache.hadoop:hadoop-common:$hadoop3Version", - 'hazelcast':'com.hazelcast:hazelcast:5.2.3', - 'hazelcastSpring':'com.hazelcast:hazelcast-spring:5.2.1', - 'hazelcastTest':'com.hazelcast:hazelcast:5.2.1:tests', + 'hazelcast':"com.hazelcast:hazelcast:$hazelcastVersion", + 'hazelcastSpring':"com.hazelcast:hazelcast-spring:$hazelcastVersion", + 'hazelcastTest':"com.hazelcast:hazelcast:$hazelcastVersion:tests", 'hibernateCore': 'org.hibernate:hibernate-core:5.2.16.Final', 'httpClient': 'org.apache.httpcomponents:httpclient:4.5.9', 'httpAsyncClient': 'org.apache.httpcomponents:httpasyncclient:4.1.5', @@ -137,6 +138,7 @@ project.ext.externalDependency = [ 'kafkaAvroSerde': 'io.confluent:kafka-streams-avro-serde:5.5.1', 'kafkaAvroSerializer': 'io.confluent:kafka-avro-serializer:5.1.4', 'kafkaClients': "org.apache.kafka:kafka-clients:$kafkaVersion", + 'snappy': 'org.xerial.snappy:snappy-java:1.1.10.3', 'logbackClassic': "ch.qos.logback:logback-classic:$logbackClassic", 'slf4jApi': "org.slf4j:slf4j-api:$slf4jVersion", 'log4jCore': "org.apache.logging.log4j:log4j-core:$log4jVersion", diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index f21d10d8f3842..fda33e4a9a3c6 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -79,6 +79,8 @@ docker { files fileTree(rootProject.projectDir) { include 'docker/monitoring/*' include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -98,7 +100,7 @@ tasks.getByName("docker").dependsOn(unversionZip) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle index 57f64960033aa..e7121d277926d 100644 --- a/datahub-frontend/play.gradle +++ b/datahub-frontend/play.gradle @@ -28,6 +28,9 @@ dependencies { implementation(externalDependency.commonsText) { because("previous versions are vulnerable to CVE-2022-42889") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } compile project(":metadata-service:restli-client") diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index f15535bfb4eb8..fbea66f738955 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -448,6 +448,11 @@ enum FilterOperator { * Represent the relation: String field is one of the array values to, e.g. name in ["Profile", "Event"] """ IN + + """ + Represents the relation: The field exists. If the field is an array, the field is either not present or empty. + """ + EXISTS } """ diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index ad2bf02bfdcc7..78d9f6a09948d 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -89,6 +89,8 @@ docker { files fileTree(rootProject.projectDir) { include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -101,7 +103,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx index 90032285cd35b..b5ebcbef80379 100644 --- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx @@ -204,6 +204,8 @@ export class ChartEntity implements Entity { createdMs={data.properties?.created?.time} externalUrl={data.properties?.externalUrl} snippet={} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx index 9b69d250e315a..7d0fc143043e2 100644 --- a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx +++ b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx @@ -12,6 +12,7 @@ import { Deprecation, ChartStatsSummary, DataProduct, + EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -40,6 +41,8 @@ export const ChartPreview = ({ externalUrl, parentContainers, snippet, + degree, + paths, }: { urn: string; platform?: string; @@ -62,6 +65,8 @@ export const ChartPreview = ({ externalUrl?: string | null; parentContainers?: ParentContainersResult | null; snippet?: React.ReactNode | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); @@ -96,6 +101,8 @@ export const ChartPreview = ({ createdMs={createdMs} /> } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/container/ContainerEntity.tsx b/datahub-web-react/src/app/entity/container/ContainerEntity.tsx index 201dcb9e4487a..9aecf6900f634 100644 --- a/datahub-web-react/src/app/entity/container/ContainerEntity.tsx +++ b/datahub-web-react/src/app/entity/container/ContainerEntity.tsx @@ -154,6 +154,8 @@ export class ContainerEntity implements Entity { externalUrl={data.properties?.externalUrl} tags={data.tags} glossaryTerms={data.glossaryTerms} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/container/preview/Preview.tsx b/datahub-web-react/src/app/entity/container/preview/Preview.tsx index 0bcf59683c3f7..fb1bd8f567420 100644 --- a/datahub-web-react/src/app/entity/container/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/container/preview/Preview.tsx @@ -13,6 +13,7 @@ import { Deprecation, GlossaryTerms, DataProduct, + EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; @@ -44,6 +45,8 @@ export const Preview = ({ parentContainers, externalUrl, deprecation, + degree, + paths, }: { urn: string; name: string; @@ -64,6 +67,8 @@ export const Preview = ({ deprecation?: Deprecation | null; parentContainers?: ParentContainersResult | null; externalUrl?: string | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); const typeName = capitalizeFirstLetterOnly(subTypes?.typeNames?.[0]) || 'Container'; @@ -97,6 +102,8 @@ export const Preview = ({ ]) || undefined } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx index d948b21a46262..a64e437265262 100644 --- a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx +++ b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx @@ -234,6 +234,8 @@ export class DashboardEntity implements Entity { /> } subtype={data.subTypes?.typeNames?.[0]} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dashboard/preview/DashboardPreview.tsx b/datahub-web-react/src/app/entity/dashboard/preview/DashboardPreview.tsx index a5536be9cca7c..d822fd1f613b3 100644 --- a/datahub-web-react/src/app/entity/dashboard/preview/DashboardPreview.tsx +++ b/datahub-web-react/src/app/entity/dashboard/preview/DashboardPreview.tsx @@ -12,6 +12,7 @@ import { Deprecation, DashboardStatsSummary, DataProduct, + EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -43,6 +44,8 @@ export const DashboardPreview = ({ parentContainers, deprecation, snippet, + degree, + paths, }: { urn: string; platform?: string; @@ -67,6 +70,8 @@ export const DashboardPreview = ({ externalUrl?: string | null; parentContainers?: ParentContainersResult | null; snippet?: React.ReactNode | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); @@ -103,6 +108,8 @@ export const DashboardPreview = ({ createdMs={createdMs} /> } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx b/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx index c6f7c8b6a6cf7..3bf24ac276c8e 100644 --- a/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx +++ b/datahub-web-react/src/app/entity/dataFlow/DataFlowEntity.tsx @@ -158,6 +158,8 @@ export class DataFlowEntity implements Entity { externalUrl={data.properties?.externalUrl} jobCount={(data as any).childJobs?.total} deprecation={data.deprecation} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx index 103e3bc0b83e4..c313171d2f241 100644 --- a/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataFlow/preview/Preview.tsx @@ -5,6 +5,7 @@ import { DataProduct, Deprecation, Domain, + EntityPath, EntityType, GlobalTags, Owner, @@ -35,6 +36,8 @@ export const Preview = ({ insights, jobCount, deprecation, + degree, + paths, }: { urn: string; name: string; @@ -51,6 +54,8 @@ export const Preview = ({ snippet?: React.ReactNode | null; insights?: Array | null; jobCount?: number | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -80,6 +85,8 @@ export const Preview = ({ ]) || undefined } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx b/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx index a2a369ec53ecf..29741119ac52b 100644 --- a/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx +++ b/datahub-web-react/src/app/entity/dataJob/DataJobEntity.tsx @@ -180,6 +180,8 @@ export class DataJobEntity implements Entity { lastRunTimeMs={ ((data as any).lastRun?.runs?.length && (data as any).lastRun?.runs[0]?.created?.time) || undefined } + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx index 00166964c8152..61963ff2dce6b 100644 --- a/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataJob/preview/Preview.tsx @@ -7,6 +7,7 @@ import { DataProduct, Deprecation, Domain, + EntityPath, EntityType, GlobalTags, Owner, @@ -38,6 +39,8 @@ export const Preview = ({ insights, lastRunTimeMs, externalUrl, + degree, + paths, }: { urn: string; name: string; @@ -54,6 +57,8 @@ export const Preview = ({ insights?: Array | null; lastRunTimeMs?: number | null; externalUrl?: string | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -85,6 +90,8 @@ export const Preview = ({ ]) || undefined } + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataProduct/DataProductEntity.tsx b/datahub-web-react/src/app/entity/dataProduct/DataProductEntity.tsx index faa254cce73a6..c3f1273681c19 100644 --- a/datahub-web-react/src/app/entity/dataProduct/DataProductEntity.tsx +++ b/datahub-web-react/src/app/entity/dataProduct/DataProductEntity.tsx @@ -151,6 +151,8 @@ export class DataProductEntity implements Entity { domain={data.domain?.domain} entityCount={data?.entities?.total || undefined} externalUrl={data.properties?.externalUrl} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataProduct/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataProduct/preview/Preview.tsx index c938d6534c479..7f3b6d7042e8e 100644 --- a/datahub-web-react/src/app/entity/dataProduct/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataProduct/preview/Preview.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { EntityType, Owner, GlobalTags, GlossaryTerms, Domain } from '../../../../types.generated'; +import { EntityType, Owner, GlobalTags, GlossaryTerms, Domain, EntityPath } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType } from '../../Entity'; @@ -14,6 +14,8 @@ interface Props { glossaryTerms?: GlossaryTerms | null; entityCount?: number; externalUrl?: string | null; + degree?: number; + paths?: EntityPath[]; } export const Preview = ({ @@ -26,6 +28,8 @@ export const Preview = ({ glossaryTerms, entityCount, externalUrl, + degree, + paths, }: Props): JSX.Element => { const entityRegistry = useEntityRegistry(); @@ -45,6 +49,8 @@ export const Preview = ({ entityCount={entityCount} externalUrl={externalUrl} displayAssetCount + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx index b55b4c54951ef..cb4239872045f 100644 --- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx +++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx @@ -301,6 +301,8 @@ export class DatasetEntity implements Entity { (data as any).lastOperation?.length && (data as any).lastOperation[0].lastUpdatedTimestamp } health={data.health} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/dataset/preview/Preview.tsx b/datahub-web-react/src/app/entity/dataset/preview/Preview.tsx index fd2583e4f5982..15c54f86038c2 100644 --- a/datahub-web-react/src/app/entity/dataset/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/dataset/preview/Preview.tsx @@ -14,6 +14,7 @@ import { DatasetStatsSummary, DataProduct, Health, + EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -49,6 +50,8 @@ export const Preview = ({ statsSummary, lastUpdatedMs, health, + degree, + paths, }: { urn: string; name: string; @@ -77,6 +80,8 @@ export const Preview = ({ statsSummary?: DatasetStatsSummary | null; lastUpdatedMs?: number | null; health?: Health[] | null; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -114,6 +119,8 @@ export const Preview = ({ /> } health={health || undefined} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlFeature/MLFeatureEntity.tsx b/datahub-web-react/src/app/entity/mlFeature/MLFeatureEntity.tsx index 8fddae7c15186..a7f586c9108ee 100644 --- a/datahub-web-react/src/app/entity/mlFeature/MLFeatureEntity.tsx +++ b/datahub-web-react/src/app/entity/mlFeature/MLFeatureEntity.tsx @@ -145,6 +145,8 @@ export class MLFeatureEntity implements Entity { dataProduct={getDataProduct(genericProperties?.dataProduct)} platform={platform} platformInstanceId={data.dataPlatformInstance?.instanceId} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlFeature/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlFeature/preview/Preview.tsx index 7572bdb08f702..57a8b375bd17b 100644 --- a/datahub-web-react/src/app/entity/mlFeature/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlFeature/preview/Preview.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { DataPlatform, DataProduct, EntityType, Owner } from '../../../../types.generated'; +import { DataPlatform, DataProduct, EntityPath, EntityType, Owner } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -14,6 +14,8 @@ export const Preview = ({ dataProduct, owners, platform, + degree, + paths, }: { urn: string; name: string; @@ -23,6 +25,8 @@ export const Preview = ({ dataProduct?: DataProduct | null; owners?: Array | null; platform?: DataPlatform | null | undefined; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -40,6 +44,8 @@ export const Preview = ({ typeIcon={entityRegistry.getIcon(EntityType.Mlfeature, 14, IconStyleType.ACCENT)} owners={owners} dataProduct={dataProduct} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlFeatureTable/MLFeatureTableEntity.tsx b/datahub-web-react/src/app/entity/mlFeatureTable/MLFeatureTableEntity.tsx index 3bb54b739e749..b3e509decd29d 100644 --- a/datahub-web-react/src/app/entity/mlFeatureTable/MLFeatureTableEntity.tsx +++ b/datahub-web-react/src/app/entity/mlFeatureTable/MLFeatureTableEntity.tsx @@ -144,6 +144,8 @@ export class MLFeatureTableEntity implements Entity { platformName={data.platform?.properties?.displayName || capitalizeFirstLetterOnly(data.platform?.name)} platformInstanceId={data.dataPlatformInstance?.instanceId} dataProduct={getDataProduct(genericProperties?.dataProduct)} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlFeatureTable/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlFeatureTable/preview/Preview.tsx index cf6d7bf5d19f7..97065d9f6dfe0 100644 --- a/datahub-web-react/src/app/entity/mlFeatureTable/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlFeatureTable/preview/Preview.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { DataProduct, EntityType, Owner } from '../../../../types.generated'; +import { DataProduct, EntityPath, EntityType, Owner } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType } from '../../Entity'; @@ -13,6 +13,8 @@ export const Preview = ({ platformName, dataProduct, platformInstanceId, + degree, + paths, }: { urn: string; name: string; @@ -22,6 +24,8 @@ export const Preview = ({ platformName?: string | null; dataProduct?: DataProduct | null; platformInstanceId?: string; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -38,6 +42,8 @@ export const Preview = ({ platformInstanceId={platformInstanceId} dataProduct={dataProduct} logoComponent={entityRegistry.getIcon(EntityType.MlfeatureTable, 20, IconStyleType.HIGHLIGHT)} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx b/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx index 3e800f4f733d2..62690d611dcdd 100644 --- a/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx +++ b/datahub-web-react/src/app/entity/mlModel/MLModelEntity.tsx @@ -127,7 +127,7 @@ export class MLModelEntity implements Entity { renderSearch = (result: SearchResult) => { const data = result.entity as MlModel; - return ; + return ; }; getLineageVizConfig = (entity: MlModel) => { diff --git a/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx index 6b01ad9ac2845..4b57976dfe1a2 100644 --- a/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlModel/preview/Preview.tsx @@ -1,12 +1,20 @@ import React from 'react'; -import { EntityType, MlModel } from '../../../../types.generated'; +import { EntityPath, EntityType, MlModel } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType } from '../../Entity'; import { getDataProduct } from '../../shared/utils'; -export const Preview = ({ model }: { model: MlModel }): JSX.Element => { +export const Preview = ({ + model, + degree, + paths, +}: { + model: MlModel; + degree?: number; + paths?: EntityPath[]; +}): JSX.Element => { const entityRegistry = useEntityRegistry(); const genericProperties = entityRegistry.getGenericEntityProperties(EntityType.Mlmodel, model); @@ -24,6 +32,8 @@ export const Preview = ({ model }: { model: MlModel }): JSX.Element => { tags={model.globalTags || undefined} owners={model?.ownership?.owners} dataProduct={getDataProduct(genericProperties?.dataProduct)} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlModelGroup/MLModelGroupEntity.tsx b/datahub-web-react/src/app/entity/mlModelGroup/MLModelGroupEntity.tsx index 1282eab47cefc..7adc7a6ee7e63 100644 --- a/datahub-web-react/src/app/entity/mlModelGroup/MLModelGroupEntity.tsx +++ b/datahub-web-react/src/app/entity/mlModelGroup/MLModelGroupEntity.tsx @@ -110,7 +110,7 @@ export class MLModelGroupEntity implements Entity { renderSearch = (result: SearchResult) => { const data = result.entity as MlModelGroup; - return ; + return ; }; getLineageVizConfig = (entity: MlModelGroup) => { diff --git a/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx index f1c25d1acadac..910397af899f5 100644 --- a/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlModelGroup/preview/Preview.tsx @@ -1,11 +1,19 @@ import React from 'react'; -import { EntityType, MlModelGroup } from '../../../../types.generated'; +import { EntityPath, EntityType, MlModelGroup } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { getDataProduct } from '../../shared/utils'; -export const Preview = ({ group }: { group: MlModelGroup }): JSX.Element => { +export const Preview = ({ + group, + degree, + paths, +}: { + group: MlModelGroup; + degree?: number; + paths?: EntityPath[]; +}): JSX.Element => { const entityRegistry = useEntityRegistry(); const genericProperties = entityRegistry.getGenericEntityProperties(EntityType.MlmodelGroup, group); return ( @@ -21,6 +29,8 @@ export const Preview = ({ group }: { group: MlModelGroup }): JSX.Element => { qualifier={group?.origin} owners={group?.ownership?.owners} dataProduct={getDataProduct(genericProperties?.dataProduct)} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlPrimaryKey/MLPrimaryKeyEntity.tsx b/datahub-web-react/src/app/entity/mlPrimaryKey/MLPrimaryKeyEntity.tsx index c6b4bba46f331..2549f4f6a0047 100644 --- a/datahub-web-react/src/app/entity/mlPrimaryKey/MLPrimaryKeyEntity.tsx +++ b/datahub-web-react/src/app/entity/mlPrimaryKey/MLPrimaryKeyEntity.tsx @@ -143,6 +143,8 @@ export class MLPrimaryKeyEntity implements Entity { platform={platform} platformInstanceId={data.dataPlatformInstance?.instanceId} dataProduct={getDataProduct(genericProperties?.dataProduct)} + degree={(result as any).degree} + paths={(result as any).paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/mlPrimaryKey/preview/Preview.tsx b/datahub-web-react/src/app/entity/mlPrimaryKey/preview/Preview.tsx index e1207e8f98f09..e72062ea2ae03 100644 --- a/datahub-web-react/src/app/entity/mlPrimaryKey/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/mlPrimaryKey/preview/Preview.tsx @@ -1,5 +1,5 @@ import React from 'react'; -import { DataPlatform, DataProduct, EntityType, Owner } from '../../../../types.generated'; +import { DataPlatform, DataProduct, EntityPath, EntityType, Owner } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; @@ -14,6 +14,8 @@ export const Preview = ({ platform, dataProduct, platformInstanceId, + degree, + paths, }: { urn: string; name: string; @@ -23,6 +25,8 @@ export const Preview = ({ platform?: DataPlatform | null | undefined; dataProduct?: DataProduct | null; platformInstanceId?: string; + degree?: number; + paths?: EntityPath[]; }): JSX.Element => { const entityRegistry = useEntityRegistry(); return ( @@ -40,6 +44,8 @@ export const Preview = ({ owners={owners} dataProduct={dataProduct} platformInstanceId={platformInstanceId} + degree={degree} + paths={paths} /> ); }; diff --git a/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts b/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts index 6e23d5400ab77..00e89e5943c17 100644 --- a/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts +++ b/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts @@ -1,10 +1,6 @@ import { dataset3WithLineage, dataset3WithSchema, dataset4WithLineage } from '../../../../Mocks'; import { EntityType, SchemaFieldDataType } from '../../../../types.generated'; -import { - combineEntityDataWithSiblings, - combineSiblingsInSearchResults, - shouldEntityBeTreatedAsPrimary, -} from '../siblingUtils'; +import { combineEntityDataWithSiblings, shouldEntityBeTreatedAsPrimary } from '../siblingUtils'; const usageStats = { buckets: [ @@ -191,494 +187,6 @@ const datasetUnprimaryWithNoPrimarySiblings = { }, }; -const searchResultWithSiblings = [ - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - customProperties: [], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['table'], - __typename: 'SubTypes', - }, - domain: null, - container: { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - parentContainers: { - count: 2, - containers: [ - { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - { - urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'cypress_project', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Project'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - ], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: false, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - platform: { - urn: 'urn:li:dataPlatform:dbt', - type: 'DATA_PLATFORM', - name: 'dbt', - properties: { - type: 'OTHERS', - displayName: 'dbt', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/dbtlogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - name: 'cypress_project.jaffle_shop.raw_orders', - properties: { - name: 'raw_orders', - description: '', - qualifiedName: null, - __typename: 'DatasetProperties', - }, - __typename: 'Dataset', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:dbt', - type: 'DATA_PLATFORM', - name: 'dbt', - properties: { - type: 'OTHERS', - displayName: 'dbt', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/dbtlogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: '', - qualifiedName: null, - customProperties: [ - { - key: 'catalog_version', - value: '1.0.4', - __typename: 'StringMapEntry', - }, - { - key: 'node_type', - value: 'seed', - __typename: 'StringMapEntry', - }, - { - key: 'materialization', - value: 'seed', - __typename: 'StringMapEntry', - }, - { - key: 'dbt_file_path', - value: 'data/raw_orders.csv', - __typename: 'StringMapEntry', - }, - { - key: 'catalog_schema', - value: 'https://schemas.getdbt.com/dbt/catalog/v1.json', - __typename: 'StringMapEntry', - }, - { - key: 'catalog_type', - value: 'table', - __typename: 'StringMapEntry', - }, - { - key: 'manifest_version', - value: '1.0.4', - __typename: 'StringMapEntry', - }, - { - key: 'manifest_schema', - value: 'https://schemas.getdbt.com/dbt/manifest/v4.json', - __typename: 'StringMapEntry', - }, - ], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['seed'], - __typename: 'SubTypes', - }, - domain: null, - container: null, - parentContainers: { - count: 0, - containers: [], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: true, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - type: 'DATASET', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - name: 'cypress_project.jaffle_shop.raw_orders', - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - __typename: 'DatasetProperties', - }, - __typename: 'Dataset', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, -]; - -const searchResultWithGhostSiblings = [ - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - customProperties: [], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['table'], - __typename: 'SubTypes', - }, - domain: null, - container: { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - parentContainers: { - count: 2, - containers: [ - { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - { - urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'cypress_project', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Project'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - ], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: false, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: false, - type: 'DATASET', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, -]; - describe('siblingUtils', () => { describe('combineEntityDataWithSiblings', () => { it('combines my metadata with my siblings as primary', () => { @@ -719,32 +227,6 @@ describe('siblingUtils', () => { }); }); - describe('combineSiblingsInSearchResults', () => { - it('combines search results to deduplicate siblings', () => { - const result = combineSiblingsInSearchResults(searchResultWithSiblings as any); - - expect(result).toHaveLength(1); - expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - expect(result?.[0]?.matchedEntities?.[1]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - - expect(result?.[0]?.matchedEntities).toHaveLength(2); - }); - - it('will not combine an entity with a ghost node', () => { - const result = combineSiblingsInSearchResults(searchResultWithGhostSiblings as any); - - expect(result).toHaveLength(1); - expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - expect(result?.[0]?.matchedEntities).toHaveLength(1); - }); - }); - describe('shouldEntityBeTreatedAsPrimary', () => { it('will say a primary entity is primary', () => { expect(shouldEntityBeTreatedAsPrimary(datasetPrimaryWithSiblings)).toBeTruthy(); diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx index 649645532d2f5..4119a341c5f1b 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearch.tsx @@ -16,7 +16,6 @@ import { FilterSet, GetSearchResultsParams, SearchResultsInterface } from './typ import { isListSubset } from '../../../utils'; import { EntityAndType } from '../../../types'; import { Message } from '../../../../../shared/Message'; -import { EntityActionProps } from '../../../../../recommendations/renderer/component/EntityNameList'; import { generateOrFilters } from '../../../../../search/utils/generateOrFilters'; import { mergeFilterSets } from '../../../../../search/utils/filterUtils'; import { useDownloadScrollAcrossEntitiesSearchResults } from '../../../../../search/utils/useDownloadScrollAcrossEntitiesSearchResults'; @@ -26,6 +25,7 @@ import { DownloadSearchResults, } from '../../../../../search/utils/types'; import { useEntityContext } from '../../../EntityContext'; +import { EntityActionProps } from './EntitySearchResults'; import { useUserContext } from '../../../../../context/useUserContext'; const Container = styled.div` @@ -251,7 +251,7 @@ export const EmbeddedListSearch = ({ }, [isSelectMode]); useEffect(() => { - if (defaultFilters) { + if (defaultFilters && filters.length === 0) { onChangeFilters(defaultFilters); } // only want to run once on page load diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchModal.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchModal.tsx index d80ada885330f..f88972bbda6a6 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchModal.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchModal.tsx @@ -3,9 +3,9 @@ import { Button, Modal } from 'antd'; import styled from 'styled-components'; import { FacetFilterInput } from '../../../../../../types.generated'; import { EmbeddedListSearch } from './EmbeddedListSearch'; -import { EntityActionProps } from '../../../../../recommendations/renderer/component/EntityNameList'; import { UnionType } from '../../../../../search/utils/constants'; import { FilterSet } from './types'; +import { EntityActionProps } from './EntitySearchResults'; const SearchContainer = styled.div` height: 500px; diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx index bad7f32db5361..e4d43f34dcba7 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx @@ -3,11 +3,11 @@ import { Pagination, Typography } from 'antd'; import styled from 'styled-components'; import { FacetFilterInput, FacetMetadata, SearchResults as SearchResultType } from '../../../../../../types.generated'; import { SearchCfg } from '../../../../../../conf'; -import { EntityNameList, EntityActionProps } from '../../../../../recommendations/renderer/component/EntityNameList'; import { ReactComponent as LoadingSvg } from '../../../../../../images/datahub-logo-color-loading_pendulum.svg'; import { EntityAndType } from '../../../types'; import { UnionType } from '../../../../../search/utils/constants'; import { SearchFiltersSection } from '../../../../../search/SearchFiltersSection'; +import { EntitySearchResults, EntityActionProps } from './EntitySearchResults'; import MatchingViewsLabel from './MatchingViewsLabel'; const SearchBody = styled.div` @@ -125,8 +125,8 @@ export const EmbeddedListSearchResults = ({ )} {!loading && ( - searchResult.entity) || []} + ({ // when we add impact analysis, we will want to pipe the path to each element to the result this diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EntitySearchResults.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EntitySearchResults.tsx new file mode 100644 index 0000000000000..05bbf01f40cf6 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EntitySearchResults.tsx @@ -0,0 +1,98 @@ +import React from 'react'; +import { Checkbox } from 'antd'; +import styled from 'styled-components'; +import { EntityPath, EntityType, SearchResult } from '../../../../../../types.generated'; +import { EntityAndType } from '../../../types'; +import { useEntityRegistry } from '../../../../../useEntityRegistry'; +import { ListItem, StyledList, ThinDivider } from '../../../../../recommendations/renderer/component/EntityNameList'; + +const StyledCheckbox = styled(Checkbox)` + margin-right: 12px; +`; + +export type EntityActionProps = { + urn: string; + type: EntityType; +}; + +type AdditionalProperties = { + degree?: number; + paths?: EntityPath[]; +}; + +type Props = { + // additional data about the search result that is not part of the entity used to enrich the + // presentation of the entity. For example, metadata about how the entity is related for the case + // of impact analysis + additionalPropertiesList?: Array; + searchResults: Array; + isSelectMode?: boolean; + selectedEntities?: EntityAndType[]; + setSelectedEntities?: (entities: EntityAndType[]) => any; + bordered?: boolean; + entityAction?: React.FC; +}; + +export const EntitySearchResults = ({ + additionalPropertiesList, + searchResults, + isSelectMode, + selectedEntities = [], + setSelectedEntities, + bordered = true, + entityAction, +}: Props) => { + const entityRegistry = useEntityRegistry(); + const selectedEntityUrns = selectedEntities?.map((entity) => entity.urn) || []; + + if ( + additionalPropertiesList?.length !== undefined && + additionalPropertiesList.length > 0 && + additionalPropertiesList?.length !== searchResults.length + ) { + console.warn( + 'Warning: additionalPropertiesList length provided to EntityNameList does not match entity array length', + { additionalPropertiesList, searchResults }, + ); + } + + /** + * Invoked when a new entity is selected. Simply updates the state of the list of selected entities. + */ + const onSelectEntity = (selectedEntity: EntityAndType, selected: boolean) => { + if (selected) { + setSelectedEntities?.([...selectedEntities, selectedEntity]); + } else { + setSelectedEntities?.(selectedEntities?.filter((entity) => entity.urn !== selectedEntity.urn) || []); + } + }; + + const EntityAction = entityAction as React.FC; + + return ( + { + const { entity } = searchResult; + return ( + <> + + {isSelectMode && ( + = 0} + onChange={(e) => + onSelectEntity({ urn: entity.urn, type: entity.type }, e.target.checked) + } + /> + )} + {entityRegistry.renderSearchResult(entity.type, searchResult)} + {entityAction && } + + + + ); + }} + /> + ); +}; diff --git a/datahub-web-react/src/app/entity/shared/constants.ts b/datahub-web-react/src/app/entity/shared/constants.ts index e14affc95b6f9..447780fb0d641 100644 --- a/datahub-web-react/src/app/entity/shared/constants.ts +++ b/datahub-web-react/src/app/entity/shared/constants.ts @@ -23,6 +23,7 @@ export const ANTD_GRAY = { export const ANTD_GRAY_V2 = { 2: '#F3F5F6', 5: '#DDE0E4', + 6: '#B2B8BD', 8: '#5E666E', 10: '#1B1E22', }; diff --git a/datahub-web-react/src/app/entity/shared/siblingUtils.ts b/datahub-web-react/src/app/entity/shared/siblingUtils.ts index 2cad28d754a80..66481051055ec 100644 --- a/datahub-web-react/src/app/entity/shared/siblingUtils.ts +++ b/datahub-web-react/src/app/entity/shared/siblingUtils.ts @@ -2,7 +2,8 @@ import merge from 'deepmerge'; import { unionBy, keyBy, values } from 'lodash'; import { useLocation } from 'react-router-dom'; import * as QueryString from 'query-string'; -import { Dataset, Entity, MatchedField, Maybe, SiblingProperties } from '../../../types.generated'; +import { Dataset, Entity, Maybe, SiblingProperties } from '../../../types.generated'; +import { GenericEntityProperties } from './types'; export function stripSiblingsFromEntity(entity: any) { return { @@ -169,23 +170,17 @@ export const shouldEntityBeTreatedAsPrimary = (extractedBaseEntity: { siblings?: return isPrimary; }; -export const combineEntityDataWithSiblings = (baseEntity: T): T => { - if (!baseEntity) { - return baseEntity; - } - const baseEntityKey = Object.keys(baseEntity)[0]; - const extractedBaseEntity = baseEntity[baseEntityKey]; - +const combineEntityWithSiblings = (entity: GenericEntityProperties) => { // eslint-disable-next-line @typescript-eslint/dot-notation - const siblingAspect = extractedBaseEntity.siblings; + const siblingAspect = entity.siblings; if ((siblingAspect?.siblings || []).length === 0) { - return baseEntity; + return entity; } // eslint-disable-next-line @typescript-eslint/dot-notation - const siblings: T[] = siblingAspect?.siblings || []; + const siblings = siblingAspect?.siblings || []; - const isPrimary = shouldEntityBeTreatedAsPrimary(extractedBaseEntity); + const isPrimary = shouldEntityBeTreatedAsPrimary(entity); const combinedBaseEntity: any = siblings.reduce( (prev, current) => @@ -193,62 +188,75 @@ export const combineEntityDataWithSiblings = (baseEntity: T): T => { arrayMerge: combineMerge, customMerge: customMerge.bind({}, isPrimary), }), - extractedBaseEntity, - ) as T; + entity, + ); // Force the urn of the combined entity to the current entity urn. - combinedBaseEntity.urn = extractedBaseEntity.urn; + combinedBaseEntity.urn = entity.urn; + + return combinedBaseEntity; +}; + +export const combineEntityDataWithSiblings = (baseEntity: T): T => { + if (!baseEntity) { + return baseEntity; + } + const baseEntityKey = Object.keys(baseEntity)[0]; + const extractedBaseEntity = baseEntity[baseEntityKey]; + + // eslint-disable-next-line @typescript-eslint/dot-notation + const siblingAspect = extractedBaseEntity.siblings; + if ((siblingAspect?.siblings || []).length === 0) { + return baseEntity; + } + + const combinedBaseEntity = combineEntityWithSiblings(extractedBaseEntity); return { [baseEntityKey]: combinedBaseEntity } as unknown as T; }; -export type CombinedSearchResult = { +export type CombinedEntity = { entity: Entity; - matchedFields: MatchedField[]; - matchedEntities?: Entity[]; + matchedEntities?: Array; }; -export function combineSiblingsInSearchResults( - results: - | { - entity: Entity; - matchedFields: MatchedField[]; - }[] - | undefined, -) { - const combinedResults: CombinedSearchResult[] | undefined = []; - const siblingsToPair: Record = {}; - - // set sibling associations - results?.forEach((result) => { - if (result.entity.urn in siblingsToPair) { - // filter from repeating - // const siblingsCombinedResult = siblingsToPair[result.entity.urn]; - // siblingsCombinedResult.matchedEntities?.push(result.entity); - return; - } - - const combinedResult: CombinedSearchResult = result; - const { entity }: { entity: any } = result; - const siblingUrns = entity?.siblings?.siblings?.map((sibling) => sibling.urn) || []; - if (siblingUrns.length > 0) { - combinedResult.matchedEntities = entity.siblings.isPrimary - ? [stripSiblingsFromEntity(entity), ...entity.siblings.siblings] - : [...entity.siblings.siblings, stripSiblingsFromEntity(entity)]; - - combinedResult.matchedEntities = combinedResult.matchedEntities.filter( - (resultToFilter) => (resultToFilter as Dataset).exists, - ); +type CombinedEntityResult = + | { + skipped: true; + } + | { + skipped: false; + combinedEntity: CombinedEntity; + }; + +export function combineSiblingsForEntity(entity: Entity, visitedSiblingUrns: Set): CombinedEntityResult { + if (visitedSiblingUrns.has(entity.urn)) return { skipped: true }; + + const combinedEntity: CombinedEntity = { entity: combineEntityWithSiblings({ ...entity }) }; + const siblings = (combinedEntity.entity as GenericEntityProperties).siblings?.siblings ?? []; + const isPrimary = (combinedEntity.entity as GenericEntityProperties).siblings?.isPrimary; + const siblingUrns = siblings.map((sibling) => sibling?.urn); + + if (siblingUrns.length > 0) { + combinedEntity.matchedEntities = isPrimary + ? [stripSiblingsFromEntity(combinedEntity.entity), ...siblings] + : [...siblings, stripSiblingsFromEntity(combinedEntity.entity)]; + + combinedEntity.matchedEntities = combinedEntity.matchedEntities.filter( + (resultToFilter) => (resultToFilter as Dataset).exists, + ); + + siblingUrns.forEach((urn) => urn && visitedSiblingUrns.add(urn)); + } - siblingUrns.forEach((urn) => { - siblingsToPair[urn] = combinedResult; - }); - } - combinedResults.push(combinedResult); - }); + return { combinedEntity, skipped: false }; +} - return combinedResults; +export function createSiblingEntityCombiner() { + const visitedSiblingUrns: Set = new Set(); + return (entity: Entity) => combineSiblingsForEntity(entity, visitedSiblingUrns); } + // used to determine whether sibling entities should be shown merged or not export const SEPARATE_SIBLINGS_URL_PARAM = 'separate_siblings'; diff --git a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx index 03689460eb02b..eda9b7d7fe2a4 100644 --- a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx +++ b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx @@ -1,4 +1,4 @@ -import React, { useEffect, useRef, useState } from 'react'; +import React, { CSSProperties, useEffect, useRef, useState } from 'react'; import { useHistory } from 'react-router'; import { Select } from 'antd'; import styled from 'styled-components'; @@ -55,11 +55,21 @@ const ViewSelectContainer = styled.div` .ant-select-selection-item { font-weight: 700; font-size: 14px; + text-align: left; } } } `; +const SelectStyled = styled(Select)` + min-width: 90px; + max-width: 200px; +`; + +type Props = { + dropdownStyle?: CSSProperties; +}; + /** * The View Select component allows you to select a View to apply to query on the current page. For example, * search, recommendations, and browse. @@ -69,7 +79,7 @@ const ViewSelectContainer = styled.div` * * In the event that a user refreshes their browser, the state of the view should be saved as well. */ -export const ViewSelect = () => { +export const ViewSelect = ({ dropdownStyle = {} }: Props) => { const history = useHistory(); const userContext = useUserContext(); const [isOpen, setIsOpen] = useState(false); @@ -188,12 +198,11 @@ export const ViewSelect = () => { return ( - + {viewBuilderDisplayState.visible && ( { ref={clearButtonRef} onClick={onHandleClickClear} > - All Entities + View all ); diff --git a/datahub-web-react/src/app/home/HomePageHeader.tsx b/datahub-web-react/src/app/home/HomePageHeader.tsx index def413e13213f..5919d2dbf5b7e 100644 --- a/datahub-web-react/src/app/home/HomePageHeader.tsx +++ b/datahub-web-react/src/app/home/HomePageHeader.tsx @@ -273,6 +273,7 @@ export const HomePageHeader = () => { autoCompleteStyle={styles.searchBox} entityRegistry={entityRegistry} viewsEnabled={viewsEnabled} + combineSiblings showQuickFilters /> {searchResultsToShow && searchResultsToShow.length > 0 && ( diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 0a7d16ade0ac0..36713cfb7ffcf 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -34,6 +34,7 @@ import ExternalUrlButton from '../entity/shared/ExternalUrlButton'; import EntityPaths from './EntityPaths/EntityPaths'; import { DataProductLink } from '../shared/tags/DataProductLink'; import { EntityHealth } from '../entity/shared/containers/profile/header/EntityHealth'; +import { getUniqueOwners } from './utils'; const PreviewContainer = styled.div` display: flex; @@ -260,6 +261,7 @@ export default function DefaultPreviewCard({ }; const shouldShowRightColumn = (topUsers && topUsers.length > 0) || (owners && owners.length > 0); + const uniqueOwners = getUniqueOwners(owners); return ( @@ -294,7 +296,7 @@ export default function DefaultPreviewCard({ {deprecation?.deprecated && ( )} - {health && health.length > 0 && } + {health && health.length > 0 ? : null} {externalUrl && ( )} - {degree !== undefined && degree !== null && ( )} - {(topUsers?.length || 0) > 0 && (owners?.length || 0) > 0 && } - {owners && owners?.length > 0 && ( + {(topUsers?.length || 0) > 0 && (uniqueOwners?.length || 0) > 0 && ( + + )} + {uniqueOwners && uniqueOwners?.length > 0 && ( Owners
- owner.owner)} max={2} /> + owner.owner)} max={2} />
)} diff --git a/datahub-web-react/src/app/preview/utils.ts b/datahub-web-react/src/app/preview/utils.ts new file mode 100644 index 0000000000000..f5a562dc2ffe7 --- /dev/null +++ b/datahub-web-react/src/app/preview/utils.ts @@ -0,0 +1,6 @@ +import { Owner } from '../../types.generated'; + +export function getUniqueOwners(owners?: Owner[] | null) { + const uniqueOwnerUrns = new Set(); + return owners?.filter((owner) => !uniqueOwnerUrns.has(owner.owner.urn) && uniqueOwnerUrns.add(owner.owner.urn)); +} diff --git a/datahub-web-react/src/app/recommendations/renderer/component/EntityNameList.tsx b/datahub-web-react/src/app/recommendations/renderer/component/EntityNameList.tsx index a0fea45c9ed2d..4ff78e64625b1 100644 --- a/datahub-web-react/src/app/recommendations/renderer/component/EntityNameList.tsx +++ b/datahub-web-react/src/app/recommendations/renderer/component/EntityNameList.tsx @@ -1,19 +1,14 @@ import React from 'react'; -import { Divider, List, Checkbox } from 'antd'; +import { Divider, List } from 'antd'; import styled from 'styled-components'; -import { Entity, EntityType, EntityPath } from '../../../../types.generated'; +import { Entity } from '../../../../types.generated'; import { useEntityRegistry } from '../../../useEntityRegistry'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { IconStyleType } from '../../../entity/Entity'; -import { EntityAndType } from '../../../entity/shared/types'; import { getPlatformName } from '../../../entity/shared/utils'; import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; -const StyledCheckbox = styled(Checkbox)` - margin-right: 12px; -`; - -const StyledList = styled(List)` +export const StyledList = styled(List)` overflow-y: auto; height: 100%; margin-top: -1px; @@ -45,7 +40,7 @@ const StyledList = styled(List)` } ` as typeof List; -const ListItem = styled.div<{ isSelectMode: boolean }>` +export const ListItem = styled.div<{ isSelectMode: boolean }>` padding-right: 40px; padding-left: ${(props) => (props.isSelectMode ? '20px' : '40px')}; padding-top: 16px; @@ -54,78 +49,23 @@ const ListItem = styled.div<{ isSelectMode: boolean }>` align-items: center; `; -const ThinDivider = styled(Divider)` +export const ThinDivider = styled(Divider)` padding: 0px; margin: 0px; `; -export type EntityActionProps = { - urn: string; - type: EntityType; -}; - -type AdditionalProperties = { - degree?: number; - paths?: EntityPath[]; -}; - type Props = { - // additional data about the search result that is not part of the entity used to enrich the - // presentation of the entity. For example, metadata about how the entity is related for the case - // of impact analysis - additionalPropertiesList?: Array; entities: Array; onClick?: (index: number) => void; - isSelectMode?: boolean; - selectedEntities?: EntityAndType[]; - setSelectedEntities?: (entities: EntityAndType[]) => any; - bordered?: boolean; - entityAction?: React.FC; }; -export const EntityNameList = ({ - additionalPropertiesList, - entities, - onClick, - isSelectMode, - selectedEntities = [], - setSelectedEntities, - bordered = true, - entityAction, -}: Props) => { +export const EntityNameList = ({ entities, onClick }: Props) => { const entityRegistry = useEntityRegistry(); - const selectedEntityUrns = selectedEntities?.map((entity) => entity.urn) || []; - - if ( - additionalPropertiesList?.length !== undefined && - additionalPropertiesList.length > 0 && - additionalPropertiesList?.length !== entities.length - ) { - console.warn( - 'Warning: additionalPropertiesList length provided to EntityNameList does not match entity array length', - { additionalPropertiesList, entities }, - ); - } - - /** - * Invoked when a new entity is selected. Simply updates the state of the list of selected entities. - */ - const onSelectEntity = (selectedEntity: EntityAndType, selected: boolean) => { - if (selected) { - setSelectedEntities?.([...selectedEntities, selectedEntity]); - } else { - setSelectedEntities?.(selectedEntities?.filter((entity) => entity.urn !== selectedEntity.urn) || []); - } - }; - - const EntityAction = entityAction as React.FC; return ( { - const additionalProperties = additionalPropertiesList?.[index]; const genericProps = entityRegistry.getGenericEntityProperties(entity.type, entity); const platformLogoUrl = genericProps?.platform?.properties?.logoUrl; const platformName = getPlatformName(genericProps); @@ -140,15 +80,7 @@ export const EntityNameList = ({ return ( <> - - {isSelectMode && ( - = 0} - onChange={(e) => - onSelectEntity({ urn: entity.urn, type: entity.type }, e.target.checked) - } - /> - )} + onClick?.(index)} entityCount={entityCount} - degree={additionalProperties?.degree} deprecation={deprecation} - paths={additionalProperties?.paths} health={health || undefined} /> - {entityAction && } diff --git a/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx b/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx index e5f58a8662acc..c562fc6e8349a 100644 --- a/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx +++ b/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx @@ -23,9 +23,7 @@ import { REMOVED_FILTER_NAME, TAGS_FILTER_NAME, TYPE_NAMES_FILTER_NAME, - DATA_PRODUCTS_FILTER_NAME, } from './utils/constants'; -import SetDataProductModal from '../entity/shared/containers/profile/sidebar/DataProduct/SetDataProductModal'; type Props = { facet?: FacetMetadata | null; @@ -80,23 +78,6 @@ export const AdvancedFilterSelectValueModal = ({ ); } - if (filterField === DATA_PRODUCTS_FILTER_NAME) { - return ( - initialValues?.includes(agg?.entity?.urn || ''))?.entity || null - } - onModalClose={onCloseModal} - onOkOverride={(dataProductUrn) => { - onSelect([dataProductUrn]); - onCloseModal(); - }} - /> - ); - } - if (filterField === CONTAINER_FILTER_NAME) { return ( { - return { - value: entity.urn, - label: , - type: entity.type, - style: { padding: '12px 12px 12px 16px' }, - }; -}; - const renderRecommendedQuery = (query: string) => { return { value: query, @@ -123,6 +115,7 @@ interface Props { hideRecommendations?: boolean; showQuickFilters?: boolean; viewsEnabled?: boolean; + combineSiblings?: boolean; setIsSearchBarFocused?: (isSearchBarFocused: boolean) => void; onFocus?: () => void; onBlur?: () => void; @@ -149,6 +142,7 @@ export const SearchBar = ({ hideRecommendations, showQuickFilters, viewsEnabled = false, + combineSiblings = false, setIsSearchBarFocused, onFocus, onBlur, @@ -227,14 +221,26 @@ export const SearchBar = ({ ]; }, [showQuickFilters, suggestions.length, effectiveQuery, selectedQuickFilter, entityRegistry]); - const autoCompleteEntityOptions = useMemo( - () => - suggestions.map((entity: AutoCompleteResultForEntity) => ({ - label: , - options: [...entity.entities.map((e: Entity) => renderItem(effectiveQuery, e))], - })), - [effectiveQuery, suggestions], - ); + const autoCompleteEntityOptions = useMemo(() => { + return suggestions.map((suggestion: AutoCompleteResultForEntity) => { + const combinedSuggestion = combineSiblingsInAutoComplete(suggestion, { combineSiblings }); + return { + label: , + options: combinedSuggestion.combinedEntities.map((combinedEntity) => ({ + value: combinedEntity.entity.urn, + label: ( + + ), + type: combinedEntity.entity.type, + style: { padding: '12px 12px 12px 16px' }, + })), + }; + }); + }, [combineSiblings, effectiveQuery, suggestions]); const previousSelectedQuickFilterValue = usePrevious(selectedQuickFilter?.value); useEffect(() => { @@ -371,7 +377,15 @@ export const SearchBar = ({ onKeyUp={handleStopPropagation} onKeyDown={handleStopPropagation} > - +
)} diff --git a/datahub-web-react/src/app/search/SearchResultList.tsx b/datahub-web-react/src/app/search/SearchResultList.tsx index b860e7b670c33..6e2d5c923c6e2 100644 --- a/datahub-web-react/src/app/search/SearchResultList.tsx +++ b/datahub-web-react/src/app/search/SearchResultList.tsx @@ -5,13 +5,14 @@ import { useHistory } from 'react-router'; import { RocketOutlined } from '@ant-design/icons'; import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; import { ANTD_GRAY } from '../entity/shared/constants'; -import { CombinedSearchResult, SEPARATE_SIBLINGS_URL_PARAM } from '../entity/shared/siblingUtils'; +import { SEPARATE_SIBLINGS_URL_PARAM } from '../entity/shared/siblingUtils'; import { CompactEntityNameList } from '../recommendations/renderer/component/CompactEntityNameList'; import { useEntityRegistry } from '../useEntityRegistry'; import { SearchResult } from '../../types.generated'; import analytics, { EventType } from '../analytics'; import { EntityAndType } from '../entity/shared/types'; import { useIsSearchV2 } from './useSearchAndBrowseVersion'; +import { CombinedSearchResult } from './utils/combineSiblingsInSearchResults'; const ResultList = styled(List)` &&& { @@ -131,7 +132,7 @@ export const SearchResultList = ({ ), }} renderItem={(item, index) => ( - + onClickResult(item, index)} diff --git a/datahub-web-react/src/app/search/SearchResults.tsx b/datahub-web-react/src/app/search/SearchResults.tsx index 4885715fe200f..19f762c1c6cf2 100644 --- a/datahub-web-react/src/app/search/SearchResults.tsx +++ b/datahub-web-react/src/app/search/SearchResults.tsx @@ -6,7 +6,6 @@ import { Entity, FacetFilterInput, FacetMetadata, MatchedField } from '../../typ import { SearchCfg } from '../../conf'; import { SearchResultsRecommendations } from './SearchResultsRecommendations'; import SearchExtendedMenu from '../entity/shared/components/styled/search/SearchExtendedMenu'; -import { combineSiblingsInSearchResults } from '../entity/shared/siblingUtils'; import { SearchSelectBar } from '../entity/shared/components/styled/search/SearchSelectBar'; import { SearchResultList } from './SearchResultList'; import { isListSubset } from '../entity/shared/utils'; @@ -26,6 +25,7 @@ import { BrowseProvider } from './sidebar/BrowseContext'; import { useIsBrowseV2, useIsSearchV2 } from './useSearchAndBrowseVersion'; import useToggleSidebar from './useToggleSidebar'; import SearchSortSelect from './sorting/SearchSortSelect'; +import { combineSiblingsInSearchResults } from './utils/combineSiblingsInSearchResults'; const SearchResultsWrapper = styled.div<{ v2Styles: boolean }>` display: flex; diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx index 8a87407b7176b..60bb21713ba58 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx @@ -1,14 +1,15 @@ -import { Image, Typography } from 'antd'; +import { Typography } from 'antd'; import React from 'react'; import styled from 'styled-components/macro'; -import { Entity } from '../../../types.generated'; +import { Entity, EntityType } from '../../../types.generated'; import { useEntityRegistry } from '../../useEntityRegistry'; -import { getPlatformName } from '../../entity/shared/utils'; -import { IconStyleType } from '../../entity/Entity'; import { getAutoCompleteEntityText } from './utils'; -import { SuggestionText } from './AutoCompleteUser'; import ParentContainers from './ParentContainers'; -import { ANTD_GRAY } from '../../entity/shared/constants'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; +import AutoCompleteEntityIcon from './AutoCompleteEntityIcon'; +import { SuggestionText } from './styledComponents'; +import AutoCompletePlatformNames from './AutoCompletePlatformNames'; +import { getPlatformName } from '../../entity/shared/utils'; const AutoCompleteEntityWrapper = styled.div` display: flex; @@ -17,12 +18,8 @@ const AutoCompleteEntityWrapper = styled.div` align-items: center; `; -const PreviewImage = styled(Image)` - height: 22px; - width: 22px; - width: auto; - object-fit: contain; - background-color: transparent; +const IconsContainer = styled.div` + display: flex; `; const ContentWrapper = styled.div` @@ -32,8 +29,8 @@ const ContentWrapper = styled.div` `; const Subtype = styled.span` - color: ${ANTD_GRAY[9]}; - border: 1px solid ${ANTD_GRAY[9]}; + color: ${ANTD_GRAY_V2[8]}; + border: 1px solid ${ANTD_GRAY_V2[6]}; border-radius: 16px; padding: 4px 8px; line-height: 12px; @@ -41,33 +38,65 @@ const Subtype = styled.span` margin-right: 8px; `; +const ItemHeader = styled.div` + display: flex; + align-items: center; + margin-bottom: 3px; + gap: 8px; +`; + +const Divider = styled.div` + border-right: 1px solid ${ANTD_GRAY_V2[6]}; + height: 12px; +`; + interface Props { query: string; entity: Entity; + siblings?: Array; hasParentTooltip: boolean; } -export default function AutoCompleteEntity({ query, entity, hasParentTooltip }: Props) { +export default function AutoCompleteEntity({ query, entity, siblings, hasParentTooltip }: Props) { const entityRegistry = useEntityRegistry(); const genericEntityProps = entityRegistry.getGenericEntityProperties(entity.type, entity); - const platformName = getPlatformName(genericEntityProps); - const platformLogoUrl = genericEntityProps?.platform?.properties?.logoUrl; const displayName = entityRegistry.getDisplayName(entity.type, entity); - const icon = - (platformLogoUrl && ) || - entityRegistry.getIcon(entity.type, 12, IconStyleType.ACCENT); const { matchedText, unmatchedText } = getAutoCompleteEntityText(displayName, query); + const entities = siblings?.length ? siblings : [entity]; + const platforms = + genericEntityProps?.siblingPlatforms + ?.map( + (platform) => + getPlatformName(entityRegistry.getGenericEntityProperties(EntityType.DataPlatform, platform)) || '', + ) + .filter(Boolean) ?? []; + const parentContainers = genericEntityProps?.parentContainers?.containers || []; // Need to reverse parentContainers since it returns direct parent first. const orderedParentContainers = [...parentContainers].reverse(); const subtype = genericEntityProps?.subTypes?.typeNames?.[0]; + const showPlatforms = !!platforms.length; + const showPlatformDivider = !!platforms.length && !!parentContainers.length; + const showParentContainers = !!parentContainers.length; + const showHeader = showPlatforms || showParentContainers; + return ( - {icon} - + {showHeader && ( + + + {entities.map((ent) => ( + + ))} + + {showPlatforms && } + {showPlatformDivider && } + {showParentContainers && } + + )} { + const entityRegistry = useEntityRegistry(); + + const genericEntityProps = entityRegistry.getGenericEntityProperties(entity.type, entity); + const platformLogoUrl = genericEntityProps?.platform?.properties?.logoUrl; + const platformName = getPlatformName(genericEntityProps); + return ( + (platformLogoUrl && ) || + entityRegistry.getIcon(entity.type, 12, IconStyleType.ACCENT) + ); +}; + +export default AutoCompleteEntityIcon; diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx index c97d171b4c931..b8f5a2c7e4081 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx @@ -18,9 +18,10 @@ export const SuggestionContainer = styled.div` interface Props { query: string; entity: Entity; + siblings?: Array; } -export default function AutoCompleteItem({ query, entity }: Props) { +export default function AutoCompleteItem({ query, entity, siblings }: Props) { const entityRegistry = useEntityRegistry(); const displayTooltip = getShouldDisplayTooltip(entity, entityRegistry); let componentToRender: React.ReactNode = null; @@ -33,7 +34,14 @@ export default function AutoCompleteItem({ query, entity }: Props) { componentToRender = ; break; default: - componentToRender = ; + componentToRender = ( + + ); break; } diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx new file mode 100644 index 0000000000000..61fe6bcae71d0 --- /dev/null +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx @@ -0,0 +1,22 @@ +import { Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; + +const PlatformText = styled(Typography.Text)` + font-size: 12px; + line-height: 20px; + font-weight: 500; + color: ${ANTD_GRAY_V2[8]}; + white-space: nowrap; +`; + +type Props = { + platforms: Array; +}; + +const AutoCompletePlatformNames = ({ platforms }: Props) => { + return {platforms.join(' & ')}; +}; + +export default AutoCompletePlatformNames; diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx index 1f88b94bb0cc7..53b4d53ef46d4 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx @@ -1,20 +1,10 @@ import { Typography } from 'antd'; import React from 'react'; -import styled from 'styled-components'; import { CorpUser, EntityType } from '../../../types.generated'; -import { ANTD_GRAY } from '../../entity/shared/constants'; import { CustomAvatar } from '../../shared/avatar'; import { useEntityRegistry } from '../../useEntityRegistry'; import { getAutoCompleteEntityText } from './utils'; - -export const SuggestionText = styled.div` - margin-left: 12px; - margin-top: 2px; - margin-bottom: 2px; - color: ${ANTD_GRAY[9]}; - font-size: 16px; - overflow: hidden; -`; +import { SuggestionText } from './styledComponents'; interface Props { query: string; diff --git a/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx b/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx index 77ccde06172c9..98a4f5aa214bb 100644 --- a/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx +++ b/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx @@ -4,20 +4,21 @@ import React, { Fragment } from 'react'; import styled from 'styled-components/macro'; import { Container, EntityType } from '../../../types.generated'; import { useEntityRegistry } from '../../useEntityRegistry'; -import { ANTD_GRAY } from '../../entity/shared/constants'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; const NUM_VISIBLE_CONTAINERS = 2; const ParentContainersWrapper = styled.div` font-size: 12px; - color: ${ANTD_GRAY[9]}; + color: ${ANTD_GRAY_V2[8]}; display: flex; align-items: center; - margin-bottom: 3px; `; const ParentContainer = styled(Typography.Text)` + color: ${ANTD_GRAY_V2[8]}; margin-left: 4px; + font-weight: 500; `; export const ArrowWrapper = styled.span` diff --git a/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx b/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx index 79743858b06d9..f4c31b18c99b2 100644 --- a/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx +++ b/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx @@ -1,7 +1,7 @@ import { SearchOutlined } from '@ant-design/icons'; import React from 'react'; import styled from 'styled-components/macro'; -import { SuggestionText } from './AutoCompleteUser'; +import { SuggestionText } from './styledComponents'; const TextWrapper = styled.span``; diff --git a/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx b/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx new file mode 100644 index 0000000000000..9e4b084ab3889 --- /dev/null +++ b/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx @@ -0,0 +1,11 @@ +import styled from 'styled-components'; +import { ANTD_GRAY } from '../../entity/shared/constants'; + +export const SuggestionText = styled.div` + margin-left: 12px; + margin-top: 2px; + margin-bottom: 2px; + color: ${ANTD_GRAY[9]}; + font-size: 16px; + overflow: hidden; +`; diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts new file mode 100644 index 0000000000000..e8e64559e67a0 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts @@ -0,0 +1,31 @@ +import { AutoCompleteResultForEntity, EntityType } from '../../../types.generated'; +import { CombinedEntity, createSiblingEntityCombiner } from '../../entity/shared/siblingUtils'; + +export type CombinedSuggestion = { + type: EntityType; + combinedEntities: Array; + suggestions?: AutoCompleteResultForEntity['suggestions']; +}; + +export function combineSiblingsInAutoComplete( + autoCompleteResultForEntity: AutoCompleteResultForEntity, + { combineSiblings = false } = {}, +): CombinedSuggestion { + const combine = createSiblingEntityCombiner(); + const combinedEntities: Array = []; + + autoCompleteResultForEntity.entities.forEach((entity) => { + if (!combineSiblings) { + combinedEntities.push({ entity }); + return; + } + const combinedResult = combine(entity); + if (!combinedResult.skipped) combinedEntities.push(combinedResult.combinedEntity); + }); + + return { + type: autoCompleteResultForEntity.type, + suggestions: autoCompleteResultForEntity.suggestions, + combinedEntities, + }; +} diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts new file mode 100644 index 0000000000000..4cf61c715b0e9 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts @@ -0,0 +1,521 @@ +import { combineSiblingsInSearchResults } from './combineSiblingsInSearchResults'; + +const searchResultWithSiblings = [ + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + customProperties: [], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['table'], + __typename: 'SubTypes', + }, + domain: null, + container: { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + parentContainers: { + count: 2, + containers: [ + { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + { + urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'cypress_project', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Project'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + ], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: false, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + platform: { + urn: 'urn:li:dataPlatform:dbt', + type: 'DATA_PLATFORM', + name: 'dbt', + properties: { + type: 'OTHERS', + displayName: 'dbt', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/dbtlogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + name: 'cypress_project.jaffle_shop.raw_orders', + properties: { + name: 'raw_orders', + description: '', + qualifiedName: null, + __typename: 'DatasetProperties', + }, + __typename: 'Dataset', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:dbt', + type: 'DATA_PLATFORM', + name: 'dbt', + properties: { + type: 'OTHERS', + displayName: 'dbt', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/dbtlogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: '', + qualifiedName: null, + customProperties: [ + { + key: 'catalog_version', + value: '1.0.4', + __typename: 'StringMapEntry', + }, + { + key: 'node_type', + value: 'seed', + __typename: 'StringMapEntry', + }, + { + key: 'materialization', + value: 'seed', + __typename: 'StringMapEntry', + }, + { + key: 'dbt_file_path', + value: 'data/raw_orders.csv', + __typename: 'StringMapEntry', + }, + { + key: 'catalog_schema', + value: 'https://schemas.getdbt.com/dbt/catalog/v1.json', + __typename: 'StringMapEntry', + }, + { + key: 'catalog_type', + value: 'table', + __typename: 'StringMapEntry', + }, + { + key: 'manifest_version', + value: '1.0.4', + __typename: 'StringMapEntry', + }, + { + key: 'manifest_schema', + value: 'https://schemas.getdbt.com/dbt/manifest/v4.json', + __typename: 'StringMapEntry', + }, + ], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['seed'], + __typename: 'SubTypes', + }, + domain: null, + container: null, + parentContainers: { + count: 0, + containers: [], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: true, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + type: 'DATASET', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + name: 'cypress_project.jaffle_shop.raw_orders', + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + __typename: 'DatasetProperties', + }, + __typename: 'Dataset', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, +]; + +const searchResultWithGhostSiblings = [ + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + customProperties: [], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['table'], + __typename: 'SubTypes', + }, + domain: null, + container: { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + parentContainers: { + count: 2, + containers: [ + { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + { + urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'cypress_project', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Project'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + ], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: false, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: false, + type: 'DATASET', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, +]; + +describe('siblingUtils', () => { + describe('combineSiblingsInSearchResults', () => { + it('combines search results to deduplicate siblings', () => { + const result = combineSiblingsInSearchResults(searchResultWithSiblings as any); + + expect(result).toHaveLength(1); + expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + expect(result?.[0]?.matchedEntities?.[1]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + + expect(result?.[0]?.matchedEntities).toHaveLength(2); + + expect(result?.[0]?.matchedFields).toHaveLength(2); + }); + + it('will not combine an entity with a ghost node', () => { + const result = combineSiblingsInSearchResults(searchResultWithGhostSiblings as any); + + expect(result).toHaveLength(1); + expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + expect(result?.[0]?.matchedEntities).toHaveLength(1); + + expect(result?.[0]?.matchedFields).toHaveLength(2); + }); + }); +}); diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts new file mode 100644 index 0000000000000..4a5c8da6381b8 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts @@ -0,0 +1,28 @@ +import { Entity, MatchedField } from '../../../types.generated'; +import { CombinedEntity, createSiblingEntityCombiner } from '../../entity/shared/siblingUtils'; + +type UncombinedSeaerchResults = { + entity: Entity; + matchedFields: Array; +}; + +export type CombinedSearchResult = CombinedEntity & Pick; + +export function combineSiblingsInSearchResults( + searchResults: Array | undefined = [], +): Array { + const combine = createSiblingEntityCombiner(); + const combinedSearchResults: Array = []; + + searchResults.forEach((searchResult) => { + const combinedResult = combine(searchResult.entity); + if (!combinedResult.skipped) { + combinedSearchResults.push({ + ...searchResult, + ...combinedResult.combinedEntity, + }); + } + }); + + return combinedSearchResults; +} diff --git a/datahub-web-react/src/app/search/utils/constants.ts b/datahub-web-react/src/app/search/utils/constants.ts index eecd18441e7a5..af45129022cc1 100644 --- a/datahub-web-react/src/app/search/utils/constants.ts +++ b/datahub-web-react/src/app/search/utils/constants.ts @@ -10,7 +10,6 @@ export const TAGS_FILTER_NAME = 'tags'; export const GLOSSARY_TERMS_FILTER_NAME = 'glossaryTerms'; export const CONTAINER_FILTER_NAME = 'container'; export const DOMAINS_FILTER_NAME = 'domains'; -export const DATA_PRODUCTS_FILTER_NAME = 'dataProducts'; export const OWNERS_FILTER_NAME = 'owners'; export const TYPE_NAMES_FILTER_NAME = 'typeNames'; export const PLATFORM_FILTER_NAME = 'platform'; @@ -57,7 +56,6 @@ export const ORDERED_FIELDS = [ TAGS_FILTER_NAME, GLOSSARY_TERMS_FILTER_NAME, DOMAINS_FILTER_NAME, - DATA_PRODUCTS_FILTER_NAME, FIELD_TAGS_FILTER_NAME, FIELD_GLOSSARY_TERMS_FILTER_NAME, FIELD_PATHS_FILTER_NAME, @@ -74,7 +72,6 @@ export const FIELD_TO_LABEL = { owners: 'Owner', tags: 'Tag', domains: 'Domain', - [DATA_PRODUCTS_FILTER_NAME]: 'Data Product', platform: 'Platform', fieldTags: 'Column Tag', glossaryTerms: 'Glossary Term', diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index f18b071705393..172a6d957e287 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -2,6 +2,7 @@ fragment autoCompleteFields on Entity { urn type ... on Dataset { + exists name platform { ...platformFields @@ -19,6 +20,29 @@ fragment autoCompleteFields on Entity { subTypes { typeNames } + siblings { + isPrimary + siblings { + urn + type + ... on Dataset { + exists + platform { + ...platformFields + } + parentContainers { + ...parentContainersFields + } + name + properties { + name + description + qualifiedName + externalUrl + } + } + } + } ...datasetStatsFields } ... on CorpUser { @@ -250,83 +274,77 @@ fragment datasetStatsFields on Dataset { } } +fragment nonSiblingsDatasetSearchFields on Dataset { + exists + name + origin + uri + platform { + ...platformFields + } + dataPlatformInstance { + ...dataPlatformInstanceFields + } + editableProperties { + description + } + platformNativeType + properties { + name + description + qualifiedName + customProperties { + key + value + } + externalUrl + } + ownership { + ...ownershipFields + } + globalTags { + ...globalTagsFields + } + glossaryTerms { + ...glossaryTerms + } + subTypes { + typeNames + } + domain { + ...entityDomain + } + ...entityDataProduct + parentContainers { + ...parentContainersFields + } + deprecation { + ...deprecationFields + } + health { + type + status + message + causes + } + ...datasetStatsFields +} + fragment searchResultFields on Entity { urn type ... on Dataset { - exists - name - origin - uri - platform { - ...platformFields - } - dataPlatformInstance { - ...dataPlatformInstanceFields - } - editableProperties { - description - } - platformNativeType - properties { - name - description - qualifiedName - customProperties { - key - value - } - externalUrl - } - ownership { - ...ownershipFields - } - globalTags { - ...globalTagsFields - } - glossaryTerms { - ...glossaryTerms - } - subTypes { - typeNames - } - domain { - ...entityDomain - } - ...entityDataProduct - parentContainers { - ...parentContainersFields - } - deprecation { - ...deprecationFields - } - health { - type - status - message - causes - } + ...nonSiblingsDatasetSearchFields siblings { isPrimary siblings { urn type ... on Dataset { - exists - platform { - ...platformFields - } - name - properties { - name - description - qualifiedName - externalUrl - } + ...nonSiblingsDatasetSearchFields } } } - ...datasetStatsFields } ... on CorpUser { username diff --git a/docker/build.gradle b/docker/build.gradle index f33e06f383240..ae101fe1defc5 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -35,8 +35,31 @@ task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') { environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - environment "ACTIONS_VERSION", 'alpine3.17-slim' - environment "DATAHUB_ACTIONS_IMAGE", 'nginx' + // environment "ACTIONS_VERSION", 'alpine3.17-slim' + // environment "DATAHUB_ACTIONS_IMAGE", 'nginx' + + def cmd = [ + 'source ../metadata-ingestion/venv/bin/activate && ', + 'datahub docker quickstart', + '--no-pull-images', + '--standalone_consumers', + '--version', "v${version}", + '--dump-logs-on-failure' + ] + + commandLine 'bash', '-c', cmd.join(" ") +} + +task quickstartSlim(type: Exec, dependsOn: ':metadata-ingestion:install') { + dependsOn(([':docker:datahub-ingestion'] + quickstart_modules).collect { it + ':dockerTag' }) + shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' + + environment "DATAHUB_TELEMETRY_ENABLED", "false" + environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" + environment "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion" + environment "ACTIONS_VERSION", "v${version}-slim" + environment "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions' + environment "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' def cmd = [ 'source ../metadata-ingestion/venv/bin/activate && ', @@ -64,6 +87,7 @@ task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') { dependsOn(debug_modules.collect { it + ':dockerTagDebug' }) shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' + environment "DATAHUB_PRECREATE_TOPICS", "true" environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 9893d44caf460..3d47f79617370 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -1,3 +1,6 @@ +ARG APP_ENV=full +ARG BASE_IMAGE=base + FROM golang:1-alpine3.17 AS binary ENV DOCKERIZE_VERSION v0.6.1 @@ -16,9 +19,7 @@ ENV CONFLUENT_KAFKA_VERSION=1.6.1 ENV DEBIAN_FRONTEND noninteractive -RUN apt-get update && apt-get install -y \ - && apt-get install -y -qq \ - # gcc \ +RUN apt-get update && apt-get install -y -qq \ make \ python3-ldap \ libldap2-dev \ @@ -31,15 +32,34 @@ RUN apt-get update && apt-get install -y \ zip \ unzip \ ldap-utils \ - openjdk-11-jre-headless \ - && python -m pip install --upgrade pip wheel setuptools==57.5.0 \ - && curl -Lk -o /root/librdkafka-${LIBRDKAFKA_VERSION}.tar.gz https://github.com/edenhill/librdkafka/archive/v${LIBRDKAFKA_VERSION}.tar.gz \ - && tar -xzf /root/librdkafka-${LIBRDKAFKA_VERSION}.tar.gz -C /root \ - && cd /root/librdkafka-${LIBRDKAFKA_VERSION} \ - && ./configure --prefix /usr && make && make install && make clean && ./configure --clean \ - && apt-get remove -y make + && python -m pip install --no-cache --upgrade pip wheel setuptools \ + && wget -q https://github.com/edenhill/librdkafka/archive/v${LIBRDKAFKA_VERSION}.tar.gz -O - | \ + tar -xz -C /root \ + && cd /root/librdkafka-${LIBRDKAFKA_VERSION} \ + && ./configure --prefix /usr && make && make install && cd .. && rm -rf /root/librdkafka-${LIBRDKAFKA_VERSION} \ + && apt-get remove -y make \ + && rm -rf /var/lib/apt/lists/* /var/cache/apk/* + +# compiled against newer golang for security fixes COPY --from=binary /go/bin/dockerize /usr/local/bin +COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt +COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh + +RUN pip install --no-cache -r requirements.txt && \ + pip uninstall -y acryl-datahub && \ + chmod +x /entrypoint.sh && \ + addgroup --gid 1000 datahub && \ + adduser --disabled-password --uid 1000 --gid 1000 --home /datahub-ingestion datahub + +ENTRYPOINT [ "/entrypoint.sh" ] + +FROM ${BASE_IMAGE} as full-install + +RUN apt-get update && apt-get install -y -qq \ + default-jre-headless \ + && rm -rf /var/lib/apt/lists/* /var/cache/apk/* + RUN if [ $(arch) = "x86_64" ]; then \ mkdir /opt/oracle && \ cd /opt/oracle && \ @@ -58,7 +78,10 @@ RUN if [ $(arch) = "x86_64" ]; then \ ldconfig; \ fi; -COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt +FROM ${BASE_IMAGE} as slim-install +# Do nothing else on top of base + +FROM ${APP_ENV}-install -RUN pip install -r requirements.txt && \ - pip uninstall -y acryl-datahub +USER datahub +ENV PATH="/datahub-ingestion/.local/bin:$PATH" \ No newline at end of file diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 3d9e0777e5ce0..82d9a93a9a2c3 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -1,3 +1,7 @@ +# Excluded for slim +# pyspark==3.0.3 +# pydeequ==1.0.1 + acryl-datahub-classify==0.0.6 acryl-iceberg-legacy==0.0.4 acryl-PyHive==0.6.13 @@ -253,7 +257,6 @@ pycryptodome==3.18.0 pycryptodomex==3.18.0 pydantic==1.10.8 pydash==7.0.3 -pydeequ==1.0.1 pydruid==0.6.5 Pygments==2.15.1 pymongo==4.3.3 @@ -261,7 +264,6 @@ PyMySQL==1.0.3 pyOpenSSL==22.0.0 pyparsing==3.0.9 pyrsistent==0.19.3 -pyspark==3.0.3 pyspnego==0.9.0 python-daemon==3.0.1 python-dateutil==2.8.2 diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index fe3c12a59886f..10cd2ee71cce3 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -12,14 +12,17 @@ ext { } docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" + name "${docker_registry}/${docker_repo}:v${version}-slim" + version "v${version}-slim" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } + buildArgs([APP_ENV: 'slim']) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -27,10 +30,11 @@ task mkdirBuildDocker { } } dockerClean.finalizedBy(mkdirBuildDocker) +dockerClean.dependsOn([':docker:datahub-ingestion:dockerClean']) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/datahub-ingestion-base/entrypoint.sh b/docker/datahub-ingestion-base/entrypoint.sh new file mode 100644 index 0000000000000..518bb21561467 --- /dev/null +++ b/docker/datahub-ingestion-base/entrypoint.sh @@ -0,0 +1,14 @@ +#!/usr/bin/bash + +if [ ! -z "$ACTIONS_EXTRA_PACKAGES" ]; then + pip install --user $ACTIONS_EXTRA_PACKAGES +fi + +if [[ ! -z "$ACTIONS_CONFIG" && ! -z "$ACTIONS_EXTRA_PACKAGES" ]]; then + mkdir -p /tmp/datahub/logs + curl -q "$ACTIONS_CONFIG" -o config.yaml + exec dockerize -wait ${DATAHUB_GMS_PROTOCOL:-http}://$DATAHUB_GMS_HOST:$DATAHUB_GMS_PORT/health -timeout 240s \ + datahub actions --config config.yaml +else + exec datahub $@ +fi diff --git a/docker/datahub-ingestion-slim/Dockerfile b/docker/datahub-ingestion-slim/Dockerfile deleted file mode 100644 index 580dcc4277124..0000000000000 --- a/docker/datahub-ingestion-slim/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -# Defining environment -ARG APP_ENV=prod -ARG DOCKER_VERSION=latest - -FROM acryldata/datahub-ingestion:$DOCKER_VERSION as base - -USER 0 -RUN pip uninstall -y pyspark -USER datahub diff --git a/docker/datahub-ingestion-slim/build.gradle b/docker/datahub-ingestion-slim/build.gradle deleted file mode 100644 index f21b66b576a0c..0000000000000 --- a/docker/datahub-ingestion-slim/build.gradle +++ /dev/null @@ -1,39 +0,0 @@ -plugins { - id 'com.palantir.docker' - id 'java' // required for versioning -} - -apply from: "../../gradle/versioning/versioning.gradle" - -ext { - docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry - docker_repo = 'datahub-ingestion-slim' - docker_dir = 'datahub-ingestion-slim' -} - -docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" - dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") - files fileTree(rootProject.projectDir) { - include "docker/${docker_dir}/*" - } - buildArgs([DOCKER_VERSION: version]) - - buildx(false) -} -tasks.getByPath('docker').dependsOn(['build', ':docker:datahub-ingestion:docker']) - -task mkdirBuildDocker { - doFirst { - mkdir "${project.buildDir}/docker" - } -} -dockerClean.finalizedBy(mkdirBuildDocker) - -task cleanLocalDockerImages { - doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) - } -} -dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 45a98efb7f6fb..0ecc30d02ac3f 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -1,42 +1,27 @@ # Defining environment -ARG APP_ENV=prod +ARG APP_ENV=full +ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=latest -FROM acryldata/datahub-ingestion-base:$DOCKER_VERSION as base - -FROM eclipse-temurin:11 as prod-build -COPY . /datahub-src -WORKDIR /datahub-src -# We noticed that the gradle wrapper download failed frequently on in CI on arm64 machines. -# I suspect this was due because of the QEMU emulation slowdown, combined with the arm64 -# build being starved for CPU by the x86_64 build's codegen step. -# -# The middle step will attempt to download gradle wrapper 5 times with exponential backoff. -# The ./gradlew --version will force the download of the gradle wrapper but is otherwise a no-op. -# Note that the retry logic will always return success, so we should always attempt to run codegen. -# Inspired by https://github.com/gradle/gradle/issues/18124#issuecomment-958182335. -# and https://unix.stackexchange.com/a/82610/378179. -# This is a workaround for https://github.com/gradle/gradle/issues/18124. -RUN (for attempt in 1 2 3 4 5; do ./gradlew --version && break ; echo "Failed to download gradle wrapper (attempt $attempt)" && sleep $((2<<$attempt)) ; done ) && \ - ./gradlew :metadata-events:mxe-schemas:build - -FROM base as prod-codegen -COPY --from=prod-build /datahub-src /datahub-src -RUN cd /datahub-src/metadata-ingestion && \ - pip install -e ".[base]" && \ - ./scripts/codegen.sh - -FROM base as prod-install -COPY --from=prod-codegen /datahub-src/metadata-ingestion /datahub-ingestion -COPY --from=prod-codegen /root/.cache/pip /root/.cache/pip +FROM $BASE_IMAGE:$DOCKER_VERSION as base +USER 0 + +COPY ./metadata-ingestion /datahub-ingestion + ARG RELEASE_VERSION -RUN cd /datahub-ingestion && \ - sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ +WORKDIR /datahub-ingestion +RUN sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ cat src/datahub/__init__.py && \ - pip install ".[all]" && \ - pip freeze && \ - # This is required to fix security vulnerability in htrace-core4 - rm -f /usr/local/lib/python3.10/site-packages/pyspark/jars/htrace-core4-4.1.0-incubating.jar + chown -R datahub /datahub-ingestion + +USER datahub +ENV PATH="/datahub-ingestion/.local/bin:$PATH" + +FROM base as slim-install +RUN pip install --no-cache --user ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" + +FROM base as full-install +RUN pip install --no-cache --user ".[all]" FROM base as dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. @@ -44,7 +29,5 @@ FROM base as dev-install FROM ${APP_ENV}-install as final -RUN addgroup --system datahub && adduser --system datahub --ingroup datahub USER datahub - -ENTRYPOINT [ "datahub" ] +ENV PATH="/datahub-ingestion/.local/bin:$PATH" diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 7a24d87794c0e..22531c0c4fd0e 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -11,24 +11,30 @@ ext { docker_dir = 'datahub-ingestion' } +dependencies { + project(':docker:datahub-ingestion-base') + project(':metadata-ingestion') +} + docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" + name "${docker_registry}/${docker_repo}:v${version}-slim" + version "v${version}-slim" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" include "metadata-ingestion/**" - include "metadata-events/**" - include "metadata-models/**" - include "li-utils/**" - include "docs/**" - include "gradle/**" - include "buildSrc/**" - include "*" + }.exclude { + i -> i.file.isHidden() || + i.file == buildDir || + i.file == project(':metadata-ingestion').buildDir } - buildArgs([DOCKER_VERSION: version]) + buildArgs([DOCKER_VERSION: version, + RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', ''), + APP_ENV: 'slim']) } -tasks.getByPath('docker').dependsOn(['build', ':docker:datahub-ingestion-base:docker']) +tasks.getByName('docker').dependsOn(['build', + ':docker:datahub-ingestion-base:docker', + ':metadata-ingestion:codegen']) task mkdirBuildDocker { doFirst { @@ -39,7 +45,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 5ea364dd31ca7..08f8cc1ec9c45 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -26,6 +26,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 10b3f3c0eca5e..a755eda21cbf5 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -27,6 +27,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 9228c11446ddf..d07ea5fa88f8b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -26,6 +26,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/elasticsearch-setup/build.gradle b/docker/elasticsearch-setup/build.gradle index cc2fe1ec5c4db..ffee3b9c65cf4 100644 --- a/docker/elasticsearch-setup/build.gradle +++ b/docker/elasticsearch-setup/build.gradle @@ -17,6 +17,8 @@ docker { files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" include "metadata-service/restli-servlet-impl/src/main/resources/index/**" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index 8cf9d0869dc9b..5707234b85f57 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -1,5 +1,7 @@ +ARG KAFKA_DOCKER_VERSION=7.4.1 + # Using as a base image because to get the needed jars for confluent utils -FROM confluentinc/cp-base-new@sha256:ac4e0f9bcaecdab728740529f37452231fa40760fcf561759fc3b219f46d2cc9 as confluent_base +FROM confluentinc/cp-base-new:$KAFKA_DOCKER_VERSION as confluent_base ARG MAVEN_REPO="https://repo1.maven.org/maven2" ARG SNAKEYAML_VERSION="2.0" @@ -16,12 +18,6 @@ ENV SCALA_VERSION 2.13 # Set the classpath for JARs required by `cub` ENV CUB_CLASSPATH='"/usr/share/java/cp-base-new/*"' -# Confluent Docker Utils Version (Namely the tag or branch to grab from git to install) -ARG PYTHON_CONFLUENT_DOCKER_UTILS_VERSION="v0.0.60" - -# This can be overriden for an offline/air-gapped builds -ARG PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC="git+https://github.com/confluentinc/confluent-docker-utils@${PYTHON_CONFLUENT_DOCKER_UTILS_VERSION}" - LABEL name="kafka" version=${KAFKA_VERSION} RUN apk add --no-cache bash coreutils @@ -39,7 +35,6 @@ RUN mkdir -p /opt \ && pip install --no-cache-dir --upgrade pip wheel setuptools \ && pip install jinja2 requests \ && pip install "Cython<3.0" "PyYAML<6" --no-build-isolation \ - && pip install --prefer-binary --prefix=/usr/local --upgrade "${PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC}" \ && rm -rf /tmp/* \ && apk del --purge .build-deps @@ -69,7 +64,8 @@ ENV USE_CONFLUENT_SCHEMA_REGISTRY="TRUE" COPY docker/kafka-setup/kafka-setup.sh ./kafka-setup.sh COPY docker/kafka-setup/kafka-config.sh ./kafka-config.sh COPY docker/kafka-setup/kafka-topic-workers.sh ./kafka-topic-workers.sh +COPY docker/kafka-setup/kafka-ready.sh ./kafka-ready.sh -RUN chmod +x ./kafka-setup.sh && chmod +x ./kafka-topic-workers.sh +RUN chmod +x ./kafka-setup.sh ./kafka-topic-workers.sh ./kafka-ready.sh CMD ./kafka-setup.sh diff --git a/docker/kafka-setup/build.gradle b/docker/kafka-setup/build.gradle index a5d33457e45f7..573ef21c88bf9 100644 --- a/docker/kafka-setup/build.gradle +++ b/docker/kafka-setup/build.gradle @@ -16,6 +16,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -24,7 +26,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -35,7 +37,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/kafka-setup/kafka-ready.sh b/docker/kafka-setup/kafka-ready.sh new file mode 100755 index 0000000000000..ba87bde047ef5 --- /dev/null +++ b/docker/kafka-setup/kafka-ready.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for i in {1..60} +do + kafka-broker-api-versions.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER + if [ $? -eq 0 ]; then + break + fi + if [ $i -eq 60 ]; then + echo "Kafka bootstrap server $KAFKA_BOOTSTRAP_SERVER not ready." + exit 1 + fi + sleep 5s +done diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh old mode 100644 new mode 100755 index 7b015421b7963..629e9bc9484ee --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -49,8 +49,8 @@ if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH fi -cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 - +# cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 +. kafka-ready.sh ############################################################ # Start Topic Creation Logic diff --git a/docker/mysql-setup/build.gradle b/docker/mysql-setup/build.gradle index 48a28f15a581d..0d8941cce4833 100644 --- a/docker/mysql-setup/build.gradle +++ b/docker/mysql-setup/build.gradle @@ -17,6 +17,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/postgres-setup/build.gradle b/docker/postgres-setup/build.gradle index a5b0413ec4be8..8a026be09d2b4 100644 --- a/docker/postgres-setup/build.gradle +++ b/docker/postgres-setup/build.gradle @@ -17,6 +17,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 5a8edd6eacf19..38418bc8c41b9 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 6d51f2efcfcf2..cf879faa6a3f0 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 48f2d797bd8a4..007830078d2b4 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index bd30c359a2a76..390543b92123f 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md index e1fadee6d371a..9c8f6761a62bc 100644 --- a/docs/advanced/no-code-modeling.md +++ b/docs/advanced/no-code-modeling.md @@ -211,7 +211,7 @@ record ServiceKey { * Name of the service */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } name: string diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index ef4071f89c585..21d59b777dd7c 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -62,6 +62,7 @@ lazy_load_plugins = False | datahub.cluster | prod | name of the airflow cluster | | datahub.capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. | | datahub.capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. | + | datahub.capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | | datahub.graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | 5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). @@ -80,9 +81,7 @@ Emitting DataHub ... If you have created a custom Airflow operator [docs](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class, when overriding the `execute` function, set inlets and outlets via `context['ti'].task.inlets` and `context['ti'].task.outlets`. -The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. - - +The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. ```python class DbtOperator(BaseOperator): @@ -97,8 +96,8 @@ class DbtOperator(BaseOperator): def _get_lineage(self): # Do some processing to get inlets/outlets - - return inlets, outlets + + return inlets, outlets ``` If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. [source](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) @@ -172,7 +171,6 @@ Take a look at this sample DAG: In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details. - ## Debugging ### Incorrect URLs diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md index 32951ab2e41eb..f47630f44e772 100644 --- a/docs/modeling/extending-the-metadata-model.md +++ b/docs/modeling/extending-the-metadata-model.md @@ -323,7 +323,7 @@ It takes the following parameters: annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define the set of mappings to be applied in the MappingsBuilder. - Thus far, we have implemented 10 fieldTypes: + Thus far, we have implemented 11 fieldTypes: 1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering @@ -332,20 +332,25 @@ It takes the following parameters: 3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial matching is expensive, so this field type should not be applied to fields with long values (like description) - 4. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. + 4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND + word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries + matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is + expensive, so should not be applied to fields with long values such as description. - 5. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like + 5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. + + 6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components - 6. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. + 7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. - 7. *BOOLEAN* - Boolean fields used for filtering. + 8. *BOOLEAN* - Boolean fields used for filtering. - 8. *COUNT* - Count fields used for filtering. + 9. *COUNT* - Count fields used for filtering. - 9. *DATETIME* - Datetime fields used to represent timestamps. + 10. *DATETIME* - Datetime fields used to represent timestamps. - 10. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as + 11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a mapping explosion in Elasticsearch. diff --git a/docs/quickstart.md b/docs/quickstart.md index b93713c4efa5c..cd91dc8d1ac84 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -145,6 +145,27 @@ Please refer to [Change the default user datahub in quickstart](authentication/c We recommend deploying DataHub to production using Kubernetes. We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running. Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough. +The `quickstart` method of running DataHub is intended for local development and a quick way to experience the features that DataHub has to offer. It is not +intended for a production environment. This recommendation is based on the following points. + +#### Default Credentials + +`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying +prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a +design choice to make development easier and is not best practice for a production environment. + +#### Exposed Ports + +DataHub's services, and it's backend data stores use the docker default behavior of binding to all interface addresses. +This makes it useful for development but is not recommended in a production environment. + +#### Performance & Management + +* `quickstart` is limited by the resources available on a single host, there is no ability to scale horizontally. +* Rollout of new versions requires downtime. +* The configuration is largely pre-determined and not easily managed. +* `quickstart`, by default, follows the most recent builds forcing updates to the latest released and unreleased builds. + ## Other Common Operations ### Stopping DataHub diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java index f2e65c771c6eb..3d3fbcf3ccaa6 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java @@ -21,7 +21,7 @@ public class SearchableAnnotation { public static final String ANNOTATION_NAME = "Searchable"; private static final Set DEFAULT_QUERY_FIELD_TYPES = - ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL); + ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL); // Name of the field in the search index. Defaults to the field name in the schema String fieldName; @@ -59,7 +59,8 @@ public enum FieldType { COUNT, DATETIME, OBJECT, - BROWSE_PATH_V2 + BROWSE_PATH_V2, + WORD_GRAM } @Nonnull diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java index 1ab5ff640ce32..3618108970afa 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java @@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) { assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName()); // Assert on Searchable Fields - assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size()); + assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10); assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName()); assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get( @@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) { assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("textArrayField", "*").toString()) .getSearchableAnnotation().getFieldType()); + assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get( + new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName()); + assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get( + new PathSpec("wordGramField").toString()) + .getSearchableAnnotation().getFieldType()); assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName()); assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get( diff --git a/gradle/docker/docker.gradle b/gradle/docker/docker.gradle index f0bb4a5500b33..db2979a8ff6dc 100644 --- a/gradle/docker/docker.gradle +++ b/gradle/docker/docker.gradle @@ -21,6 +21,7 @@ ext.getDockerContainers = { ext.cleanLocalDockerImages = { String docker_registry, String docker_repo, String docker_tag -> + println("Docker image string: ${docker_registry}/${docker_repo}:${docker_tag}") def containers = getDockerContainers(docker_registry, docker_repo, docker_tag) if(!containers.isEmpty()) { println "Stopping containers: $containers" @@ -35,6 +36,7 @@ ext.cleanLocalDockerImages = { if(!images.isEmpty()) { println "Removing images: $images" exec { + ignoreExitValue true // may not work if used by downstream image commandLine = ["docker", "rmi", "-f"] + images } } diff --git a/metadata-dao-impl/kafka-producer/build.gradle b/metadata-dao-impl/kafka-producer/build.gradle index 5b40eb5f32232..6b08ac50a4c17 100644 --- a/metadata-dao-impl/kafka-producer/build.gradle +++ b/metadata-dao-impl/kafka-producer/build.gradle @@ -23,5 +23,8 @@ dependencies { implementation(externalDependency.log4jApi) { because("previous versions are vulnerable to CVE-2021-45105") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } } \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md index 9d400460407c8..03bcef70e1860 100644 --- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md +++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md @@ -1,5 +1,60 @@ ## Advanced Configurations +### Working with Platform Instances +If you've multiple instances of kafka OR source/sink systems that are referred in your `kafka-connect` setup, you'd need to configure platform instance for these systems in `kafka-connect` recipe to generate correct lineage edges. You must have already set `platform_instance` in recipes of original source/sink systems. Refer the document [Working with Platform Instances](https://datahubproject.io/docs/platform-instances) to understand more about this. + +There are two options available to declare source/sink system's `platform_instance` in `kafka-connect` recipe. If single instance of platform is used across all `kafka-connect` connectors, you can use `platform_instance_map` to specify platform_instance to use for a platform when constructing URNs for lineage. + +Example: +```yml + # Map of platform name to platform instance + platform_instance_map: + snowflake: snowflake_platform_instance + mysql: mysql_platform_instance + +``` +If multiple instances of platform are used across `kafka-connect` connectors, you'd need to specify platform_instance to use for platform for every connector. + +#### Example - Multiple MySQL Source Connectors each reading from different mysql instance +```yml + # Map of platform name to platform instance per connector + connect_to_platform_map: + mysql_connector1: + mysql: mysql_instance1 + + mysql_connector2: + mysql: mysql_instance2 +``` +Here mysql_connector1 and mysql_connector2 are names of MySQL source connectors as defined in `kafka-connect` connector config. + +#### Example - Multiple MySQL Source Connectors each reading from difference mysql instance and writing to different kafka cluster +```yml + connect_to_platform_map: + mysql_connector1: + mysql: mysql_instance1 + kafka: kafka_instance1 + + mysql_connector2: + mysql: mysql_instance2 + kafka: kafka_instance2 +``` +You can also use combination of `platform_instance_map` and `connect_to_platform_map` in your recipe. Note that, the platform_instance specified for the connector in `connect_to_platform_map` will always take higher precedance even if platform_instance for same platform is set in `platform_instance_map`. + +If you do not use `platform_instance` in original source/sink recipes, you do not need to specify them in above configurations. + +Note that, you do not need to specify platform_instance for BigQuery. + +#### Example - Multiple BigQuery Sink Connectors each writing to different kafka cluster +```yml + connect_to_platform_map: + bigquery_connector1: + kafka: kafka_instance1 + + bigquery_connector2: + kafka: kafka_instance2 +``` + +### Provided Configurations from External Sources Kafka Connect supports pluggable configuration providers which can load configuration data from external sources at runtime. These values are not available to DataHub ingestion source through Kafka Connect APIs. If you are using such provided configurations to specify connection url (database, etc) in Kafka Connect connector configuration then you will need also add these in `provided_configs` section in recipe for DataHub to generate correct lineage. ```yml diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml index f5e33e661622d..cacbda5ca078a 100644 --- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml +++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml @@ -3,14 +3,16 @@ source: config: # Coordinates connect_uri: "http://localhost:8083" - + # Credentials username: admin password: password # Optional - platform_instance_map: - bigquery: bigquery_platform_instance_id - + # Platform instance mapping to use when constructing URNs. + # Use if single instance of platform is referred across connectors. + platform_instance_map: + mysql: mysql_platform_instance + sink: - # sink configs \ No newline at end of file + # sink configs diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 84c520e689fbd..00600ee0055c7 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -269,6 +269,8 @@ def get_long_description(): "requests", } +mysql = sql_common | {"pymysql>=1.0.2"} + # Note: for all of these, framework_common will be added. plugins: Dict[str, Set[str]] = { # Sink plugins. @@ -288,6 +290,7 @@ def get_long_description(): "gql>=3.3.0", "gql[requests]>=3.3.0", }, + "datahub": mysql | kafka_common, "great-expectations": sql_common | sqllineage_lib, # Misc plugins. "sql-parser": sqlglot_lib, @@ -342,7 +345,7 @@ def get_long_description(): }, "iceberg": iceberg_common, "json-schema": set(), - "kafka": {*kafka_common, *kafka_protobuf}, + "kafka": kafka_common | kafka_protobuf, "kafka-connect": sql_common | {"requests", "JPype1"}, "ldap": {"python-ldap>=2.4"}, "looker": looker_common, @@ -352,10 +355,10 @@ def get_long_description(): "mongodb": {"pymongo[srv]>=3.11", "packaging"}, "mssql": sql_common | {"sqlalchemy-pytds>=0.3"}, "mssql-odbc": sql_common | {"pyodbc"}, - "mysql": sql_common | {"pymysql>=1.0.2"}, + "mysql": mysql, # mariadb should have same dependency as mysql "mariadb": sql_common | {"pymysql>=1.0.2"}, - "okta": {"okta~=1.7.0"}, + "okta": {"okta~=1.7.0", "nest-asyncio"}, "oracle": sql_common | {"cx_Oracle"}, "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, "presto": sql_common | pyhive_common | trino, @@ -452,7 +455,7 @@ def get_long_description(): "mypy==1.0.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.9.0", + "pydantic>=1.10.0", *test_api_requirements, pytest_dep, "pytest-asyncio>=0.16.0", @@ -549,6 +552,7 @@ def get_long_description(): "datahub.ingestion.source.plugins": [ "csv-enricher = datahub.ingestion.source.csv_enricher:CSVEnricherSource", "file = datahub.ingestion.source.file:GenericFileSource", + "datahub = datahub.ingestion.source.datahub.datahub_source:DataHubSource", "sqlalchemy = datahub.ingestion.source.sql.sql_generic:SQLAlchemyGenericSource", "athena = datahub.ingestion.source.sql.athena:AthenaSource", "azure-ad = datahub.ingestion.source.identity.azure_ad:AzureADSource", diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 918f610ce4635..9fde47c82873c 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -893,6 +893,7 @@ def download_compose_files( tmp_file.write(quickstart_download_response.content) logger.debug(f"Copied to {path}") if kafka_setup: + base_url = get_docker_compose_base_url(compose_git_ref) kafka_setup_github_file = f"{base_url}/{KAFKA_SETUP_QUICKSTART_COMPOSE_FILE}" default_kafka_compose_file = ( diff --git a/metadata-ingestion/src/datahub/configuration/datetimes.py b/metadata-ingestion/src/datahub/configuration/datetimes.py index 55f5c6fbd6155..41af7565593d9 100644 --- a/metadata-ingestion/src/datahub/configuration/datetimes.py +++ b/metadata-ingestion/src/datahub/configuration/datetimes.py @@ -43,24 +43,28 @@ def parse_user_datetime(input: str) -> datetime: # Then try parsing as a relative time. with contextlib.suppress(humanfriendly.InvalidTimespan): - delta = _parse_relative_timespan(input) + delta = parse_relative_timespan(input) return datetime.now(tz=timezone.utc) + delta # Finally, try parsing as an absolute time. with contextlib.suppress(dateutil.parser.ParserError): - dt = dateutil.parser.parse(input) - if dt.tzinfo is None: - # Assume that the user meant to specify a time in UTC. - dt = dt.replace(tzinfo=timezone.utc) - else: - # Convert to UTC. - dt = dt.astimezone(timezone.utc) - return dt + return parse_absolute_time(input) raise ValueError(f"Could not parse {input} as a datetime or relative time.") -def _parse_relative_timespan(input: str) -> timedelta: +def parse_absolute_time(input: str) -> datetime: + dt = dateutil.parser.parse(input) + if dt.tzinfo is None: + # Assume that the user meant to specify a time in UTC. + dt = dt.replace(tzinfo=timezone.utc) + else: + # Convert to UTC. + dt = dt.astimezone(timezone.utc) + return dt + + +def parse_relative_timespan(input: str) -> timedelta: neg = False input = input.strip() diff --git a/metadata-ingestion/src/datahub/configuration/time_window_config.py b/metadata-ingestion/src/datahub/configuration/time_window_config.py index a4b451f0cdfbd..1bf992952759b 100644 --- a/metadata-ingestion/src/datahub/configuration/time_window_config.py +++ b/metadata-ingestion/src/datahub/configuration/time_window_config.py @@ -2,10 +2,12 @@ from datetime import datetime, timedelta, timezone from typing import Any, Dict, List +import humanfriendly import pydantic from pydantic.fields import Field from datahub.configuration.common import ConfigModel +from datahub.configuration.datetimes import parse_absolute_time, parse_relative_timespan from datahub.metadata.schema_classes import CalendarIntervalClass @@ -42,25 +44,42 @@ class BaseTimeWindowConfig(ConfigModel): # if those fields are not set by the user. end_time: datetime = Field( default_factory=lambda: datetime.now(tz=timezone.utc), - description="Latest date of usage to consider. Default: Current time in UTC", + description="Latest date of lineage/usage to consider. Default: Current time in UTC", ) - start_time: datetime = Field(default=None, description="Earliest date of usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`)") # type: ignore + start_time: datetime = Field(default=None, description="Earliest date of lineage/usage to consider. Default: Last full day in UTC (or hour, depending on `bucket_duration`). You can also specify relative time with respect to end_time such as '-7 days' Or '-7d'.") # type: ignore @pydantic.validator("start_time", pre=True, always=True) def default_start_time( - cls, v: Any, *, values: Dict[str, Any], **kwargs: Any + cls, v: Any, values: Dict[str, Any], **kwargs: Any ) -> datetime: - return v or get_time_bucket( - values["end_time"] - get_bucket_duration_delta(values["bucket_duration"]), - values["bucket_duration"], - ) + if v is None: + return get_time_bucket( + values["end_time"] + - get_bucket_duration_delta(values["bucket_duration"]), + values["bucket_duration"], + ) + elif isinstance(v, str): + # This is where start_time str is resolved to datetime + try: + delta = parse_relative_timespan(v) + assert delta < timedelta( + 0 + ), "Relative start time should start with minus sign (-) e.g. '-2 days'." + assert abs(delta) > get_bucket_duration_delta( + values["bucket_duration"] + ), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'." + return values["end_time"] + delta + except humanfriendly.InvalidTimespan: + return parse_absolute_time(v) + + return v @pydantic.validator("start_time", "end_time") def ensure_timestamps_in_utc(cls, v: datetime) -> datetime: - if v.tzinfo != timezone.utc: - raise ValueError( - 'timezone is not UTC; try adding a "Z" to the value e.g. "2021-07-20T00:00:00Z"' - ) + assert ( + v.tzinfo == timezone.utc + ), 'timezone is not UTC; try adding a "Z" to the value e.g. "2021-07-20T00:00:00Z"' + return v def buckets(self) -> List[datetime]: diff --git a/metadata-ingestion/src/datahub/emitter/aspect.py b/metadata-ingestion/src/datahub/emitter/aspect.py index 9118967a07273..0be2b3336980c 100644 --- a/metadata-ingestion/src/datahub/emitter/aspect.py +++ b/metadata-ingestion/src/datahub/emitter/aspect.py @@ -1,10 +1,12 @@ -from datahub.metadata.schema_classes import ASPECT_CLASSES +from typing import Dict, Type -ASPECT_MAP = { +from datahub.metadata.schema_classes import ASPECT_CLASSES, _Aspect + +ASPECT_MAP: Dict[str, Type[_Aspect]] = { AspectClass.get_aspect_name(): AspectClass for AspectClass in ASPECT_CLASSES } -TIMESERIES_ASPECT_MAP = { +TIMESERIES_ASPECT_MAP: Dict[str, Type[_Aspect]] = { name: klass for name, klass in ASPECT_MAP.items() if klass.get_aspect_type() == "timeseries" diff --git a/metadata-ingestion/src/datahub/emitter/mcp.py b/metadata-ingestion/src/datahub/emitter/mcp.py index 6f9a22bffd085..9085ac152ea0b 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp.py +++ b/metadata-ingestion/src/datahub/emitter/mcp.py @@ -9,6 +9,7 @@ DictWrapper, GenericAspectClass, KafkaAuditHeaderClass, + MetadataChangeLogClass, MetadataChangeProposalClass, SystemMetadataClass, _Aspect, @@ -214,6 +215,22 @@ def try_from_mcpc( else: return None + @classmethod + def try_from_mcl( + cls, mcl: MetadataChangeLogClass + ) -> Union["MetadataChangeProposalWrapper", MetadataChangeProposalClass]: + mcpc = MetadataChangeProposalClass( + entityUrn=mcl.entityUrn, + entityType=mcl.entityType, + entityKeyAspect=mcl.entityKeyAspect, + aspect=mcl.aspect, + aspectName=mcl.aspectName, + changeType=mcl.changeType, + auditHeader=mcl.auditHeader, + systemMetadata=mcl.systemMetadata, + ) + return cls.try_from_mcpc(mcpc) or mcpc + @classmethod def from_obj_require_wrapper( cls, obj: dict, tuples: bool = False diff --git a/metadata-ingestion/src/datahub/ingestion/api/registry.py b/metadata-ingestion/src/datahub/ingestion/api/registry.py index ec4884e7e805f..56ea716948199 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/registry.py +++ b/metadata-ingestion/src/datahub/ingestion/api/registry.py @@ -127,7 +127,7 @@ def _ensure_not_lazy(self, key: str) -> Union[Type[T], Exception]: plugin_class = import_path(path) self.register(key, plugin_class, override=True) return plugin_class - except (AssertionError, ImportError) as e: + except Exception as e: self.register_disabled(key, e, override=True) return e diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 0eabd22e77334..7fc15cf829678 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -314,7 +314,7 @@ def auto_empty_dataset_usage_statistics( logger.warning( f"Usage statistics with unexpected timestamps, bucket_duration={config.bucket_duration}:\n" ", ".join( - str(datetime.fromtimestamp(ts, tz=timezone.utc)) + str(datetime.fromtimestamp(ts / 1000, tz=timezone.utc)) for ts in invalid_timestamps ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index 8e313e92cbf84..c943b83a887ed 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -435,6 +435,7 @@ def _field_from_complex_type( field_path._set_parent_type_if_not_exists( DataHubType(type=MapTypeClass, nested_type=value_type) ) + # FIXME: description not set. This is present in schema["description"]. yield from JsonSchemaTranslator.get_fields( JsonSchemaTranslator._get_type_from_schema( schema["additionalProperties"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index d1f39a3ba1ba6..7725d63ce0e1e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -129,6 +129,7 @@ # Handle table snapshots # See https://cloud.google.com/bigquery/docs/table-snapshots-intro. SNAPSHOT_TABLE_REGEX = re.compile(r"^(.+)@(\d{13})$") +CLUSTERING_COLUMN_TAG = "CLUSTERING_COLUMN" # We can't use close as it is not called if the ingestion is not successful @@ -1151,6 +1152,21 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: field.description = col.comment schema_fields[idx] = field else: + tags = [] + if col.is_partition_column: + tags.append( + TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY)) + ) + + if col.cluster_column_position is not None: + tags.append( + TagAssociationClass( + make_tag_urn( + f"{CLUSTERING_COLUMN_TAG}_{col.cluster_column_position}" + ) + ) + ) + field = SchemaField( fieldPath=col.name, type=SchemaFieldDataType( @@ -1160,15 +1176,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: nativeDataType=col.data_type, description=col.comment, nullable=col.is_nullable, - globalTags=GlobalTagsClass( - tags=[ - TagAssociationClass( - make_tag_urn(Constants.TAG_PARTITION_KEY) - ) - ] - ) - if col.is_partition_column - else GlobalTagsClass(tags=[]), + globalTags=GlobalTagsClass(tags=tags), ) schema_fields.append(field) last_id = col.ordinal_position diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 7287dc1b67d73..e5730ee87daf4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -8,7 +8,7 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.validate_field_removal import pydantic_removed_field -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulLineageConfigMixin, StatefulProfilingConfigMixin, @@ -37,7 +37,7 @@ class BigQueryUsageConfig(BaseUsageConfig): class BigQueryV2Config( BigQueryBaseConfig, - SQLAlchemyConfig, + SQLCommonConfig, StatefulUsageConfigMixin, StatefulLineageConfigMixin, StatefulProfilingConfigMixin, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 2450dbd0e2391..f8256f8e6fed6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -33,6 +33,7 @@ class BigqueryTableType: class BigqueryColumn(BaseColumn): field_path: str is_partition_column: bool + cluster_column_position: Optional[int] RANGE_PARTITION_NAME: str = "RANGE" @@ -285,7 +286,8 @@ class BigqueryQuery: CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, description as comment, c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column + c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, from `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name @@ -307,6 +309,7 @@ class BigqueryQuery: description as comment, c.is_hidden as is_hidden, c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, -- We count the columns to be able limit it later row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num, -- Getting the maximum shard for each table @@ -333,6 +336,7 @@ class BigqueryQuery: CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, c.is_hidden as is_hidden, c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, description as comment from `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c @@ -583,6 +587,7 @@ def get_columns_for_dataset( data_type=column.data_type, comment=column.comment, is_partition_column=column.is_partitioning_column == "YES", + cluster_column_position=column.clustering_ordinal_position, ) ) @@ -621,6 +626,7 @@ def get_columns_for_table( data_type=column.data_type, comment=column.comment, is_partition_column=column.is_partitioning_column == "YES", + cluster_column_position=column.clustering_ordinal_position, ) ) last_seen_table = column.table_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py index dcaec4e45737f..0bdcb115b377c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py @@ -113,7 +113,7 @@ def get_schema_str_replace_confluent_ref_avro( schema_seen = set() schema_str = self._compact_schema(schema.schema_str) for schema_ref in schema.references: - ref_subject = schema_ref["subject"] + ref_subject = schema_ref.subject if ref_subject in schema_seen: continue @@ -132,7 +132,7 @@ def get_schema_str_replace_confluent_ref_avro( # Replace only external type references with the reference schema recursively. # NOTE: The type pattern is dependent on _compact_schema. avro_type_kwd = '"type"' - ref_name = schema_ref["name"] + ref_name = schema_ref.name # Try by name first pattern_to_replace = f'{avro_type_kwd}:"{ref_name}"' if pattern_to_replace not in schema_str: @@ -164,7 +164,7 @@ def get_schemas_from_confluent_ref_protobuf( schema_ref: SchemaReference for schema_ref in schema.references: - ref_subject: str = schema_ref["subject"] + ref_subject: str = schema_ref.subject if ref_subject in schema_seen: continue reference_schema: RegisteredSchema = ( @@ -173,7 +173,7 @@ def get_schemas_from_confluent_ref_protobuf( schema_seen.add(ref_subject) all_schemas.append( ProtobufSchema( - name=schema_ref["name"], content=reference_schema.schema.schema_str + name=schema_ref.name, content=reference_schema.schema.schema_str ) ) return all_schemas @@ -192,19 +192,19 @@ def get_schemas_from_confluent_ref_json( schema_ref: SchemaReference for schema_ref in schema.references: - ref_subject: str = schema_ref["subject"] + ref_subject: str = schema_ref.subject if ref_subject in schema_seen: continue reference_schema: RegisteredSchema = ( self.schema_registry_client.get_version( - subject_name=ref_subject, version=schema_ref["version"] + subject_name=ref_subject, version=schema_ref.version ) ) schema_seen.add(ref_subject) all_schemas.extend( self.get_schemas_from_confluent_ref_json( reference_schema.schema, - name=schema_ref["name"], + name=schema_ref.name, subject=ref_subject, schema_seen=schema_seen, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index f057862a343b1..7cb487a86d931 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -106,9 +106,9 @@ class CSVEnricherSource(Source): ``` resource,subresource,glossary_terms,tags,owners,ownership_type,description,domain - "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub.growth.users,PROD",,[urn:li:glossaryTerm:Users],[urn:li:tag:HighQuality],[urn:li:corpuser:lfoe;urn:li:corpuser:jdoe],TECHNICAL_OWNER,"description for users table",urn:li:domain:Engineering - "urn:li:dataset:(urn:li:dataPlatform:hive,datahub.growth.users,PROD",first_name,[urn:li:glossaryTerm:FirstName],,,,"first_name description" - "urn:li:dataset:(urn:li:dataPlatform:hive,datahub.growth.users,PROD",last_name,[urn:li:glossaryTerm:LastName],,,,"last_name description" + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub.growth.users,PROD)",,[urn:li:glossaryTerm:Users],[urn:li:tag:HighQuality],[urn:li:corpuser:lfoe|urn:li:corpuser:jdoe],TECHNICAL_OWNER,"description for users table",urn:li:domain:Engineering + "urn:li:dataset:(urn:li:dataPlatform:hive,datahub.growth.users,PROD)",first_name,[urn:li:glossaryTerm:FirstName],,,,"first_name description" + "urn:li:dataset:(urn:li:dataPlatform:hive,datahub.growth.users,PROD)",last_name,[urn:li:glossaryTerm:LastName],,,,"last_name description" ``` Note that the first row does not have a subresource populated. That means any glossary terms, tags, and owners will diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py new file mode 100644 index 0000000000000..a054067d92334 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -0,0 +1,68 @@ +from typing import Optional + +from pydantic import Field + +from datahub.configuration.kafka import KafkaConsumerConnectionConfig +from datahub.ingestion.source.sql.mysql import MySQLConnectionConfig +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfig, + StatefulIngestionConfigBase, +) + +DEFAULT_MYSQL_TABLE_NAME = "metadata_aspect_v2" +DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1" +DEFAULT_MYSQL_BATCH_SIZE = 10_000 + + +class DataHubSourceConfig(StatefulIngestionConfigBase): + mysql_connection: MySQLConnectionConfig = Field( + default=MySQLConnectionConfig(), + description="MySQL connection config", + ) + + kafka_connection: KafkaConsumerConnectionConfig = Field( + default=KafkaConsumerConnectionConfig(), + description="Kafka connection config", + ) + + include_all_versions: bool = Field( + default=False, + description=( + "If enabled, include all versions of each aspect. " + "Otherwise, only include the latest version of each aspect." + ), + ) + + mysql_batch_size: int = Field( + default=DEFAULT_MYSQL_BATCH_SIZE, + description="Number of records to fetch from MySQL at a time", + ) + + mysql_table_name: str = Field( + default=DEFAULT_MYSQL_TABLE_NAME, + description="Name of MySQL table containing all versioned aspects", + ) + + kafka_topic_name: str = Field( + default=DEFAULT_KAFKA_TOPIC_NAME, + description="Name of kafka topic containing timeseries MCLs", + ) + + # Override from base class to make this enabled by default + stateful_ingestion: StatefulIngestionConfig = Field( + default=StatefulIngestionConfig(enabled=True), + description="Stateful Ingestion Config", + ) + + commit_state_interval: Optional[int] = Field( + default=1000, + description="Number of records to process before committing state", + ) + + commit_with_parse_errors: bool = Field( + default=False, + description=( + "Whether to update createdon timestamp and kafka offset despite parse errors. " + "Enable if you want to ignore the errors." + ), + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py new file mode 100644 index 0000000000000..b165d70dd53b0 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py @@ -0,0 +1,99 @@ +import logging +from datetime import datetime +from typing import Dict, Iterable, List, Tuple + +from confluent_kafka import ( + OFFSET_BEGINNING, + Consumer, + DeserializingConsumer, + TopicPartition, +) +from confluent_kafka.schema_registry import SchemaRegistryClient +from confluent_kafka.schema_registry.avro import AvroDeserializer + +from datahub.ingestion.api.closeable import Closeable +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.report import DataHubSourceReport +from datahub.ingestion.source.datahub.state import PartitionOffset +from datahub.metadata.schema_classes import MetadataChangeLogClass + +logger = logging.getLogger(__name__) + +KAFKA_GROUP_PREFIX = "datahub_source" + + +class DataHubKafkaReader(Closeable): + def __init__( + self, + config: DataHubSourceConfig, + report: DataHubSourceReport, + ctx: PipelineContext, + ): + self.config = config + self.report = report + self.group_id = f"{KAFKA_GROUP_PREFIX}-{ctx.pipeline_name}" + + def __enter__(self) -> "DataHubKafkaReader": + self.consumer = DeserializingConsumer( + { + "group.id": self.group_id, + "bootstrap.servers": self.config.kafka_connection.bootstrap, + **self.config.kafka_connection.consumer_config, + "auto.offset.reset": "earliest", + "enable.auto.commit": False, + "value.deserializer": AvroDeserializer( + schema_registry_client=SchemaRegistryClient( + {"url": self.config.kafka_connection.schema_registry_url} + ), + return_record_name=True, + ), + } + ) + return self + + def get_mcls( + self, from_offsets: Dict[int, int], stop_time: datetime + ) -> Iterable[Tuple[MetadataChangeLogClass, PartitionOffset]]: + # Based on https://github.com/confluentinc/confluent-kafka-python/issues/145#issuecomment-284843254 + def on_assign(consumer: Consumer, partitions: List[TopicPartition]) -> None: + for p in partitions: + p.offset = from_offsets.get(p.partition, OFFSET_BEGINNING) + logger.debug(f"Set partition {p.partition} offset to {p.offset}") + consumer.assign(partitions) + + self.consumer.subscribe([self.config.kafka_topic_name], on_assign=on_assign) + try: + yield from self._poll_partition(stop_time) + finally: + self.consumer.unsubscribe() + + def _poll_partition( + self, stop_time: datetime + ) -> Iterable[Tuple[MetadataChangeLogClass, PartitionOffset]]: + while True: + msg = self.consumer.poll(10) + if msg is None: + break + + try: + mcl = MetadataChangeLogClass.from_obj(msg.value(), True) + except Exception as e: + logger.warning(f"Error deserializing MCL: {e}") + self.report.num_kafka_parse_errors += 1 + self.report.kafka_parse_errors.setdefault(str(e), 0) + self.report.kafka_parse_errors[str(e)] += 1 + continue + + if mcl.created and mcl.created.time > stop_time.timestamp() * 1000: + logger.info( + f"Stopped reading from kafka, reached MCL " + f"with audit stamp {datetime.fromtimestamp(mcl.created.time / 1000)}" + ) + break + + # TODO: Consider storing state in kafka instead, via consumer.commit() + yield mcl, PartitionOffset(partition=msg.partition(), offset=msg.offset()) + + def close(self) -> None: + self.consumer.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_mysql_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_mysql_reader.py new file mode 100644 index 0000000000000..adf4c1db57395 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_mysql_reader.py @@ -0,0 +1,98 @@ +import json +import logging +from datetime import datetime +from typing import Dict, Iterable, Optional, Tuple + +from sqlalchemy import create_engine + +from datahub.emitter.aspect import ASPECT_MAP +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.serialization_helper import post_json_transform +from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.report import DataHubSourceReport +from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass +from datahub.utilities.lossy_collections import LossyDict, LossyList + +logger = logging.getLogger(__name__) + +MYSQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f" + + +class DataHubMySQLReader: + def __init__(self, config: DataHubSourceConfig, report: DataHubSourceReport): + self.config = config + self.report = report + self.engine = create_engine( + url=config.mysql_connection.get_sql_alchemy_url(), + **config.mysql_connection.options, + ) + + @property + def query(self) -> str: + # May repeat rows for the same date + # Offset is generally 0, unless we repeat the same date twice + return f""" + SELECT urn, aspect, metadata, systemmetadata, createdon + FROM `{self.config.mysql_table_name}` + WHERE createdon >= %(since_createdon)s + {"" if self.config.include_all_versions else "AND version = 0"} + ORDER BY createdon, urn, aspect, version # Ensures stable ordering + LIMIT %(limit)s + OFFSET %(offset)s + """ + + def get_aspects( + self, from_createdon: datetime, stop_time: datetime + ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]: + with self.engine.connect() as conn: + ts = from_createdon + offset = 0 + while ts.timestamp() <= stop_time.timestamp(): + logger.debug(f"Polling MySQL aspects from {ts}") + rows = conn.execute( + self.query, + since_createdon=ts.strftime(MYSQL_DATETIME_FORMAT), + limit=self.config.mysql_batch_size, + offset=offset, + ) + if not rows.rowcount: + return + + for i, row in enumerate(rows): + # TODO: Replace with namedtuple usage once we drop sqlalchemy 1.3 + if hasattr(row, "_asdict"): + row_dict = row._asdict() + else: + row_dict = dict(row) + mcp = self._parse_mysql_row(row_dict) + if mcp: + yield mcp, row_dict["createdon"] + + if ts == row_dict["createdon"]: + offset += i + else: + ts = row_dict["createdon"] + print(ts) + offset = 0 + + def _parse_mysql_row(self, d: Dict) -> Optional[MetadataChangeProposalWrapper]: + try: + json_aspect = post_json_transform(json.loads(d["metadata"])) + json_metadata = post_json_transform(json.loads(d["systemmetadata"] or "{}")) + system_metadata = SystemMetadataClass.from_obj(json_metadata) + system_metadata.lastObserved = int(d["createdon"].timestamp() * 1000) + return MetadataChangeProposalWrapper( + entityUrn=d["urn"], + aspect=ASPECT_MAP[d["aspect"]].from_obj(json_aspect), + systemMetadata=system_metadata, + changeType=ChangeTypeClass.UPSERT, + ) + except Exception as e: + logger.warning( + f"Failed to parse metadata for {d['urn']}: {e}", exc_info=True + ) + self.report.num_mysql_parse_errors += 1 + self.report.mysql_parse_errors.setdefault(str(e), LossyDict()).setdefault( + d["aspect"], LossyList() + ).append(d["urn"]) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py new file mode 100644 index 0000000000000..636e65a244dad --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -0,0 +1,125 @@ +import logging +from datetime import datetime, timezone +from typing import Dict, Iterable, List, Optional + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.datahub_kafka_reader import DataHubKafkaReader +from datahub.ingestion.source.datahub.datahub_mysql_reader import DataHubMySQLReader +from datahub.ingestion.source.datahub.report import DataHubSourceReport +from datahub.ingestion.source.datahub.state import StatefulDataHubIngestionHandler +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) +from datahub.metadata.schema_classes import ChangeTypeClass + +logger = logging.getLogger(__name__) + + +@platform_name("DataHub") +@config_class(DataHubSourceConfig) +@support_status(SupportStatus.TESTING) +class DataHubSource(StatefulIngestionSourceBase): + platform: str = "datahub" + + def __init__(self, config: DataHubSourceConfig, ctx: PipelineContext): + super().__init__(config, ctx) + self.config = config + self.report: DataHubSourceReport = DataHubSourceReport() + self.stateful_ingestion_handler = StatefulDataHubIngestionHandler(self) + + @classmethod + def create(cls, config_dict: Dict, ctx: PipelineContext) -> "DataHubSource": + config: DataHubSourceConfig = DataHubSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_report(self) -> SourceReport: + return self.report + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [] # Exactly replicate data from DataHub source + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + stop_time = datetime.now(tz=timezone.utc) + logger.info(f"Ingesting DataHub metadata up until roughly {stop_time}") + state = self.stateful_ingestion_handler.get_last_run_state() + yield from self._get_mysql_workunits(state.mysql_createdon_datetime, stop_time) + self._commit_progress() + yield from self._get_kafka_workunits(state.kafka_offsets, stop_time) + self._commit_progress() + + def _get_mysql_workunits( + self, from_createdon: datetime, stop_time: datetime + ) -> Iterable[MetadataWorkUnit]: + logger.info(f"Fetching MySQL aspects from {from_createdon}") + reader = DataHubMySQLReader(self.config, self.report) + mcps = reader.get_aspects(from_createdon, stop_time) + for i, (mcp, createdon) in enumerate(mcps): + yield mcp.as_workunit() + self.report.num_mysql_aspects_ingested += 1 + + if ( + self.config.commit_with_parse_errors + or not self.report.num_mysql_parse_errors + ): + self.stateful_ingestion_handler.update_checkpoint( + last_createdon=createdon + ) + self._commit_progress(i) + + def _get_kafka_workunits( + self, from_offsets: Dict[int, int], stop_time: datetime + ) -> Iterable[MetadataWorkUnit]: + logger.info(f"Fetching timeseries aspects from kafka until {stop_time}") + + with DataHubKafkaReader(self.config, self.report, self.ctx) as reader: + mcls = reader.get_mcls(from_offsets=from_offsets, stop_time=stop_time) + for i, (mcl, offset) in enumerate(mcls): + mcp = MetadataChangeProposalWrapper.try_from_mcl(mcl) + if mcp.changeType == ChangeTypeClass.DELETE: + self.report.num_timeseries_deletions_dropped += 1 + logger.debug( + f"Dropping timeseries deletion of {mcp.aspectName} on {mcp.entityUrn}" + ) + continue + + if isinstance(mcp, MetadataChangeProposalWrapper): + yield mcp.as_workunit() + else: + yield MetadataWorkUnit( + id=f"{mcp.entityUrn}-{mcp.aspectName}-{i}", mcp_raw=mcp + ) + self.report.num_kafka_aspects_ingested += 1 + + if ( + self.config.commit_with_parse_errors + or not self.report.num_kafka_parse_errors + ): + self.stateful_ingestion_handler.update_checkpoint( + last_offset=offset + ) + self._commit_progress(i) + + def _commit_progress(self, i: Optional[int] = None) -> None: + """Commit progress to stateful storage, if there have been no errors. + + If an index `i` is provided, only commit if we are at the appropriate interval + as per `config.commit_state_interval`. + """ + on_interval = ( + i + and self.config.commit_state_interval + and i % self.config.commit_state_interval == 0 + ) + + if i is None or on_interval: + self.stateful_ingestion_handler.commit_checkpoint() diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py new file mode 100644 index 0000000000000..3aa93d6a4577b --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass, field + +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionReport, +) +from datahub.utilities.lossy_collections import LossyDict, LossyList + + +@dataclass +class DataHubSourceReport(StatefulIngestionReport): + num_mysql_aspects_ingested: int = 0 + num_mysql_parse_errors: int = 0 + # error -> aspect -> [urn] + mysql_parse_errors: LossyDict[str, LossyDict[str, LossyList[str]]] = field( + default_factory=LossyDict + ) + + num_kafka_aspects_ingested: int = 0 + num_kafka_parse_errors: int = 0 + kafka_parse_errors: LossyDict[str, int] = field(default_factory=LossyDict) + + num_timeseries_deletions_dropped: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py new file mode 100644 index 0000000000000..deea9772fae20 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py @@ -0,0 +1,95 @@ +from datetime import datetime, timezone +from functools import lru_cache +from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, cast + +from pydantic import Field +from pydantic.types import NonNegativeInt + +from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import JobId +from datahub.ingestion.source.state.checkpoint import Checkpoint, CheckpointStateBase +from datahub.ingestion.source.state.use_case_handler import ( + StatefulIngestionUsecaseHandlerBase, +) + +if TYPE_CHECKING: + from datahub.ingestion.source.datahub.datahub_source import DataHubSource + + +class DataHubIngestionState(CheckpointStateBase): + mysql_createdon_ts: NonNegativeInt = 0 + + # Maps partition -> offset + kafka_offsets: Dict[int, NonNegativeInt] = Field(default_factory=dict) + + @property + def mysql_createdon_datetime(self) -> datetime: + return datetime.fromtimestamp(self.mysql_createdon_ts / 1000, tz=timezone.utc) + + +class PartitionOffset(NamedTuple): + partition: int + offset: int + + +class StatefulDataHubIngestionHandler( + StatefulIngestionUsecaseHandlerBase[DataHubIngestionState] +): + def __init__(self, source: "DataHubSource"): + self.state_provider = source.state_provider + self.config = source.config.stateful_ingestion + self.run_id = source.ctx.run_id + self.pipeline_name = source.ctx.pipeline_name + self.state_provider.register_stateful_ingestion_usecase_handler(self) + + @lru_cache(maxsize=1) + def is_checkpointing_enabled(self) -> bool: + return self.state_provider.is_stateful_ingestion_configured() + + def get_last_run_state(self) -> DataHubIngestionState: + if self.is_checkpointing_enabled() and not self.config.ignore_old_state: + last_checkpoint = self.state_provider.get_last_checkpoint( + self.job_id, DataHubIngestionState + ) + if last_checkpoint and last_checkpoint.state: + return last_checkpoint.state + + return DataHubIngestionState() + + def create_checkpoint(self) -> Optional[Checkpoint[DataHubIngestionState]]: + if not self.is_checkpointing_enabled() or self.config.ignore_new_state: + return None + + if self.pipeline_name is None: + raise ValueError( + "Pipeline name must be set to use stateful datahub ingestion" + ) + + return Checkpoint( + job_name=self.job_id, + pipeline_name=self.pipeline_name, + run_id=self.run_id, + state=self.get_last_run_state(), + ) + + def update_checkpoint( + self, + *, + last_createdon: Optional[datetime] = None, + last_offset: Optional[PartitionOffset] = None, + ) -> None: + cur_checkpoint = self.state_provider.get_current_checkpoint(self.job_id) + if cur_checkpoint: + cur_state = cast(DataHubIngestionState, cur_checkpoint.state) + if last_createdon: + cur_state.mysql_createdon_ts = int(last_createdon.timestamp() * 1000) + if last_offset: + cur_state.kafka_offsets[last_offset.partition] = last_offset.offset + 1 + + def commit_checkpoint(self) -> None: + if self.state_provider.ingestion_checkpointing_state_provider: + self.state_provider.prepare_for_commit() + self.state_provider.ingestion_checkpointing_state_provider.commit() + + @property + def job_id(self) -> JobId: + return JobId("datahub_ingestion") diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 1cd5ed8164854..af9769bc9d94c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -162,9 +162,11 @@ class DBTCloudConfig(DBTCommonConfig): } _DBT_GRAPHQL_QUERY = """ -query DatahubMetadataQuery_{type}($jobId: Int!, $runId: Int) {{ - {type}(jobId: $jobId, runId: $runId) {{ +query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{ + job(id: $jobId, runId: $runId) {{ + {type} {{ {fields} + }} }} }} """ @@ -218,7 +220,7 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: }, ) - raw_nodes.extend(data[node_type]) + raw_nodes.extend(data["job"][node_type]) nodes = [self._parse_into_dbt_node(node) for node in raw_nodes] diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index a299023b88e64..5805790fe8bb7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -7,6 +7,7 @@ from time import sleep from typing import Dict, Iterable, List, Optional, Union +import nest_asyncio from okta.client import Client as OktaClient from okta.exceptions import OktaAPIException from okta.models import Group, GroupProfile, User, UserProfile, UserStatus @@ -51,6 +52,7 @@ ) logger = logging.getLogger(__name__) +nest_asyncio.apply() class OktaConfig(StatefulIngestionConfigBase, ConfigModel): diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 085878245c60d..497b49acb6505 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -146,6 +146,11 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): description="Use pagination while do search query (enabled by default).", ) + use_email_as_username: bool = Field( + default=False, + description="Use email for users' usernames instead of username (disabled by default). \ + If enabled, the user and group urn would be having email as the id part of the urn.", + ) # default mapping for attrs user_attrs_map: Dict[str, Any] = {} group_attrs_map: Dict[str, Any] = {} @@ -306,6 +311,7 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn work unit based on the information. """ manager_ldap = None + make_manager_urn = None if self.config.user_attrs_map["managerUrn"] in attrs: try: m_cn = attrs[self.config.user_attrs_map["managerUrn"]][0].decode() @@ -322,10 +328,19 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn result = self.ldap_client.result3(manager_msgid) if result[1]: _m_dn, m_attrs = result[1][0] + manager_ldap = guess_person_ldap(m_attrs, self.config, self.report) + + m_email = get_attr_or_none( + m_attrs, self.config.user_attrs_map["email"], manager_ldap + ) + make_manager_urn = ( + m_email if self.config.use_email_as_username else manager_ldap + ) + except ldap.LDAPError as e: self.report.report_warning(dn, f"manager LDAP search failed: {e}") - mce = self.build_corp_user_mce(dn, attrs, manager_ldap) + mce = self.build_corp_user_mce(dn, attrs, make_manager_urn) if mce: yield MetadataWorkUnit(dn, mce) else: @@ -387,8 +402,10 @@ def build_corp_user_mce( manager_urn = f"urn:li:corpuser:{manager_ldap}" if manager_ldap else None + make_user_urn = email if self.config.use_email_as_username else ldap_user + user_snapshot = CorpUserSnapshotClass( - urn=f"urn:li:corpuser:{ldap_user}", + urn=f"urn:li:corpuser:{make_user_urn}", aspects=[ CorpUserInfoClass( active=True, @@ -429,8 +446,10 @@ def build_corp_group_mce(self, attrs: dict) -> Optional[MetadataChangeEvent]: attrs, self.config.group_attrs_map["displayName"] ) + make_group_urn = email if self.config.use_email_as_username else full_name + group_snapshot = CorpGroupSnapshotClass( - urn=f"urn:li:corpGroup:{full_name}", + urn=f"urn:li:corpGroup:{make_group_urn}", aspects=[ CorpGroupInfoClass( email=email, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 31d067f984d2d..ffa685fb25826 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -121,6 +121,12 @@ class DataPlatformPair: powerbi_data_platform_name: str +@dataclass +class PowerBIPlatformDetail: + data_platform_pair: DataPlatformPair + data_platform_server: str + + class SupportedDataPlatform(Enum): POSTGRES_SQL = DataPlatformPair( powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" @@ -382,6 +388,15 @@ class PowerBiDashboardSourceConfig( description="The instance of the platform that all assets produced by this recipe belong to", ) + # Enable advance sql construct + enable_advance_lineage_sql_construct: bool = pydantic.Field( + default=False, + description="Whether to enable advance native sql construct for parsing like join, sub-queries. " + "along this flag , the native_query_parsing should be enabled. " + "By default convert_lineage_urns_to_lowercase is enabled, in-case if you have disabled it in previous ingestion execution then it may break lineage " + "as this option generates the upstream datasets URN in lowercase.", + ) + @validator("dataset_type_mapping") @classmethod def map_data_platform(cls, value): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index 396da2d79e3b7..baaa8d5b85ae1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -5,8 +5,8 @@ from datahub.ingestion.source.powerbi.config import ( PlatformDetail, PowerBiDashboardSourceConfig, + PowerBIPlatformDetail, ) -from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable logger = logging.getLogger(__name__) @@ -14,7 +14,7 @@ class AbstractDataPlatformInstanceResolver(ABC): @abstractmethod def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: pass @@ -32,10 +32,10 @@ class ResolvePlatformInstanceFromDatasetTypeMapping( BaseAbstractDataPlatformInstanceResolver ): def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: platform: Union[str, PlatformDetail] = self.config.dataset_type_mapping[ - dataplatform_table.data_platform_pair.powerbi_data_platform_name + data_platform_detail.data_platform_pair.powerbi_data_platform_name ] if isinstance(platform, PlatformDetail): @@ -48,13 +48,13 @@ class ResolvePlatformInstanceFromServerToPlatformInstance( BaseAbstractDataPlatformInstanceResolver ): def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: return ( self.config.server_to_platform_instance[ - dataplatform_table.datasource_server + data_platform_detail.data_platform_server ] - if dataplatform_table.datasource_server + if data_platform_detail.data_platform_server in self.config.server_to_platform_instance else PlatformDetail.parse_obj({}) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 640bc4bd60d80..021c429c3c633 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -1,8 +1,12 @@ import logging -from typing import List +from typing import List, Optional import sqlparse +import datahub.utilities.sqlglot_lineage as sqlglot_l +from datahub.ingestion.api.common import PipelineContext +from datahub.utilities.sqlglot_lineage import SqlParsingResult + SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] logger = logging.getLogger() @@ -45,3 +49,30 @@ def get_tables(native_query: str) -> List[str]: from_index = from_index + 1 return tables + + +def parse_custom_sql( + ctx: PipelineContext, + query: str, + schema: Optional[str], + database: Optional[str], + platform: str, + env: str, + platform_instance: Optional[str], +) -> Optional["SqlParsingResult"]: + + logger.debug("Using sqlglot_lineage to parse custom sql") + + sql_query = remove_special_characters(query) + + logger.debug(f"Parsing sql={sql_query}") + + return sqlglot_l.create_lineage_sql_parsed_result( + query=sql_query, + schema=schema, + database=database, + platform=platform, + platform_instance=platform_instance, + env=env, + graph=ctx.graph, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 83106c04529d1..8cc38c366c42a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -6,7 +6,14 @@ import lark from lark import Lark, Tree -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) from datahub.ingestion.source.powerbi.m_query import resolver, validator from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -45,7 +52,9 @@ def _parse_expression(expression: str) -> Tree: def get_upstream_tables( table: Table, reporter: PowerBiDashboardSourceReport, - native_query_enabled: bool = True, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, ) -> List[resolver.DataPlatformTable]: if table.expression is None: @@ -58,7 +67,7 @@ def get_upstream_tables( parse_tree: Tree = _parse_expression(table.expression) valid, message = validator.validate_parse_tree( - parse_tree, native_query_enabled=native_query_enabled + parse_tree, native_query_enabled=config.native_query_parsing ) if valid is False: assert message is not None @@ -84,7 +93,11 @@ def get_upstream_tables( parse_tree=parse_tree, reporter=reporter, parameters=parameters, - ).resolve_to_data_platform_table_list() + ).resolve_to_data_platform_table_list( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) except BaseException as e: reporter.report_warning(table.full_name, "Failed to process m-query expression") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index e2b448124c89d..479f1decff903 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -6,11 +6,19 @@ from lark import Tree +import datahub.emitter.mce_builder as builder +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( DataPlatformPair, + PlatformDetail, + PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, + PowerBIPlatformDetail, SupportedDataPlatform, ) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -19,19 +27,98 @@ IdentifierAccessor, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table +from datahub.utilities.sqlglot_lineage import SqlParsingResult logger = logging.getLogger(__name__) @dataclass class DataPlatformTable: - name: str - full_name: str - datasource_server: str data_platform_pair: DataPlatformPair + urn: str + + +def urn_to_lowercase(value: str, flag: bool) -> str: + if flag is True: + return value.lower() + + return value + + +def urn_creator( + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + data_platform_pair: DataPlatformPair, + server: str, + qualified_table_name: str, +) -> str: + + platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=data_platform_pair, + data_platform_server=server, + ) + ) + + return builder.make_dataset_urn_with_platform_instance( + platform=data_platform_pair.datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + name=urn_to_lowercase( + qualified_table_name, config.convert_lineage_urns_to_lowercase + ), + ) class AbstractDataPlatformTableCreator(ABC): + """ + Base class to share common functionalities among different dataplatform for M-Query parsing. + + To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and + the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query. + + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + + It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument + of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL. + + DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern + + data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to + find out database-name , schema-name and table-name also varies as per dataplatform. + + Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query + + let + Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) + in + Source + + In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query. + + NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing. + + """ + + ctx: PipelineContext + config: PowerBiDashboardSourceConfig + platform_instance_resolver: AbstractDataPlatformInstanceResolver + + def __init__( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> None: + super().__init__() + self.ctx = ctx + self.config = config + self.platform_instance_resolver = platform_instance_resolver + @abstractmethod def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail @@ -58,6 +145,49 @@ def get_db_detail_from_argument( return arguments[0], arguments[1] + def parse_custom_sql( + self, query: str, server: str, database: Optional[str], schema: Optional[str] + ) -> List[DataPlatformTable]: + + dataplatform_tables: List[DataPlatformTable] = [] + + platform_detail: PlatformDetail = ( + self.platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=self.get_platform_pair(), + data_platform_server=server, + ) + ) + ) + + parsed_result: Optional[ + "SqlParsingResult" + ] = native_sql_parser.parse_custom_sql( + ctx=self.ctx, + query=query, + platform=self.get_platform_pair().datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + database=database, + schema=schema, + ) + + if parsed_result is None: + logger.debug("Failed to parse query") + return dataplatform_tables + + for urn in parsed_result.in_tables: + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") + + return dataplatform_tables + class AbstractDataAccessMQueryResolver(ABC): table: Table @@ -80,11 +210,29 @@ def __init__( self.data_access_functions = SupportedResolver.get_function_names() @abstractmethod - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + def resolve_to_data_platform_table_list( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> List[DataPlatformTable]: pass class MQueryResolver(AbstractDataAccessMQueryResolver, ABC): + """ + This class parses the M-Query recursively to generate DataAccessFunctionDetail (see method create_data_access_functional_detail). + + This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail. + + Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator + (see method resolve_to_data_platform_table_list). + + Classes which extended from AbstractDataPlatformTableCreator knows how to convert generated DataAccessFunctionDetail instance + to respective DataPlatformTable instance as per dataplatform. + + """ + def get_item_selector_tokens( self, expression_tree: Tree, @@ -318,9 +466,15 @@ def internal( return table_links - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + def resolve_to_data_platform_table_list( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> List[DataPlatformTable]: data_platform_tables: List[DataPlatformTable] = [] + # Find out output variable as we are doing backtracking in M-Query output_variable: Optional[str] = tree_function.get_output_variable( self.parse_tree ) @@ -332,12 +486,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) return data_platform_tables + # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail table_links: List[ DataAccessFunctionDetail ] = self.create_data_access_functional_detail(output_variable) # Each item is data-access function for f_detail in table_links: + # Get & Check if we support data-access-function available in M-Query supported_resolver = SupportedResolver.get_resolver( f_detail.data_access_function_name ) @@ -351,8 +507,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) continue + # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it + # & also pass additional information that will be need to generate urn table_full_name_creator: AbstractDataPlatformTableCreator = ( - supported_resolver.get_table_full_name_creator()() + supported_resolver.get_table_full_name_creator()( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) data_platform_tables.extend( @@ -393,18 +555,24 @@ def two_level_access_pattern( IdentifierAccessor, data_access_func_detail.identifier_accessor ).items["Item"] - full_table_name: str = f"{db_name}.{schema_name}.{table_name}" + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" logger.debug( - f"Platform({self.get_platform_pair().datahub_data_platform_name}) full_table_name= {full_table_name}" + f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}" + ) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, ) return [ DataPlatformTable( - name=table_name, - full_name=full_table_name, - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -420,9 +588,48 @@ def get_platform_pair(self) -> DataPlatformPair: class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources): + # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 + DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo + def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.MS_SQL.value + def create_urn_using_old_parser( + self, query: str, db_name: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for table in tables: + schema_and_table: List[str] = table.split(".") + if len(schema_and_table) == 1: + # schema name is not present. set default schema + schema_and_table.insert(0, MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA) + + qualified_table_name = ( + f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}" + ) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated upstream tables = {dataplatform_tables}") + + return dataplatform_tables + def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[DataPlatformTable]: @@ -442,28 +649,20 @@ def create_dataplatform_tables( logger.debug("Unsupported case is found. Second index is not the Query") return dataplatform_tables - db_name: str = arguments[1] - - tables: List[str] = native_sql_parser.get_tables(arguments[3]) - for table in tables: - schema_and_table: List[str] = table.split(".") - if len(schema_and_table) == 1: - # schema name is not present. Default schema name in MS-SQL is dbo - # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 - schema_and_table.insert(0, "dbo") - - dataplatform_tables.append( - DataPlatformTable( - name=schema_and_table[1], - full_name=f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}", - datasource_server=arguments[0], - data_platform_pair=self.get_platform_pair(), - ) + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=arguments[3], + db_name=arguments[1], + server=arguments[0], ) - logger.debug("MS-SQL full-table-names %s", dataplatform_tables) - - return dataplatform_tables + return self.parse_custom_sql( + query=arguments[3], + database=arguments[1], + server=arguments[0], + schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, + ) class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -510,12 +709,20 @@ def create_dataplatform_tables( cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -547,14 +754,28 @@ def create_dataplatform_tables( db_name: str = value_dict["Database"] schema_name: str = value_dict["Schema"] table_name: str = value_dict["Table"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + server, _ = self.get_db_detail_from_argument(data_access_func_detail.arg_list) + if server is None: + logger.info( + f"server information is not available for {qualified_table_name}. Skipping upstream table" + ) + return [] + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server if server else "", data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -589,20 +810,26 @@ def create_dataplatform_tables( IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore ).items["Name"] - full_table_name: str = f"{db_name}.{schema_name}.{table_name}" + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" logger.debug( - f"{self.get_platform_pair().datahub_data_platform_name} full-table-name {full_table_name}" + f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" + ) + + server: str = self.get_datasource_server(arguments, data_access_func_detail) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, ) return [ DataPlatformTable( - name=table_name, - full_name=full_table_name, - datasource_server=self.get_datasource_server( - arguments, data_access_func_detail - ), data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -654,12 +881,20 @@ def create_dataplatform_tables( cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -681,6 +916,39 @@ def is_native_parsing_supported(data_access_function_name: str) -> bool: in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM ) + def create_urn_using_old_parser( + self, query: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for qualified_table_name in tables: + if len(qualified_table_name.split(".")) != 3: + logger.debug( + f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format" + ) + continue + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") + + return dataplatform_tables + def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[DataPlatformTable]: @@ -727,25 +995,21 @@ def create_dataplatform_tables( 0 ] # Remove any whitespaces and double quotes character - for table in native_sql_parser.get_tables(sql_query): - if len(table.split(".")) != 3: - logger.debug( - f"Skipping table {table} as it is not as per full_table_name format" - ) - continue + server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] - dataplatform_tables.append( - DataPlatformTable( - name=table.split(".")[2], - full_name=table, - datasource_server=tree_function.strip_char_from_list( - [data_access_tokens[2]] - )[0], - data_platform_pair=self.get_platform_pair(), - ) + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=sql_query, + server=server, ) - return dataplatform_tables + return self.parse_custom_sql( + query=sql_query, + server=server, + database=None, # database and schema is available inside custom sql as per PowerBI Behavior + schema=None, + ) class FunctionName(Enum): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 919cb83e4d832..5d477ee090e7e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -28,7 +28,6 @@ ) from datahub.ingestion.source.powerbi.config import ( Constant, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, ) @@ -96,10 +95,12 @@ def __hash__(self): def __init__( self, + ctx: PipelineContext, config: PowerBiDashboardSourceConfig, reporter: PowerBiDashboardSourceReport, dataplatform_instance_resolver: AbstractDataPlatformInstanceResolver, ): + self.__ctx = ctx self.__config = config self.__reporter = reporter self.__dataplatform_instance_resolver = dataplatform_instance_resolver @@ -172,43 +173,40 @@ def extract_lineage( # table.dataset should always be set, but we check it just in case. parameters = table.dataset.parameters if table.dataset else {} - upstreams: List[UpstreamClass] = [] - upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables( - table, self.__reporter, parameters=parameters + upstream: List[UpstreamClass] = [] + + upstream_dpts: List[resolver.DataPlatformTable] = parser.get_upstream_tables( + table=table, + reporter=self.__reporter, + platform_instance_resolver=self.__dataplatform_instance_resolver, + ctx=self.__ctx, + config=self.__config, + parameters=parameters, ) + logger.debug( - f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_tables}" + f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_dpts}" ) - for upstream_table in upstream_tables: + + for upstream_dpt in upstream_dpts: if ( - upstream_table.data_platform_pair.powerbi_data_platform_name + upstream_dpt.data_platform_pair.powerbi_data_platform_name not in self.__config.dataset_type_mapping.keys() ): logger.debug( - f"Skipping upstream table for {ds_urn}. The platform {upstream_table.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", + f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", ) continue - platform_detail: PlatformDetail = ( - self.__dataplatform_instance_resolver.get_platform_instance( - upstream_table - ) - ) - upstream_urn = builder.make_dataset_urn_with_platform_instance( - platform=upstream_table.data_platform_pair.datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - name=self.lineage_urn_to_lowercase(upstream_table.full_name), - ) - upstream_table_class = UpstreamClass( - upstream_urn, + upstream_dpt.urn, DatasetLineageTypeClass.TRANSFORMED, ) - upstreams.append(upstream_table_class) - if len(upstreams) > 0: - upstream_lineage = UpstreamLineageClass(upstreams=upstreams) + upstream.append(upstream_table_class) + + if len(upstream) > 0: + upstream_lineage = UpstreamLineageClass(upstreams=upstream) logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}") mcp = MetadataChangeProposalWrapper( entityType=Constant.DATASET, @@ -1107,7 +1105,9 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext): ) # Exit pipeline as we are not able to connect to PowerBI API Service. This exit will avoid raising # unwanted stacktrace on console - self.mapper = Mapper(config, self.reporter, self.dataplatform_instance_resolver) + self.mapper = Mapper( + ctx, config, self.reporter, self.dataplatform_instance_resolver + ) # Create and register the stateful ingestion use-case handler. self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py index 2d2d9f527788f..0d41ab00c66f5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py @@ -1,3 +1,4 @@ +import dataclasses from dataclasses import dataclass from enum import Enum from typing import Any, Dict, List, Optional, Union @@ -105,7 +106,7 @@ class Measure: dataType: str = "measure" datahubDataType: Union[ BooleanTypeClass, DateTypeClass, NullTypeClass, NumberTypeClass, StringTypeClass - ] = NullTypeClass() + ] = dataclasses.field(default_factory=NullTypeClass) description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index cf4e3a5b0135a..268de5832559a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -343,7 +343,9 @@ def populate_lineage( if self.config.table_lineage_mode == LineageMode.STL_SCAN_BASED: # Populate table level lineage by getting upstream tables from stl_scan redshift table query = RedshiftQuery.stl_scan_based_lineage_query( - self.config.database, self.config.start_time, self.config.end_time + self.config.database, + self.config.start_time, + self.config.end_time, ) populate_calls.append((query, LineageCollectorType.QUERY_SCAN)) elif self.config.table_lineage_mode == LineageMode.SQL_BASED: diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index a989dc2f2fcbe..f1dd622efb746 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -66,6 +66,11 @@ class DataLakeSourceConfig( default="4g", description="Max amount of memory to grant Spark." ) + spark_config: Dict[str, Any] = Field( + description='Spark configuration properties to set on the SparkSession. Put config property names into quotes. For example: \'"spark.executor.memory": "2g"\'', + default={}, + ) + max_rows: int = Field( default=100, description="Maximum number of rows to use when inferring schemas for TSV and CSV files.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 61f9f88c3fb05..4247ee9330cfb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -37,6 +37,7 @@ from datahub.emitter.mce_builder import ( make_data_platform_urn, + make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -81,6 +82,7 @@ TimeTypeClass, ) from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, DatasetPropertiesClass, MapTypeClass, OperationClass, @@ -329,6 +331,9 @@ def init_spark(self): conf.set("spark.jars.excludes", pydeequ.f2j_maven_coord) conf.set("spark.driver.memory", self.source_config.spark_driver_memory) + if self.source_config.spark_config: + for key, value in self.source_config.spark_config.items(): + conf.set(key, value) self.spark = SparkSession.builder.config(conf=conf).getOrCreate() @classmethod @@ -559,6 +564,15 @@ def ingest_table( self.source_config.env, ) + if self.source_config.platform_instance: + data_platform_instance = DataPlatformInstanceClass( + platform=data_platform_urn, + instance=make_dataplatform_instance_urn( + self.source_config.platform, self.source_config.platform_instance + ), + ) + aspects.append(data_platform_instance) + customProperties = {"schema_inferred_from": str(table_data.full_path)} if not path_spec.sample_files: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 039eac1e93819..587c71a98be67 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -1,5 +1,6 @@ from typing import List, Optional +from datahub.configuration.time_window_config import BucketDuration from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain from datahub.ingestion.source.snowflake.snowflake_config import DEFAULT_TABLES_DENY_LIST @@ -575,14 +576,17 @@ def get_access_history_date_range() -> str: def usage_per_object_per_time_bucket_for_time_window( start_time_millis: int, end_time_millis: int, - time_bucket_size: str, + time_bucket_size: BucketDuration, use_base_objects: bool, top_n_queries: int, include_top_n_queries: bool, ) -> str: if not include_top_n_queries: top_n_queries = 0 - assert time_bucket_size == "DAY" or time_bucket_size == "HOUR" + assert ( + time_bucket_size == BucketDuration.DAY + or time_bucket_size == BucketDuration.HOUR + ) objects_column = ( "BASE_OBJECTS_ACCESSED" if use_base_objects else "DIRECT_OBJECTS_ACCESSED" ) @@ -629,7 +633,7 @@ def usage_per_object_per_time_bucket_for_time_window( SELECT object_name, ANY_VALUE(object_domain) AS object_domain, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries, count( distinct(user_name) ) AS total_users FROM @@ -644,7 +648,7 @@ def usage_per_object_per_time_bucket_for_time_window( SELECT object_name, column_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries FROM field_access_history @@ -658,7 +662,7 @@ def usage_per_object_per_time_bucket_for_time_window( ( SELECT object_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries, user_name, ANY_VALUE(users.email) AS user_email @@ -677,7 +681,7 @@ def usage_per_object_per_time_bucket_for_time_window( ( SELECT object_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, query_history.query_text AS query_text, count(distinct(access_history.query_id)) AS total_queries FROM diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 3605205b6055c..f8dfa612952d8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -356,7 +356,6 @@ def _check_usage_date_ranges(self) -> Any: def _get_operation_aspect_work_unit( self, event: SnowflakeJoinedAccessEvent, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: - if event.query_start_time and event.query_type: start_time = event.query_start_time query_type = event.query_type diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index 8b2eed36ac6b3..c95e20252e421 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -22,10 +22,7 @@ from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes from datahub.ingestion.source.sql.sql_common import SQLAlchemySource -from datahub.ingestion.source.sql.sql_config import ( - SQLAlchemyConfig, - make_sqlalchemy_uri, -) +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, gen_database_container, @@ -33,7 +30,7 @@ ) -class AthenaConfig(SQLAlchemyConfig): +class AthenaConfig(SQLCommonConfig): scheme: str = "awsathena+rest" username: Optional[str] = pydantic.Field( default=None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py index 3bec07f6a13d5..e4969ce946f78 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py @@ -19,6 +19,7 @@ make_sqlalchemy_type, register_custom_type, ) +from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig from datahub.ingestion.source.sql.two_tier_sql_source import ( TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, @@ -45,11 +46,13 @@ base.ischema_names["decimal128"] = DECIMAL128 -class MySQLConfig(TwoTierSQLAlchemyConfig): +class MySQLConnectionConfig(SQLAlchemyConnectionConfig): # defaults host_port = Field(default="localhost:3306", description="MySQL host URL.") scheme = "mysql+pymysql" + +class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig): def get_identifier(self, *, schema: str, table: str) -> str: regular = f"{schema}.{table}" if self.database_alias: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py index 1f3092888054e..ceb9ecacb25d2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto_on_hive.py @@ -38,7 +38,7 @@ ) from datahub.ingestion.source.sql.sql_config import ( BasicSQLAlchemyConfig, - SQLAlchemyConfig, + SQLCommonConfig, make_sqlalchemy_uri, ) from datahub.ingestion.source.sql.sql_utils import ( @@ -453,7 +453,7 @@ def loop_tables( self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: # In mysql we get tables for all databases and we should filter out the non metastore one if ( @@ -718,7 +718,7 @@ def loop_views( self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: assert isinstance(sql_config, PrestoOnHiveConfig) @@ -904,7 +904,7 @@ def _set_partition_key(self, columns, schema_fields): class SQLAlchemyClient: - def __init__(self, config: SQLAlchemyConfig): + def __init__(self, config: SQLCommonConfig): self.config = config self.connection = self._get_connection() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 64dca217e694d..280f4f47adcdf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -38,7 +38,7 @@ DatasetContainerSubTypes, DatasetSubTypes, ) -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, downgrade_schema_from_v2, @@ -331,7 +331,7 @@ class ProfileMetadata: class SQLAlchemySource(StatefulIngestionSourceBase): """A Base class for all SQL Sources that use SQLAlchemy to extend""" - def __init__(self, config: SQLAlchemyConfig, ctx: PipelineContext, platform: str): + def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str): super(SQLAlchemySource, self).__init__(config, ctx) self.config = config self.platform = platform @@ -599,7 +599,7 @@ def loop_tables( # noqa: C901 self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: tables_seen: Set[str] = set() try: @@ -647,7 +647,7 @@ def _process_table( inspector: Inspector, schema: str, table: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = self._get_columns(dataset_name, inspector, schema, table) dataset_urn = make_dataset_urn_with_platform_instance( @@ -867,7 +867,7 @@ def loop_views( self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: try: for view in inspector.get_view_names(schema): @@ -904,7 +904,7 @@ def _process_view( inspector: Inspector, schema: str, view: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: try: columns = inspector.get_columns(view, schema) @@ -1026,7 +1026,7 @@ def generate_profile_candidates( def is_dataset_eligible_for_profiling( self, dataset_name: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, inspector: Inspector, profile_candidates: Optional[List[str]], ) -> bool: @@ -1042,7 +1042,7 @@ def loop_profiler_requests( self, inspector: Inspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable["GEProfilerRequest"]: from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 76d1dbd14a7db..8f1e04b915f3b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -6,7 +6,7 @@ import pydantic from pydantic import Field -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig @@ -21,7 +21,7 @@ logger: logging.Logger = logging.getLogger(__name__) -class SQLAlchemyConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): +class SQLCommonConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): options: dict = pydantic.Field( default_factory=dict, description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", @@ -97,7 +97,7 @@ def get_sql_alchemy_url(self): pass -class BasicSQLAlchemyConfig(SQLAlchemyConfig): +class SQLAlchemyConnectionConfig(ConfigModel): username: Optional[str] = Field(default=None, description="username") password: Optional[pydantic.SecretStr] = Field( default=None, exclude=True, description="password" @@ -115,6 +115,12 @@ class BasicSQLAlchemyConfig(SQLAlchemyConfig): description="URI of database to connect to. See https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls. Takes precedence over other connection parameters.", ) + # Duplicate of SQLCommonConfig.options + options: dict = pydantic.Field( + default_factory=dict, + description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + ) + _database_alias_deprecation = pydantic_field_deprecated( "database_alias", message="database_alias is deprecated. Use platform_instance instead.", @@ -136,6 +142,10 @@ def get_sql_alchemy_url( ) +class BasicSQLAlchemyConfig(SQLAlchemyConnectionConfig, SQLCommonConfig): + pass + + def make_sqlalchemy_uri( scheme: str, username: Optional[str], diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py index a31a5ac64e5fb..aa0493a18ab58 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py @@ -14,7 +14,7 @@ support_status, ) from datahub.ingestion.source.sql.sql_common import SQLAlchemySource -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig @dataclass @@ -50,7 +50,7 @@ class BaseView: column_count: Optional[int] = None -class SQLAlchemyGenericConfig(SQLAlchemyConfig): +class SQLAlchemyGenericConfig(SQLCommonConfig): platform: str = Field( description="Name of platform being ingested, used in constructing URNs." ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index 63403c265598b..344c114d464a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -12,7 +12,7 @@ GEProfilerRequest, ) from datahub.ingestion.source.sql.sql_common import SQLSourceReport -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile @@ -53,7 +53,7 @@ class TableProfilerRequest(GEProfilerRequest): class GenericProfiler: def __init__( self, - config: SQLAlchemyConfig, + config: SQLCommonConfig, report: ProfilingSqlReport, platform: str, state_handler: Optional[ProfilingHandler] = None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py index 764f26c256893..a417cae2b1ab0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py @@ -32,7 +32,7 @@ ) from datahub.ingestion.source.sql.sql_config import ( BasicSQLAlchemyConfig, - SQLAlchemyConfig, + SQLCommonConfig, ) from datahub.ingestion.source.sql.sql_utils import get_domain_wu from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass @@ -220,7 +220,7 @@ def _process_table( inspector: VerticaInspector, schema: str, table: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: dataset_urn = make_dataset_urn_with_platform_instance( self.platform, @@ -242,7 +242,7 @@ def loop_views( self, inspector: VerticaInspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: try: for view in inspector.get_view_names(schema): @@ -314,7 +314,7 @@ def _process_view( inspector: VerticaInspector, schema: str, view: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: """ This function is used for performing operation and gets data for every view inside a schema @@ -324,7 +324,7 @@ def _process_view( inspector (Inspector) schema (str): schema name view (str): name of the view to inspect - sql_config (SQLAlchemyConfig) + sql_config (SQLCommonConfig) table_tags (Dict[str, str], optional) Defaults to dict(). Returns: @@ -356,7 +356,7 @@ def loop_projections( self, inspector: VerticaInspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: """ this function loop through all the projection in the given schema. @@ -366,7 +366,7 @@ def loop_projections( Args: inspector (Inspector): inspector obj from reflection schema (str): schema name - sql_config (SQLAlchemyConfig): config + sql_config (SQLCommonConfig): config Returns: Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: [description] @@ -438,7 +438,7 @@ def _process_projections( inspector: VerticaInspector, schema: str, projection: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = inspector.get_projection_columns(projection, schema) dataset_urn = make_dataset_urn_with_platform_instance( @@ -512,7 +512,7 @@ def loop_profiler_requests( self, inspector: VerticaInspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable["GEProfilerRequest"]: """Function is used for collecting profiling related information for every projections inside an schema. @@ -590,7 +590,7 @@ def loop_models( self, inspector: VerticaInspector, schema: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: """ This function is for iterating over the ml models in vertica db @@ -598,7 +598,7 @@ def loop_models( Args: inspector (Inspector) : inspector obj from reflection engine schema (str): schema name - sql_config (SQLAlchemyConfig): config + sql_config (SQLCommonConfig): config Returns: Iterable[Union[SqlWorkUnit, MetadataWorkUnit]] @@ -646,7 +646,7 @@ def _process_models( inspector: VerticaInspector, schema: str, table: str, - sql_config: SQLAlchemyConfig, + sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: """ To fetch ml models related information of ml_model from vertica db @@ -655,7 +655,7 @@ def _process_models( inspector (Inspector): inspector obj from reflection schema (str): schema name entity table (str): name of ml model - sql_config (SQLAlchemyConfig) + sql_config (SQLCommonConfig) Returns: Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: [description] diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py index eace13368897e..5bfd48eb754d5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py @@ -6,7 +6,7 @@ import logging import pickle from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from typing import Callable, Generic, Optional, Type, TypeVar import pydantic @@ -144,7 +144,7 @@ def create_from_checkpoint_aspect( ) logger.info( f"Successfully constructed last checkpoint state for job {job_name} " - f"with timestamp {datetime.utcfromtimestamp(checkpoint_aspect.timestampMillis/1000)}" + f"with timestamp {datetime.fromtimestamp(checkpoint_aspect.timestampMillis/1000, tz=timezone.utc)}" ) return checkpoint return None @@ -213,7 +213,7 @@ def to_checkpoint_aspect( ), ) checkpoint_aspect = DatahubIngestionCheckpointClass( - timestampMillis=int(datetime.utcnow().timestamp() * 1000), + timestampMillis=int(datetime.now(tz=timezone.utc).timestamp() * 1000), pipelineName=self.pipeline_name, platformInstanceId="", runId=self.run_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py index 874ee08cc78f9..d7ebcba2c6695 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py @@ -104,7 +104,7 @@ def commit(self) -> None: for job_name, checkpoint in self.state_to_commit.items(): # Emit the ingestion state for each job - logger.info( + logger.debug( f"Committing ingestion checkpoint for pipeline:'{checkpoint.pipelineName}', " f"job:'{job_name}'" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 6752bdf519830..ec0af37089b1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -31,6 +31,7 @@ from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError import datahub.emitter.mce_builder as builder +import datahub.utilities.sqlglot_lineage as sqlglot_l from datahub.configuration.common import ( AllowDenyPattern, ConfigModel, @@ -136,12 +137,7 @@ ViewPropertiesClass, ) from datahub.utilities import config_clean -from datahub.utilities.sqlglot_lineage import ( - ColumnLineageInfo, - SchemaResolver, - SqlParsingResult, - sqlglot_lineage, -) +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult logger: logging.Logger = logging.getLogger(__name__) @@ -1585,42 +1581,14 @@ def parse_custom_sql( f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}" ) - parsed_result: Optional["SqlParsingResult"] = None - try: - schema_resolver = ( - self.ctx.graph._make_schema_resolver( - platform=platform, - platform_instance=platform_instance, - env=env, - ) - if self.ctx.graph is not None - else SchemaResolver( - platform=platform, - platform_instance=platform_instance, - env=env, - graph=None, - ) - ) - - if schema_resolver.graph is None: - logger.warning( - "Column Level Lineage extraction would not work as DataHub graph client is None." - ) - - parsed_result = sqlglot_lineage( - query, - schema_resolver=schema_resolver, - default_db=upstream_db, - ) - except Exception as e: - self.report.report_warning( - key="csql-lineage", - reason=f"Unable to retrieve lineage from query. " - f"Query: {query} " - f"Reason: {str(e)} ", - ) - - return parsed_result + return sqlglot_l.create_lineage_sql_parsed_result( + query=query, + database=upstream_db, + platform=platform, + platform_instance=platform_instance, + env=env, + graph=self.ctx.graph, + ) def _create_lineage_from_unsupported_csql( self, csql_urn: str, csql: dict diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index d5da93c7be35e..49f56b46fb012 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -176,10 +176,8 @@ def _parse_query_via_lineage_runner(self, query: str) -> Optional[StringTableInf for table in runner.target_tables ], ) - except Exception: - logger.info( - f"Could not parse query via lineage runner, {query}", exc_info=True - ) + except Exception as e: + logger.info(f"Could not parse query via lineage runner, {query}: {e!r}") return None @staticmethod @@ -202,8 +200,8 @@ def _parse_query_via_spark_sql_plan(self, query: str) -> Optional[StringTableInf return GenericTableInfo( source_tables=[t for t in tables if t], target_tables=[] ) - except Exception: - logger.info(f"Could not parse query via spark plan, {query}", exc_info=True) + except Exception as e: + logger.info(f"Could not parse query via spark plan, {query}: {e!r}") return None @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py index 92f8223f34d14..8d4ac37f49213 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py @@ -213,6 +213,19 @@ def ensure_top_n_queries_is_not_too_big(cls, v: int) -> int: ) return v + @pydantic.validator("start_time") + def ensure_start_time_aligns_with_bucket_start_time( + cls, v: datetime, values: dict + ) -> datetime: + if get_time_bucket(v, values["bucket_duration"]) != v: + new_start_time = get_time_bucket(v, values["bucket_duration"]) + logger.warning( + f"`start_time` will be changed to {new_start_time}, although the input `start_time` is {v}." + "This is necessary to record correct usage for the configured bucket duration." + ) + return new_start_time + return v + class UsageAggregator(Generic[ResourceType]): # TODO: Move over other connectors to use this class diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 27ac2472bef93..0d72fc52da0ca 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -21,10 +21,7 @@ CLIENT_SESSION_KEEP_ALIVE, ) from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator -from datahub.ingestion.source.sql.sql_config import ( - SQLAlchemyConfig, - make_sqlalchemy_uri, -) +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri from datahub.utilities.config_clean import ( remove_protocol, remove_suffix, @@ -261,7 +258,7 @@ def get_connect_args(self) -> dict: return connect_args -class SnowflakeConfig(BaseSnowflakeConfig, SQLAlchemyConfig): +class SnowflakeConfig(BaseSnowflakeConfig, SQLCommonConfig): database_pattern: AllowDenyPattern = AllowDenyPattern( deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] ) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index fd56eb604071c..534cac5cef2aa 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -848,3 +848,43 @@ def sqlglot_lineage( table_error=e, ), ) + + +def create_lineage_sql_parsed_result( + query: str, + database: Optional[str], + platform: str, + platform_instance: Optional[str], + env: str, + schema: Optional[str] = None, + graph: Optional[DataHubGraph] = None, +) -> Optional["SqlParsingResult"]: + + parsed_result: Optional["SqlParsingResult"] = None + try: + schema_resolver = ( + graph._make_schema_resolver( + platform=platform, + platform_instance=platform_instance, + env=env, + ) + if graph is not None + else SchemaResolver( + platform=platform, + platform_instance=platform_instance, + env=env, + graph=None, + ) + ) + + parsed_result = sqlglot_lineage( + query, + schema_resolver=schema_resolver, + default_db=database, + default_schema=schema, + ) + except Exception as e: + logger.debug(f"Fail to prase query {query}", exc_info=e) + logger.warning("Fail to parse custom SQL") + + return parsed_result diff --git a/metadata-ingestion/src/datahub_provider/_plugin.py b/metadata-ingestion/src/datahub_provider/_plugin.py index 6f6c7c9ab71b7..ed2e4e1c93d80 100644 --- a/metadata-ingestion/src/datahub_provider/_plugin.py +++ b/metadata-ingestion/src/datahub_provider/_plugin.py @@ -107,7 +107,7 @@ def get_inlets_from_task(task: BaseOperator, context: Any) -> Iterable[Any]: ] for inlet in task_inlets: - if isinstance(inlet, str): + if not isinstance(inlet, str): inlets.append(inlet) return inlets diff --git a/metadata-ingestion/tests/integration/ldap/test_ldap.py b/metadata-ingestion/tests/integration/ldap/test_ldap.py index 148a3a6128013..3e76f13fc823d 100644 --- a/metadata-ingestion/tests/integration/ldap/test_ldap.py +++ b/metadata-ingestion/tests/integration/ldap/test_ldap.py @@ -100,3 +100,54 @@ def test_ldap_memberof_ingest(docker_compose_runner, pytestconfig, tmp_path, moc output_path=tmp_path / "ldap_memberof_mces.json", golden_path=test_resources_dir / "ldap_memberof_mces_golden.json", ) + + +@pytest.mark.integration +def test_ldap_ingest_with_email_as_username( + docker_compose_runner, pytestconfig, tmp_path, mock_time +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/ldap" + + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "ldap" + ) as docker_services: + # The openldap container loads the sample data after exposing the port publicly. As such, + # we must wait a little bit extra to ensure that the sample data is loaded. + wait_for_port(docker_services, "openldap", 389) + time.sleep(5) + + pipeline = Pipeline.create( + { + "run_id": "ldap-test", + "source": { + "type": "ldap", + "config": { + "ldap_server": "ldap://localhost", + "ldap_user": "cn=admin,dc=example,dc=org", + "ldap_password": "admin", + "base_dn": "dc=example,dc=org", + "user_attrs_map": {"email": "mail"}, + "group_attrs_map": { + "members": "memberUid", + "email": "mail", + }, + "use_email_as_username": True, + "custom_props_list": ["givenName"], + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/ldap_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "ldap_mces.json", + golden_path=test_resources_dir / "ldap_mces_golden.json", + ) diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 5c9553402a8c4..e77a12aa4088e 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,17 +1,22 @@ import logging import sys -from typing import List +from typing import List, Tuple import pytest from lark import Tree import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport -from datahub.ingestion.source.powerbi.m_query import parser, tree_function -from datahub.ingestion.source.powerbi.m_query.resolver import ( - DataPlatformTable, - SupportedDataPlatform, +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, + create_dataplatform_instance_resolver, ) +from datahub.ingestion.source.powerbi.m_query import parser, tree_function +from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -38,9 +43,31 @@ 'let\n Source = AmazonRedshift.Database("redshift-url","dev"),\n public = Source{[Name="public"]}[Data],\n category1 = public{[Name="category"]}[Data]\nin\n category1', 'let\n Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) \n in Source', 'let\n Source = Databricks.Catalogs("adb-123.azuredatabricks.net", "/sql/1.0/endpoints/12345dc91aa25844", [Catalog=null, Database=null]),\n hive_metastore_Database = Source{[Name="hive_metastore",Kind="Database"]}[Data],\n sandbox_revenue_Schema = hive_metastore_Database{[Name="sandbox_revenue",Kind="Schema"]}[Data],\n public_consumer_price_index_Table = sandbox_revenue_Schema{[Name="public_consumer_price_index",Kind="Table"]}[Data],\n #"Renamed Columns" = Table.RenameColumns(public_consumer_price_index_Table,{{"Country", "country"}, {"Metric", "metric"}}),\n #"Inserted Year" = Table.AddColumn(#"Renamed Columns", "ID", each Date.Year([date_id]) + Date.Month([date_id]), Text.Type),\n #"Added Custom" = Table.AddColumn(#"Inserted Year", "Custom", each Text.Combine({Number.ToText(Date.Year([date_id])), Number.ToText(Date.Month([date_id])), [country]})),\n #"Removed Columns" = Table.RemoveColumns(#"Added Custom",{"ID"}),\n #"Renamed Columns1" = Table.RenameColumns(#"Removed Columns",{{"Custom", "ID"}}),\n #"Filtered Rows" = Table.SelectRows(#"Renamed Columns1", each ([metric] = "Consumer Price Index") and (not Number.IsNaN([value])))\nin\n #"Filtered Rows"', + "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source", ] +def get_default_instances( + override_config: dict = {}, +) -> Tuple[ + PipelineContext, PowerBiDashboardSourceConfig, AbstractDataPlatformInstanceResolver +]: + config: PowerBiDashboardSourceConfig = PowerBiDashboardSourceConfig.parse_obj( + { + "tenant_id": "fake", + "client_id": "foo", + "client_secret": "bar", + **override_config, + } + ) + + platform_instance_resolver: AbstractDataPlatformInstanceResolver = ( + create_dataplatform_instance_resolver(config) + ) + + return PipelineContext(run_id="fake"), config, platform_instance_resolver + + @pytest.mark.integration def test_parse_m_query1(): expression: str = M_QUERIES[0] @@ -145,20 +172,20 @@ def test_snowflake_regular_case(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "TESTTABLE" - assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" assert ( - data_platform_tables[0].datasource_server - == "bu10758.ap-unknown-2.fakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,pbi_test.test.testtable,PROD)" ) @@ -174,17 +201,21 @@ def test_postgres_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "order_date" - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].datasource_server == "localhost" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)" ) @@ -200,19 +231,21 @@ def test_databricks_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "public_consumer_price_index" assert ( - data_platform_tables[0].full_name - == "hive_metastore.sandbox_revenue.public_consumer_price_index" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.DATABRICK_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:databricks,hive_metastore.sandbox_revenue.public_consumer_price_index,PROD)" ) @@ -228,17 +261,21 @@ def test_oracle_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "EMPLOYEES" - assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" - assert data_platform_tables[0].datasource_server == "localhost:1521" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.hr.employees,PROD)" ) @@ -255,17 +292,20 @@ def test_mssql_regular_case(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "book_issue" - assert data_platform_tables[0].full_name == "library.dbo.book_issue" - assert data_platform_tables[0].datasource_server == "localhost" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:mssql,library.dbo.book_issue,PROD)" ) @@ -280,14 +320,16 @@ def test_mssql_with_query(): M_QUERIES[11], ] expected_tables = [ - "COMMOPSDB.dbo.V_OIP_ENT_2022", - "COMMOPSDB.dbo.V_INVOICE_BOOKING_2022", - "COMMOPSDB.dbo.V_ARR_ADDS", - "COMMOPSDB.dbo.V_PS_CD_RETENTION", - "COMMOPSDB.dbo.V_TPV_LEADERBOARD", - "COMMOPSDB.dbo.V_ENTERPRISE_INVOICED_REVENUE", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_oip_ent_2022,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_invoice_booking_2022,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_arr_adds,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_tpv_leaderboard,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_enterprise_invoiced_revenue,PROD)", ] + ctx, config, platform_instance_resolver = get_default_instances() + for index, query in enumerate(mssql_queries): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -299,17 +341,15 @@ def test_mssql_with_query(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == expected_tables[index].split(".")[2] - assert data_platform_tables[0].full_name == expected_tables[index] - assert data_platform_tables[0].datasource_server == "AUPRDWHDB" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name - ) + assert data_platform_tables[0].urn == expected_tables[index] @pytest.mark.integration @@ -322,12 +362,14 @@ def test_snowflake_native_query(): ] expected_tables = [ - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", ] + ctx, config, platform_instance_resolver = get_default_instances() + for index, query in enumerate(snowflake_queries): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -339,20 +381,15 @@ def test_snowflake_native_query(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == expected_tables[index].split(".")[2] - assert data_platform_tables[0].full_name == expected_tables[index] - assert ( - data_platform_tables[0].datasource_server - == "bu10758.ap-unknown-2.fakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name - ) + assert data_platform_tables[0].urn == expected_tables[index] def test_google_bigquery_1(): @@ -363,16 +400,20 @@ def test_google_bigquery_1(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "seraphic-music-344307" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,seraphic-music-344307.school_dataset.first,PROD)" ) @@ -387,23 +428,24 @@ def test_google_bigquery_2(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "Parameter - Source": "my-test-project", "My bq project": "gcp_billing", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "my-test-project" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.gcp_billing.gcp_table,PROD)" ) @@ -416,23 +458,24 @@ def test_for_each_expression_1(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "Parameter - Source": "my-test-project", "My bq project": "gcp_billing", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].datasource_server == "my-test-project" - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.universal.d_wh_date,PROD)" ) @@ -445,22 +488,23 @@ def test_for_each_expression_2(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "dwh-prod": "originally-not-a-variable-ref-and-not-resolved", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "dwh-prod" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,dwh-prod.gcp_billing.d_gcp_custom_label,PROD)" ) @@ -476,8 +520,14 @@ def test_native_query_disabled(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + config.native_query_parsing = False data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 0 @@ -493,26 +543,25 @@ def test_multi_source_table(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 2 - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].datasource_server == "localhost" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name - ) - - assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW" assert ( - data_platform_tables[1].datasource_server - == "ghh48144.snowflakefakecomputing.com" + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)" ) assert ( - data_platform_tables[1].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst_view,PROD)" ) @@ -521,36 +570,33 @@ def test_table_combine(): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], measures=[], - expression=M_QUERIES[16], # 1st index has the native query + expression=M_QUERIES[16], name="virtual_order_table", full_name="OrderDataSet.virtual_order_table", ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 2 - assert data_platform_tables[0].full_name == "GSL_TEST_DB.PUBLIC.SALES_FORECAST" - assert ( - data_platform_tables[0].datasource_server - == "ghh48144.snowflakefakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name - ) - assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST" assert ( - data_platform_tables[1].datasource_server - == "ghh48144.snowflakefakecomputing.com" + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_forecast,PROD)" ) + assert ( - data_platform_tables[1].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD)" ) @@ -574,8 +620,14 @@ def test_expression_is_none(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 0 @@ -589,15 +641,20 @@ def test_redshift_regular_case(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)" ) @@ -609,13 +666,60 @@ def test_redshift_native_query(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + + config.native_query_parsing = True + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=True + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)" + ) + + +def test_sqlglot_parser(): + table: powerbi_data_classes.Table = powerbi_data_classes.Table( + expression=M_QUERIES[24], + name="SALES_TARGET", + full_name="dev.public.sales", + ) + reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances( + override_config={ + "server_to_platform_instance": { + "bu10758.ap-unknown-2.fakecomputing.com": { + "platform_instance": "sales_deployment", + "env": "PROD", + } + }, + "native_query_parsing": True, + "enable_advance_lineage_sql_construct": True, + } + ) + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) + + assert len(data_platform_tables) == 2 + assert ( + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit,PROD)" + ) + assert ( + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)" ) diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json index d042c3fbb158b..63efc79941d82 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "file_without_extension.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json index 8e4fcb80ff855..ceec764bfbc86 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", @@ -2740,6 +2756,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", @@ -3277,6 +3309,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", @@ -3852,6 +3900,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", @@ -4178,6 +4242,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", @@ -4571,6 +4651,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/small.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/small.csv,DEV)", @@ -7590,6 +7686,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json index 58b81065c190f..d50f00efacaa0 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "file_without_extension.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json index 0c1d92ed58e3d..36d3ba1b3510d 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", @@ -945,6 +961,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", @@ -1110,6 +1142,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", @@ -1319,6 +1367,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", @@ -1482,6 +1546,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", @@ -1647,6 +1727,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/small.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/small.csv,DEV)", @@ -2282,6 +2378,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index b08a14d0805c6..43f5e04fbc89f 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -144,7 +144,7 @@ def default_query_results( # noqa: C901 } ] elif query == snowflake_query.SnowflakeQuery.operational_data_for_time_window( - 1654499820000, + 1654473600000, 1654586220000, ): return [ @@ -257,7 +257,7 @@ def default_query_results( # noqa: C901 elif ( query == snowflake_query.SnowflakeQuery.usage_per_object_per_time_bucket_for_time_window( - 1654499820000, + 1654473600000, 1654586220000, use_base_objects=False, top_n_queries=10, @@ -268,11 +268,11 @@ def default_query_results( # noqa: C901 return [] elif query in ( snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654499820000, + 1654473600000, 1654586220000, ), snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654499820000, 1654586220000, False + 1654473600000, 1654586220000, False ), ): return [ @@ -331,7 +331,7 @@ def default_query_results( # noqa: C901 ] elif query in ( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( - start_time_millis=1654499820000, + start_time_millis=1654473600000, end_time_millis=1654586220000, include_view_lineage=True, include_column_lineage=True, @@ -403,7 +403,7 @@ def default_query_results( # noqa: C901 ] elif query in ( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( - start_time_millis=1654499820000, + start_time_millis=1654473600000, end_time_millis=1654586220000, include_view_lineage=False, include_column_lineage=False, @@ -435,7 +435,7 @@ def default_query_results( # noqa: C901 for op_idx in range(1, num_ops + 1) ] elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654499820000, + 1654473600000, 1654586220000, ): return [] @@ -470,11 +470,11 @@ def default_query_results( # noqa: C901 ] elif query in [ snowflake_query.SnowflakeQuery.view_lineage_history( - 1654499820000, + 1654473600000, 1654586220000, ), snowflake_query.SnowflakeQuery.view_lineage_history( - 1654499820000, 1654586220000, False + 1654473600000, 1654586220000, False ), ]: return [ @@ -510,7 +510,7 @@ def default_query_results( # noqa: C901 ] elif query in [ snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654499820000, + 1654473600000, 1654586220000, ), snowflake_query.SnowflakeQuery.view_dependencies_v2(), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index ed3bea49f0179..73a261bb3cb6e 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -228,7 +228,7 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( default_query_results, [ snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( - start_time_millis=1654499820000, + start_time_millis=1654473600000, end_time_millis=1654586220000, include_view_lineage=False, include_column_lineage=True, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py index 18779bd564f0d..a5993793e574d 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py @@ -228,7 +228,7 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( default_query_results, [ snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654499820000, 1654586220000, True + 1654473600000, 1654586220000, True ), ], "Database 'SNOWFLAKE' does not exist or not authorized.", diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index d04c8d905b439..71428a7847953 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -791,11 +791,9 @@ def test_tableau_unsupported_csql(mock_datahub_graph): database_override_map={"production database": "prod"} ) - with mock.patch( - "datahub.ingestion.source.tableau.sqlglot_lineage" - ) as sqlglot_lineage: + with mock.patch("datahub.ingestion.source.tableau.sqlglot_l") as sqlglot_lineage: - sqlglot_lineage.return_value = SqlParsingResult( # type:ignore + sqlglot_lineage.create_lineage_sql_parsed_result.return_value = SqlParsingResult( # type:ignore in_tables=[ "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.userdetail,PROD)" ], diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml index ddaf206f236cf..84af5c32a60e3 100644 --- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml +++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml @@ -1,6 +1,7 @@ version: "3.9" services: vertica: + platform: linux/amd64 environment: APP_DB_USER: "dbadmin" APP_DB_PASSWORD: "abc123" @@ -18,6 +19,3 @@ services: volumes: vertica-data: - - - diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py index db8bfd247313b..fe306d1d0b2b8 100644 --- a/metadata-ingestion/tests/integration/vertica/test_vertica.py +++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py @@ -58,6 +58,7 @@ def vertica_runner(docker_compose_runner, test_resources_dir): # Test needs more work to be done , currently it is working fine. @freeze_time(FROZEN_TIME) +@pytest.mark.skip("Failing in CI, cmd failing with exit code 1") @pytest.mark.integration def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica" diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py index c691711890aff..51e2b0795819a 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py @@ -27,7 +27,7 @@ def _assert_checkpoint_deserialization( ) -> Checkpoint: # Serialize a checkpoint aspect with the previous state. checkpoint_aspect = DatahubIngestionCheckpointClass( - timestampMillis=int(datetime.utcnow().timestamp() * 1000), + timestampMillis=int(datetime.now().timestamp() * 1000), pipelineName=test_pipeline_name, platformInstanceId="this-can-be-anything-and-will-be-ignored", config="this-is-also-ignored", diff --git a/metadata-ingestion/tests/unit/test_base_usage_config.py b/metadata-ingestion/tests/unit/test_base_usage_config.py new file mode 100644 index 0000000000000..008dcf25e38e4 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_base_usage_config.py @@ -0,0 +1,34 @@ +from datetime import datetime, timezone + +from freezegun import freeze_time + +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig + +FROZEN_TIME = "2023-08-03 09:00:00" +FROZEN_TIME2 = "2023-08-03 09:10:00" + + +@freeze_time(FROZEN_TIME) +def test_relative_start_time_aligns_with_bucket_start_time(): + config = BaseUsageConfig.parse_obj( + {"start_time": "-2 days", "end_time": "2023-07-07T09:00:00Z"} + ) + assert config.start_time == datetime(2023, 7, 5, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 7, 7, 9, tzinfo=timezone.utc) + + config = BaseUsageConfig.parse_obj( + {"start_time": "-2 days", "end_time": "2023-07-07T09:00:00Z"} + ) + assert config.start_time == datetime(2023, 7, 5, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 7, 7, 9, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME) +def test_absolute_start_time_aligns_with_bucket_start_time(): + config = BaseUsageConfig.parse_obj({"start_time": "2023-07-01T00:00:00Z"}) + assert config.start_time == datetime(2023, 7, 1, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + config = BaseUsageConfig.parse_obj({"start_time": "2023-07-01T09:00:00Z"}) + assert config.start_time == datetime(2023, 7, 1, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py index a2aec8df93d09..44ce5f0a02e37 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py +++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py @@ -37,6 +37,7 @@ def test_generate_day_partitioned_partition_profiler_query(): ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) @@ -79,6 +80,7 @@ def test_generate_day_partitioned_partition_profiler_query_with_set_partition_ti ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) @@ -120,6 +122,7 @@ def test_generate_hour_partitioned_partition_profiler_query(): ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) diff --git a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py index a71e07b68d898..b047cd16c52a9 100644 --- a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py +++ b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py @@ -4,6 +4,7 @@ from confluent_kafka.schema_registry.schema_registry_client import ( RegisteredSchema, Schema, + SchemaReference, ) from datahub.ingestion.source.confluent_schema_registry import ConfluentSchemaRegistry @@ -90,7 +91,9 @@ def new_get_latest_version(subject_name: str) -> RegisteredSchema: schema_str=schema_str_orig, schema_type="AVRO", references=[ - dict(name="TestTopic1", subject="schema_subject_1", version=1) + SchemaReference( + name="TestTopic1", subject="schema_subject_1", version=1 + ) ], ) ) @@ -109,7 +112,9 @@ def new_get_latest_version(subject_name: str) -> RegisteredSchema: schema_str=schema_str_orig, schema_type="AVRO", references=[ - dict(name="schema_subject_1", subject="TestTopic1", version=1) + SchemaReference( + name="schema_subject_1", subject="TestTopic1", version=1 + ) ], ) ) diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index db14b8f6de738..95af0e623e991 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -9,10 +9,10 @@ SQLAlchemySource, get_platform_from_sqlalchemy_uri, ) -from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig -class _TestSQLAlchemyConfig(SQLAlchemyConfig): +class _TestSQLAlchemyConfig(SQLCommonConfig): def get_sql_alchemy_url(self): pass @@ -22,7 +22,7 @@ class _TestSQLAlchemySource(SQLAlchemySource): def test_generate_foreign_key(): - config: SQLAlchemyConfig = _TestSQLAlchemyConfig() + config: SQLCommonConfig = _TestSQLAlchemyConfig() ctx: PipelineContext = PipelineContext(run_id="test_ctx") platform: str = "TEST" inspector: Inspector = Mock() @@ -49,7 +49,7 @@ def test_generate_foreign_key(): def test_use_source_schema_for_foreign_key_if_not_specified(): - config: SQLAlchemyConfig = _TestSQLAlchemyConfig() + config: SQLCommonConfig = _TestSQLAlchemyConfig() ctx: PipelineContext = PipelineContext(run_id="test_ctx") platform: str = "TEST" inspector: Inspector = Mock() diff --git a/metadata-ingestion/tests/unit/test_time_window_config.py b/metadata-ingestion/tests/unit/test_time_window_config.py new file mode 100644 index 0000000000000..127dc179c21e7 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_time_window_config.py @@ -0,0 +1,80 @@ +from datetime import datetime, timezone + +import pytest +from freezegun import freeze_time + +from datahub.configuration.time_window_config import BaseTimeWindowConfig + +FROZEN_TIME = "2023-08-03 09:00:00" +FROZEN_TIME2 = "2023-08-03 09:10:00" + + +@freeze_time(FROZEN_TIME) +def test_default_start_end_time(): + config = BaseTimeWindowConfig.parse_obj({}) + assert config.start_time == datetime(2023, 8, 2, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME2) +def test_default_start_end_time_hour_bucket_duration(): + config = BaseTimeWindowConfig.parse_obj({"bucket_duration": "HOUR"}) + assert config.start_time == datetime(2023, 8, 3, 8, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, 10, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME) +def test_relative_start_time(): + config = BaseTimeWindowConfig.parse_obj({"start_time": "-2 days"}) + assert config.start_time == datetime(2023, 8, 1, 9, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + config = BaseTimeWindowConfig.parse_obj({"start_time": "-2d"}) + assert config.start_time == datetime(2023, 8, 1, 9, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + config = BaseTimeWindowConfig.parse_obj( + {"start_time": "-2 days", "end_time": "2023-07-07T09:00:00Z"} + ) + assert config.start_time == datetime(2023, 7, 5, 9, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 7, 7, 9, tzinfo=timezone.utc) + + config = BaseTimeWindowConfig.parse_obj( + {"start_time": "-2 days", "end_time": "2023-07-07T09:00:00Z"} + ) + assert config.start_time == datetime(2023, 7, 5, 9, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 7, 7, 9, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME) +def test_absolute_start_time(): + config = BaseTimeWindowConfig.parse_obj({"start_time": "2023-07-01T00:00:00Z"}) + assert config.start_time == datetime(2023, 7, 1, 0, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + config = BaseTimeWindowConfig.parse_obj({"start_time": "2023-07-01T09:00:00Z"}) + assert config.start_time == datetime(2023, 7, 1, 9, tzinfo=timezone.utc) + assert config.end_time == datetime(2023, 8, 3, 9, tzinfo=timezone.utc) + + +@freeze_time(FROZEN_TIME) +def test_invalid_relative_start_time(): + with pytest.raises(ValueError, match="Unknown string format"): + BaseTimeWindowConfig.parse_obj({"start_time": "-2 das"}) + + with pytest.raises( + ValueError, + match="Relative start time should be in terms of configured bucket duration", + ): + BaseTimeWindowConfig.parse_obj({"start_time": "-2"}) + + with pytest.raises( + ValueError, match="Relative start time should start with minus sign" + ): + BaseTimeWindowConfig.parse_obj({"start_time": "2d"}) + + with pytest.raises( + ValueError, + match="Relative start time should be in terms of configured bucket duration", + ): + BaseTimeWindowConfig.parse_obj({"start_time": "-2m"}) diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index ae56cd4cb8a96..507351f933cf0 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -88,6 +88,9 @@ dependencies { implementation(externalDependency.jettison) { because("previous versions are vulnerable") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index 555acb2ffdd3b..efa4e0c279a76 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -42,6 +42,9 @@ public static Map getPartialNgramConfigWithOverrides(Map getMappingsForField(@Nonnull final Searchable mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); // Add keyword subfield without lowercase filter mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP)); - } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) { + } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { mappingForField.put(TYPE, KEYWORD); mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); Map subFields = new HashMap<>(); - if (fieldType == FieldType.TEXT_PARTIAL) { + if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { subFields.put(NGRAM, getPartialNgramConfigWithOverrides( ImmutableMap.of( ANALYZER, PARTIAL_ANALYZER ) )); + if (fieldType == FieldType.WORD_GRAM) { + for (Map.Entry entry : Map.of( + WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER, + WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER, + WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) { + String fieldName = entry.getKey(); + String analyzerName = entry.getValue(); + subFields.put(fieldName, ImmutableMap.of( + TYPE, TEXT, + ANALYZER, analyzerName, + SEARCH_ANALYZER, analyzerName + )); + } + } } subFields.put(DELIMITED, ImmutableMap.of( TYPE, TEXT, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index 5b3e396837aa7..e180c8296b48d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -66,6 +66,9 @@ public class SettingsBuilder { public static final String KEYWORD_ANALYZER = "keyword"; public static final String URN_ANALYZER = "urn_component"; public static final String URN_SEARCH_ANALYZER = "query_urn_component"; + public static final String WORD_GRAM_2_ANALYZER = "word_gram_2"; + public static final String WORD_GRAM_3_ANALYZER = "word_gram_3"; + public static final String WORD_GRAM_4_ANALYZER = "word_gram_4"; // Filters public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space"; @@ -80,6 +83,10 @@ public class SettingsBuilder { public static final String MULTIFILTER = "multifilter"; public static final String MULTIFILTER_GRAPH = "multifilter_graph"; public static final String PARTIAL_URN_COMPONENT = "partial_urn_component"; + public static final String SHINGLE = "shingle"; + public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter"; + public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter"; + public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter"; public static final String SNOWBALL = "snowball"; public static final String STEM_OVERRIDE = "stem_override"; public static final String STOP = "stop"; @@ -108,6 +115,7 @@ public class SettingsBuilder { public static final String SLASH_TOKENIZER = "slash_tokenizer"; public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer"; public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer"; + public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer"; // Do not remove the space, needed for multi-term synonyms public static final List ALPHANUM_SPACE_PATTERNS = ImmutableList.of( "([a-z0-9 _-]{2,})", @@ -161,6 +169,13 @@ public class SettingsBuilder { AUTOCOMPLETE_CUSTOM_DELIMITER, LOWERCASE); + public static final List WORD_GRAM_TOKEN_FILTERS = ImmutableList.of( + ASCII_FOLDING, + LOWERCASE, + TRIM, + REMOVE_QUOTES + ); + public final Map settings; public SettingsBuilder(String mainTokenizer) { @@ -275,6 +290,17 @@ private static Map buildFilters() throws IOException { .collect(Collectors.toList())) .build()); } + + for (Map.Entry entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) { + String filterName = entry.getKey(); + Integer gramSize = entry.getValue(); + filters.put(filterName, ImmutableMap.builder() + .put(TYPE, SHINGLE) + .put("min_shingle_size", gramSize) + .put("max_shingle_size", gramSize) + .put("output_unigrams", false) + .build()); + } } return filters.build(); @@ -302,13 +328,24 @@ private static Map buildTokenizers() { .put(DELIMITER, "␟") .build()); - // Tokenize by whitespace and most special chars + // Tokenize by most special chars + // Do NOT tokenize by whitespace to keep multi-word synonyms in the same token + // The split by whitespace is done later in the token filters phase tokenizers.put(MAIN_TOKENIZER, ImmutableMap.builder() .put(TYPE, PATTERN) .put(PATTERN, "[(),./:]") .build()); + // Tokenize by whitespace and most special chars for wordgrams + // only split on - when not preceded by a whitespace to preserve exclusion functionality + // i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently + tokenizers.put(WORD_GRAM_TOKENIZER, + ImmutableMap.builder() + .put(TYPE, PATTERN) + .put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)") + .build()); + return tokenizers.build(); } @@ -382,6 +419,21 @@ private static Map buildAnalyzers(String mainTokenizer) { .put(FILTER, SEARCH_TOKEN_FILTERS) .build()); + // Support word grams + for (Map.Entry entry : Map.of( + WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER, + WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER, + WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) { + String analyzerName = entry.getKey(); + String filterName = entry.getValue(); + analyzers.put(analyzerName, ImmutableMap.builder() + .put(TOKENIZER, WORD_GRAM_TOKENIZER) + .put(FILTER, ImmutableList.builder() + .addAll(WORD_GRAM_TOKEN_FILTERS) + .add(filterName).build()) + .build()); + } + // For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN) // Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token analyzers.put(PARTIAL_ANALYZER, ImmutableMap.builder() @@ -395,6 +447,7 @@ private static Map buildAnalyzers(String mainTokenizer) { .put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS) .build()); + return analyzers.build(); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java index fb7e19a5d67bc..a75ed40ffca52 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java @@ -11,11 +11,8 @@ import java.util.Set; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; + @Builder @Getter @@ -33,7 +30,8 @@ public class SearchFieldConfig { private static final Set TYPES_WITH_DELIMITED_SUBFIELD = Set.of( SearchableAnnotation.FieldType.TEXT, - SearchableAnnotation.FieldType.TEXT_PARTIAL + SearchableAnnotation.FieldType.TEXT_PARTIAL, + SearchableAnnotation.FieldType.WORD_GRAM // NOT URN_PARTIAL (urn field is special) ); // NOT comprehensive @@ -56,6 +54,7 @@ public class SearchFieldConfig { SearchableAnnotation.FieldType.TEXT, SearchableAnnotation.FieldType.TEXT_PARTIAL, SearchableAnnotation.FieldType.KEYWORD, + SearchableAnnotation.FieldType.WORD_GRAM, // not analyzed SearchableAnnotation.FieldType.BOOLEAN, SearchableAnnotation.FieldType.COUNT, @@ -69,6 +68,11 @@ public class SearchFieldConfig { SearchableAnnotation.FieldType.URN_PARTIAL ); + public static final Set TYPES_WITH_WORD_GRAM = + Set.of( + SearchableAnnotation.FieldType.WORD_GRAM + ); + @Nonnull private final String fieldName; @Nonnull @@ -78,9 +82,11 @@ public class SearchFieldConfig { private final String analyzer; private boolean hasKeywordSubfield; private boolean hasDelimitedSubfield; + private boolean hasWordGramSubfields; private boolean isQueryByDefault; private boolean isDelimitedSubfield; private boolean isKeywordSubfield; + private boolean isWordGramSubfield; public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) { final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation(); @@ -106,6 +112,7 @@ public static SearchFieldConfig detectSubFieldType(String fieldName, .analyzer(getAnalyzer(fieldName, fieldType)) .hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType)) .hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType)) + .hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType)) .isQueryByDefault(isQueryByDefault) .build(); } @@ -118,6 +125,11 @@ private static boolean hasDelimitedSubfield(String fieldName, SearchableAnnotati return !fieldName.contains(".") && ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType)); } + + private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) { + return !fieldName.contains(".") + && (TYPES_WITH_WORD_GRAM.contains(fieldType)); + } private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) { return !"urn".equals(fieldName) && !fieldName.contains(".") @@ -155,6 +167,7 @@ public SearchFieldConfigBuilder fieldName(@Nonnull String fieldName) { this.fieldName = fieldName; isDelimitedSubfield(fieldName.endsWith(".delimited")); isKeywordSubfield(fieldName.endsWith(".keyword")); + isWordGramSubfield(fieldName.contains("wordGrams")); shortName(fieldName.split("[.]")[0]); return this; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index 289c6f1f84e32..49fc882314e0a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -3,6 +3,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.BoolQueryConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.config.search.custom.QueryConfiguration; @@ -51,6 +52,9 @@ import org.elasticsearch.search.SearchModule; import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; +import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.*; + @Slf4j public class SearchQueryBuilder { @@ -69,6 +73,7 @@ public class SearchQueryBuilder { public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q "; private final ExactMatchConfiguration exactMatchConfiguration; private final PartialConfiguration partialConfiguration; + private final WordGramConfiguration wordGramConfiguration; private final CustomizedQueryHandler customizedQueryHandler; @@ -76,6 +81,7 @@ public SearchQueryBuilder(@Nonnull SearchConfiguration searchConfiguration, @Nullable CustomSearchConfiguration customSearchConfiguration) { this.exactMatchConfiguration = searchConfiguration.getExactMatch(); this.partialConfiguration = searchConfiguration.getPartial(); + this.wordGramConfiguration = searchConfiguration.getWordGram(); this.customizedQueryHandler = CustomizedQueryHandler.builder(customSearchConfiguration).build(); } @@ -148,6 +154,36 @@ private Set getStandardFields(@Nonnull EntitySpec entitySpec) fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited", searchFieldConfig.boost() * partialConfiguration.getFactor(), searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault())); + + if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) { + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams2") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getTwoGramFactor()) + .analyzer(WORD_GRAM_2_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams3") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getThreeGramFactor()) + .analyzer(WORD_GRAM_3_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams4") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getFourGramFactor()) + .analyzer(WORD_GRAM_4_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + } } } @@ -188,7 +224,7 @@ private Optional getSimpleQuery(@Nullable QueryConfiguration custo .filter(SearchFieldConfig::isQueryByDefault) .collect(Collectors.groupingBy(SearchFieldConfig::analyzer)); - analyzerGroup.keySet().stream().sorted().forEach(analyzer -> { + analyzerGroup.keySet().stream().sorted().filter(str -> !str.contains("word_gram")).forEach(analyzer -> { List fieldConfigs = analyzerGroup.get(analyzer); SimpleQueryStringBuilder simpleBuilder = QueryBuilders.simpleQueryStringQuery(sanitizedQuery); simpleBuilder.analyzer(analyzer); @@ -253,6 +289,13 @@ private Optional getPrefixAndExactMatchQuery(@Nullable QueryConfig * exactMatchConfiguration.getCaseSensitivityFactor()) .queryName(searchFieldConfig.fieldName())); } + + if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) { + finalQuery.should(QueryBuilders + .matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) + .boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName())) + .queryName(searchFieldConfig.shortName())); + } }); return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty(); @@ -377,4 +420,15 @@ private FunctionScoreQueryBuilder toFunctionScoreQueryBuilder(QueryBuilder query throw new RuntimeException(e); } } + + public float getWordGramFactor(String fieldName) { + if (fieldName.endsWith("Grams2")) { + return wordGramConfiguration.getTwoGramFactor(); + } else if (fieldName.endsWith("Grams3")) { + return wordGramConfiguration.getThreeGramFactor(); + } else if (fieldName.endsWith("Grams4")) { + return wordGramConfiguration.getFourGramFactor(); + } + throw new IllegalArgumentException(fieldName + " does not end with Grams[2-4]"); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java index 847029bc180eb..20501225ef787 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java @@ -54,6 +54,13 @@ @TestConfiguration @Import(ESTestConfiguration.class) public class ESSampleDataFixture { + /** + * Interested in adding more fixtures? Here's what you will need to update? + * 1. Create a new indexPrefix and FixtureName. Both are needed or else all fixtures will load on top of each other, + * overwriting each other + * 2. Create a new IndexConvention, IndexBuilder, and EntityClient. These are needed + * to index a different set of entities. + */ @Autowired private ESBulkProcessor _bulkProcessor; @@ -61,6 +68,9 @@ public class ESSampleDataFixture { @Autowired private RestHighLevelClient _searchClient; + @Autowired + private RestHighLevelClient _longTailSearchClient; + @Autowired private SearchConfiguration _searchConfiguration; @@ -68,24 +78,54 @@ public class ESSampleDataFixture { private CustomSearchConfiguration _customSearchConfiguration; @Bean(name = "sampleDataPrefix") - protected String indexPrefix() { + protected String sampleDataPrefix() { return "smpldat"; } + @Bean(name = "longTailPrefix") + protected String longTailIndexPrefix() { + return "lngtl"; + } + @Bean(name = "sampleDataIndexConvention") protected IndexConvention indexConvention(@Qualifier("sampleDataPrefix") String prefix) { return new IndexConventionImpl(prefix); } + @Bean(name = "longTailIndexConvention") + protected IndexConvention longTailIndexConvention(@Qualifier("longTailPrefix") String prefix) { + return new IndexConventionImpl(prefix); + } + @Bean(name = "sampleDataFixtureName") - protected String fixtureName() { + protected String sampleDataFixtureName() { return "sample_data"; } + @Bean(name = "longTailFixtureName") + protected String longTailFixtureName() { + return "long_tail"; + } + @Bean(name = "sampleDataEntityIndexBuilders") protected EntityIndexBuilders entityIndexBuilders( @Qualifier("entityRegistry") EntityRegistry entityRegistry, @Qualifier("sampleDataIndexConvention") IndexConvention indexConvention + ) { + return entityIndexBuildersHelper(entityRegistry, indexConvention); + } + + @Bean(name = "longTailEntityIndexBuilders") + protected EntityIndexBuilders longTailEntityIndexBuilders( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailIndexConvention") IndexConvention indexConvention + ) { + return entityIndexBuildersHelper(longTailEntityRegistry, indexConvention); + } + + protected EntityIndexBuilders entityIndexBuildersHelper( + EntityRegistry entityRegistry, + IndexConvention indexConvention ) { GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); ESIndexBuilder indexBuilder = new ESIndexBuilder(_searchClient, 1, 0, 1, @@ -100,6 +140,23 @@ protected ElasticSearchService entitySearchService( @Qualifier("entityRegistry") EntityRegistry entityRegistry, @Qualifier("sampleDataEntityIndexBuilders") EntityIndexBuilders indexBuilders, @Qualifier("sampleDataIndexConvention") IndexConvention indexConvention + ) throws IOException { + return entitySearchServiceHelper(entityRegistry, indexBuilders, indexConvention); + } + + @Bean(name = "longTailEntitySearchService") + protected ElasticSearchService longTailEntitySearchService( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailEntityIndexBuilders") EntityIndexBuilders longTailEndexBuilders, + @Qualifier("longTailIndexConvention") IndexConvention longTailIndexConvention + ) throws IOException { + return entitySearchServiceHelper(longTailEntityRegistry, longTailEndexBuilders, longTailIndexConvention); + } + + protected ElasticSearchService entitySearchServiceHelper( + EntityRegistry entityRegistry, + EntityIndexBuilders indexBuilders, + IndexConvention indexConvention ) throws IOException { CustomConfiguration customConfiguration = new CustomConfiguration(); customConfiguration.setEnabled(true); @@ -107,7 +164,7 @@ protected ElasticSearchService entitySearchService( CustomSearchConfiguration customSearchConfiguration = customConfiguration.resolve(new YAMLMapper()); ESSearchDAO searchDAO = new ESSearchDAO(entityRegistry, _searchClient, indexConvention, false, - ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, customSearchConfiguration); + ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, customSearchConfiguration); ESBrowseDAO browseDAO = new ESBrowseDAO(entityRegistry, _searchClient, indexConvention, _searchConfiguration, _customSearchConfiguration); ESWriteDAO writeDAO = new ESWriteDAO(entityRegistry, _searchClient, indexConvention, _bulkProcessor, 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); @@ -120,9 +177,30 @@ protected SearchService searchService( @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, @Qualifier("sampleDataEntityIndexBuilders") EntityIndexBuilders indexBuilders, @Qualifier("sampleDataPrefix") String prefix, - @Qualifier("sampleDataFixtureName") String fixtureName + @Qualifier("sampleDataFixtureName") String sampleDataFixtureName ) throws IOException { + return searchServiceHelper(entityRegistry, entitySearchService, indexBuilders, prefix, sampleDataFixtureName); + } + @Bean(name = "longTailSearchService") + @Nonnull + protected SearchService longTailSearchService( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailEntitySearchService") ElasticSearchService longTailEntitySearchService, + @Qualifier("longTailEntityIndexBuilders") EntityIndexBuilders longTailIndexBuilders, + @Qualifier("longTailPrefix") String longTailPrefix, + @Qualifier("longTailFixtureName") String longTailFixtureName + ) throws IOException { + return searchServiceHelper(longTailEntityRegistry, longTailEntitySearchService, longTailIndexBuilders, longTailPrefix, longTailFixtureName); + } + + public SearchService searchServiceHelper( + EntityRegistry entityRegistry, + ElasticSearchService entitySearchService, + EntityIndexBuilders indexBuilders, + String prefix, + String fixtureName + ) throws IOException { int batchSize = 100; SearchRanker ranker = new SimpleRanker(); CacheManager cacheManager = new ConcurrentMapCacheManager(); @@ -159,6 +237,24 @@ protected EntityClient entityClient( @Qualifier("sampleDataSearchService") SearchService searchService, @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, @Qualifier("entityRegistry") EntityRegistry entityRegistry + ) { + return entityClientHelper(searchService, entitySearchService, entityRegistry); + } + + @Bean(name = "longTailEntityClient") + @Nonnull + protected EntityClient longTailEntityClient( + @Qualifier("sampleDataSearchService") SearchService searchService, + @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry + ) { + return entityClientHelper(searchService, entitySearchService, longTailEntityRegistry); + } + + private EntityClient entityClientHelper( + SearchService searchService, + ElasticSearchService entitySearchService, + EntityRegistry entityRegistry ) { CachingEntitySearchService cachingEntitySearchService = new CachingEntitySearchService( new ConcurrentMapCacheManager(), @@ -173,7 +269,7 @@ protected EntityClient entityClient( preProcessHooks.setUiEnabled(true); return new JavaEntityClient( new EntityServiceImpl(mockAspectDao, null, entityRegistry, true, null, - preProcessHooks), + preProcessHooks), null, entitySearchService, cachingEntitySearchService, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java index 0d7ac506599af..673474c96cc51 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; @@ -55,11 +56,17 @@ public SearchConfiguration searchConfiguration() { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.5f); searchConfiguration.setExactMatch(exactMatchConfiguration); + searchConfiguration.setWordGram(wordGramConfiguration); searchConfiguration.setPartial(partialConfiguration); return searchConfiguration; } @@ -137,4 +144,10 @@ public EntityRegistry entityRegistry() throws EntityRegistryException { return new ConfigEntityRegistry( ESTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); } + + @Bean(name = "longTailEntityRegistry") + public EntityRegistry longTailEntityRegistry() throws EntityRegistryException { + return new ConfigEntityRegistry( + ESTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java index 79496888650e1..45c4c16864b07 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java @@ -77,6 +77,11 @@ public static SearchResult searchAcrossEntities(SearchService searchService, Str 100, new SearchFlags().setFulltext(true).setSkipCache(true), facets); } + public static SearchResult searchAcrossCustomEntities(SearchService searchService, String query, List searchableEntities) { + return searchService.searchAcrossEntities(searchableEntities, query, null, null, 0, + 100, new SearchFlags().setFulltext(true).setSkipCache(true)); + } + public static SearchResult search(SearchService searchService, String query) { return search(searchService, SEARCHABLE_ENTITIES, query); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java new file mode 100644 index 0000000000000..d720c95fef84d --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java @@ -0,0 +1,167 @@ +package com.linkedin.metadata.search.elasticsearch.fixtures; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.ESSampleDataFixture; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.search.MatchedFieldArray; +import com.linkedin.metadata.search.SearchEntityArray; +import com.linkedin.metadata.search.SearchResult; +import com.linkedin.metadata.search.SearchService; +import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.linkedin.metadata.ESTestUtils.*; +import static org.testng.Assert.assertTrue; +import static org.testng.AssertJUnit.*; + +@Import(ESSampleDataFixture.class) +public class ElasticSearchGoldenTest extends AbstractTestNGSpringContextTests { + + private static final List SEARCHABLE_LONGTAIL_ENTITIES = Stream.of(EntityType.CHART, EntityType.CONTAINER, + EntityType.DASHBOARD, EntityType.DATASET, EntityType.DOMAIN, EntityType.TAG + ).map(EntityTypeMapper::getName) + .collect(Collectors.toList()); + @Autowired + private RestHighLevelClient _searchClient; + + @Autowired + @Qualifier("longTailSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("longTailEntityClient") + protected EntityClient entityClient; + + @Autowired + @Qualifier("longTailEntityRegistry") + private EntityRegistry entityRegistry; + + @Test + public void testNameMatchPetProfiles() { + /* + Searching for "pet profiles" should return "pet_profiles" as the first 2 search results + */ + assertNotNull(searchService); + assertNotNull(entityRegistry); + SearchResult searchResult = searchAcrossCustomEntities(searchService, "pet profiles", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("pet_profiles")); + assertTrue(secondResultUrn.toString().contains("pet_profiles")); + } + + @Test + public void testNameMatchPetProfile() { + /* + Searching for "pet profile" should return "pet_profiles" as the first 2 search results + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "pet profile", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("pet_profiles")); + assertTrue(secondResultUrn.toString().contains("pet_profiles")); + } + + @Test + public void testGlossaryTerms() { + /* + Searching for "ReturnRate" should return all tables that have the glossary term applied before + anything else + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "ReturnRate", SEARCHABLE_LONGTAIL_ENTITIES); + SearchEntityArray entities = searchResult.getEntities(); + assertTrue(searchResult.getEntities().size() >= 4); + MatchedFieldArray firstResultMatchedFields = entities.get(0).getMatchedFields(); + MatchedFieldArray secondResultMatchedFields = entities.get(1).getMatchedFields(); + MatchedFieldArray thirdResultMatchedFields = entities.get(2).getMatchedFields(); + MatchedFieldArray fourthResultMatchedFields = entities.get(3).getMatchedFields(); + + assertTrue(firstResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(secondResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(thirdResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(fourthResultMatchedFields.toString().contains("ReturnRate")); + } + + @Test + public void testNameMatchPartiallyQualified() { + /* + Searching for "analytics.pet_details" (partially qualified) should return the fully qualified table + name as the first search results before any others + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "analytics.pet_details", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("snowflake,long_tail_companions.analytics.pet_details")); + assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details")); + } + + @Test + public void testNameMatchCollaborativeActionitems() { + /* + Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search + result, followed by "collaborative_actionitems_old" + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + // Checks that the table name is not suffixed with anything + assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,")); + assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old")); + + Double firstResultScore = searchResult.getEntities().get(0).getScore(); + Double secondResultScore = searchResult.getEntities().get(1).getScore(); + + // Checks that the scores aren't tied so that we are matching on table name more than column name + assertTrue(firstResultScore > secondResultScore); + } + + @Test + public void testNameMatchCustomerOrders() { + /* + Searching for "customer orders" should return "customer_orders" as the first search + result, not suffixed by anything + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "customer orders", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + + // Checks that the table name is not suffixed with anything + assertTrue(firstResultUrn.toString().contains("customer_orders,")); + + Double firstResultScore = searchResult.getEntities().get(0).getScore(); + Double secondResultScore = searchResult.getEntities().get(1).getScore(); + + // Checks that the scores aren't tied so that we are matching on table name more than column name + assertTrue(firstResultScore > secondResultScore); + } + + /* + Tests that should pass but do not yet can be added below here, with the following annotation: + @Test(enabled = false) + */ + +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java index dada13bd6f479..d989d4ef4fa87 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java @@ -82,6 +82,7 @@ public class SampleDataFixtureTests extends AbstractTestNGSpringContextTests { protected EntityClient entityClient; @Autowired + @Qualifier("entityRegistry") private EntityRegistry entityRegistry; @Test @@ -357,6 +358,84 @@ public void testDelimitedSynonym() throws IOException { }).collect(Collectors.toList()); } + @Test + public void testNegateAnalysis() throws IOException { + String queryWithMinus = "logging_events -bckp"; + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "query_word_delimited", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp")); + + request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_gram_3", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp")); + + request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_gram_4", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + + } + + @Test + public void testWordGram() throws IOException { + String text = "hello.cat_cool_customer"; + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer")); + + String testMoreSeparators = "quick.brown:fox jumped-LAZY_Dog"; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog")); + + String textWithQuotesAndDuplicateWord = "\"my_db.my_exact_table\""; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table")); + + String textWithParens = "(hi) there"; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithParens); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there")); + + String oneWordText = "hello"; + for (String analyzer : List.of("word_gram_2", "word_gram_3", "word_gram_4")) { + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", analyzer, oneWordText); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + } + } + @Test public void testUrnSynonym() throws IOException { List expectedTokens = List.of("bigquery"); @@ -1266,6 +1345,53 @@ public void testParens() { String.format("%s - Expected search results to include matched fields", query)); assertEquals(result.getEntities().size(), 2); } + @Test + public void testGram() { + String query = "jaffle shop customers"; + SearchResult result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)", + "Expected exact match in 1st position"); + + query = "shop customers source"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)", + "Expected ngram match in 1st position"); + + query = "jaffle shop stg customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)", + "Expected ngram match in 1st position"); + + query = "jaffle shop transformers customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)", + "Expected ngram match in 1st position"); + + query = "shop raw customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)", + "Expected ngram match in 1st position"); + } @Test public void testPrefixVsExact() { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java index ed72b46e98c46..5a8f80f325dbd 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java @@ -16,7 +16,7 @@ public void testMappingsBuilder() { Map result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec()); assertEquals(result.size(), 1); Map properties = (Map) result.get("properties"); - assertEquals(properties.size(), 17); + assertEquals(properties.size(), 18); assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword", "fields", ImmutableMap.of("delimited", @@ -76,6 +76,19 @@ public void testMappingsBuilder() { assertTrue(textArrayFieldSubfields.containsKey("ngram")); assertTrue(textArrayFieldSubfields.containsKey("keyword")); + // WORD_GRAM + Map wordGramField = (Map) properties.get("wordGramField"); + assertEquals(wordGramField.get("type"), "keyword"); + assertEquals(wordGramField.get("normalizer"), "keyword_normalizer"); + Map wordGramFieldSubfields = (Map) wordGramField.get("fields"); + assertEquals(wordGramFieldSubfields.size(), 6); + assertTrue(wordGramFieldSubfields.containsKey("delimited")); + assertTrue(wordGramFieldSubfields.containsKey("ngram")); + assertTrue(wordGramFieldSubfields.containsKey("keyword")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams2")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams3")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams4")); + // URN Map foreignKey = (Map) properties.get("foreignKey"); assertEquals(foreignKey.get("type"), "text"); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java index a2ec396c34b2d..282b1d8bb6778 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; import com.google.common.collect.ImmutableList; @@ -18,6 +19,7 @@ import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.MatchAllQueryBuilder; import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder; +import org.elasticsearch.index.query.MatchPhraseQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryStringQueryBuilder; import org.elasticsearch.index.query.SimpleQueryStringBuilder; @@ -46,11 +48,17 @@ public class SearchQueryBuilderTest { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.7f); testQueryConfig.setExactMatch(exactMatchConfiguration); + testQueryConfig.setWordGram(wordGramConfiguration); testQueryConfig.setPartial(partialConfiguration); } public static final SearchQueryBuilder TEST_BUILDER = new SearchQueryBuilder(testQueryConfig, null); @@ -70,16 +78,17 @@ public void testQueryBuilderFulltext() { assertEquals(keywordQuery.value(), "testQuery"); assertEquals(keywordQuery.analyzer(), "keyword"); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 8); + assertEquals(keywordFields.size(), 9); assertEquals(keywordFields, Map.of( - "urn", 10.f, - "textArrayField", 1.0f, - "customProperties", 1.0f, - "nestedArrayArrayField", 1.0f, - "textFieldOverride", 1.0f, - "nestedArrayStringField", 1.0f, - "keyPart1", 10.0f, - "esObjectField", 1.0f + "urn", 10.f, + "textArrayField", 1.0f, + "customProperties", 1.0f, + "wordGramField", 1.0f, + "nestedArrayArrayField", 1.0f, + "textFieldOverride", 1.0f, + "nestedArrayStringField", 1.0f, + "keyPart1", 10.0f, + "esObjectField", 1.0f )); SimpleQueryStringBuilder urnComponentQuery = (SimpleQueryStringBuilder) analyzerGroupQuery.should().get(1); @@ -99,7 +108,8 @@ public void testQueryBuilderFulltext() { "nestedArrayArrayField.delimited", 0.4f, "urn.delimited", 7.0f, "textArrayField.delimited", 0.4f, - "nestedArrayStringField.delimited", 0.4f + "nestedArrayStringField.delimited", 0.4f, + "wordGramField.delimited", 0.4f )); BoolQueryBuilder boolPrefixQuery = (BoolQueryBuilder) shouldQueries.get(1); @@ -109,21 +119,30 @@ public void testQueryBuilderFulltext() { if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) { MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery; return Pair.of(builder.fieldName(), builder.boost()); - } else { + } else if (prefixQuery instanceof TermQueryBuilder) { // exact TermQueryBuilder builder = (TermQueryBuilder) prefixQuery; return Pair.of(builder.fieldName(), builder.boost()); + } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) { + // ngram + MatchPhraseQueryBuilder builder = (MatchPhraseQueryBuilder) prefixQuery; + return Pair.of(builder.fieldName(), builder.boost()); } }).collect(Collectors.toList()); - assertEquals(prefixFieldWeights.size(), 22); + assertEquals(prefixFieldWeights.size(), 28); List.of( Pair.of("urn", 100.0f), Pair.of("urn", 70.0f), Pair.of("keyPart1.delimited", 16.8f), Pair.of("keyPart1.keyword", 100.0f), - Pair.of("keyPart1.keyword", 70.0f) + Pair.of("keyPart1.keyword", 70.0f), + Pair.of("wordGramField.wordGrams2", 1.44f), + Pair.of("wordGramField.wordGrams3", 2.25f), + Pair.of("wordGramField.wordGrams4", 3.2399998f), + Pair.of("wordGramField.keyword", 10.0f), + Pair.of("wordGramField.keyword", 7.0f) ).forEach(p -> assertTrue(prefixFieldWeights.contains(p), "Missing: " + p)); // Validate scorer @@ -144,7 +163,7 @@ public void testQueryBuilderStructured() { assertEquals(keywordQuery.queryString(), "testQuery"); assertNull(keywordQuery.analyzer()); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 16); + assertEquals(keywordFields.size(), 21); assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f); assertFalse(keywordFields.containsKey("keyPart3")); assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f); @@ -196,10 +215,14 @@ public void testCustomExactMatch() { List queries = boolPrefixQuery.should().stream().map(prefixQuery -> { if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) { + // prefix return (MatchPhrasePrefixQueryBuilder) prefixQuery; - } else { + } else if (prefixQuery instanceof TermQueryBuilder) { // exact return (TermQueryBuilder) prefixQuery; + } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) { + // ngram + return (MatchPhraseQueryBuilder) prefixQuery; } }).collect(Collectors.toList()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java index d66d6a0ab0e76..db56e2d34881b 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java @@ -7,6 +7,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.TestEntitySpecBuilder; +import com.linkedin.metadata.config.search.WordGramConfiguration; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -65,11 +66,17 @@ public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.7f); testQueryConfig.setExactMatch(exactMatchConfiguration); + testQueryConfig.setWordGram(wordGramConfiguration); testQueryConfig.setPartial(partialConfiguration); } @@ -113,10 +120,10 @@ public void testSearchRequestHandler() { HighlightBuilder highlightBuilder = sourceBuilder.highlighter(); List fields = highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList()); - assertEquals(fields.size(), 20); + assertEquals(fields.size(), 22); List highlightableFields = ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey", - "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField"); + "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "wordGramField"); highlightableFields.forEach(field -> { assertTrue(fields.contains(field), "Missing: " + field); assertTrue(fields.contains(field + ".*"), "Missing: " + field + ".*"); diff --git a/metadata-jobs/mae-consumer-job/build.gradle b/metadata-jobs/mae-consumer-job/build.gradle index e7941a04224e3..3811a9537ac24 100644 --- a/metadata-jobs/mae-consumer-job/build.gradle +++ b/metadata-jobs/mae-consumer-job/build.gradle @@ -43,6 +43,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -55,7 +57,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/metadata-jobs/mce-consumer-job/build.gradle b/metadata-jobs/mce-consumer-job/build.gradle index 5981284e9da3f..2229c387f3676 100644 --- a/metadata-jobs/mce-consumer-job/build.gradle +++ b/metadata-jobs/mce-consumer-job/build.gradle @@ -56,6 +56,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -68,7 +70,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl index 4339a186f1304..5047c824e2617 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl @@ -20,7 +20,7 @@ record ChartInfo includes CustomProperties, ExternalReference { * Title of the chart */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl index 26745fe46caaa..0b9c89ea30c90 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl @@ -15,7 +15,7 @@ record ContainerProperties includes CustomProperties, ExternalReference { * Display name of the Asset Container */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -25,7 +25,7 @@ record ContainerProperties includes CustomProperties, ExternalReference { * Fully-qualified name of the Container */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -61,4 +61,4 @@ record ContainerProperties includes CustomProperties, ExternalReference { } } lastModified: optional TimeStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl index 5cb306039506e..84b3065a08022 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl @@ -22,7 +22,7 @@ record DashboardInfo includes CustomProperties, ExternalReference { * Title of the dashboard */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -126,4 +126,4 @@ record DashboardInfo includes CustomProperties, ExternalReference { * The time when this dashboard last refreshed */ lastRefreshed: optional Time -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl index 481240740876a..1303bfbc863ea 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl @@ -17,7 +17,7 @@ record DataFlowInfo includes CustomProperties, ExternalReference { * Flow name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl index 8737dd4d9ef52..1e305816f96a2 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl @@ -18,7 +18,7 @@ record DataJobInfo includes CustomProperties, ExternalReference { * Job name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl index acc40e9f693ec..0be58d73dc79f 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl @@ -15,7 +15,7 @@ record DataPlatformInfo { */ @validate.strlen.max = 15 @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": false, "boostScore": 10.0 } @@ -25,7 +25,7 @@ record DataPlatformInfo { * The name that will be used for displaying a platform type. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl index d7ce5565103ee..1220741ee5726 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl @@ -16,7 +16,7 @@ record DataPlatformInstanceProperties includes CustomProperties, ExternalReferen * Display name of the Data Platform Instance */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index 72eefd5e294e4..c63cb1a97c017 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -19,7 +19,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc * Process name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -31,6 +31,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc @Searchable = { "fieldType": "KEYWORD", "addToFilters": true, + "fieldName": "processType", "filterNameOverride": "Process Type" } type: optional enum DataProcessType { diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl index 3861b7def7669..c0a50a5e0e688 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl @@ -13,7 +13,7 @@ record DataProductProperties includes CustomProperties, ExternalReference { * Display name of the Data Product */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl index 57b1fe7693129..49d0dcd58ee27 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl @@ -17,7 +17,7 @@ record DatasetProperties includes CustomProperties, ExternalReference { * Display name of the Dataset */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -27,7 +27,7 @@ record DatasetProperties includes CustomProperties, ExternalReference { * Fully-qualified name of the Dataset */ @Searchable = { - "fieldType": "TEXT", + "fieldType": "WORD_GRAM", "addToFilters": false, "enableAutocomplete": true, "boostScore": 10.0 @@ -77,4 +77,4 @@ record DatasetProperties includes CustomProperties, ExternalReference { */ @deprecated = "Use GlobalTags aspect instead." tags: array[string] = [ ] -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl index 5a0b8657ecb47..a362d412a32b9 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl @@ -14,7 +14,7 @@ record DomainProperties { * Display name of the Domain */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl index 1e840e5a1df7e..557b5e2a0f419 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl @@ -35,7 +35,7 @@ record GlossaryNodeInfo { */ @Searchable = { "fieldName": "displayName", - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -49,4 +49,4 @@ record GlossaryNodeInfo { } id: optional string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl index aa2a8b31e3dde..13e7af311fba1 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl @@ -23,7 +23,7 @@ record GlossaryTermInfo includes CustomProperties { * Display name of the term */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -75,4 +75,4 @@ record GlossaryTermInfo includes CustomProperties { */ @deprecated rawSchema: optional string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl index 6b050f484fedd..48ee53377e582 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl @@ -45,7 +45,7 @@ record CorpUserEditableInfo { * DataHub-native display name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl index 1cb705d426cc0..6cb0e8fd6aa6d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl @@ -26,7 +26,7 @@ record CorpUserInfo includes CustomProperties { * displayName of this user , e.g. Hang Zhang(DataHQ) */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 @@ -89,7 +89,7 @@ record CorpUserInfo includes CustomProperties { * Common name of this user, format is firstName + lastName (split by a whitespace) */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl index 075cc14ddc83b..9e65b8f6e9929 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl @@ -11,10 +11,10 @@ record CorpGroupKey { * The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl index d1a8a4bb5bb23..476a0ad9704b3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl @@ -12,7 +12,7 @@ record CorpUserKey { */ @Searchable = { "fieldName": "ldap", - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "boostScore": 2.0, "enableAutocomplete": true } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl index bcdb92f75d055..d8342630248b6 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl @@ -19,7 +19,7 @@ record DataFlowKey { * Unique Identifier of the data flow */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } flowId: string @@ -31,4 +31,4 @@ record DataFlowKey { "fieldType": "TEXT_PARTIAL" } cluster: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl index d0ac7dbca0f99..60ec51b464dcc 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl @@ -27,7 +27,7 @@ record DataJobKey { * Unique Identifier of the data job */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } jobId: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl index a5c05029352c2..4df1364a04ebe 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl @@ -13,7 +13,7 @@ record DataProcessKey { * Process name i.e. an ETL job name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 4.0 } @@ -37,4 +37,4 @@ record DataProcessKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl index ea1f9510ed438..70c5d174171af 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl @@ -25,7 +25,7 @@ record DatasetKey { //This is no longer to be used for Dataset native name. Use name, qualifiedName from DatasetProperties instead. @Searchable = { "fieldName": "id" - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl index 88697fe3ff364..51a3bc00f4e9e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl @@ -12,9 +12,9 @@ import com.linkedin.common.FabricType record GlossaryNodeKey { @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl index a9f35146da18e..61bcd60cbc754 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl @@ -13,10 +13,10 @@ record GlossaryTermKey { * The term name, which serves as a unique id */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "fieldName": "id" } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl index 579f1966977a9..0dcb194bccce0 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl @@ -20,9 +20,9 @@ record MLFeatureKey { * Name of the feature */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 8.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl index 1f786ad417be7..880daa4423573 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl @@ -22,9 +22,9 @@ record MLFeatureTableKey { * Name of the feature table */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 8.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl index 7c36f410fede3..83ba35e0af601 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl @@ -19,7 +19,7 @@ record MLModelDeploymentKey { * Name of the MLModelDeployment */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -35,4 +35,4 @@ record MLModelDeploymentKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl index 17c401c0b8c48..b1e2b7b7ede70 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl @@ -19,7 +19,7 @@ record MLModelGroupKey { * Name of the MLModelGroup */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -33,4 +33,4 @@ record MLModelGroupKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl index 55fd2bc370846..24fe89dcce654 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl @@ -19,7 +19,7 @@ record MLModelKey { * Name of the MLModel */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -35,4 +35,4 @@ record MLModelKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl index 9eb67eaf5f651..7987f3a3345b7 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl @@ -21,9 +21,9 @@ record MLPrimaryKeyKey { * Name of the primary key */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 8.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl index 47f1a631b4a2c..4622e32dce67b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl @@ -11,10 +11,10 @@ record TagKey { * The tag name, which serves as a unique id */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0, "fieldName": "id" } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl index 1f4dcf975f48c..5df4daacffa49 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl @@ -18,7 +18,7 @@ record NotebookInfo includes CustomProperties, ExternalReference { * Title of the Notebook */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl index 004df6e399be4..3e7b53beff531 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl @@ -14,7 +14,7 @@ record OwnershipTypeInfo { * Display name of the Ownership Type */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -54,4 +54,4 @@ record OwnershipTypeInfo { } } lastModified: AuditStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl index bb7e22900e168..3ba19d348913b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl @@ -29,7 +29,7 @@ record QueryProperties { * Optional display name to identify the query. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -69,4 +69,4 @@ record QueryProperties { } } lastModified: AuditStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl index acebdf5558c59..84d8ecc379ec2 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl @@ -14,7 +14,7 @@ record RoleProperties { * Display name of the IAM Role in the external system */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl index 41c500c6fff2f..e808aef491749 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl @@ -11,7 +11,7 @@ record TagProperties { * Display name of the tag */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java index 1a56db1bd68b0..b2b5260dc5e70 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java @@ -11,4 +11,5 @@ public class SearchConfiguration { private PartialConfiguration partial; private CustomConfiguration custom; private GraphQueryConfiguration graph; + private WordGramConfiguration wordGram; } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java new file mode 100644 index 0000000000000..624d2a4c63c4c --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java @@ -0,0 +1,11 @@ +package com.linkedin.metadata.config.search; + +import lombok.Data; + + +@Data +public class WordGramConfiguration { + private float twoGramFactor; + private float threeGramFactor; + private float fourGramFactor; +} diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 9f7bf92039fdc..82cf9e8fdc8a7 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -198,6 +198,10 @@ elasticsearch: prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search + wordGram: + twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens + threeGramFactor: ${ELASTICSEARCH_QUERY_THREE_GRAM_FACTOR:1.5} # boost multiplier when match on 3-gram tokens + fourGramFactor: ${ELASTICSEARCH_QUERY_FOUR_GRAM_FACTOR:1.8} # boost multiplier when match on 4-gram tokens # Field weight annotations are typically calibrated for exact match, if partial match is possible on the field use these adjustments partial: urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed @@ -318,4 +322,4 @@ cache: search: lineage: ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day - lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} \ No newline at end of file + lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index 796b6ee436b78..8e9b859e3b136 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -49,6 +49,12 @@ dependencies { testCompile externalDependency.hazelcastTest implementation externalDependency.jline implementation externalDependency.common + + constraints { + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } + } } configurations.all{ diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 7e9aa90664611..eaf14f7fd6c18 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -72,6 +72,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -84,7 +86,7 @@ tasks.getByName("docker").dependsOn([build, war]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index 050b5d2db95c9..d40e4a5e7a4aa 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -15,4 +15,4 @@ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false \ DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ -datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup +datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js new file mode 100644 index 0000000000000..2a8fe045f154e --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js @@ -0,0 +1,51 @@ +const DATASET_ENTITY_TYPE = 'dataset'; +const DATASET_URN = 'urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)'; + +describe("column-level lineage graph test", () => { + + it("navigate to lineage graph view and verify that column-level lineage is showing correctly", () => { + cy.login(); + cy.goToEntityLineageGraph(DATASET_ENTITY_TYPE, DATASET_URN); + //verify columns not shown by default + cy.waitTextVisible("SampleCypressHdfs"); + cy.waitTextVisible("SampleCypressHive"); + cy.waitTextVisible("cypress_logging"); + cy.ensureTextNotPresent("shipment_info"); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.ensureTextNotPresent("event_name"); + cy.ensureTextNotPresent("event_data"); + cy.ensureTextNotPresent("timestamp"); + cy.ensureTextNotPresent("browser"); + cy.clickOptionWithTestId("column-toggle") + //verify columns appear and belong co correct dataset + cy.waitTextVisible("shipment_info"); + cy.waitTextVisible("shipment_info.date"); + cy.waitTextVisible("shipment_info.target"); + cy.waitTextVisible("shipment_info.destination"); + cy.waitTextVisible("shipment_info.geo_info"); + cy.waitTextVisible("field_foo"); + cy.waitTextVisible("field_baz"); + cy.waitTextVisible("event_name"); + cy.waitTextVisible("event_data"); + cy.waitTextVisible("timestamp"); + cy.waitTextVisible("browser"); + //verify columns can be hidden and shown again + cy.contains("Hide").click({ force:true }); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.get("[aria-label='down']").eq(1).click({ force:true }); + cy.waitTextVisible("field_foo"); + cy.waitTextVisible("field_baz"); + //verify columns can be disabled successfully + cy.clickOptionWithTestId("column-toggle") + cy.ensureTextNotPresent("shipment_info"); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.ensureTextNotPresent("event_name"); + cy.ensureTextNotPresent("event_data"); + cy.ensureTextNotPresent("timestamp"); + cy.ensureTextNotPresent("browser"); + }); + +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/login/login.js b/smoke-test/tests/cypress/cypress/e2e/login/login.js index f86741b5afe01..74d04aa56d0d0 100644 --- a/smoke-test/tests/cypress/cypress/e2e/login/login.js +++ b/smoke-test/tests/cypress/cypress/e2e/login/login.js @@ -4,6 +4,6 @@ describe('login', () => { cy.get('input[data-testid=username]').type(Cypress.env('ADMIN_USERNAME')); cy.get('input[data-testid=password]').type(Cypress.env('ADMIN_PASSWORD')); cy.contains('Sign In').click(); - cy.contains('Welcome back, DataHub'); + cy.contains('Welcome back, Data Hub'); }); }) diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js b/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js index 1d41d155440e8..2fa11654a3c3e 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js @@ -1,19 +1,29 @@ -describe("deprecation", () => { +describe("dataset deprecation", () => { it("go to dataset and check deprecation works", () => { const urn = "urn:li:dataset:(urn:li:dataPlatform:hive,cypress_logging_events,PROD)"; const datasetName = "cypress_logging_events"; cy.login(); - cy.goToDataset(urn, datasetName); cy.openThreeDotDropdown(); cy.clickOptionWithText("Mark as deprecated"); cy.addViaFormModal("test deprecation", "Add Deprecation Details"); - - cy.goToDataset(urn, datasetName); - cy.contains("DEPRECATED"); - + cy.waitTextVisible("Deprecation Updated"); + cy.waitTextVisible("DEPRECATED") cy.openThreeDotDropdown(); cy.clickOptionWithText("Mark as un-deprecated"); + cy.waitTextVisible("Deprecation Updated"); + cy.ensureTextNotPresent("DEPRECATED"); + cy.openThreeDotDropdown(); + cy.clickOptionWithText("Mark as deprecated"); + cy.addViaFormModal("test deprecation", "Add Deprecation Details"); + cy.waitTextVisible("Deprecation Updated"); + cy.waitTextVisible("DEPRECATED"); + cy.contains("DEPRECATED").trigger("mouseover", { force: true }); + cy.waitTextVisible("Deprecation note"); + cy.get("[role='tooltip']").contains("Mark as un-deprecated").click(); + cy.waitTextVisible("Confirm Mark as un-deprecated"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Marked assets as un-deprecated!"); cy.ensureTextNotPresent("DEPRECATED"); - }); + }); }); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js new file mode 100644 index 0000000000000..1f40cdf602062 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js @@ -0,0 +1,71 @@ +const test_id = Math.floor(Math.random() * 100000); +const documentation_edited = `This is test${test_id} documentation EDITED`; +const wrong_url = "https://www.linkedincom"; +const correct_url = "https://www.linkedin.com"; + +describe("edit documentation and link to dataset", () => { + + it("open test dataset page, edit documentation", () => { + //edit documentation and verify changes saved + cy.loginWithCredentials(); + cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.waitTextVisible("my hive dataset"); + cy.waitTextVisible("Sample doc"); + cy.clickOptionWithText("Edit"); + cy.focused().clear(); + cy.focused().type(documentation_edited); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible(documentation_edited); + //return documentation to original state + cy.clickOptionWithText("Edit"); + cy.focused().clear().wait(1000); + cy.focused().type("my hive dataset"); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible("my hive dataset"); + }); + + it("open test dataset page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + }); + + it("edit field documentation", () => { + cy.loginWithCredentials(); + cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.waitTextVisible("Update description"); + cy.waitTextVisible("Foo field description has changed"); + cy.focused().clear().wait(1000); + cy.focused().type(documentation_edited); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible(documentation_edited); + cy.waitTextVisible("(edited)"); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.focused().clear().wait(1000); + cy.focused().type("Foo field description has changed"); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible("Foo field description has changed"); + cy.waitTextVisible("(edited)"); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js index ddda8626fba2f..24a24cc21138d 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js @@ -31,8 +31,7 @@ describe("run managed ingestion", () => { cy.waitTextVisible(testName) cy.contains(testName).parent().within(() => { - // TODO: Skipping until disk size resolved - // cy.contains("Succeeded", {timeout: 30000}) + cy.contains("Succeeded", {timeout: 180000}) cy.clickOptionWithTestId("delete-button"); }) cy.clickOptionWithText("Yes") diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js index 7686acfe50de0..353570c0d955b 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js @@ -64,6 +64,8 @@ describe("create and manage group", () => { }); it("update group info", () => { + var expected_name = Cypress.env('ADMIN_USERNAME') == "datahub" ? "Data Hub" : Cypress.env('ADMIN_USERNAME'); + cy.loginWithCredentials(); cy.visit("/settings/identities/groups"); cy.clickOptionWithText(group_name); @@ -77,13 +79,13 @@ describe("create and manage group", () => { cy.contains("Test group description EDITED").should("be.visible"); cy.clickOptionWithText("Add Owners"); cy.contains("Search for users or groups...").click({ force: true }); - cy.focused().type(Cypress.env('ADMIN_USERNAME')); - cy.get(".ant-select-item-option").contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).click(); + cy.focused().type(expected_name); + cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click(); cy.focused().blur(); - cy.contains(Cypress.env('ADMIN_USERNAME')).should("have.length", 1); + cy.contains(expected_name).should("have.length", 1); cy.get('[role="dialog"] button').contains("Done").click(); cy.waitTextVisible("Owners Added"); - cy.contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).should("be.visible"); + cy.contains(expected_name, { matchCase: false }).should("be.visible"); cy.clickOptionWithText("Edit Group"); cy.waitTextVisible("Edit Profile"); cy.get("#email").type(`${test_id}@testemail.com`); diff --git a/smoke-test/tests/cypress/data.json b/smoke-test/tests/cypress/data.json index c6606519e8d73..3b2ee1afaba58 100644 --- a/smoke-test/tests/cypress/data.json +++ b/smoke-test/tests/cypress/data.json @@ -2012,4 +2012,4 @@ }, "systemMetadata": null } -] +] \ No newline at end of file diff --git a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl index ed30244c31b17..cc579ba488174 100644 --- a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl +++ b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl @@ -25,6 +25,11 @@ record TestEntityInfo includes CustomProperties { } textArrayField: optional array[string] + @Searchable = { + "fieldType": "WORD_GRAM" + } + wordGramField: optional string + @Relationship = { "name": "foreignKey", "entityTypes": []