diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index c268a66938945..532669c44722c 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -63,8 +63,8 @@ jobs:
env:
ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD != '' && secrets.ACRYL_DOCKER_PASSWORD != '' }}
run: |
- echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}"
- echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT
+ echo "Enable publish: ${{ env.ENABLE_PUBLISH }}"
+ echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT
gms_build:
name: Build and Push DataHub GMS Docker Image
@@ -451,8 +451,6 @@ jobs:
tags: ${{ needs.setup.outputs.tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
- build-args: |
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
publish: ${{ needs.setup.outputs.publish }}
context: .
file: ./docker/datahub-ingestion-base/Dockerfile
@@ -481,7 +479,7 @@ jobs:
uses: ishworkh/docker-image-artifact-download@v1
if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+ image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
- name: Build and push Base-Slim Image
if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }}
uses: ./.github/actions/docker-custom-build-and-push
@@ -493,16 +491,15 @@ jobs:
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
build-args: |
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
APP_ENV=slim
- BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+ BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
publish: ${{ needs.setup.outputs.publish }}
context: .
file: ./docker/datahub-ingestion-base/Dockerfile
platforms: linux/amd64,linux/arm64/v8
- name: Compute DataHub Ingestion (Base-Slim) Tag
id: tag
- run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}" >> $GITHUB_OUTPUT
+ run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT
datahub_ingestion_base_full_build:
name: Build and Push DataHub Ingestion (Base-Full) Docker Image
runs-on: ubuntu-latest
@@ -524,7 +521,7 @@ jobs:
uses: ishworkh/docker-image-artifact-download@v1
if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+ image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
- name: Build and push Base-Full Image
if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }}
uses: ./.github/actions/docker-custom-build-and-push
@@ -532,20 +529,19 @@ jobs:
target: full-install
images: |
${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
- tags: ${{ needs.setup.outputs.full_tag }}
+ tags: ${{ needs.setup.outputs.unique_full_tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
build-args: |
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
APP_ENV=full
- BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+ BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
publish: ${{ needs.setup.outputs.publish }}
context: .
file: ./docker/datahub-ingestion-base/Dockerfile
platforms: linux/amd64,linux/arm64/v8
- name: Compute DataHub Ingestion (Base-Full) Tag
id: tag
- run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}" >> $GITHUB_OUTPUT
+ run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
datahub_ingestion_slim_build:
@@ -553,6 +549,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.tag.outputs.tag }}
+ needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }}
needs: [setup, datahub_ingestion_base_slim_build]
steps:
- name: Check out the repo
@@ -572,9 +569,9 @@ jobs:
run: ./gradlew :metadata-ingestion:codegen
- name: Download Base Image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}
+ image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}
- name: Build and push Slim Image
if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }}
uses: ./.github/actions/docker-custom-build-and-push
@@ -584,7 +581,7 @@ jobs:
${{ env.DATAHUB_INGESTION_IMAGE }}
build-args: |
BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}
+ DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}
APP_ENV=slim
tags: ${{ needs.setup.outputs.slim_tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
@@ -595,7 +592,7 @@ jobs:
platforms: linux/amd64,linux/arm64/v8
- name: Compute Tag
id: tag
- run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.slim_tag || 'head' }}" >> $GITHUB_OUTPUT
+ run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT
datahub_ingestion_slim_scan:
permissions:
contents: read # for actions/checkout to fetch code
@@ -609,15 +606,15 @@ jobs:
uses: actions/checkout@v3
- name: Download image Slim Image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.slim_tag }}
+ image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
- name: Run Trivy vulnerability scanner Slim Image
uses: aquasecurity/trivy-action@0.8.0
env:
TRIVY_OFFLINE_SCAN: true
with:
- image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.slim_tag }}
+ image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
format: "template"
template: "@/contrib/sarif.tpl"
output: "trivy-results.sarif"
@@ -634,6 +631,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.tag.outputs.tag }}
+ needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }}
needs: [setup, datahub_ingestion_base_full_build]
steps:
- name: Check out the repo
@@ -653,9 +651,9 @@ jobs:
run: ./gradlew :metadata-ingestion:codegen
- name: Download Base Image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}
+ image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}
- name: Build and push Full Image
if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }}
uses: ./.github/actions/docker-custom-build-and-push
@@ -665,8 +663,8 @@ jobs:
${{ env.DATAHUB_INGESTION_IMAGE }}
build-args: |
BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}
- tags: ${{ needs.setup.outputs.full_tag }}
+ DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}
+ tags: ${{ needs.setup.outputs.unique_full_tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
publish: ${{ needs.setup.outputs.publish }}
@@ -675,7 +673,7 @@ jobs:
platforms: linux/amd64,linux/arm64/v8
- name: Compute Tag (Full)
id: tag
- run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}" >> $GITHUB_OUTPUT
+ run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
datahub_ingestion_full_scan:
permissions:
contents: read # for actions/checkout to fetch code
@@ -689,15 +687,15 @@ jobs:
uses: actions/checkout@v3
- name: Download image Full Image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.full_tag }}
+ image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }}
- name: Run Trivy vulnerability scanner Full Image
uses: aquasecurity/trivy-action@0.8.0
env:
TRIVY_OFFLINE_SCAN: true
with:
- image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.full_tag }}
+ image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }}
format: "template"
template: "@/contrib/sarif.tpl"
output: "trivy-results.sarif"
@@ -750,6 +748,10 @@ jobs:
./gradlew :metadata-ingestion:install
- name: Disk Check
run: df -h . && docker images
+ - name: Remove images
+ run: docker image prune -a -f || true
+ - name: Disk Check
+ run: df -h . && docker images
- name: Download GMS image
uses: ishworkh/docker-image-artifact-download@v1
if: ${{ needs.setup.outputs.publish != 'true' }}
@@ -792,9 +794,9 @@ jobs:
image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }}
- name: Download datahub-ingestion-slim image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.setup.outputs.unique_tag }}
+ image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
- name: Disk Check
run: df -h . && docker images
- name: run quickstart
@@ -812,6 +814,8 @@ jobs:
# we are doing this because gms takes time to get ready
# and we don't have a better readiness check when bootstrap is done
sleep 60s
+ - name: Disk Check
+ run: df -h . && docker images
- name: Disable ES Disk Threshold
run: |
curl -XPUT "http://localhost:9200/_cluster/settings" \
diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql
index f15535bfb4eb8..fbea66f738955 100644
--- a/datahub-graphql-core/src/main/resources/search.graphql
+++ b/datahub-graphql-core/src/main/resources/search.graphql
@@ -448,6 +448,11 @@ enum FilterOperator {
* Represent the relation: String field is one of the array values to, e.g. name in ["Profile", "Event"]
"""
IN
+
+ """
+ Represents the relation: The field exists. If the field is an array, the field is either not present or empty.
+ """
+ EXISTS
}
"""
diff --git a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
index 03689460eb02b..eda9b7d7fe2a4 100644
--- a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
+++ b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
@@ -1,4 +1,4 @@
-import React, { useEffect, useRef, useState } from 'react';
+import React, { CSSProperties, useEffect, useRef, useState } from 'react';
import { useHistory } from 'react-router';
import { Select } from 'antd';
import styled from 'styled-components';
@@ -55,11 +55,21 @@ const ViewSelectContainer = styled.div`
.ant-select-selection-item {
font-weight: 700;
font-size: 14px;
+ text-align: left;
}
}
}
`;
+const SelectStyled = styled(Select)`
+ min-width: 90px;
+ max-width: 200px;
+`;
+
+type Props = {
+ dropdownStyle?: CSSProperties;
+};
+
/**
* The View Select component allows you to select a View to apply to query on the current page. For example,
* search, recommendations, and browse.
@@ -69,7 +79,7 @@ const ViewSelectContainer = styled.div`
*
* In the event that a user refreshes their browser, the state of the view should be saved as well.
*/
-export const ViewSelect = () => {
+export const ViewSelect = ({ dropdownStyle = {} }: Props) => {
const history = useHistory();
const userContext = useUserContext();
const [isOpen, setIsOpen] = useState(false);
@@ -188,12 +198,11 @@ export const ViewSelect = () => {
return (
-
+
{viewBuilderDisplayState.visible && (
{
ref={clearButtonRef}
onClick={onHandleClickClear}
>
- All Entities
+ View all
);
diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
index f776082e3f905..36713cfb7ffcf 100644
--- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
+++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
@@ -296,7 +296,7 @@ export default function DefaultPreviewCard({
{deprecation?.deprecated && (
)}
- {health && health.length > 0 && }
+ {health && health.length > 0 ? : null}
{externalUrl && (
initialValues?.includes(agg?.entity?.urn || ''))?.entity || null
- }
- onModalClose={onCloseModal}
- onOkOverride={(dataProductUrn) => {
- onSelect([dataProductUrn]);
- onCloseModal();
- }}
- />
- );
- }
-
if (filterField === CONTAINER_FILTER_NAME) {
return (
-
+
)}
> $CONNECTION_PROPERTIES_PATH
fi
-cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180
-
+# cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180
+. kafka-ready.sh
############################################################
# Start Topic Creation Logic
diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md
index e1fadee6d371a..9c8f6761a62bc 100644
--- a/docs/advanced/no-code-modeling.md
+++ b/docs/advanced/no-code-modeling.md
@@ -211,7 +211,7 @@ record ServiceKey {
* Name of the service
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
name: string
diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md
index ef4071f89c585..21d59b777dd7c 100644
--- a/docs/lineage/airflow.md
+++ b/docs/lineage/airflow.md
@@ -62,6 +62,7 @@ lazy_load_plugins = False
| datahub.cluster | prod | name of the airflow cluster |
| datahub.capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. |
| datahub.capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. |
+ | datahub.capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. |
| datahub.graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. |
5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html).
@@ -80,9 +81,7 @@ Emitting DataHub ...
If you have created a custom Airflow operator [docs](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class,
when overriding the `execute` function, set inlets and outlets via `context['ti'].task.inlets` and `context['ti'].task.outlets`.
-The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs.
-
-
+The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs.
```python
class DbtOperator(BaseOperator):
@@ -97,8 +96,8 @@ class DbtOperator(BaseOperator):
def _get_lineage(self):
# Do some processing to get inlets/outlets
-
- return inlets, outlets
+
+ return inlets, outlets
```
If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. [source](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage)
@@ -172,7 +171,6 @@ Take a look at this sample DAG:
In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details.
-
## Debugging
### Incorrect URLs
diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md
index 32951ab2e41eb..f47630f44e772 100644
--- a/docs/modeling/extending-the-metadata-model.md
+++ b/docs/modeling/extending-the-metadata-model.md
@@ -323,7 +323,7 @@ It takes the following parameters:
annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define
the set of mappings to be applied in the MappingsBuilder.
- Thus far, we have implemented 10 fieldTypes:
+ Thus far, we have implemented 11 fieldTypes:
1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering
@@ -332,20 +332,25 @@ It takes the following parameters:
3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial
matching is expensive, so this field type should not be applied to fields with long values (like description)
- 4. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
+ 4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND
+ word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries
+ matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is
+ expensive, so should not be applied to fields with long values such as description.
- 5. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
+ 5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
+
+ 6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
"urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components
- 6. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
+ 7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
- 7. *BOOLEAN* - Boolean fields used for filtering.
+ 8. *BOOLEAN* - Boolean fields used for filtering.
- 8. *COUNT* - Count fields used for filtering.
+ 9. *COUNT* - Count fields used for filtering.
- 9. *DATETIME* - Datetime fields used to represent timestamps.
+ 10. *DATETIME* - Datetime fields used to represent timestamps.
- 10. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as
+ 11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as
`field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a
mapping explosion in Elasticsearch.
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
index f2e65c771c6eb..3d3fbcf3ccaa6 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
@@ -21,7 +21,7 @@ public class SearchableAnnotation {
public static final String ANNOTATION_NAME = "Searchable";
private static final Set DEFAULT_QUERY_FIELD_TYPES =
- ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL);
+ ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL);
// Name of the field in the search index. Defaults to the field name in the schema
String fieldName;
@@ -59,7 +59,8 @@ public enum FieldType {
COUNT,
DATETIME,
OBJECT,
- BROWSE_PATH_V2
+ BROWSE_PATH_V2,
+ WORD_GRAM
}
@Nonnull
diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
index 1ab5ff640ce32..3618108970afa 100644
--- a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
+++ b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
@@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName());
// Assert on Searchable Fields
- assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size());
+ assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10);
assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get(
@@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("textArrayField", "*").toString())
.getSearchableAnnotation().getFieldType());
+ assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get(
+ new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName());
+ assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get(
+ new PathSpec("wordGramField").toString())
+ .getSearchableAnnotation().getFieldType());
assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get(
diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md
index 9d400460407c8..03bcef70e1860 100644
--- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md
+++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md
@@ -1,5 +1,60 @@
## Advanced Configurations
+### Working with Platform Instances
+If you've multiple instances of kafka OR source/sink systems that are referred in your `kafka-connect` setup, you'd need to configure platform instance for these systems in `kafka-connect` recipe to generate correct lineage edges. You must have already set `platform_instance` in recipes of original source/sink systems. Refer the document [Working with Platform Instances](https://datahubproject.io/docs/platform-instances) to understand more about this.
+
+There are two options available to declare source/sink system's `platform_instance` in `kafka-connect` recipe. If single instance of platform is used across all `kafka-connect` connectors, you can use `platform_instance_map` to specify platform_instance to use for a platform when constructing URNs for lineage.
+
+Example:
+```yml
+ # Map of platform name to platform instance
+ platform_instance_map:
+ snowflake: snowflake_platform_instance
+ mysql: mysql_platform_instance
+
+```
+If multiple instances of platform are used across `kafka-connect` connectors, you'd need to specify platform_instance to use for platform for every connector.
+
+#### Example - Multiple MySQL Source Connectors each reading from different mysql instance
+```yml
+ # Map of platform name to platform instance per connector
+ connect_to_platform_map:
+ mysql_connector1:
+ mysql: mysql_instance1
+
+ mysql_connector2:
+ mysql: mysql_instance2
+```
+Here mysql_connector1 and mysql_connector2 are names of MySQL source connectors as defined in `kafka-connect` connector config.
+
+#### Example - Multiple MySQL Source Connectors each reading from difference mysql instance and writing to different kafka cluster
+```yml
+ connect_to_platform_map:
+ mysql_connector1:
+ mysql: mysql_instance1
+ kafka: kafka_instance1
+
+ mysql_connector2:
+ mysql: mysql_instance2
+ kafka: kafka_instance2
+```
+You can also use combination of `platform_instance_map` and `connect_to_platform_map` in your recipe. Note that, the platform_instance specified for the connector in `connect_to_platform_map` will always take higher precedance even if platform_instance for same platform is set in `platform_instance_map`.
+
+If you do not use `platform_instance` in original source/sink recipes, you do not need to specify them in above configurations.
+
+Note that, you do not need to specify platform_instance for BigQuery.
+
+#### Example - Multiple BigQuery Sink Connectors each writing to different kafka cluster
+```yml
+ connect_to_platform_map:
+ bigquery_connector1:
+ kafka: kafka_instance1
+
+ bigquery_connector2:
+ kafka: kafka_instance2
+```
+
+### Provided Configurations from External Sources
Kafka Connect supports pluggable configuration providers which can load configuration data from external sources at runtime. These values are not available to DataHub ingestion source through Kafka Connect APIs. If you are using such provided configurations to specify connection url (database, etc) in Kafka Connect connector configuration then you will need also add these in `provided_configs` section in recipe for DataHub to generate correct lineage.
```yml
diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml
index f5e33e661622d..cacbda5ca078a 100644
--- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml
+++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml
@@ -3,14 +3,16 @@ source:
config:
# Coordinates
connect_uri: "http://localhost:8083"
-
+
# Credentials
username: admin
password: password
# Optional
- platform_instance_map:
- bigquery: bigquery_platform_instance_id
-
+ # Platform instance mapping to use when constructing URNs.
+ # Use if single instance of platform is referred across connectors.
+ platform_instance_map:
+ mysql: mysql_platform_instance
+
sink:
- # sink configs
\ No newline at end of file
+ # sink configs
diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py
index 8e313e92cbf84..c943b83a887ed 100644
--- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py
+++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py
@@ -435,6 +435,7 @@ def _field_from_complex_type(
field_path._set_parent_type_if_not_exists(
DataHubType(type=MapTypeClass, nested_type=value_type)
)
+ # FIXME: description not set. This is present in schema["description"].
yield from JsonSchemaTranslator.get_fields(
JsonSchemaTranslator._get_type_from_schema(
schema["additionalProperties"]
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index d1f39a3ba1ba6..7725d63ce0e1e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -129,6 +129,7 @@
# Handle table snapshots
# See https://cloud.google.com/bigquery/docs/table-snapshots-intro.
SNAPSHOT_TABLE_REGEX = re.compile(r"^(.+)@(\d{13})$")
+CLUSTERING_COLUMN_TAG = "CLUSTERING_COLUMN"
# We can't use close as it is not called if the ingestion is not successful
@@ -1151,6 +1152,21 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]:
field.description = col.comment
schema_fields[idx] = field
else:
+ tags = []
+ if col.is_partition_column:
+ tags.append(
+ TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY))
+ )
+
+ if col.cluster_column_position is not None:
+ tags.append(
+ TagAssociationClass(
+ make_tag_urn(
+ f"{CLUSTERING_COLUMN_TAG}_{col.cluster_column_position}"
+ )
+ )
+ )
+
field = SchemaField(
fieldPath=col.name,
type=SchemaFieldDataType(
@@ -1160,15 +1176,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]:
nativeDataType=col.data_type,
description=col.comment,
nullable=col.is_nullable,
- globalTags=GlobalTagsClass(
- tags=[
- TagAssociationClass(
- make_tag_urn(Constants.TAG_PARTITION_KEY)
- )
- ]
- )
- if col.is_partition_column
- else GlobalTagsClass(tags=[]),
+ globalTags=GlobalTagsClass(tags=tags),
)
schema_fields.append(field)
last_id = col.ordinal_position
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
index 2450dbd0e2391..f8256f8e6fed6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -33,6 +33,7 @@ class BigqueryTableType:
class BigqueryColumn(BaseColumn):
field_path: str
is_partition_column: bool
+ cluster_column_position: Optional[int]
RANGE_PARTITION_NAME: str = "RANGE"
@@ -285,7 +286,8 @@ class BigqueryQuery:
CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
description as comment,
c.is_hidden as is_hidden,
- c.is_partitioning_column as is_partitioning_column
+ c.is_partitioning_column as is_partitioning_column,
+ c.clustering_ordinal_position as clustering_ordinal_position,
from
`{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
@@ -307,6 +309,7 @@ class BigqueryQuery:
description as comment,
c.is_hidden as is_hidden,
c.is_partitioning_column as is_partitioning_column,
+ c.clustering_ordinal_position as clustering_ordinal_position,
-- We count the columns to be able limit it later
row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num,
-- Getting the maximum shard for each table
@@ -333,6 +336,7 @@ class BigqueryQuery:
CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
c.is_hidden as is_hidden,
c.is_partitioning_column as is_partitioning_column,
+ c.clustering_ordinal_position as clustering_ordinal_position,
description as comment
from
`{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c
@@ -583,6 +587,7 @@ def get_columns_for_dataset(
data_type=column.data_type,
comment=column.comment,
is_partition_column=column.is_partitioning_column == "YES",
+ cluster_column_position=column.clustering_ordinal_position,
)
)
@@ -621,6 +626,7 @@ def get_columns_for_table(
data_type=column.data_type,
comment=column.comment,
is_partition_column=column.is_partitioning_column == "YES",
+ cluster_column_position=column.clustering_ordinal_position,
)
)
last_seen_table = column.table_name
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py
index 1cd5ed8164854..af9769bc9d94c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py
@@ -162,9 +162,11 @@ class DBTCloudConfig(DBTCommonConfig):
}
_DBT_GRAPHQL_QUERY = """
-query DatahubMetadataQuery_{type}($jobId: Int!, $runId: Int) {{
- {type}(jobId: $jobId, runId: $runId) {{
+query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
+ job(id: $jobId, runId: $runId) {{
+ {type} {{
{fields}
+ }}
}}
}}
"""
@@ -218,7 +220,7 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]:
},
)
- raw_nodes.extend(data[node_type])
+ raw_nodes.extend(data["job"][node_type])
nodes = [self._parse_into_dbt_node(node) for node in raw_nodes]
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
index 039eac1e93819..587c71a98be67 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
@@ -1,5 +1,6 @@
from typing import List, Optional
+from datahub.configuration.time_window_config import BucketDuration
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
from datahub.ingestion.source.snowflake.snowflake_config import DEFAULT_TABLES_DENY_LIST
@@ -575,14 +576,17 @@ def get_access_history_date_range() -> str:
def usage_per_object_per_time_bucket_for_time_window(
start_time_millis: int,
end_time_millis: int,
- time_bucket_size: str,
+ time_bucket_size: BucketDuration,
use_base_objects: bool,
top_n_queries: int,
include_top_n_queries: bool,
) -> str:
if not include_top_n_queries:
top_n_queries = 0
- assert time_bucket_size == "DAY" or time_bucket_size == "HOUR"
+ assert (
+ time_bucket_size == BucketDuration.DAY
+ or time_bucket_size == BucketDuration.HOUR
+ )
objects_column = (
"BASE_OBJECTS_ACCESSED" if use_base_objects else "DIRECT_OBJECTS_ACCESSED"
)
@@ -629,7 +633,7 @@ def usage_per_object_per_time_bucket_for_time_window(
SELECT
object_name,
ANY_VALUE(object_domain) AS object_domain,
- DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time,
+ DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time,
count(distinct(query_id)) AS total_queries,
count( distinct(user_name) ) AS total_users
FROM
@@ -644,7 +648,7 @@ def usage_per_object_per_time_bucket_for_time_window(
SELECT
object_name,
column_name,
- DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time,
+ DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time,
count(distinct(query_id)) AS total_queries
FROM
field_access_history
@@ -658,7 +662,7 @@ def usage_per_object_per_time_bucket_for_time_window(
(
SELECT
object_name,
- DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time,
+ DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time,
count(distinct(query_id)) AS total_queries,
user_name,
ANY_VALUE(users.email) AS user_email
@@ -677,7 +681,7 @@ def usage_per_object_per_time_bucket_for_time_window(
(
SELECT
object_name,
- DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time,
+ DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time,
query_history.query_text AS query_text,
count(distinct(access_history.query_id)) AS total_queries
FROM
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
index 3605205b6055c..f8dfa612952d8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py
@@ -356,7 +356,6 @@ def _check_usage_date_ranges(self) -> Any:
def _get_operation_aspect_work_unit(
self, event: SnowflakeJoinedAccessEvent, discovered_datasets: List[str]
) -> Iterable[MetadataWorkUnit]:
-
if event.query_start_time and event.query_type:
start_time = event.query_start_time
query_type = event.query_type
diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml
index ddaf206f236cf..84af5c32a60e3 100644
--- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml
+++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml
@@ -1,6 +1,7 @@
version: "3.9"
services:
vertica:
+ platform: linux/amd64
environment:
APP_DB_USER: "dbadmin"
APP_DB_PASSWORD: "abc123"
@@ -18,6 +19,3 @@ services:
volumes:
vertica-data:
-
-
-
diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py
index db8bfd247313b..fe306d1d0b2b8 100644
--- a/metadata-ingestion/tests/integration/vertica/test_vertica.py
+++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py
@@ -58,6 +58,7 @@ def vertica_runner(docker_compose_runner, test_resources_dir):
# Test needs more work to be done , currently it is working fine.
@freeze_time(FROZEN_TIME)
+@pytest.mark.skip("Failing in CI, cmd failing with exit code 1")
@pytest.mark.integration
def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path):
test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica"
diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py
index a2aec8df93d09..44ce5f0a02e37 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py
@@ -37,6 +37,7 @@ def test_generate_day_partitioned_partition_profiler_query():
ordinal_position=1,
data_type="TIMESTAMP",
is_partition_column=True,
+ cluster_column_position=None,
comment=None,
is_nullable=False,
)
@@ -79,6 +80,7 @@ def test_generate_day_partitioned_partition_profiler_query_with_set_partition_ti
ordinal_position=1,
data_type="TIMESTAMP",
is_partition_column=True,
+ cluster_column_position=None,
comment=None,
is_nullable=False,
)
@@ -120,6 +122,7 @@ def test_generate_hour_partitioned_partition_profiler_query():
ordinal_position=1,
data_type="TIMESTAMP",
is_partition_column=True,
+ cluster_column_position=None,
comment=None,
is_nullable=False,
)
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
index 555acb2ffdd3b..efa4e0c279a76 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
@@ -42,6 +42,9 @@ public static Map getPartialNgramConfigWithOverrides(Map getMappingsForField(@Nonnull final Searchable
mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
// Add keyword subfield without lowercase filter
mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP));
- } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) {
+ } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
mappingForField.put(TYPE, KEYWORD);
mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
Map subFields = new HashMap<>();
- if (fieldType == FieldType.TEXT_PARTIAL) {
+ if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
subFields.put(NGRAM, getPartialNgramConfigWithOverrides(
ImmutableMap.of(
ANALYZER, PARTIAL_ANALYZER
)
));
+ if (fieldType == FieldType.WORD_GRAM) {
+ for (Map.Entry entry : Map.of(
+ WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER,
+ WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER,
+ WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) {
+ String fieldName = entry.getKey();
+ String analyzerName = entry.getValue();
+ subFields.put(fieldName, ImmutableMap.of(
+ TYPE, TEXT,
+ ANALYZER, analyzerName,
+ SEARCH_ANALYZER, analyzerName
+ ));
+ }
+ }
}
subFields.put(DELIMITED, ImmutableMap.of(
TYPE, TEXT,
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
index 5b3e396837aa7..e180c8296b48d 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
@@ -66,6 +66,9 @@ public class SettingsBuilder {
public static final String KEYWORD_ANALYZER = "keyword";
public static final String URN_ANALYZER = "urn_component";
public static final String URN_SEARCH_ANALYZER = "query_urn_component";
+ public static final String WORD_GRAM_2_ANALYZER = "word_gram_2";
+ public static final String WORD_GRAM_3_ANALYZER = "word_gram_3";
+ public static final String WORD_GRAM_4_ANALYZER = "word_gram_4";
// Filters
public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space";
@@ -80,6 +83,10 @@ public class SettingsBuilder {
public static final String MULTIFILTER = "multifilter";
public static final String MULTIFILTER_GRAPH = "multifilter_graph";
public static final String PARTIAL_URN_COMPONENT = "partial_urn_component";
+ public static final String SHINGLE = "shingle";
+ public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter";
+ public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter";
+ public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter";
public static final String SNOWBALL = "snowball";
public static final String STEM_OVERRIDE = "stem_override";
public static final String STOP = "stop";
@@ -108,6 +115,7 @@ public class SettingsBuilder {
public static final String SLASH_TOKENIZER = "slash_tokenizer";
public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer";
public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
+ public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer";
// Do not remove the space, needed for multi-term synonyms
public static final List ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
"([a-z0-9 _-]{2,})",
@@ -161,6 +169,13 @@ public class SettingsBuilder {
AUTOCOMPLETE_CUSTOM_DELIMITER,
LOWERCASE);
+ public static final List WORD_GRAM_TOKEN_FILTERS = ImmutableList.of(
+ ASCII_FOLDING,
+ LOWERCASE,
+ TRIM,
+ REMOVE_QUOTES
+ );
+
public final Map settings;
public SettingsBuilder(String mainTokenizer) {
@@ -275,6 +290,17 @@ private static Map buildFilters() throws IOException {
.collect(Collectors.toList()))
.build());
}
+
+ for (Map.Entry entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) {
+ String filterName = entry.getKey();
+ Integer gramSize = entry.getValue();
+ filters.put(filterName, ImmutableMap.builder()
+ .put(TYPE, SHINGLE)
+ .put("min_shingle_size", gramSize)
+ .put("max_shingle_size", gramSize)
+ .put("output_unigrams", false)
+ .build());
+ }
}
return filters.build();
@@ -302,13 +328,24 @@ private static Map buildTokenizers() {
.put(DELIMITER, "␟")
.build());
- // Tokenize by whitespace and most special chars
+ // Tokenize by most special chars
+ // Do NOT tokenize by whitespace to keep multi-word synonyms in the same token
+ // The split by whitespace is done later in the token filters phase
tokenizers.put(MAIN_TOKENIZER,
ImmutableMap.builder()
.put(TYPE, PATTERN)
.put(PATTERN, "[(),./:]")
.build());
+ // Tokenize by whitespace and most special chars for wordgrams
+ // only split on - when not preceded by a whitespace to preserve exclusion functionality
+ // i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently
+ tokenizers.put(WORD_GRAM_TOKENIZER,
+ ImmutableMap.builder()
+ .put(TYPE, PATTERN)
+ .put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)")
+ .build());
+
return tokenizers.build();
}
@@ -382,6 +419,21 @@ private static Map buildAnalyzers(String mainTokenizer) {
.put(FILTER, SEARCH_TOKEN_FILTERS)
.build());
+ // Support word grams
+ for (Map.Entry entry : Map.of(
+ WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER,
+ WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER,
+ WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) {
+ String analyzerName = entry.getKey();
+ String filterName = entry.getValue();
+ analyzers.put(analyzerName, ImmutableMap.builder()
+ .put(TOKENIZER, WORD_GRAM_TOKENIZER)
+ .put(FILTER, ImmutableList.