From a17ed80cf742b0e3f5a04632eac6dc2eec045da6 Mon Sep 17 00:00:00 2001 From: Joshua Eilers Date: Mon, 21 Aug 2023 09:47:05 -0700 Subject: [PATCH 01/11] Fix a few view select issues (#8670) --- .../src/app/entity/view/select/ViewSelect.tsx | 23 +++++++++++++------ .../entity/view/select/ViewSelectHeader.tsx | 2 +- .../src/app/search/SearchBar.tsx | 10 +++++++- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx index 03689460eb02b..eda9b7d7fe2a4 100644 --- a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx +++ b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx @@ -1,4 +1,4 @@ -import React, { useEffect, useRef, useState } from 'react'; +import React, { CSSProperties, useEffect, useRef, useState } from 'react'; import { useHistory } from 'react-router'; import { Select } from 'antd'; import styled from 'styled-components'; @@ -55,11 +55,21 @@ const ViewSelectContainer = styled.div` .ant-select-selection-item { font-weight: 700; font-size: 14px; + text-align: left; } } } `; +const SelectStyled = styled(Select)` + min-width: 90px; + max-width: 200px; +`; + +type Props = { + dropdownStyle?: CSSProperties; +}; + /** * The View Select component allows you to select a View to apply to query on the current page. For example, * search, recommendations, and browse. @@ -69,7 +79,7 @@ const ViewSelectContainer = styled.div` * * In the event that a user refreshes their browser, the state of the view should be saved as well. */ -export const ViewSelect = () => { +export const ViewSelect = ({ dropdownStyle = {} }: Props) => { const history = useHistory(); const userContext = useUserContext(); const [isOpen, setIsOpen] = useState(false); @@ -188,12 +198,11 @@ export const ViewSelect = () => { return ( - + {viewBuilderDisplayState.visible && ( { ref={clearButtonRef} onClick={onHandleClickClear} > - All Entities + View all ); diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx index 7dbf3c55d021d..fb10e1ca0026e 100644 --- a/datahub-web-react/src/app/search/SearchBar.tsx +++ b/datahub-web-react/src/app/search/SearchBar.tsx @@ -377,7 +377,15 @@ export const SearchBar = ({ onKeyUp={handleStopPropagation} onKeyDown={handleStopPropagation} > - + )} Date: Mon, 21 Aug 2023 15:33:10 -0300 Subject: [PATCH 02/11] feat(search): Add word gram analyzer for name fields (#8611) Co-authored-by: Indy Prentice --- docs/advanced/no-code-modeling.md | 2 +- docs/modeling/extending-the-metadata-model.md | 21 +-- .../annotation/SearchableAnnotation.java | 5 +- .../models/EntitySpecBuilderTest.java | 7 +- .../indexbuilder/MappingsBuilder.java | 21 ++- .../indexbuilder/SettingsBuilder.java | 55 +++++++- .../query/request/SearchFieldConfig.java | 25 +++- .../query/request/SearchQueryBuilder.java | 56 +++++++- .../metadata/ESTestConfiguration.java | 7 + .../fixtures/ElasticSearchGoldenTest.java | 15 +-- .../fixtures/SampleDataFixtureTests.java | 125 ++++++++++++++++++ .../indexbuilder/MappingsBuilderTest.java | 15 ++- .../query/request/SearchQueryBuilderTest.java | 53 +++++--- .../request/SearchRequestHandlerTest.java | 11 +- .../pegasus/com/linkedin/chart/ChartInfo.pdl | 2 +- .../container/ContainerProperties.pdl | 6 +- .../com/linkedin/dashboard/DashboardInfo.pdl | 4 +- .../com/linkedin/datajob/DataFlowInfo.pdl | 2 +- .../com/linkedin/datajob/DataJobInfo.pdl | 2 +- .../dataplatform/DataPlatformInfo.pdl | 4 +- .../DataPlatformInstanceProperties.pdl | 2 +- .../DataProcessInstanceProperties.pdl | 2 +- .../dataproduct/DataProductProperties.pdl | 2 +- .../linkedin/dataset/DatasetProperties.pdl | 6 +- .../com/linkedin/domain/DomainProperties.pdl | 2 +- .../linkedin/glossary/GlossaryNodeInfo.pdl | 4 +- .../linkedin/glossary/GlossaryTermInfo.pdl | 4 +- .../identity/CorpUserEditableInfo.pdl | 2 +- .../com/linkedin/identity/CorpUserInfo.pdl | 4 +- .../linkedin/metadata/key/CorpGroupKey.pdl | 4 +- .../com/linkedin/metadata/key/CorpUserKey.pdl | 2 +- .../com/linkedin/metadata/key/DataFlowKey.pdl | 4 +- .../com/linkedin/metadata/key/DataJobKey.pdl | 2 +- .../linkedin/metadata/key/DataProcessKey.pdl | 4 +- .../com/linkedin/metadata/key/DatasetKey.pdl | 2 +- .../linkedin/metadata/key/GlossaryNodeKey.pdl | 4 +- .../linkedin/metadata/key/GlossaryTermKey.pdl | 4 +- .../linkedin/metadata/key/MLFeatureKey.pdl | 4 +- .../metadata/key/MLFeatureTableKey.pdl | 4 +- .../metadata/key/MLModelDeploymentKey.pdl | 4 +- .../linkedin/metadata/key/MLModelGroupKey.pdl | 4 +- .../com/linkedin/metadata/key/MLModelKey.pdl | 4 +- .../linkedin/metadata/key/MLPrimaryKeyKey.pdl | 4 +- .../com/linkedin/metadata/key/TagKey.pdl | 4 +- .../com/linkedin/notebook/NotebookInfo.pdl | 2 +- .../linkedin/ownership/OwnershipTypeInfo.pdl | 4 +- .../com/linkedin/query/QueryProperties.pdl | 4 +- .../com/linkedin/role/RoleProperties.pdl | 2 +- .../com/linkedin/tag/TagProperties.pdl | 2 +- .../config/search/SearchConfiguration.java | 1 + .../config/search/WordGramConfiguration.java | 11 ++ .../src/main/resources/application.yml | 6 +- .../com/datahub/test/TestEntityInfo.pdl | 5 + 53 files changed, 449 insertions(+), 108 deletions(-) create mode 100644 metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md index e1fadee6d371a..9c8f6761a62bc 100644 --- a/docs/advanced/no-code-modeling.md +++ b/docs/advanced/no-code-modeling.md @@ -211,7 +211,7 @@ record ServiceKey { * Name of the service */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } name: string diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md index 32951ab2e41eb..f47630f44e772 100644 --- a/docs/modeling/extending-the-metadata-model.md +++ b/docs/modeling/extending-the-metadata-model.md @@ -323,7 +323,7 @@ It takes the following parameters: annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define the set of mappings to be applied in the MappingsBuilder. - Thus far, we have implemented 10 fieldTypes: + Thus far, we have implemented 11 fieldTypes: 1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering @@ -332,20 +332,25 @@ It takes the following parameters: 3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial matching is expensive, so this field type should not be applied to fields with long values (like description) - 4. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. + 4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND + word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries + matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is + expensive, so should not be applied to fields with long values such as description. - 5. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like + 5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. + + 6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components - 6. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. + 7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. - 7. *BOOLEAN* - Boolean fields used for filtering. + 8. *BOOLEAN* - Boolean fields used for filtering. - 8. *COUNT* - Count fields used for filtering. + 9. *COUNT* - Count fields used for filtering. - 9. *DATETIME* - Datetime fields used to represent timestamps. + 10. *DATETIME* - Datetime fields used to represent timestamps. - 10. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as + 11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a mapping explosion in Elasticsearch. diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java index f2e65c771c6eb..3d3fbcf3ccaa6 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java @@ -21,7 +21,7 @@ public class SearchableAnnotation { public static final String ANNOTATION_NAME = "Searchable"; private static final Set DEFAULT_QUERY_FIELD_TYPES = - ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL); + ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL); // Name of the field in the search index. Defaults to the field name in the schema String fieldName; @@ -59,7 +59,8 @@ public enum FieldType { COUNT, DATETIME, OBJECT, - BROWSE_PATH_V2 + BROWSE_PATH_V2, + WORD_GRAM } @Nonnull diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java index 1ab5ff640ce32..3618108970afa 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java @@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) { assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName()); // Assert on Searchable Fields - assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size()); + assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10); assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName()); assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get( @@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) { assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("textArrayField", "*").toString()) .getSearchableAnnotation().getFieldType()); + assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get( + new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName()); + assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get( + new PathSpec("wordGramField").toString()) + .getSearchableAnnotation().getFieldType()); assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName()); assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index 555acb2ffdd3b..efa4e0c279a76 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -42,6 +42,9 @@ public static Map getPartialNgramConfigWithOverrides(Map getMappingsForField(@Nonnull final Searchable mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); // Add keyword subfield without lowercase filter mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP)); - } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) { + } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { mappingForField.put(TYPE, KEYWORD); mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); Map subFields = new HashMap<>(); - if (fieldType == FieldType.TEXT_PARTIAL) { + if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { subFields.put(NGRAM, getPartialNgramConfigWithOverrides( ImmutableMap.of( ANALYZER, PARTIAL_ANALYZER ) )); + if (fieldType == FieldType.WORD_GRAM) { + for (Map.Entry entry : Map.of( + WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER, + WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER, + WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) { + String fieldName = entry.getKey(); + String analyzerName = entry.getValue(); + subFields.put(fieldName, ImmutableMap.of( + TYPE, TEXT, + ANALYZER, analyzerName, + SEARCH_ANALYZER, analyzerName + )); + } + } } subFields.put(DELIMITED, ImmutableMap.of( TYPE, TEXT, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index 5b3e396837aa7..e180c8296b48d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -66,6 +66,9 @@ public class SettingsBuilder { public static final String KEYWORD_ANALYZER = "keyword"; public static final String URN_ANALYZER = "urn_component"; public static final String URN_SEARCH_ANALYZER = "query_urn_component"; + public static final String WORD_GRAM_2_ANALYZER = "word_gram_2"; + public static final String WORD_GRAM_3_ANALYZER = "word_gram_3"; + public static final String WORD_GRAM_4_ANALYZER = "word_gram_4"; // Filters public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space"; @@ -80,6 +83,10 @@ public class SettingsBuilder { public static final String MULTIFILTER = "multifilter"; public static final String MULTIFILTER_GRAPH = "multifilter_graph"; public static final String PARTIAL_URN_COMPONENT = "partial_urn_component"; + public static final String SHINGLE = "shingle"; + public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter"; + public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter"; + public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter"; public static final String SNOWBALL = "snowball"; public static final String STEM_OVERRIDE = "stem_override"; public static final String STOP = "stop"; @@ -108,6 +115,7 @@ public class SettingsBuilder { public static final String SLASH_TOKENIZER = "slash_tokenizer"; public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer"; public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer"; + public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer"; // Do not remove the space, needed for multi-term synonyms public static final List ALPHANUM_SPACE_PATTERNS = ImmutableList.of( "([a-z0-9 _-]{2,})", @@ -161,6 +169,13 @@ public class SettingsBuilder { AUTOCOMPLETE_CUSTOM_DELIMITER, LOWERCASE); + public static final List WORD_GRAM_TOKEN_FILTERS = ImmutableList.of( + ASCII_FOLDING, + LOWERCASE, + TRIM, + REMOVE_QUOTES + ); + public final Map settings; public SettingsBuilder(String mainTokenizer) { @@ -275,6 +290,17 @@ private static Map buildFilters() throws IOException { .collect(Collectors.toList())) .build()); } + + for (Map.Entry entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) { + String filterName = entry.getKey(); + Integer gramSize = entry.getValue(); + filters.put(filterName, ImmutableMap.builder() + .put(TYPE, SHINGLE) + .put("min_shingle_size", gramSize) + .put("max_shingle_size", gramSize) + .put("output_unigrams", false) + .build()); + } } return filters.build(); @@ -302,13 +328,24 @@ private static Map buildTokenizers() { .put(DELIMITER, "␟") .build()); - // Tokenize by whitespace and most special chars + // Tokenize by most special chars + // Do NOT tokenize by whitespace to keep multi-word synonyms in the same token + // The split by whitespace is done later in the token filters phase tokenizers.put(MAIN_TOKENIZER, ImmutableMap.builder() .put(TYPE, PATTERN) .put(PATTERN, "[(),./:]") .build()); + // Tokenize by whitespace and most special chars for wordgrams + // only split on - when not preceded by a whitespace to preserve exclusion functionality + // i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently + tokenizers.put(WORD_GRAM_TOKENIZER, + ImmutableMap.builder() + .put(TYPE, PATTERN) + .put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)") + .build()); + return tokenizers.build(); } @@ -382,6 +419,21 @@ private static Map buildAnalyzers(String mainTokenizer) { .put(FILTER, SEARCH_TOKEN_FILTERS) .build()); + // Support word grams + for (Map.Entry entry : Map.of( + WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER, + WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER, + WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) { + String analyzerName = entry.getKey(); + String filterName = entry.getValue(); + analyzers.put(analyzerName, ImmutableMap.builder() + .put(TOKENIZER, WORD_GRAM_TOKENIZER) + .put(FILTER, ImmutableList.builder() + .addAll(WORD_GRAM_TOKEN_FILTERS) + .add(filterName).build()) + .build()); + } + // For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN) // Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token analyzers.put(PARTIAL_ANALYZER, ImmutableMap.builder() @@ -395,6 +447,7 @@ private static Map buildAnalyzers(String mainTokenizer) { .put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS) .build()); + return analyzers.build(); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java index fb7e19a5d67bc..a75ed40ffca52 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java @@ -11,11 +11,8 @@ import java.util.Set; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; + @Builder @Getter @@ -33,7 +30,8 @@ public class SearchFieldConfig { private static final Set TYPES_WITH_DELIMITED_SUBFIELD = Set.of( SearchableAnnotation.FieldType.TEXT, - SearchableAnnotation.FieldType.TEXT_PARTIAL + SearchableAnnotation.FieldType.TEXT_PARTIAL, + SearchableAnnotation.FieldType.WORD_GRAM // NOT URN_PARTIAL (urn field is special) ); // NOT comprehensive @@ -56,6 +54,7 @@ public class SearchFieldConfig { SearchableAnnotation.FieldType.TEXT, SearchableAnnotation.FieldType.TEXT_PARTIAL, SearchableAnnotation.FieldType.KEYWORD, + SearchableAnnotation.FieldType.WORD_GRAM, // not analyzed SearchableAnnotation.FieldType.BOOLEAN, SearchableAnnotation.FieldType.COUNT, @@ -69,6 +68,11 @@ public class SearchFieldConfig { SearchableAnnotation.FieldType.URN_PARTIAL ); + public static final Set TYPES_WITH_WORD_GRAM = + Set.of( + SearchableAnnotation.FieldType.WORD_GRAM + ); + @Nonnull private final String fieldName; @Nonnull @@ -78,9 +82,11 @@ public class SearchFieldConfig { private final String analyzer; private boolean hasKeywordSubfield; private boolean hasDelimitedSubfield; + private boolean hasWordGramSubfields; private boolean isQueryByDefault; private boolean isDelimitedSubfield; private boolean isKeywordSubfield; + private boolean isWordGramSubfield; public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) { final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation(); @@ -106,6 +112,7 @@ public static SearchFieldConfig detectSubFieldType(String fieldName, .analyzer(getAnalyzer(fieldName, fieldType)) .hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType)) .hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType)) + .hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType)) .isQueryByDefault(isQueryByDefault) .build(); } @@ -118,6 +125,11 @@ private static boolean hasDelimitedSubfield(String fieldName, SearchableAnnotati return !fieldName.contains(".") && ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType)); } + + private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) { + return !fieldName.contains(".") + && (TYPES_WITH_WORD_GRAM.contains(fieldType)); + } private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) { return !"urn".equals(fieldName) && !fieldName.contains(".") @@ -155,6 +167,7 @@ public SearchFieldConfigBuilder fieldName(@Nonnull String fieldName) { this.fieldName = fieldName; isDelimitedSubfield(fieldName.endsWith(".delimited")); isKeywordSubfield(fieldName.endsWith(".keyword")); + isWordGramSubfield(fieldName.contains("wordGrams")); shortName(fieldName.split("[.]")[0]); return this; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index 289c6f1f84e32..49fc882314e0a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -3,6 +3,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.BoolQueryConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.config.search.custom.QueryConfiguration; @@ -51,6 +52,9 @@ import org.elasticsearch.search.SearchModule; import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; +import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.*; + @Slf4j public class SearchQueryBuilder { @@ -69,6 +73,7 @@ public class SearchQueryBuilder { public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q "; private final ExactMatchConfiguration exactMatchConfiguration; private final PartialConfiguration partialConfiguration; + private final WordGramConfiguration wordGramConfiguration; private final CustomizedQueryHandler customizedQueryHandler; @@ -76,6 +81,7 @@ public SearchQueryBuilder(@Nonnull SearchConfiguration searchConfiguration, @Nullable CustomSearchConfiguration customSearchConfiguration) { this.exactMatchConfiguration = searchConfiguration.getExactMatch(); this.partialConfiguration = searchConfiguration.getPartial(); + this.wordGramConfiguration = searchConfiguration.getWordGram(); this.customizedQueryHandler = CustomizedQueryHandler.builder(customSearchConfiguration).build(); } @@ -148,6 +154,36 @@ private Set getStandardFields(@Nonnull EntitySpec entitySpec) fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited", searchFieldConfig.boost() * partialConfiguration.getFactor(), searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault())); + + if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) { + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams2") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getTwoGramFactor()) + .analyzer(WORD_GRAM_2_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams3") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getThreeGramFactor()) + .analyzer(WORD_GRAM_3_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams4") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getFourGramFactor()) + .analyzer(WORD_GRAM_4_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + } } } @@ -188,7 +224,7 @@ private Optional getSimpleQuery(@Nullable QueryConfiguration custo .filter(SearchFieldConfig::isQueryByDefault) .collect(Collectors.groupingBy(SearchFieldConfig::analyzer)); - analyzerGroup.keySet().stream().sorted().forEach(analyzer -> { + analyzerGroup.keySet().stream().sorted().filter(str -> !str.contains("word_gram")).forEach(analyzer -> { List fieldConfigs = analyzerGroup.get(analyzer); SimpleQueryStringBuilder simpleBuilder = QueryBuilders.simpleQueryStringQuery(sanitizedQuery); simpleBuilder.analyzer(analyzer); @@ -253,6 +289,13 @@ private Optional getPrefixAndExactMatchQuery(@Nullable QueryConfig * exactMatchConfiguration.getCaseSensitivityFactor()) .queryName(searchFieldConfig.fieldName())); } + + if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) { + finalQuery.should(QueryBuilders + .matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) + .boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName())) + .queryName(searchFieldConfig.shortName())); + } }); return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty(); @@ -377,4 +420,15 @@ private FunctionScoreQueryBuilder toFunctionScoreQueryBuilder(QueryBuilder query throw new RuntimeException(e); } } + + public float getWordGramFactor(String fieldName) { + if (fieldName.endsWith("Grams2")) { + return wordGramConfiguration.getTwoGramFactor(); + } else if (fieldName.endsWith("Grams3")) { + return wordGramConfiguration.getThreeGramFactor(); + } else if (fieldName.endsWith("Grams4")) { + return wordGramConfiguration.getFourGramFactor(); + } + throw new IllegalArgumentException(fieldName + " does not end with Grams[2-4]"); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java index 1e5b860b581fc..673474c96cc51 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; @@ -55,11 +56,17 @@ public SearchConfiguration searchConfiguration() { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.5f); searchConfiguration.setExactMatch(exactMatchConfiguration); + searchConfiguration.setWordGram(wordGramConfiguration); searchConfiguration.setPartial(partialConfiguration); return searchConfiguration; } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java index cc0d9dca6ae5f..29457f244291f 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java @@ -116,15 +116,7 @@ public void testGlossaryTerms() { assertTrue(fourthResultMatchedFields.toString().contains("ReturnRate")); } - /** - * - * The test below should be added back in as improvements are made to search, - * via the linked tickets. - * - **/ - - // TODO: enable once PFP-481 is complete - @Test(enabled = false) + @Test public void testNameMatchPartiallyQualified() { /* Searching for "analytics.pet_details" (partially qualified) should return the fully qualified table @@ -140,4 +132,9 @@ public void testNameMatchPartiallyQualified() { assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details")); } + /* + * Tests that should pass but do not yet can be added below here, with the following annotation: + * @Test(enabled = false) + **/ + } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java index 2f1e48c18450d..d989d4ef4fa87 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java @@ -358,6 +358,84 @@ public void testDelimitedSynonym() throws IOException { }).collect(Collectors.toList()); } + @Test + public void testNegateAnalysis() throws IOException { + String queryWithMinus = "logging_events -bckp"; + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "query_word_delimited", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp")); + + request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_gram_3", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp")); + + request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_gram_4", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + + } + + @Test + public void testWordGram() throws IOException { + String text = "hello.cat_cool_customer"; + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer")); + + String testMoreSeparators = "quick.brown:fox jumped-LAZY_Dog"; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog")); + + String textWithQuotesAndDuplicateWord = "\"my_db.my_exact_table\""; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table")); + + String textWithParens = "(hi) there"; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithParens); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there")); + + String oneWordText = "hello"; + for (String analyzer : List.of("word_gram_2", "word_gram_3", "word_gram_4")) { + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", analyzer, oneWordText); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + } + } + @Test public void testUrnSynonym() throws IOException { List expectedTokens = List.of("bigquery"); @@ -1267,6 +1345,53 @@ public void testParens() { String.format("%s - Expected search results to include matched fields", query)); assertEquals(result.getEntities().size(), 2); } + @Test + public void testGram() { + String query = "jaffle shop customers"; + SearchResult result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)", + "Expected exact match in 1st position"); + + query = "shop customers source"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)", + "Expected ngram match in 1st position"); + + query = "jaffle shop stg customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)", + "Expected ngram match in 1st position"); + + query = "jaffle shop transformers customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)", + "Expected ngram match in 1st position"); + + query = "shop raw customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)", + "Expected ngram match in 1st position"); + } @Test public void testPrefixVsExact() { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java index ed72b46e98c46..5a8f80f325dbd 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java @@ -16,7 +16,7 @@ public void testMappingsBuilder() { Map result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec()); assertEquals(result.size(), 1); Map properties = (Map) result.get("properties"); - assertEquals(properties.size(), 17); + assertEquals(properties.size(), 18); assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword", "fields", ImmutableMap.of("delimited", @@ -76,6 +76,19 @@ public void testMappingsBuilder() { assertTrue(textArrayFieldSubfields.containsKey("ngram")); assertTrue(textArrayFieldSubfields.containsKey("keyword")); + // WORD_GRAM + Map wordGramField = (Map) properties.get("wordGramField"); + assertEquals(wordGramField.get("type"), "keyword"); + assertEquals(wordGramField.get("normalizer"), "keyword_normalizer"); + Map wordGramFieldSubfields = (Map) wordGramField.get("fields"); + assertEquals(wordGramFieldSubfields.size(), 6); + assertTrue(wordGramFieldSubfields.containsKey("delimited")); + assertTrue(wordGramFieldSubfields.containsKey("ngram")); + assertTrue(wordGramFieldSubfields.containsKey("keyword")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams2")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams3")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams4")); + // URN Map foreignKey = (Map) properties.get("foreignKey"); assertEquals(foreignKey.get("type"), "text"); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java index a2ec396c34b2d..282b1d8bb6778 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; import com.google.common.collect.ImmutableList; @@ -18,6 +19,7 @@ import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.MatchAllQueryBuilder; import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder; +import org.elasticsearch.index.query.MatchPhraseQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryStringQueryBuilder; import org.elasticsearch.index.query.SimpleQueryStringBuilder; @@ -46,11 +48,17 @@ public class SearchQueryBuilderTest { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.7f); testQueryConfig.setExactMatch(exactMatchConfiguration); + testQueryConfig.setWordGram(wordGramConfiguration); testQueryConfig.setPartial(partialConfiguration); } public static final SearchQueryBuilder TEST_BUILDER = new SearchQueryBuilder(testQueryConfig, null); @@ -70,16 +78,17 @@ public void testQueryBuilderFulltext() { assertEquals(keywordQuery.value(), "testQuery"); assertEquals(keywordQuery.analyzer(), "keyword"); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 8); + assertEquals(keywordFields.size(), 9); assertEquals(keywordFields, Map.of( - "urn", 10.f, - "textArrayField", 1.0f, - "customProperties", 1.0f, - "nestedArrayArrayField", 1.0f, - "textFieldOverride", 1.0f, - "nestedArrayStringField", 1.0f, - "keyPart1", 10.0f, - "esObjectField", 1.0f + "urn", 10.f, + "textArrayField", 1.0f, + "customProperties", 1.0f, + "wordGramField", 1.0f, + "nestedArrayArrayField", 1.0f, + "textFieldOverride", 1.0f, + "nestedArrayStringField", 1.0f, + "keyPart1", 10.0f, + "esObjectField", 1.0f )); SimpleQueryStringBuilder urnComponentQuery = (SimpleQueryStringBuilder) analyzerGroupQuery.should().get(1); @@ -99,7 +108,8 @@ public void testQueryBuilderFulltext() { "nestedArrayArrayField.delimited", 0.4f, "urn.delimited", 7.0f, "textArrayField.delimited", 0.4f, - "nestedArrayStringField.delimited", 0.4f + "nestedArrayStringField.delimited", 0.4f, + "wordGramField.delimited", 0.4f )); BoolQueryBuilder boolPrefixQuery = (BoolQueryBuilder) shouldQueries.get(1); @@ -109,21 +119,30 @@ public void testQueryBuilderFulltext() { if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) { MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery; return Pair.of(builder.fieldName(), builder.boost()); - } else { + } else if (prefixQuery instanceof TermQueryBuilder) { // exact TermQueryBuilder builder = (TermQueryBuilder) prefixQuery; return Pair.of(builder.fieldName(), builder.boost()); + } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) { + // ngram + MatchPhraseQueryBuilder builder = (MatchPhraseQueryBuilder) prefixQuery; + return Pair.of(builder.fieldName(), builder.boost()); } }).collect(Collectors.toList()); - assertEquals(prefixFieldWeights.size(), 22); + assertEquals(prefixFieldWeights.size(), 28); List.of( Pair.of("urn", 100.0f), Pair.of("urn", 70.0f), Pair.of("keyPart1.delimited", 16.8f), Pair.of("keyPart1.keyword", 100.0f), - Pair.of("keyPart1.keyword", 70.0f) + Pair.of("keyPart1.keyword", 70.0f), + Pair.of("wordGramField.wordGrams2", 1.44f), + Pair.of("wordGramField.wordGrams3", 2.25f), + Pair.of("wordGramField.wordGrams4", 3.2399998f), + Pair.of("wordGramField.keyword", 10.0f), + Pair.of("wordGramField.keyword", 7.0f) ).forEach(p -> assertTrue(prefixFieldWeights.contains(p), "Missing: " + p)); // Validate scorer @@ -144,7 +163,7 @@ public void testQueryBuilderStructured() { assertEquals(keywordQuery.queryString(), "testQuery"); assertNull(keywordQuery.analyzer()); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 16); + assertEquals(keywordFields.size(), 21); assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f); assertFalse(keywordFields.containsKey("keyPart3")); assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f); @@ -196,10 +215,14 @@ public void testCustomExactMatch() { List queries = boolPrefixQuery.should().stream().map(prefixQuery -> { if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) { + // prefix return (MatchPhrasePrefixQueryBuilder) prefixQuery; - } else { + } else if (prefixQuery instanceof TermQueryBuilder) { // exact return (TermQueryBuilder) prefixQuery; + } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) { + // ngram + return (MatchPhraseQueryBuilder) prefixQuery; } }).collect(Collectors.toList()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java index d66d6a0ab0e76..db56e2d34881b 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java @@ -7,6 +7,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.TestEntitySpecBuilder; +import com.linkedin.metadata.config.search.WordGramConfiguration; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -65,11 +66,17 @@ public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.7f); testQueryConfig.setExactMatch(exactMatchConfiguration); + testQueryConfig.setWordGram(wordGramConfiguration); testQueryConfig.setPartial(partialConfiguration); } @@ -113,10 +120,10 @@ public void testSearchRequestHandler() { HighlightBuilder highlightBuilder = sourceBuilder.highlighter(); List fields = highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList()); - assertEquals(fields.size(), 20); + assertEquals(fields.size(), 22); List highlightableFields = ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey", - "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField"); + "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "wordGramField"); highlightableFields.forEach(field -> { assertTrue(fields.contains(field), "Missing: " + field); assertTrue(fields.contains(field + ".*"), "Missing: " + field + ".*"); diff --git a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl index 4339a186f1304..5047c824e2617 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl @@ -20,7 +20,7 @@ record ChartInfo includes CustomProperties, ExternalReference { * Title of the chart */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl index 26745fe46caaa..0b9c89ea30c90 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl @@ -15,7 +15,7 @@ record ContainerProperties includes CustomProperties, ExternalReference { * Display name of the Asset Container */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -25,7 +25,7 @@ record ContainerProperties includes CustomProperties, ExternalReference { * Fully-qualified name of the Container */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -61,4 +61,4 @@ record ContainerProperties includes CustomProperties, ExternalReference { } } lastModified: optional TimeStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl index 5cb306039506e..84b3065a08022 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl @@ -22,7 +22,7 @@ record DashboardInfo includes CustomProperties, ExternalReference { * Title of the dashboard */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -126,4 +126,4 @@ record DashboardInfo includes CustomProperties, ExternalReference { * The time when this dashboard last refreshed */ lastRefreshed: optional Time -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl index 481240740876a..1303bfbc863ea 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl @@ -17,7 +17,7 @@ record DataFlowInfo includes CustomProperties, ExternalReference { * Flow name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl index 8737dd4d9ef52..1e305816f96a2 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl @@ -18,7 +18,7 @@ record DataJobInfo includes CustomProperties, ExternalReference { * Job name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl index acc40e9f693ec..0be58d73dc79f 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl @@ -15,7 +15,7 @@ record DataPlatformInfo { */ @validate.strlen.max = 15 @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": false, "boostScore": 10.0 } @@ -25,7 +25,7 @@ record DataPlatformInfo { * The name that will be used for displaying a platform type. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl index d7ce5565103ee..1220741ee5726 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl @@ -16,7 +16,7 @@ record DataPlatformInstanceProperties includes CustomProperties, ExternalReferen * Display name of the Data Platform Instance */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index 72eefd5e294e4..46a490dbb2925 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -19,7 +19,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc * Process name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl index 3861b7def7669..c0a50a5e0e688 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl @@ -13,7 +13,7 @@ record DataProductProperties includes CustomProperties, ExternalReference { * Display name of the Data Product */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl index 57b1fe7693129..49d0dcd58ee27 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl @@ -17,7 +17,7 @@ record DatasetProperties includes CustomProperties, ExternalReference { * Display name of the Dataset */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -27,7 +27,7 @@ record DatasetProperties includes CustomProperties, ExternalReference { * Fully-qualified name of the Dataset */ @Searchable = { - "fieldType": "TEXT", + "fieldType": "WORD_GRAM", "addToFilters": false, "enableAutocomplete": true, "boostScore": 10.0 @@ -77,4 +77,4 @@ record DatasetProperties includes CustomProperties, ExternalReference { */ @deprecated = "Use GlobalTags aspect instead." tags: array[string] = [ ] -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl index 5a0b8657ecb47..a362d412a32b9 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl @@ -14,7 +14,7 @@ record DomainProperties { * Display name of the Domain */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl index 1e840e5a1df7e..557b5e2a0f419 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl @@ -35,7 +35,7 @@ record GlossaryNodeInfo { */ @Searchable = { "fieldName": "displayName", - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -49,4 +49,4 @@ record GlossaryNodeInfo { } id: optional string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl index aa2a8b31e3dde..13e7af311fba1 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl @@ -23,7 +23,7 @@ record GlossaryTermInfo includes CustomProperties { * Display name of the term */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -75,4 +75,4 @@ record GlossaryTermInfo includes CustomProperties { */ @deprecated rawSchema: optional string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl index 6b050f484fedd..48ee53377e582 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl @@ -45,7 +45,7 @@ record CorpUserEditableInfo { * DataHub-native display name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl index 1cb705d426cc0..6cb0e8fd6aa6d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl @@ -26,7 +26,7 @@ record CorpUserInfo includes CustomProperties { * displayName of this user , e.g. Hang Zhang(DataHQ) */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 @@ -89,7 +89,7 @@ record CorpUserInfo includes CustomProperties { * Common name of this user, format is firstName + lastName (split by a whitespace) */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl index 075cc14ddc83b..9e65b8f6e9929 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl @@ -11,10 +11,10 @@ record CorpGroupKey { * The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl index d1a8a4bb5bb23..476a0ad9704b3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl @@ -12,7 +12,7 @@ record CorpUserKey { */ @Searchable = { "fieldName": "ldap", - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "boostScore": 2.0, "enableAutocomplete": true } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl index bcdb92f75d055..d8342630248b6 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl @@ -19,7 +19,7 @@ record DataFlowKey { * Unique Identifier of the data flow */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } flowId: string @@ -31,4 +31,4 @@ record DataFlowKey { "fieldType": "TEXT_PARTIAL" } cluster: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl index d0ac7dbca0f99..60ec51b464dcc 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl @@ -27,7 +27,7 @@ record DataJobKey { * Unique Identifier of the data job */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } jobId: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl index a5c05029352c2..4df1364a04ebe 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl @@ -13,7 +13,7 @@ record DataProcessKey { * Process name i.e. an ETL job name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 4.0 } @@ -37,4 +37,4 @@ record DataProcessKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl index ea1f9510ed438..70c5d174171af 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl @@ -25,7 +25,7 @@ record DatasetKey { //This is no longer to be used for Dataset native name. Use name, qualifiedName from DatasetProperties instead. @Searchable = { "fieldName": "id" - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl index 88697fe3ff364..51a3bc00f4e9e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl @@ -12,9 +12,9 @@ import com.linkedin.common.FabricType record GlossaryNodeKey { @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl index a9f35146da18e..61bcd60cbc754 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl @@ -13,10 +13,10 @@ record GlossaryTermKey { * The term name, which serves as a unique id */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "fieldName": "id" } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl index 579f1966977a9..0dcb194bccce0 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl @@ -20,9 +20,9 @@ record MLFeatureKey { * Name of the feature */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 8.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl index 1f786ad417be7..880daa4423573 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl @@ -22,9 +22,9 @@ record MLFeatureTableKey { * Name of the feature table */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 8.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl index 7c36f410fede3..83ba35e0af601 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl @@ -19,7 +19,7 @@ record MLModelDeploymentKey { * Name of the MLModelDeployment */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -35,4 +35,4 @@ record MLModelDeploymentKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl index 17c401c0b8c48..b1e2b7b7ede70 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl @@ -19,7 +19,7 @@ record MLModelGroupKey { * Name of the MLModelGroup */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -33,4 +33,4 @@ record MLModelGroupKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl index 55fd2bc370846..24fe89dcce654 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl @@ -19,7 +19,7 @@ record MLModelKey { * Name of the MLModel */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -35,4 +35,4 @@ record MLModelKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl index 9eb67eaf5f651..7987f3a3345b7 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl @@ -21,9 +21,9 @@ record MLPrimaryKeyKey { * Name of the primary key */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 8.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl index 47f1a631b4a2c..4622e32dce67b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl @@ -11,10 +11,10 @@ record TagKey { * The tag name, which serves as a unique id */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0, "fieldName": "id" } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl index 1f4dcf975f48c..5df4daacffa49 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl @@ -18,7 +18,7 @@ record NotebookInfo includes CustomProperties, ExternalReference { * Title of the Notebook */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl index 004df6e399be4..3e7b53beff531 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl @@ -14,7 +14,7 @@ record OwnershipTypeInfo { * Display name of the Ownership Type */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -54,4 +54,4 @@ record OwnershipTypeInfo { } } lastModified: AuditStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl index bb7e22900e168..3ba19d348913b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl @@ -29,7 +29,7 @@ record QueryProperties { * Optional display name to identify the query. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -69,4 +69,4 @@ record QueryProperties { } } lastModified: AuditStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl index acebdf5558c59..84d8ecc379ec2 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl @@ -14,7 +14,7 @@ record RoleProperties { * Display name of the IAM Role in the external system */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl index 41c500c6fff2f..e808aef491749 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl @@ -11,7 +11,7 @@ record TagProperties { * Display name of the tag */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java index 1a56db1bd68b0..b2b5260dc5e70 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java @@ -11,4 +11,5 @@ public class SearchConfiguration { private PartialConfiguration partial; private CustomConfiguration custom; private GraphQueryConfiguration graph; + private WordGramConfiguration wordGram; } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java new file mode 100644 index 0000000000000..624d2a4c63c4c --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java @@ -0,0 +1,11 @@ +package com.linkedin.metadata.config.search; + +import lombok.Data; + + +@Data +public class WordGramConfiguration { + private float twoGramFactor; + private float threeGramFactor; + private float fourGramFactor; +} diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 9f7bf92039fdc..82cf9e8fdc8a7 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -198,6 +198,10 @@ elasticsearch: prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search + wordGram: + twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens + threeGramFactor: ${ELASTICSEARCH_QUERY_THREE_GRAM_FACTOR:1.5} # boost multiplier when match on 3-gram tokens + fourGramFactor: ${ELASTICSEARCH_QUERY_FOUR_GRAM_FACTOR:1.8} # boost multiplier when match on 4-gram tokens # Field weight annotations are typically calibrated for exact match, if partial match is possible on the field use these adjustments partial: urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed @@ -318,4 +322,4 @@ cache: search: lineage: ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day - lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} \ No newline at end of file + lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} diff --git a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl index ed30244c31b17..cc579ba488174 100644 --- a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl +++ b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl @@ -25,6 +25,11 @@ record TestEntityInfo includes CustomProperties { } textArrayField: optional array[string] + @Searchable = { + "fieldType": "WORD_GRAM" + } + wordGramField: optional string + @Relationship = { "name": "foreignKey", "entityTypes": [] From 655914841bc6c840839ca0cdce751e4e11b6f06f Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 22 Aug 2023 01:08:08 -0400 Subject: [PATCH 03/11] fix(docker): misc docker fixes (#8677) --- .github/workflows/docker-unified.yml | 56 ++++++++++--------- docker/build.gradle | 1 + docker/datahub-ingestion-base/Dockerfile | 2 +- docker/datahub-ingestion/Dockerfile | 2 +- docker/kafka-setup/Dockerfile | 14 ++--- docker/kafka-setup/kafka-ready.sh | 14 +++++ docker/kafka-setup/kafka-setup.sh | 4 +- .../fixtures/ElasticSearchGoldenTest.java | 2 + .../DataProcessInstanceProperties.pdl | 1 + smoke-test/run-quickstart.sh | 2 +- .../tests/cypress/cypress/e2e/login/login.js | 2 +- .../cypress/e2e/settings/managing_groups.js | 10 ++-- 12 files changed, 64 insertions(+), 46 deletions(-) create mode 100755 docker/kafka-setup/kafka-ready.sh mode change 100644 => 100755 docker/kafka-setup/kafka-setup.sh diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index c268a66938945..e8e12ac6def94 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -63,8 +63,8 @@ jobs: env: ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD != '' && secrets.ACRYL_DOCKER_PASSWORD != '' }} run: | - echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" - echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT + echo "Enable publish: ${{ env.ENABLE_PUBLISH }}" + echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT gms_build: name: Build and Push DataHub GMS Docker Image @@ -451,8 +451,6 @@ jobs: tags: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - build-args: | - DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }} publish: ${{ needs.setup.outputs.publish }} context: . file: ./docker/datahub-ingestion-base/Dockerfile @@ -481,7 +479,7 @@ jobs: uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} with: - image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }} + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} - name: Build and push Base-Slim Image if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} uses: ./.github/actions/docker-custom-build-and-push @@ -493,16 +491,15 @@ jobs: username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} build-args: | - DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }} APP_ENV=slim - BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }} + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} publish: ${{ needs.setup.outputs.publish }} context: . file: ./docker/datahub-ingestion-base/Dockerfile platforms: linux/amd64,linux/arm64/v8 - name: Compute DataHub Ingestion (Base-Slim) Tag id: tag - run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT datahub_ingestion_base_full_build: name: Build and Push DataHub Ingestion (Base-Full) Docker Image runs-on: ubuntu-latest @@ -524,7 +521,7 @@ jobs: uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} with: - image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }} + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} - name: Build and push Base-Full Image if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} uses: ./.github/actions/docker-custom-build-and-push @@ -532,20 +529,19 @@ jobs: target: full-install images: | ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} - tags: ${{ needs.setup.outputs.full_tag }} + tags: ${{ needs.setup.outputs.unique_full_tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} build-args: | - DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }} APP_ENV=full - BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }} + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} publish: ${{ needs.setup.outputs.publish }} context: . file: ./docker/datahub-ingestion-base/Dockerfile platforms: linux/amd64,linux/arm64/v8 - name: Compute DataHub Ingestion (Base-Full) Tag id: tag - run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT datahub_ingestion_slim_build: @@ -572,9 +568,9 @@ jobs: run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image uses: ishworkh/docker-image-artifact-download@v1 - if: ${{ needs.setup.outputs.publish != 'true' }} + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} with: - image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }} + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} - name: Build and push Slim Image if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} uses: ./.github/actions/docker-custom-build-and-push @@ -584,7 +580,7 @@ jobs: ${{ env.DATAHUB_INGESTION_IMAGE }} build-args: | BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} - DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} APP_ENV=slim tags: ${{ needs.setup.outputs.slim_tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} @@ -595,7 +591,7 @@ jobs: platforms: linux/amd64,linux/arm64/v8 - name: Compute Tag id: tag - run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.slim_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT datahub_ingestion_slim_scan: permissions: contents: read # for actions/checkout to fetch code @@ -611,13 +607,13 @@ jobs: uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' }} with: - image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.slim_tag }} + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} - name: Run Trivy vulnerability scanner Slim Image uses: aquasecurity/trivy-action@0.8.0 env: TRIVY_OFFLINE_SCAN: true with: - image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.slim_tag }} + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} format: "template" template: "@/contrib/sarif.tpl" output: "trivy-results.sarif" @@ -653,9 +649,9 @@ jobs: run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image uses: ishworkh/docker-image-artifact-download@v1 - if: ${{ needs.setup.outputs.publish != 'true' }} + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} with: - image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }} + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }} - name: Build and push Full Image if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} uses: ./.github/actions/docker-custom-build-and-push @@ -665,8 +661,8 @@ jobs: ${{ env.DATAHUB_INGESTION_IMAGE }} build-args: | BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} - DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }} - tags: ${{ needs.setup.outputs.full_tag }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }} + tags: ${{ needs.setup.outputs.unique_full_tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish }} @@ -675,7 +671,7 @@ jobs: platforms: linux/amd64,linux/arm64/v8 - name: Compute Tag (Full) id: tag - run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT datahub_ingestion_full_scan: permissions: contents: read # for actions/checkout to fetch code @@ -691,13 +687,13 @@ jobs: uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' }} with: - image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.full_tag }} + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} - name: Run Trivy vulnerability scanner Full Image uses: aquasecurity/trivy-action@0.8.0 env: TRIVY_OFFLINE_SCAN: true with: - image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.full_tag }} + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} format: "template" template: "@/contrib/sarif.tpl" output: "trivy-results.sarif" @@ -750,6 +746,10 @@ jobs: ./gradlew :metadata-ingestion:install - name: Disk Check run: df -h . && docker images + - name: Remove images + run: docker image prune -a -f || true + - name: Disk Check + run: df -h . && docker images - name: Download GMS image uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' }} @@ -794,7 +794,7 @@ jobs: uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' }} with: - image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} - name: Disk Check run: df -h . && docker images - name: run quickstart @@ -812,6 +812,8 @@ jobs: # we are doing this because gms takes time to get ready # and we don't have a better readiness check when bootstrap is done sleep 60s + - name: Disk Check + run: df -h . && docker images - name: Disable ES Disk Threshold run: | curl -XPUT "http://localhost:9200/_cluster/settings" \ diff --git a/docker/build.gradle b/docker/build.gradle index 829bc344411f3..ae101fe1defc5 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -87,6 +87,7 @@ task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') { dependsOn(debug_modules.collect { it + ':dockerTagDebug' }) shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' + environment "DATAHUB_PRECREATE_TOPICS", "true" environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index bb4b0bc42e167..3d47f79617370 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -84,4 +84,4 @@ FROM ${BASE_IMAGE} as slim-install FROM ${APP_ENV}-install USER datahub -ENV PATH="/datahub-ingestion/.local/bin:$PATH" +ENV PATH="/datahub-ingestion/.local/bin:$PATH" \ No newline at end of file diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index d16caea2fcecd..0ecc30d02ac3f 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -30,4 +30,4 @@ FROM base as dev-install FROM ${APP_ENV}-install as final USER datahub -ENV PATH="/datahub-ingestion/.local/bin:$PATH" \ No newline at end of file +ENV PATH="/datahub-ingestion/.local/bin:$PATH" diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index 8cf9d0869dc9b..5707234b85f57 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -1,5 +1,7 @@ +ARG KAFKA_DOCKER_VERSION=7.4.1 + # Using as a base image because to get the needed jars for confluent utils -FROM confluentinc/cp-base-new@sha256:ac4e0f9bcaecdab728740529f37452231fa40760fcf561759fc3b219f46d2cc9 as confluent_base +FROM confluentinc/cp-base-new:$KAFKA_DOCKER_VERSION as confluent_base ARG MAVEN_REPO="https://repo1.maven.org/maven2" ARG SNAKEYAML_VERSION="2.0" @@ -16,12 +18,6 @@ ENV SCALA_VERSION 2.13 # Set the classpath for JARs required by `cub` ENV CUB_CLASSPATH='"/usr/share/java/cp-base-new/*"' -# Confluent Docker Utils Version (Namely the tag or branch to grab from git to install) -ARG PYTHON_CONFLUENT_DOCKER_UTILS_VERSION="v0.0.60" - -# This can be overriden for an offline/air-gapped builds -ARG PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC="git+https://github.com/confluentinc/confluent-docker-utils@${PYTHON_CONFLUENT_DOCKER_UTILS_VERSION}" - LABEL name="kafka" version=${KAFKA_VERSION} RUN apk add --no-cache bash coreutils @@ -39,7 +35,6 @@ RUN mkdir -p /opt \ && pip install --no-cache-dir --upgrade pip wheel setuptools \ && pip install jinja2 requests \ && pip install "Cython<3.0" "PyYAML<6" --no-build-isolation \ - && pip install --prefer-binary --prefix=/usr/local --upgrade "${PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC}" \ && rm -rf /tmp/* \ && apk del --purge .build-deps @@ -69,7 +64,8 @@ ENV USE_CONFLUENT_SCHEMA_REGISTRY="TRUE" COPY docker/kafka-setup/kafka-setup.sh ./kafka-setup.sh COPY docker/kafka-setup/kafka-config.sh ./kafka-config.sh COPY docker/kafka-setup/kafka-topic-workers.sh ./kafka-topic-workers.sh +COPY docker/kafka-setup/kafka-ready.sh ./kafka-ready.sh -RUN chmod +x ./kafka-setup.sh && chmod +x ./kafka-topic-workers.sh +RUN chmod +x ./kafka-setup.sh ./kafka-topic-workers.sh ./kafka-ready.sh CMD ./kafka-setup.sh diff --git a/docker/kafka-setup/kafka-ready.sh b/docker/kafka-setup/kafka-ready.sh new file mode 100755 index 0000000000000..ba87bde047ef5 --- /dev/null +++ b/docker/kafka-setup/kafka-ready.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for i in {1..60} +do + kafka-broker-api-versions.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER + if [ $? -eq 0 ]; then + break + fi + if [ $i -eq 60 ]; then + echo "Kafka bootstrap server $KAFKA_BOOTSTRAP_SERVER not ready." + exit 1 + fi + sleep 5s +done diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh old mode 100644 new mode 100755 index 7b015421b7963..629e9bc9484ee --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -49,8 +49,8 @@ if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH fi -cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 - +# cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 +. kafka-ready.sh ############################################################ # Start Topic Creation Logic diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java index 29457f244291f..8e8c20bd292e5 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java @@ -15,6 +15,7 @@ import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Import; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Ignore; import org.testng.annotations.Test; import java.util.List; @@ -96,6 +97,7 @@ public void testNameMatchMemberInWorkspace() { } @Test + @Ignore("unstable") public void testGlossaryTerms() { /* Searching for "ReturnRate" should return all tables that have the glossary term applied before diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index 46a490dbb2925..c63cb1a97c017 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -31,6 +31,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc @Searchable = { "fieldType": "KEYWORD", "addToFilters": true, + "fieldName": "processType", "filterNameOverride": "Process Type" } type: optional enum DataProcessType { diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index 050b5d2db95c9..d40e4a5e7a4aa 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -15,4 +15,4 @@ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false \ DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ -datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup +datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/login/login.js b/smoke-test/tests/cypress/cypress/e2e/login/login.js index f86741b5afe01..74d04aa56d0d0 100644 --- a/smoke-test/tests/cypress/cypress/e2e/login/login.js +++ b/smoke-test/tests/cypress/cypress/e2e/login/login.js @@ -4,6 +4,6 @@ describe('login', () => { cy.get('input[data-testid=username]').type(Cypress.env('ADMIN_USERNAME')); cy.get('input[data-testid=password]').type(Cypress.env('ADMIN_PASSWORD')); cy.contains('Sign In').click(); - cy.contains('Welcome back, DataHub'); + cy.contains('Welcome back, Data Hub'); }); }) diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js index 7686acfe50de0..353570c0d955b 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js @@ -64,6 +64,8 @@ describe("create and manage group", () => { }); it("update group info", () => { + var expected_name = Cypress.env('ADMIN_USERNAME') == "datahub" ? "Data Hub" : Cypress.env('ADMIN_USERNAME'); + cy.loginWithCredentials(); cy.visit("/settings/identities/groups"); cy.clickOptionWithText(group_name); @@ -77,13 +79,13 @@ describe("create and manage group", () => { cy.contains("Test group description EDITED").should("be.visible"); cy.clickOptionWithText("Add Owners"); cy.contains("Search for users or groups...").click({ force: true }); - cy.focused().type(Cypress.env('ADMIN_USERNAME')); - cy.get(".ant-select-item-option").contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).click(); + cy.focused().type(expected_name); + cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click(); cy.focused().blur(); - cy.contains(Cypress.env('ADMIN_USERNAME')).should("have.length", 1); + cy.contains(expected_name).should("have.length", 1); cy.get('[role="dialog"] button').contains("Done").click(); cy.waitTextVisible("Owners Added"); - cy.contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).should("be.visible"); + cy.contains(expected_name, { matchCase: false }).should("be.visible"); cy.clickOptionWithText("Edit Group"); cy.waitTextVisible("Edit Profile"); cy.get("#email").type(`${test_id}@testemail.com`); From b0cb990bad0522ea77fabab6f4746f1fd6d4ba23 Mon Sep 17 00:00:00 2001 From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com> Date: Tue, 22 Aug 2023 11:35:58 -0700 Subject: [PATCH 04/11] tests(search): more golden tests (#8683) --- .../fixtures/ElasticSearchGoldenTest.java | 69 +++++++++++++------ 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java index 8e8c20bd292e5..d720c95fef84d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java @@ -15,7 +15,6 @@ import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.context.annotation.Import; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; -import org.testng.annotations.Ignore; import org.testng.annotations.Test; import java.util.List; @@ -80,24 +79,6 @@ public void testNameMatchPetProfile() { } @Test - public void testNameMatchMemberInWorkspace() { - /* - Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search - result, followed by "collaborative_actionitems_old" - */ - assertNotNull(searchService); - SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES); - assertTrue(searchResult.getEntities().size() >= 2); - Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); - Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); - - // Checks that the table name is not suffixed with anything - assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,")); - assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old")); - } - - @Test - @Ignore("unstable") public void testGlossaryTerms() { /* Searching for "ReturnRate" should return all tables that have the glossary term applied before @@ -134,9 +115,53 @@ public void testNameMatchPartiallyQualified() { assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details")); } + @Test + public void testNameMatchCollaborativeActionitems() { + /* + Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search + result, followed by "collaborative_actionitems_old" + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + // Checks that the table name is not suffixed with anything + assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,")); + assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old")); + + Double firstResultScore = searchResult.getEntities().get(0).getScore(); + Double secondResultScore = searchResult.getEntities().get(1).getScore(); + + // Checks that the scores aren't tied so that we are matching on table name more than column name + assertTrue(firstResultScore > secondResultScore); + } + + @Test + public void testNameMatchCustomerOrders() { + /* + Searching for "customer orders" should return "customer_orders" as the first search + result, not suffixed by anything + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "customer orders", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + + // Checks that the table name is not suffixed with anything + assertTrue(firstResultUrn.toString().contains("customer_orders,")); + + Double firstResultScore = searchResult.getEntities().get(0).getScore(); + Double secondResultScore = searchResult.getEntities().get(1).getScore(); + + // Checks that the scores aren't tied so that we are matching on table name more than column name + assertTrue(firstResultScore > secondResultScore); + } + /* - * Tests that should pass but do not yet can be added below here, with the following annotation: - * @Test(enabled = false) - **/ + Tests that should pass but do not yet can be added below here, with the following annotation: + @Test(enabled = false) + */ } From 439cf4d7dcde7003de3a3fbe02339cbf72c7246a Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 22 Aug 2023 16:27:46 -0400 Subject: [PATCH 05/11] test(ingest/vertica): Skip integration test failing CI; support arm Macs (#8694) --- .../tests/integration/vertica/docker-compose.yml | 4 +--- metadata-ingestion/tests/integration/vertica/test_vertica.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml index ddaf206f236cf..84af5c32a60e3 100644 --- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml +++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml @@ -1,6 +1,7 @@ version: "3.9" services: vertica: + platform: linux/amd64 environment: APP_DB_USER: "dbadmin" APP_DB_PASSWORD: "abc123" @@ -18,6 +19,3 @@ services: volumes: vertica-data: - - - diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py index db8bfd247313b..fe306d1d0b2b8 100644 --- a/metadata-ingestion/tests/integration/vertica/test_vertica.py +++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py @@ -58,6 +58,7 @@ def vertica_runner(docker_compose_runner, test_resources_dir): # Test needs more work to be done , currently it is working fine. @freeze_time(FROZEN_TIME) +@pytest.mark.skip("Failing in CI, cmd failing with exit code 1") @pytest.mark.integration def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica" From d6e36f16de0f9b776767a898e7f64eb972ed8987 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 22 Aug 2023 21:27:02 -0700 Subject: [PATCH 06/11] ci: add `needs_artifact_download` output for ingestion image (#8695) --- .github/workflows/docker-unified.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index e8e12ac6def94..532669c44722c 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -549,6 +549,7 @@ jobs: runs-on: ubuntu-latest outputs: tag: ${{ steps.tag.outputs.tag }} + needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }} needs: [setup, datahub_ingestion_base_slim_build] steps: - name: Check out the repo @@ -605,7 +606,7 @@ jobs: uses: actions/checkout@v3 - name: Download image Slim Image uses: ishworkh/docker-image-artifact-download@v1 - if: ${{ needs.setup.outputs.publish != 'true' }} + if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} - name: Run Trivy vulnerability scanner Slim Image @@ -630,6 +631,7 @@ jobs: runs-on: ubuntu-latest outputs: tag: ${{ steps.tag.outputs.tag }} + needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }} needs: [setup, datahub_ingestion_base_full_build] steps: - name: Check out the repo @@ -685,7 +687,7 @@ jobs: uses: actions/checkout@v3 - name: Download image Full Image uses: ishworkh/docker-image-artifact-download@v1 - if: ${{ needs.setup.outputs.publish != 'true' }} + if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} - name: Run Trivy vulnerability scanner Full Image @@ -792,7 +794,7 @@ jobs: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download datahub-ingestion-slim image uses: ishworkh/docker-image-artifact-download@v1 - if: ${{ needs.setup.outputs.publish != 'true' }} + if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} - name: Disk Check From 4116716a1571919224947b793c0388437ebf4b68 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 23 Aug 2023 05:08:10 -0400 Subject: [PATCH 07/11] logs(ingestion/unity): Hide stack trace on sql parse failure logs (#8657) --- .../src/datahub/ingestion/source/unity/usage.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index d5da93c7be35e..49f56b46fb012 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -176,10 +176,8 @@ def _parse_query_via_lineage_runner(self, query: str) -> Optional[StringTableInf for table in runner.target_tables ], ) - except Exception: - logger.info( - f"Could not parse query via lineage runner, {query}", exc_info=True - ) + except Exception as e: + logger.info(f"Could not parse query via lineage runner, {query}: {e!r}") return None @staticmethod @@ -202,8 +200,8 @@ def _parse_query_via_spark_sql_plan(self, query: str) -> Optional[StringTableInf return GenericTableInfo( source_tables=[t for t in tables if t], target_tables=[] ) - except Exception: - logger.info(f"Could not parse query via spark plan, {query}", exc_info=True) + except Exception as e: + logger.info(f"Could not parse query via spark plan, {query}: {e!r}") return None @staticmethod From 8ee58af0c249f74c93f3f8132ec9896da882a8cc Mon Sep 17 00:00:00 2001 From: siddiquebagwan-gslab Date: Wed, 23 Aug 2023 14:38:58 +0530 Subject: [PATCH 08/11] feat(ingestion/powerbi): support multiple tables as upstream in native SQL parsing (#8592) --- .../ingestion/source/powerbi/config.py | 15 + .../powerbi/dataplatform_instance_resolver.py | 14 +- .../powerbi/m_query/native_sql_parser.py | 33 +- .../source/powerbi/m_query/parser.py | 21 +- .../source/powerbi/m_query/resolver.py | 390 +++++++++++++++--- .../ingestion/source/powerbi/powerbi.py | 50 +-- .../src/datahub/ingestion/source/tableau.py | 52 +-- .../src/datahub/utilities/sqlglot_lineage.py | 40 ++ .../integration/powerbi/test_m_parser.py | 374 +++++++++++------ .../tableau/test_tableau_ingest.py | 6 +- 10 files changed, 714 insertions(+), 281 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 31d067f984d2d..ffa685fb25826 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -121,6 +121,12 @@ class DataPlatformPair: powerbi_data_platform_name: str +@dataclass +class PowerBIPlatformDetail: + data_platform_pair: DataPlatformPair + data_platform_server: str + + class SupportedDataPlatform(Enum): POSTGRES_SQL = DataPlatformPair( powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" @@ -382,6 +388,15 @@ class PowerBiDashboardSourceConfig( description="The instance of the platform that all assets produced by this recipe belong to", ) + # Enable advance sql construct + enable_advance_lineage_sql_construct: bool = pydantic.Field( + default=False, + description="Whether to enable advance native sql construct for parsing like join, sub-queries. " + "along this flag , the native_query_parsing should be enabled. " + "By default convert_lineage_urns_to_lowercase is enabled, in-case if you have disabled it in previous ingestion execution then it may break lineage " + "as this option generates the upstream datasets URN in lowercase.", + ) + @validator("dataset_type_mapping") @classmethod def map_data_platform(cls, value): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index 396da2d79e3b7..baaa8d5b85ae1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -5,8 +5,8 @@ from datahub.ingestion.source.powerbi.config import ( PlatformDetail, PowerBiDashboardSourceConfig, + PowerBIPlatformDetail, ) -from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable logger = logging.getLogger(__name__) @@ -14,7 +14,7 @@ class AbstractDataPlatformInstanceResolver(ABC): @abstractmethod def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: pass @@ -32,10 +32,10 @@ class ResolvePlatformInstanceFromDatasetTypeMapping( BaseAbstractDataPlatformInstanceResolver ): def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: platform: Union[str, PlatformDetail] = self.config.dataset_type_mapping[ - dataplatform_table.data_platform_pair.powerbi_data_platform_name + data_platform_detail.data_platform_pair.powerbi_data_platform_name ] if isinstance(platform, PlatformDetail): @@ -48,13 +48,13 @@ class ResolvePlatformInstanceFromServerToPlatformInstance( BaseAbstractDataPlatformInstanceResolver ): def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: return ( self.config.server_to_platform_instance[ - dataplatform_table.datasource_server + data_platform_detail.data_platform_server ] - if dataplatform_table.datasource_server + if data_platform_detail.data_platform_server in self.config.server_to_platform_instance else PlatformDetail.parse_obj({}) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 640bc4bd60d80..021c429c3c633 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -1,8 +1,12 @@ import logging -from typing import List +from typing import List, Optional import sqlparse +import datahub.utilities.sqlglot_lineage as sqlglot_l +from datahub.ingestion.api.common import PipelineContext +from datahub.utilities.sqlglot_lineage import SqlParsingResult + SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] logger = logging.getLogger() @@ -45,3 +49,30 @@ def get_tables(native_query: str) -> List[str]: from_index = from_index + 1 return tables + + +def parse_custom_sql( + ctx: PipelineContext, + query: str, + schema: Optional[str], + database: Optional[str], + platform: str, + env: str, + platform_instance: Optional[str], +) -> Optional["SqlParsingResult"]: + + logger.debug("Using sqlglot_lineage to parse custom sql") + + sql_query = remove_special_characters(query) + + logger.debug(f"Parsing sql={sql_query}") + + return sqlglot_l.create_lineage_sql_parsed_result( + query=sql_query, + schema=schema, + database=database, + platform=platform, + platform_instance=platform_instance, + env=env, + graph=ctx.graph, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 83106c04529d1..8cc38c366c42a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -6,7 +6,14 @@ import lark from lark import Lark, Tree -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) from datahub.ingestion.source.powerbi.m_query import resolver, validator from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -45,7 +52,9 @@ def _parse_expression(expression: str) -> Tree: def get_upstream_tables( table: Table, reporter: PowerBiDashboardSourceReport, - native_query_enabled: bool = True, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, ) -> List[resolver.DataPlatformTable]: if table.expression is None: @@ -58,7 +67,7 @@ def get_upstream_tables( parse_tree: Tree = _parse_expression(table.expression) valid, message = validator.validate_parse_tree( - parse_tree, native_query_enabled=native_query_enabled + parse_tree, native_query_enabled=config.native_query_parsing ) if valid is False: assert message is not None @@ -84,7 +93,11 @@ def get_upstream_tables( parse_tree=parse_tree, reporter=reporter, parameters=parameters, - ).resolve_to_data_platform_table_list() + ).resolve_to_data_platform_table_list( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) except BaseException as e: reporter.report_warning(table.full_name, "Failed to process m-query expression") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index e2b448124c89d..479f1decff903 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -6,11 +6,19 @@ from lark import Tree +import datahub.emitter.mce_builder as builder +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( DataPlatformPair, + PlatformDetail, + PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, + PowerBIPlatformDetail, SupportedDataPlatform, ) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -19,19 +27,98 @@ IdentifierAccessor, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table +from datahub.utilities.sqlglot_lineage import SqlParsingResult logger = logging.getLogger(__name__) @dataclass class DataPlatformTable: - name: str - full_name: str - datasource_server: str data_platform_pair: DataPlatformPair + urn: str + + +def urn_to_lowercase(value: str, flag: bool) -> str: + if flag is True: + return value.lower() + + return value + + +def urn_creator( + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + data_platform_pair: DataPlatformPair, + server: str, + qualified_table_name: str, +) -> str: + + platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=data_platform_pair, + data_platform_server=server, + ) + ) + + return builder.make_dataset_urn_with_platform_instance( + platform=data_platform_pair.datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + name=urn_to_lowercase( + qualified_table_name, config.convert_lineage_urns_to_lowercase + ), + ) class AbstractDataPlatformTableCreator(ABC): + """ + Base class to share common functionalities among different dataplatform for M-Query parsing. + + To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and + the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query. + + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + + It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument + of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL. + + DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern + + data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to + find out database-name , schema-name and table-name also varies as per dataplatform. + + Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query + + let + Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) + in + Source + + In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query. + + NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing. + + """ + + ctx: PipelineContext + config: PowerBiDashboardSourceConfig + platform_instance_resolver: AbstractDataPlatformInstanceResolver + + def __init__( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> None: + super().__init__() + self.ctx = ctx + self.config = config + self.platform_instance_resolver = platform_instance_resolver + @abstractmethod def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail @@ -58,6 +145,49 @@ def get_db_detail_from_argument( return arguments[0], arguments[1] + def parse_custom_sql( + self, query: str, server: str, database: Optional[str], schema: Optional[str] + ) -> List[DataPlatformTable]: + + dataplatform_tables: List[DataPlatformTable] = [] + + platform_detail: PlatformDetail = ( + self.platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=self.get_platform_pair(), + data_platform_server=server, + ) + ) + ) + + parsed_result: Optional[ + "SqlParsingResult" + ] = native_sql_parser.parse_custom_sql( + ctx=self.ctx, + query=query, + platform=self.get_platform_pair().datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + database=database, + schema=schema, + ) + + if parsed_result is None: + logger.debug("Failed to parse query") + return dataplatform_tables + + for urn in parsed_result.in_tables: + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") + + return dataplatform_tables + class AbstractDataAccessMQueryResolver(ABC): table: Table @@ -80,11 +210,29 @@ def __init__( self.data_access_functions = SupportedResolver.get_function_names() @abstractmethod - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + def resolve_to_data_platform_table_list( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> List[DataPlatformTable]: pass class MQueryResolver(AbstractDataAccessMQueryResolver, ABC): + """ + This class parses the M-Query recursively to generate DataAccessFunctionDetail (see method create_data_access_functional_detail). + + This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail. + + Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator + (see method resolve_to_data_platform_table_list). + + Classes which extended from AbstractDataPlatformTableCreator knows how to convert generated DataAccessFunctionDetail instance + to respective DataPlatformTable instance as per dataplatform. + + """ + def get_item_selector_tokens( self, expression_tree: Tree, @@ -318,9 +466,15 @@ def internal( return table_links - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + def resolve_to_data_platform_table_list( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> List[DataPlatformTable]: data_platform_tables: List[DataPlatformTable] = [] + # Find out output variable as we are doing backtracking in M-Query output_variable: Optional[str] = tree_function.get_output_variable( self.parse_tree ) @@ -332,12 +486,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) return data_platform_tables + # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail table_links: List[ DataAccessFunctionDetail ] = self.create_data_access_functional_detail(output_variable) # Each item is data-access function for f_detail in table_links: + # Get & Check if we support data-access-function available in M-Query supported_resolver = SupportedResolver.get_resolver( f_detail.data_access_function_name ) @@ -351,8 +507,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) continue + # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it + # & also pass additional information that will be need to generate urn table_full_name_creator: AbstractDataPlatformTableCreator = ( - supported_resolver.get_table_full_name_creator()() + supported_resolver.get_table_full_name_creator()( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) data_platform_tables.extend( @@ -393,18 +555,24 @@ def two_level_access_pattern( IdentifierAccessor, data_access_func_detail.identifier_accessor ).items["Item"] - full_table_name: str = f"{db_name}.{schema_name}.{table_name}" + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" logger.debug( - f"Platform({self.get_platform_pair().datahub_data_platform_name}) full_table_name= {full_table_name}" + f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}" + ) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, ) return [ DataPlatformTable( - name=table_name, - full_name=full_table_name, - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -420,9 +588,48 @@ def get_platform_pair(self) -> DataPlatformPair: class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources): + # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 + DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo + def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.MS_SQL.value + def create_urn_using_old_parser( + self, query: str, db_name: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for table in tables: + schema_and_table: List[str] = table.split(".") + if len(schema_and_table) == 1: + # schema name is not present. set default schema + schema_and_table.insert(0, MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA) + + qualified_table_name = ( + f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}" + ) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated upstream tables = {dataplatform_tables}") + + return dataplatform_tables + def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[DataPlatformTable]: @@ -442,28 +649,20 @@ def create_dataplatform_tables( logger.debug("Unsupported case is found. Second index is not the Query") return dataplatform_tables - db_name: str = arguments[1] - - tables: List[str] = native_sql_parser.get_tables(arguments[3]) - for table in tables: - schema_and_table: List[str] = table.split(".") - if len(schema_and_table) == 1: - # schema name is not present. Default schema name in MS-SQL is dbo - # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 - schema_and_table.insert(0, "dbo") - - dataplatform_tables.append( - DataPlatformTable( - name=schema_and_table[1], - full_name=f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}", - datasource_server=arguments[0], - data_platform_pair=self.get_platform_pair(), - ) + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=arguments[3], + db_name=arguments[1], + server=arguments[0], ) - logger.debug("MS-SQL full-table-names %s", dataplatform_tables) - - return dataplatform_tables + return self.parse_custom_sql( + query=arguments[3], + database=arguments[1], + server=arguments[0], + schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, + ) class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -510,12 +709,20 @@ def create_dataplatform_tables( cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -547,14 +754,28 @@ def create_dataplatform_tables( db_name: str = value_dict["Database"] schema_name: str = value_dict["Schema"] table_name: str = value_dict["Table"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + server, _ = self.get_db_detail_from_argument(data_access_func_detail.arg_list) + if server is None: + logger.info( + f"server information is not available for {qualified_table_name}. Skipping upstream table" + ) + return [] + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server if server else "", data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -589,20 +810,26 @@ def create_dataplatform_tables( IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore ).items["Name"] - full_table_name: str = f"{db_name}.{schema_name}.{table_name}" + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" logger.debug( - f"{self.get_platform_pair().datahub_data_platform_name} full-table-name {full_table_name}" + f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" + ) + + server: str = self.get_datasource_server(arguments, data_access_func_detail) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, ) return [ DataPlatformTable( - name=table_name, - full_name=full_table_name, - datasource_server=self.get_datasource_server( - arguments, data_access_func_detail - ), data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -654,12 +881,20 @@ def create_dataplatform_tables( cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -681,6 +916,39 @@ def is_native_parsing_supported(data_access_function_name: str) -> bool: in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM ) + def create_urn_using_old_parser( + self, query: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for qualified_table_name in tables: + if len(qualified_table_name.split(".")) != 3: + logger.debug( + f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format" + ) + continue + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") + + return dataplatform_tables + def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[DataPlatformTable]: @@ -727,25 +995,21 @@ def create_dataplatform_tables( 0 ] # Remove any whitespaces and double quotes character - for table in native_sql_parser.get_tables(sql_query): - if len(table.split(".")) != 3: - logger.debug( - f"Skipping table {table} as it is not as per full_table_name format" - ) - continue + server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] - dataplatform_tables.append( - DataPlatformTable( - name=table.split(".")[2], - full_name=table, - datasource_server=tree_function.strip_char_from_list( - [data_access_tokens[2]] - )[0], - data_platform_pair=self.get_platform_pair(), - ) + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=sql_query, + server=server, ) - return dataplatform_tables + return self.parse_custom_sql( + query=sql_query, + server=server, + database=None, # database and schema is available inside custom sql as per PowerBI Behavior + schema=None, + ) class FunctionName(Enum): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 919cb83e4d832..5d477ee090e7e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -28,7 +28,6 @@ ) from datahub.ingestion.source.powerbi.config import ( Constant, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, ) @@ -96,10 +95,12 @@ def __hash__(self): def __init__( self, + ctx: PipelineContext, config: PowerBiDashboardSourceConfig, reporter: PowerBiDashboardSourceReport, dataplatform_instance_resolver: AbstractDataPlatformInstanceResolver, ): + self.__ctx = ctx self.__config = config self.__reporter = reporter self.__dataplatform_instance_resolver = dataplatform_instance_resolver @@ -172,43 +173,40 @@ def extract_lineage( # table.dataset should always be set, but we check it just in case. parameters = table.dataset.parameters if table.dataset else {} - upstreams: List[UpstreamClass] = [] - upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables( - table, self.__reporter, parameters=parameters + upstream: List[UpstreamClass] = [] + + upstream_dpts: List[resolver.DataPlatformTable] = parser.get_upstream_tables( + table=table, + reporter=self.__reporter, + platform_instance_resolver=self.__dataplatform_instance_resolver, + ctx=self.__ctx, + config=self.__config, + parameters=parameters, ) + logger.debug( - f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_tables}" + f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_dpts}" ) - for upstream_table in upstream_tables: + + for upstream_dpt in upstream_dpts: if ( - upstream_table.data_platform_pair.powerbi_data_platform_name + upstream_dpt.data_platform_pair.powerbi_data_platform_name not in self.__config.dataset_type_mapping.keys() ): logger.debug( - f"Skipping upstream table for {ds_urn}. The platform {upstream_table.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", + f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", ) continue - platform_detail: PlatformDetail = ( - self.__dataplatform_instance_resolver.get_platform_instance( - upstream_table - ) - ) - upstream_urn = builder.make_dataset_urn_with_platform_instance( - platform=upstream_table.data_platform_pair.datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - name=self.lineage_urn_to_lowercase(upstream_table.full_name), - ) - upstream_table_class = UpstreamClass( - upstream_urn, + upstream_dpt.urn, DatasetLineageTypeClass.TRANSFORMED, ) - upstreams.append(upstream_table_class) - if len(upstreams) > 0: - upstream_lineage = UpstreamLineageClass(upstreams=upstreams) + upstream.append(upstream_table_class) + + if len(upstream) > 0: + upstream_lineage = UpstreamLineageClass(upstreams=upstream) logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}") mcp = MetadataChangeProposalWrapper( entityType=Constant.DATASET, @@ -1107,7 +1105,9 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext): ) # Exit pipeline as we are not able to connect to PowerBI API Service. This exit will avoid raising # unwanted stacktrace on console - self.mapper = Mapper(config, self.reporter, self.dataplatform_instance_resolver) + self.mapper = Mapper( + ctx, config, self.reporter, self.dataplatform_instance_resolver + ) # Create and register the stateful ingestion use-case handler. self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 6752bdf519830..ec0af37089b1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -31,6 +31,7 @@ from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError import datahub.emitter.mce_builder as builder +import datahub.utilities.sqlglot_lineage as sqlglot_l from datahub.configuration.common import ( AllowDenyPattern, ConfigModel, @@ -136,12 +137,7 @@ ViewPropertiesClass, ) from datahub.utilities import config_clean -from datahub.utilities.sqlglot_lineage import ( - ColumnLineageInfo, - SchemaResolver, - SqlParsingResult, - sqlglot_lineage, -) +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult logger: logging.Logger = logging.getLogger(__name__) @@ -1585,42 +1581,14 @@ def parse_custom_sql( f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}" ) - parsed_result: Optional["SqlParsingResult"] = None - try: - schema_resolver = ( - self.ctx.graph._make_schema_resolver( - platform=platform, - platform_instance=platform_instance, - env=env, - ) - if self.ctx.graph is not None - else SchemaResolver( - platform=platform, - platform_instance=platform_instance, - env=env, - graph=None, - ) - ) - - if schema_resolver.graph is None: - logger.warning( - "Column Level Lineage extraction would not work as DataHub graph client is None." - ) - - parsed_result = sqlglot_lineage( - query, - schema_resolver=schema_resolver, - default_db=upstream_db, - ) - except Exception as e: - self.report.report_warning( - key="csql-lineage", - reason=f"Unable to retrieve lineage from query. " - f"Query: {query} " - f"Reason: {str(e)} ", - ) - - return parsed_result + return sqlglot_l.create_lineage_sql_parsed_result( + query=query, + database=upstream_db, + platform=platform, + platform_instance=platform_instance, + env=env, + graph=self.ctx.graph, + ) def _create_lineage_from_unsupported_csql( self, csql_urn: str, csql: dict diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index e5a9954802019..6d028c4ac1b9e 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -825,3 +825,43 @@ def sqlglot_lineage( table_error=e, ), ) + + +def create_lineage_sql_parsed_result( + query: str, + database: Optional[str], + platform: str, + platform_instance: Optional[str], + env: str, + schema: Optional[str] = None, + graph: Optional[DataHubGraph] = None, +) -> Optional["SqlParsingResult"]: + + parsed_result: Optional["SqlParsingResult"] = None + try: + schema_resolver = ( + graph._make_schema_resolver( + platform=platform, + platform_instance=platform_instance, + env=env, + ) + if graph is not None + else SchemaResolver( + platform=platform, + platform_instance=platform_instance, + env=env, + graph=None, + ) + ) + + parsed_result = sqlglot_lineage( + query, + schema_resolver=schema_resolver, + default_db=database, + default_schema=schema, + ) + except Exception as e: + logger.debug(f"Fail to prase query {query}", exc_info=e) + logger.warning("Fail to parse custom SQL") + + return parsed_result diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 5c9553402a8c4..e77a12aa4088e 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,17 +1,22 @@ import logging import sys -from typing import List +from typing import List, Tuple import pytest from lark import Tree import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport -from datahub.ingestion.source.powerbi.m_query import parser, tree_function -from datahub.ingestion.source.powerbi.m_query.resolver import ( - DataPlatformTable, - SupportedDataPlatform, +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, + create_dataplatform_instance_resolver, ) +from datahub.ingestion.source.powerbi.m_query import parser, tree_function +from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -38,9 +43,31 @@ 'let\n Source = AmazonRedshift.Database("redshift-url","dev"),\n public = Source{[Name="public"]}[Data],\n category1 = public{[Name="category"]}[Data]\nin\n category1', 'let\n Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) \n in Source', 'let\n Source = Databricks.Catalogs("adb-123.azuredatabricks.net", "/sql/1.0/endpoints/12345dc91aa25844", [Catalog=null, Database=null]),\n hive_metastore_Database = Source{[Name="hive_metastore",Kind="Database"]}[Data],\n sandbox_revenue_Schema = hive_metastore_Database{[Name="sandbox_revenue",Kind="Schema"]}[Data],\n public_consumer_price_index_Table = sandbox_revenue_Schema{[Name="public_consumer_price_index",Kind="Table"]}[Data],\n #"Renamed Columns" = Table.RenameColumns(public_consumer_price_index_Table,{{"Country", "country"}, {"Metric", "metric"}}),\n #"Inserted Year" = Table.AddColumn(#"Renamed Columns", "ID", each Date.Year([date_id]) + Date.Month([date_id]), Text.Type),\n #"Added Custom" = Table.AddColumn(#"Inserted Year", "Custom", each Text.Combine({Number.ToText(Date.Year([date_id])), Number.ToText(Date.Month([date_id])), [country]})),\n #"Removed Columns" = Table.RemoveColumns(#"Added Custom",{"ID"}),\n #"Renamed Columns1" = Table.RenameColumns(#"Removed Columns",{{"Custom", "ID"}}),\n #"Filtered Rows" = Table.SelectRows(#"Renamed Columns1", each ([metric] = "Consumer Price Index") and (not Number.IsNaN([value])))\nin\n #"Filtered Rows"', + "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source", ] +def get_default_instances( + override_config: dict = {}, +) -> Tuple[ + PipelineContext, PowerBiDashboardSourceConfig, AbstractDataPlatformInstanceResolver +]: + config: PowerBiDashboardSourceConfig = PowerBiDashboardSourceConfig.parse_obj( + { + "tenant_id": "fake", + "client_id": "foo", + "client_secret": "bar", + **override_config, + } + ) + + platform_instance_resolver: AbstractDataPlatformInstanceResolver = ( + create_dataplatform_instance_resolver(config) + ) + + return PipelineContext(run_id="fake"), config, platform_instance_resolver + + @pytest.mark.integration def test_parse_m_query1(): expression: str = M_QUERIES[0] @@ -145,20 +172,20 @@ def test_snowflake_regular_case(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "TESTTABLE" - assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" assert ( - data_platform_tables[0].datasource_server - == "bu10758.ap-unknown-2.fakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,pbi_test.test.testtable,PROD)" ) @@ -174,17 +201,21 @@ def test_postgres_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "order_date" - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].datasource_server == "localhost" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)" ) @@ -200,19 +231,21 @@ def test_databricks_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "public_consumer_price_index" assert ( - data_platform_tables[0].full_name - == "hive_metastore.sandbox_revenue.public_consumer_price_index" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.DATABRICK_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:databricks,hive_metastore.sandbox_revenue.public_consumer_price_index,PROD)" ) @@ -228,17 +261,21 @@ def test_oracle_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "EMPLOYEES" - assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" - assert data_platform_tables[0].datasource_server == "localhost:1521" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.hr.employees,PROD)" ) @@ -255,17 +292,20 @@ def test_mssql_regular_case(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "book_issue" - assert data_platform_tables[0].full_name == "library.dbo.book_issue" - assert data_platform_tables[0].datasource_server == "localhost" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:mssql,library.dbo.book_issue,PROD)" ) @@ -280,14 +320,16 @@ def test_mssql_with_query(): M_QUERIES[11], ] expected_tables = [ - "COMMOPSDB.dbo.V_OIP_ENT_2022", - "COMMOPSDB.dbo.V_INVOICE_BOOKING_2022", - "COMMOPSDB.dbo.V_ARR_ADDS", - "COMMOPSDB.dbo.V_PS_CD_RETENTION", - "COMMOPSDB.dbo.V_TPV_LEADERBOARD", - "COMMOPSDB.dbo.V_ENTERPRISE_INVOICED_REVENUE", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_oip_ent_2022,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_invoice_booking_2022,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_arr_adds,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_tpv_leaderboard,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_enterprise_invoiced_revenue,PROD)", ] + ctx, config, platform_instance_resolver = get_default_instances() + for index, query in enumerate(mssql_queries): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -299,17 +341,15 @@ def test_mssql_with_query(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == expected_tables[index].split(".")[2] - assert data_platform_tables[0].full_name == expected_tables[index] - assert data_platform_tables[0].datasource_server == "AUPRDWHDB" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name - ) + assert data_platform_tables[0].urn == expected_tables[index] @pytest.mark.integration @@ -322,12 +362,14 @@ def test_snowflake_native_query(): ] expected_tables = [ - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", ] + ctx, config, platform_instance_resolver = get_default_instances() + for index, query in enumerate(snowflake_queries): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -339,20 +381,15 @@ def test_snowflake_native_query(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == expected_tables[index].split(".")[2] - assert data_platform_tables[0].full_name == expected_tables[index] - assert ( - data_platform_tables[0].datasource_server - == "bu10758.ap-unknown-2.fakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name - ) + assert data_platform_tables[0].urn == expected_tables[index] def test_google_bigquery_1(): @@ -363,16 +400,20 @@ def test_google_bigquery_1(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "seraphic-music-344307" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,seraphic-music-344307.school_dataset.first,PROD)" ) @@ -387,23 +428,24 @@ def test_google_bigquery_2(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "Parameter - Source": "my-test-project", "My bq project": "gcp_billing", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "my-test-project" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.gcp_billing.gcp_table,PROD)" ) @@ -416,23 +458,24 @@ def test_for_each_expression_1(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "Parameter - Source": "my-test-project", "My bq project": "gcp_billing", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].datasource_server == "my-test-project" - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.universal.d_wh_date,PROD)" ) @@ -445,22 +488,23 @@ def test_for_each_expression_2(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "dwh-prod": "originally-not-a-variable-ref-and-not-resolved", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "dwh-prod" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,dwh-prod.gcp_billing.d_gcp_custom_label,PROD)" ) @@ -476,8 +520,14 @@ def test_native_query_disabled(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + config.native_query_parsing = False data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 0 @@ -493,26 +543,25 @@ def test_multi_source_table(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 2 - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].datasource_server == "localhost" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name - ) - - assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW" assert ( - data_platform_tables[1].datasource_server - == "ghh48144.snowflakefakecomputing.com" + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)" ) assert ( - data_platform_tables[1].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst_view,PROD)" ) @@ -521,36 +570,33 @@ def test_table_combine(): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], measures=[], - expression=M_QUERIES[16], # 1st index has the native query + expression=M_QUERIES[16], name="virtual_order_table", full_name="OrderDataSet.virtual_order_table", ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 2 - assert data_platform_tables[0].full_name == "GSL_TEST_DB.PUBLIC.SALES_FORECAST" - assert ( - data_platform_tables[0].datasource_server - == "ghh48144.snowflakefakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name - ) - assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST" assert ( - data_platform_tables[1].datasource_server - == "ghh48144.snowflakefakecomputing.com" + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_forecast,PROD)" ) + assert ( - data_platform_tables[1].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD)" ) @@ -574,8 +620,14 @@ def test_expression_is_none(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 0 @@ -589,15 +641,20 @@ def test_redshift_regular_case(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)" ) @@ -609,13 +666,60 @@ def test_redshift_native_query(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + + config.native_query_parsing = True + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=True + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)" + ) + + +def test_sqlglot_parser(): + table: powerbi_data_classes.Table = powerbi_data_classes.Table( + expression=M_QUERIES[24], + name="SALES_TARGET", + full_name="dev.public.sales", + ) + reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances( + override_config={ + "server_to_platform_instance": { + "bu10758.ap-unknown-2.fakecomputing.com": { + "platform_instance": "sales_deployment", + "env": "PROD", + } + }, + "native_query_parsing": True, + "enable_advance_lineage_sql_construct": True, + } + ) + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) + + assert len(data_platform_tables) == 2 + assert ( + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit,PROD)" + ) + assert ( + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)" ) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index d04c8d905b439..71428a7847953 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -791,11 +791,9 @@ def test_tableau_unsupported_csql(mock_datahub_graph): database_override_map={"production database": "prod"} ) - with mock.patch( - "datahub.ingestion.source.tableau.sqlglot_lineage" - ) as sqlglot_lineage: + with mock.patch("datahub.ingestion.source.tableau.sqlglot_l") as sqlglot_lineage: - sqlglot_lineage.return_value = SqlParsingResult( # type:ignore + sqlglot_lineage.create_lineage_sql_parsed_result.return_value = SqlParsingResult( # type:ignore in_tables=[ "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.userdetail,PROD)" ], From 68abf9c6a1f0ccb9ad144247805781587c40ceeb Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 23 Aug 2023 07:25:51 -0400 Subject: [PATCH 09/11] build(ingest): Bump pydantic pin (#8660) --- metadata-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 4ff1d06bb8c22..62cb4f1abb8cf 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -454,7 +454,7 @@ def get_long_description(): "mypy==1.0.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.9.0", + "pydantic>=1.10.0", *test_api_requirements, pytest_dep, "pytest-asyncio>=0.16.0", From 8141e2d64920f0511c531c493a3b61b5dc2ca026 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 23 Aug 2023 15:57:46 -0400 Subject: [PATCH 10/11] remove(ingest/snowflake): Remove legacy snowflake lineage (#8653) Co-authored-by: Tamas Nemeth Co-authored-by: Aseem Bansal --- .../source/snowflake/snowflake_config.py | 11 +- .../snowflake/snowflake_lineage_legacy.py | 664 ------------------ .../source/snowflake/snowflake_query.py | 29 - .../source/snowflake/snowflake_v2.py | 18 +- .../tests/integration/snowflake/common.py | 9 - .../integration/snowflake/test_snowflake.py | 2 - .../snowflake/test_snowflake_failures.py | 1 - .../test_snowflake_failures_legacy_lineage.py | 291 -------- .../test_snowflake_legacy_lineage.py | 207 ------ 9 files changed, 6 insertions(+), 1226 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py delete mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py delete mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index e8e80e172a9ce..7699d89ce9ac2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -91,13 +91,8 @@ class SnowflakeV2Config( description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", ) - use_legacy_lineage_method: bool = Field( - default=False, - description=( - "Whether to use the legacy lineage computation method. " - "By default, uses new optimised lineage extraction method that requires less ingestion process memory. " - "Table-to-view and view-to-view column-level lineage are not supported with the legacy method." - ), + _use_legacy_lineage_method_removed = pydantic_removed_field( + "use_legacy_lineage_method" ) validate_upstreams_against_patterns: bool = Field( @@ -113,7 +108,7 @@ class SnowflakeV2Config( # This is required since access_history table does not capture whether the table was temporary table. temporary_tables_pattern: List[str] = Field( default=DEFAULT_TABLES_DENY_LIST, - description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools. Not used if `use_legacy_lineage_method=True`", + description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools.", ) rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py deleted file mode 100644 index 832a072c619f8..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py +++ /dev/null @@ -1,664 +0,0 @@ -import json -import logging -from collections import defaultdict -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set - -from pydantic import Field -from pydantic.error_wrappers import ValidationError -from snowflake.connector import SnowflakeConnection - -import datahub.emitter.mce_builder as builder -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import make_s3_urn -from datahub.ingestion.source.snowflake.constants import ( - LINEAGE_PERMISSION_ERROR, - SnowflakeEdition, - SnowflakeObjectDomain, -) -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( - SnowflakeColumnReference, -) -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakePermissionError, - SnowflakeQueryMixin, -) -from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( - FineGrainedLineage, - FineGrainedLineageDownstreamType, - FineGrainedLineageUpstreamType, - UpstreamLineage, -) -from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass -from datahub.utilities.perf_timer import PerfTimer - -logger: logging.Logger = logging.getLogger(__name__) - - -class SnowflakeColumnWithLineage(SnowflakeColumnReference): - class Config: - # This is for backward compatibility and can be removed later - allow_population_by_field_name = True - - directSourceColumns: Optional[List[SnowflakeColumnReference]] = Field( - default=None, alias="directSources" - ) - - -@dataclass(frozen=True) -class SnowflakeColumnId: - columnName: str - objectName: str - objectDomain: Optional[str] = None - - -@dataclass(frozen=True) -class SnowflakeColumnFineGrainedLineage: - """ - Fie grained upstream of column, - which represents a transformation applied on input columns""" - - inputColumns: FrozenSet[SnowflakeColumnId] - # Transform function, query etc can be added here - - -@dataclass -class SnowflakeColumnUpstreams: - """All upstreams of a column""" - - upstreams: Set[SnowflakeColumnFineGrainedLineage] = field( - default_factory=set, init=False - ) - - def update_column_lineage( - self, directSourceColumns: List[SnowflakeColumnReference] - ) -> None: - input_columns = frozenset( - [ - SnowflakeColumnId( - upstream_col.columnName, - upstream_col.objectName, - upstream_col.objectDomain, - ) - for upstream_col in directSourceColumns - if upstream_col.objectName - ] - ) - if not input_columns: - return - upstream = SnowflakeColumnFineGrainedLineage(inputColumns=input_columns) - if upstream not in self.upstreams: - self.upstreams.add(upstream) - - -@dataclass -class SnowflakeUpstreamTable: - upstreamDataset: str - upstreamColumns: List[SnowflakeColumnReference] - downstreamColumns: List[SnowflakeColumnWithLineage] - - @classmethod - def from_dict( - cls, - dataset: str, - upstreams_columns_json: Optional[str], - downstream_columns_json: Optional[str], - ) -> "SnowflakeUpstreamTable": - try: - upstreams_columns_list = [] - downstream_columns_list = [] - if upstreams_columns_json is not None: - upstreams_columns_list = json.loads(upstreams_columns_json) - if downstream_columns_json is not None: - downstream_columns_list = json.loads(downstream_columns_json) - - table_with_upstreams = cls( - dataset, - [ - SnowflakeColumnReference.parse_obj(col) - for col in upstreams_columns_list - ], - [ - SnowflakeColumnWithLineage.parse_obj(col) - for col in downstream_columns_list - ], - ) - except ValidationError: - # Earlier versions of column lineage did not include columnName, only columnId - table_with_upstreams = cls(dataset, [], []) - return table_with_upstreams - - -@dataclass -class SnowflakeTableLineage: - # key: upstream table name - upstreamTables: Dict[str, SnowflakeUpstreamTable] = field( - default_factory=dict, init=False - ) - - # key: downstream column name - columnLineages: Dict[str, SnowflakeColumnUpstreams] = field( - default_factory=lambda: defaultdict(SnowflakeColumnUpstreams), init=False - ) - - def update_lineage( - self, table: SnowflakeUpstreamTable, include_column_lineage: bool = True - ) -> None: - if table.upstreamDataset not in self.upstreamTables.keys(): - self.upstreamTables[table.upstreamDataset] = table - - if include_column_lineage and table.downstreamColumns: - for col in table.downstreamColumns: - if col.directSourceColumns: - self.columnLineages[col.columnName].update_column_lineage( - col.directSourceColumns - ) - - -class SnowflakeLineageExtractor( - SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin -): - """ - Extracts Lineage from Snowflake. - Following lineage edges are considered. - - 1. "Table to View" lineage via `snowflake.account_usage.object_dependencies` view - 2. "S3 to Table" lineage via `show external tables` query. - 3. "View to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - 4. "Table to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - 5. "S3 to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - - Edition Note - Snowflake Standard Edition does not have Access History Feature. So it does not support lineage extraction for edges 3, 4, 5 mentioned above. - """ - - def __init__( - self, - config: SnowflakeV2Config, - report: SnowflakeV2Report, - dataset_urn_builder: Callable[[str], str], - ) -> None: - self._lineage_map: Dict[str, SnowflakeTableLineage] = defaultdict( - SnowflakeTableLineage - ) - self._external_lineage_map: Dict[str, Set[str]] = defaultdict(set) - self.config = config - self.report = report - self.logger = logger - self.dataset_urn_builder = dataset_urn_builder - self.connection: Optional[SnowflakeConnection] = None - - # Kwargs used by new snowflake lineage extractor need to be ignored here - def get_workunits( - self, discovered_tables: List[str], discovered_views: List[str], **_kwargs: Any - ) -> Iterable[MetadataWorkUnit]: - self.connection = self.create_connection() - if self.connection is None: - return - - self._populate_table_lineage() - - if self.config.include_view_lineage: - if len(discovered_views) > 0: - self._populate_view_lineage() - else: - logger.info("No views found. Skipping View Lineage Extraction.") - - self._populate_external_lineage() - - if ( - len(self._lineage_map.keys()) == 0 - and len(self._external_lineage_map.keys()) == 0 - ): - logger.debug("No lineage found.") - return - - yield from self.get_table_upstream_workunits(discovered_tables) - yield from self.get_view_upstream_workunits(discovered_views) - - def _populate_table_lineage(self): - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. Table to Table Lineage Feature is not supported." - ) # See Edition Note above for why - else: - with PerfTimer() as timer: - self._populate_lineage() - self.report.table_lineage_query_secs = timer.elapsed_seconds() - - def get_table_upstream_workunits(self, discovered_tables): - if self.config.include_table_lineage: - for dataset_name in discovered_tables: - upstream_lineage = self._get_upstream_lineage_info(dataset_name) - if upstream_lineage is not None: - yield MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(dataset_name), - aspect=upstream_lineage, - ).as_workunit() - - def get_view_upstream_workunits(self, discovered_views): - if self.config.include_view_lineage: - for view_name in discovered_views: - upstream_lineage = self._get_upstream_lineage_info(view_name) - if upstream_lineage is not None: - yield MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(view_name), - aspect=upstream_lineage, - ).as_workunit() - - def _get_upstream_lineage_info( - self, dataset_name: str - ) -> Optional[UpstreamLineage]: - lineage = self._lineage_map[dataset_name] - external_lineage = self._external_lineage_map[dataset_name] - if not (lineage.upstreamTables or lineage.columnLineages or external_lineage): - logger.debug(f"No lineage found for {dataset_name}") - return None - - upstream_tables: List[UpstreamClass] = [] - finegrained_lineages: List[FineGrainedLineage] = [] - - # Populate the table-lineage in aspect - self.update_upstream_tables_lineage(upstream_tables, lineage) - - # Populate the column-lineage in aspect - self.update_upstream_columns_lineage( - self.dataset_urn_builder(dataset_name), finegrained_lineages, lineage - ) - - # Populate the external-table-lineage(s3->snowflake) in aspect - self.update_external_tables_lineage(upstream_tables, external_lineage) - - if len(upstream_tables) > 0: - logger.debug( - f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}" - ) - if self.config.upstream_lineage_in_report: - self.report.upstream_lineage[dataset_name] = [ - u.dataset for u in upstream_tables - ] - return UpstreamLineage( - upstreams=upstream_tables, - fineGrainedLineages=sorted( - finegrained_lineages, key=lambda x: (x.downstreams, x.upstreams) - ) - or None, - ) - else: - return None - - def _populate_view_lineage(self) -> None: - with PerfTimer() as timer: - self._populate_view_upstream_lineage() - self.report.view_upstream_lineage_query_secs = timer.elapsed_seconds() - - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. View to Table Lineage Feature is not supported." - ) # See Edition Note above for why - else: - with PerfTimer() as timer: - self._populate_view_downstream_lineage() - self.report.view_downstream_lineage_query_secs = timer.elapsed_seconds() - - def _populate_external_lineage(self) -> None: - with PerfTimer() as timer: - self.report.num_external_table_edges_scanned = 0 - - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. External Lineage Feature via Access History is not supported." - ) # See Edition Note above for why - else: - self._populate_external_lineage_from_access_history() - - self._populate_external_lineage_from_show_query() - - logger.info( - f"Found {self.report.num_external_table_edges_scanned} external lineage edges." - ) - - self.report.external_lineage_queries_secs = timer.elapsed_seconds() - - # Handles the case for explicitly created external tables. - # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_show_query(self): - external_tables_query: str = SnowflakeQuery.show_external_tables() - try: - for db_row in self.query(external_tables_query): - key = self.get_dataset_identifier( - db_row["name"], db_row["schema_name"], db_row["database_name"] - ) - - if not self._is_dataset_pattern_allowed( - key, SnowflakeObjectDomain.TABLE - ): - continue - self._external_lineage_map[key].add(db_row["location"]) - logger.debug( - f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via show external tables" - ) - self.report.num_external_table_edges_scanned += 1 - except Exception as e: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating external table lineage from Snowflake failed due to error {e}.", - ) - - # Handles the case where a table is populated from an external location via copy. - # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv'; - def _populate_external_lineage_from_access_history(self): - query: str = SnowflakeQuery.external_table_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - ) - - try: - for db_row in self.query(query): - self._process_external_lineage_result_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating table external lineage from Snowflake failed due to error {e}.", - ) - - def _process_external_lineage_result_row(self, db_row): - # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed(key, SnowflakeObjectDomain.TABLE): - return - - if db_row["UPSTREAM_LOCATIONS"] is not None: - external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"]) - - for loc in external_locations: - if loc not in self._external_lineage_map[key]: - self._external_lineage_map[key].add(loc) - self.report.num_external_table_edges_scanned += 1 - - logger.debug( - f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via access_history" - ) - - def _populate_lineage(self) -> None: - query: str = SnowflakeQuery.table_to_table_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - include_column_lineage=self.config.include_column_lineage, - ) - self.report.num_table_to_table_edges_scanned = 0 - try: - for db_row in self.query(query): - self._process_table_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get table to table lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "table-lineage", - f"Extracting lineage from Snowflake failed due to error {e}.", - ) - logger.info( - f"A total of {self.report.num_table_to_table_edges_scanned} Table->Table edges found" - f" for {len(self._lineage_map)} downstream tables.", - ) - - def _process_table_lineage_row(self, db_row): - # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - upstream_table_name = self.get_dataset_identifier_from_qualified_name( - db_row["UPSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed( - key, SnowflakeObjectDomain.TABLE - ) or not ( - self._is_dataset_pattern_allowed( - upstream_table_name, SnowflakeObjectDomain.TABLE, is_upstream=True - ) - ): - return - self._lineage_map[key].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict( - upstream_table_name, - db_row["UPSTREAM_TABLE_COLUMNS"], - db_row["DOWNSTREAM_TABLE_COLUMNS"], - ), - self.config.include_column_lineage, - ) - self.report.num_table_to_table_edges_scanned += 1 - logger.debug(f"Lineage[Table(Down)={key}]:Table(Up)={self._lineage_map[key]}") - - def _populate_view_upstream_lineage(self) -> None: - # NOTE: This query captures only the upstream lineage of a view (with no column lineage). - # For more details see: https://docs.snowflake.com/en/user-guide/object-dependencies.html#object-dependencies - # and also https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views. - view_upstream_lineage_query: str = SnowflakeQuery.view_dependencies() - - self.report.num_table_to_view_edges_scanned = 0 - - try: - for db_row in self.query(view_upstream_lineage_query): - self._process_view_upstream_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get table to view lineage. Please grant imported privileges on SNOWFLAKE database." - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "view-upstream-lineage", - f"Extracting the upstream view lineage from Snowflake failed due to error {e}.", - ) - logger.info( - f"A total of {self.report.num_table_to_view_edges_scanned} View upstream edges found." - ) - - def _process_view_upstream_lineage_row(self, db_row): - # Process UpstreamTable/View/ExternalTable/Materialized View->View edge. - view_upstream: str = self.get_dataset_identifier_from_qualified_name( - db_row["VIEW_UPSTREAM"] - ) - view_name: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_VIEW"] - ) - - if not self._is_dataset_pattern_allowed( - dataset_name=view_name, - dataset_type=db_row["REFERENCING_OBJECT_DOMAIN"], - ) or not self._is_dataset_pattern_allowed( - view_upstream, db_row["REFERENCED_OBJECT_DOMAIN"], is_upstream=True - ): - return - # key is the downstream view name - self._lineage_map[view_name].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict(view_upstream, None, None), - self.config.include_column_lineage, - ) - self.report.num_table_to_view_edges_scanned += 1 - logger.debug( - f"Upstream->View: Lineage[View(Down)={view_name}]:Upstream={view_upstream}" - ) - - def _populate_view_downstream_lineage(self) -> None: - # This query captures the downstream table lineage for views. - # See https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views. - # Eg: For viewA->viewB->ViewC->TableD, snowflake does not yet log intermediate view logs, resulting in only the viewA->TableD edge. - view_lineage_query: str = SnowflakeQuery.view_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - include_column_lineage=self.config.include_column_lineage, - ) - - self.report.num_view_to_table_edges_scanned = 0 - - try: - for db_row in self.query(view_lineage_query): - self._process_view_downstream_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get view to table lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "view-downstream-lineage", - f"Extracting the view lineage from Snowflake failed due to error {e}.", - ) - - logger.info( - f"Found {self.report.num_view_to_table_edges_scanned} View->Table edges." - ) - - def _process_view_downstream_lineage_row(self, db_row): - view_name: str = self.get_dataset_identifier_from_qualified_name( - db_row["VIEW_NAME"] - ) - downstream_table: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed( - view_name, db_row["VIEW_DOMAIN"], is_upstream=True - ) or not self._is_dataset_pattern_allowed( - downstream_table, db_row["DOWNSTREAM_TABLE_DOMAIN"] - ): - return - - # Capture view->downstream table lineage. - self._lineage_map[downstream_table].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict( - view_name, - db_row["VIEW_COLUMNS"], - db_row["DOWNSTREAM_TABLE_COLUMNS"], - ), - self.config.include_column_lineage, - ) - self.report.num_view_to_table_edges_scanned += 1 - - logger.debug( - f"View->Table: Lineage[Table(Down)={downstream_table}]:View(Up)={self._lineage_map[downstream_table]}" - ) - - def update_upstream_tables_lineage( - self, upstream_tables: List[UpstreamClass], lineage: SnowflakeTableLineage - ) -> None: - for lineage_entry in sorted( - lineage.upstreamTables.values(), key=lambda x: x.upstreamDataset - ): - upstream_table_name = lineage_entry.upstreamDataset - upstream_table = UpstreamClass( - dataset=self.dataset_urn_builder(upstream_table_name), - type=DatasetLineageTypeClass.TRANSFORMED, - ) - upstream_tables.append(upstream_table) - - def update_upstream_columns_lineage( - self, - dataset_urn: str, - finegrained_lineages: List[FineGrainedLineage], - lineage: SnowflakeTableLineage, - ) -> None: - # For every column for which upstream lineage is available - for col, col_upstreams in lineage.columnLineages.items(): - # For every upstream of column - self.update_upstream_columns_lineage_of_column( - dataset_urn, col, finegrained_lineages, col_upstreams - ) - - def update_upstream_columns_lineage_of_column( - self, - dataset_urn: str, - col: str, - finegrained_lineages: List[FineGrainedLineage], - col_upstreams: SnowflakeColumnUpstreams, - ) -> None: - for fine_upstream in col_upstreams.upstreams: - finegrained_lineage_entry = self.build_finegrained_lineage( - dataset_urn, col, fine_upstream - ) - if finegrained_lineage_entry.upstreams: - finegrained_lineages.append(finegrained_lineage_entry) - - def build_finegrained_lineage( - self, - dataset_urn: str, - col: str, - fine_upstream: SnowflakeColumnFineGrainedLineage, - ) -> FineGrainedLineage: - fieldPath = col - - column_upstreams = self.build_finegrained_lineage_upstreams(fine_upstream) - finegrained_lineage_entry = FineGrainedLineage( - upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, - # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend - # even if the lineage is same but the order is different. - upstreams=sorted(column_upstreams), - downstreamType=FineGrainedLineageDownstreamType.FIELD, - downstreams=[ - builder.make_schema_field_urn( - dataset_urn, self.snowflake_identifier(fieldPath) - ) - ], - ) - - return finegrained_lineage_entry - - def build_finegrained_lineage_upstreams( - self, fine_upstream: SnowflakeColumnFineGrainedLineage - ) -> List[str]: - column_upstreams = [] - for upstream_col in fine_upstream.inputColumns: - if ( - upstream_col.objectName - and upstream_col.columnName - and self._is_dataset_pattern_allowed( - upstream_col.objectName, upstream_col.objectDomain, is_upstream=True - ) - ): - upstream_dataset_name = self.get_dataset_identifier_from_qualified_name( - upstream_col.objectName - ) - column_upstreams.append( - builder.make_schema_field_urn( - self.dataset_urn_builder(upstream_dataset_name), - self.snowflake_identifier(upstream_col.columnName), - ) - ) - return column_upstreams - - def update_external_tables_lineage( - self, upstream_tables: List[UpstreamClass], external_lineage: Set[str] - ) -> None: - for external_lineage_entry in sorted(external_lineage): - # For now, populate only for S3 - if external_lineage_entry.startswith("s3://"): - external_upstream_table = UpstreamClass( - dataset=make_s3_urn(external_lineage_entry, self.config.env), - type=DatasetLineageTypeClass.COPY, - ) - upstream_tables.append(external_upstream_table) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 587c71a98be67..0f89324f5efc6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -506,35 +506,6 @@ def view_dependencies_v2() -> str: def show_external_tables() -> str: return "show external tables in account" - # Note - This method should be removed once legacy lineage is removed - @staticmethod - def external_table_lineage_history( - start_time_millis: int, end_time_millis: int - ) -> str: - return f""" - WITH external_table_lineage_history AS ( - SELECT - r.value:"locations" AS upstream_locations, - w.value:"objectName"::varchar AS downstream_table_name, - w.value:"objectDomain"::varchar AS downstream_table_domain, - w.value:"columns" AS downstream_table_columns, - t.query_start_time AS query_start_time - FROM - (SELECT * from snowflake.account_usage.access_history) t, - lateral flatten(input => t.BASE_OBJECTS_ACCESSED) r, - lateral flatten(input => t.OBJECTS_MODIFIED) w - WHERE r.value:"locations" IS NOT NULL - AND w.value:"objectId" IS NOT NULL - AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3) - AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)) - SELECT - upstream_locations AS "UPSTREAM_LOCATIONS", - downstream_table_name AS "DOWNSTREAM_TABLE_NAME", - downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS" - FROM external_table_lineage_history - WHERE downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}' - QUALIFY ROW_NUMBER() OVER (PARTITION BY downstream_table_name ORDER BY query_start_time DESC) = 1""" - @staticmethod def copy_lineage_history( start_time_millis: int, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 7dd51d5b20e8e..40c4d32525a51 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -51,9 +51,6 @@ SnowflakeV2Config, TagOption, ) -from datahub.ingestion.source.snowflake.snowflake_lineage_legacy import ( - SnowflakeLineageExtractor as SnowflakeLineageLegacyExtractor, -) from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( SnowflakeLineageExtractor, ) @@ -240,19 +237,10 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary() - self.lineage_extractor: Union[ - SnowflakeLineageExtractor, SnowflakeLineageLegacyExtractor - ] if config.include_table_lineage: - # For lineage - if self.config.use_legacy_lineage_method: - self.lineage_extractor = SnowflakeLineageLegacyExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn - ) - else: - self.lineage_extractor = SnowflakeLineageExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn - ) + self.lineage_extractor = SnowflakeLineageExtractor( + config, self.report, dataset_urn_builder=self.gen_dataset_urn + ) if config.include_usage_stats or config.include_operational_stats: self.usage_extractor = SnowflakeUsageExtractor( diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 43f5e04fbc89f..81e307a78ae9e 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -434,11 +434,6 @@ def default_query_results( # noqa: C901 } for op_idx in range(1, num_ops + 1) ] - elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654473600000, - 1654586220000, - ): - return [] elif query in [ snowflake_query.SnowflakeQuery.view_dependencies(), ]: @@ -509,10 +504,6 @@ def default_query_results( # noqa: C901 } ] elif query in [ - snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654473600000, - 1654586220000, - ), snowflake_query.SnowflakeQuery.view_dependencies_v2(), snowflake_query.SnowflakeQuery.view_dependencies(), snowflake_query.SnowflakeQuery.show_external_tables(), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 53b2bcb236cd9..6135b0b3b3274 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -121,7 +121,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): include_table_lineage=True, include_view_lineage=True, include_usage_stats=True, - use_legacy_lineage_method=False, validate_upstreams_against_patterns=False, include_operational_stats=True, email_as_user_identifier=True, @@ -213,7 +212,6 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_ include_column_lineage=False, include_views=False, include_view_lineage=False, - use_legacy_lineage_method=False, include_usage_stats=False, include_operational_stats=False, start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 73a261bb3cb6e..4963e71ae4d96 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -55,7 +55,6 @@ def snowflake_pipeline_config(tmp_path): schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_view_lineage=False, include_usage_stats=False, - use_legacy_lineage_method=False, start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( tzinfo=timezone.utc ), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py deleted file mode 100644 index a5993793e574d..0000000000000 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py +++ /dev/null @@ -1,291 +0,0 @@ -from datetime import datetime, timezone -from typing import cast -from unittest import mock - -from freezegun import freeze_time -from pytest import fixture - -from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig -from datahub.ingestion.source.snowflake import snowflake_query -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from tests.integration.snowflake.common import ( - FROZEN_TIME, - NUM_TABLES, - default_query_results, -) - - -def query_permission_error_override(fn, override_for_query, error_msg): - def my_function(query): - if query in override_for_query: - raise Exception(error_msg) - else: - return fn(query) - - return my_function - - -def query_permission_response_override(fn, override_for_query, response): - def my_function(query): - if query in override_for_query: - return response - else: - return fn(query) - - return my_function - - -@fixture(scope="function") -def snowflake_pipeline_legacy_lineage_config(tmp_path): - output_file = tmp_path / "snowflake_test_events_permission_error.json" - config = PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.aws", - username="TST_USR", - password="TST_PWD", - role="TEST_ROLE", - warehouse="TEST_WAREHOUSE", - include_technical_schema=True, - match_fully_qualified_names=True, - schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_view_lineage=False, - include_usage_stats=False, - use_legacy_lineage_method=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(tzinfo=timezone.utc), - ), - ), - sink=DynamicTypedConfig(type="file", config={"filename": str(output_file)}), - ) - return config - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_role_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - # Snowflake connection fails role not granted error - mock_connect.side_effect = Exception( - "250001 (08001): Failed to connect to DB: abc12345.ap-south-1.snowflakecomputing.com:443. Role 'TEST_ROLE' specified in the connect string is not granted to this user. Contact your local system administrator, or attempt to login with another role, e.g. PUBLIC" - ) - - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_warehouse_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Current warehouse query leads to blank result - sf_cursor.execute.side_effect = query_permission_response_override( - default_query_results, - [SnowflakeQuery.current_warehouse()], - [(None,)], - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_no_databases_with_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing databases - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [SnowflakeQuery.get_databases("TEST_DB")], - "Database 'TEST_DB' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_no_tables_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing databases - no_tables_fn = query_permission_response_override( - default_query_results, - [SnowflakeQuery.tables_for_schema("TEST_SCHEMA", "TEST_DB")], - [], - ) - sf_cursor.execute.side_effect = query_permission_response_override( - no_tables_fn, - [SnowflakeQuery.show_views_for_schema("TEST_SCHEMA", "TEST_DB")], - [], - ) - - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_list_columns_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing columns - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [ - SnowflakeQuery.columns_for_table( - "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB" - ) - for tbl_idx in range(1, NUM_TABLES + 1) - ], - "Database 'TEST_DB' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert ( - "Failed to get columns for table" - in pipeline.source.get_report().warnings.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_list_primary_keys_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing keys leads to warning - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [SnowflakeQuery.show_primary_keys_for_schema("TEST_SCHEMA", "TEST_DB")], - "Insufficient privileges to operate on TEST_DB", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert ( - "Failed to get primary key for table" - in pipeline.source.get_report().warnings.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting lineage - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [ - snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654473600000, 1654586220000, True - ), - ], - "Database 'SNOWFLAKE' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert ( - "lineage-permission-error" in pipeline.source.get_report().failures.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_snowflake_operations_permission_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting access history date range - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [snowflake_query.SnowflakeQuery.get_access_history_date_range()], - "Database 'SNOWFLAKE' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "usage-permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting view lineage - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [snowflake_query.SnowflakeQuery.view_dependencies()], - "Unexpected Error", - ) - - snowflake_pipeline_config1 = snowflake_pipeline_legacy_lineage_config.copy() - cast( - SnowflakeV2Config, - cast(PipelineConfig, snowflake_pipeline_config1).source.config, - ).include_view_lineage = True - pipeline = Pipeline(snowflake_pipeline_config1) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert "view-upstream-lineage" in pipeline.source.get_report().warnings.keys() diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py deleted file mode 100644 index 59da7ddf695d8..0000000000000 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py +++ /dev/null @@ -1,207 +0,0 @@ -import random -from datetime import datetime, timezone -from unittest import mock - -import pandas as pd -import pytest -from freezegun import freeze_time - -from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.glossary.classifier import ( - ClassificationConfig, - DynamicTypedClassifierConfig, -) -from datahub.ingestion.glossary.datahub_classifier import ( - DataHubClassifierConfig, - InfoTypeConfig, - PredictionFactorsAndWeights, - ValuesFactorConfig, -) -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig -from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig -from datahub.ingestion.source.snowflake.snowflake_config import ( - SnowflakeV2Config, - TagOption, -) -from tests.integration.snowflake.common import FROZEN_TIME, default_query_results -from tests.integration.snowflake.test_snowflake import random_cloud_region, random_email -from tests.test_helpers import mce_helpers - - -@pytest.mark.integration -def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" - - # Run the metadata ingestion pipeline. - output_file = tmp_path / "snowflake_test_events.json" - golden_file = test_resources_dir / "snowflake_golden.json" - - with mock.patch("snowflake.connector.connect") as mock_connect, mock.patch( - "datahub.ingestion.source.snowflake.snowflake_v2.SnowflakeV2Source.get_sample_values_for_table" - ) as mock_sample_values: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - sf_cursor.execute.side_effect = default_query_results - - mock_sample_values.return_value = pd.DataFrame( - data={ - "col_1": [random.randint(1, 80) for i in range(20)], - "col_2": [random_email() for i in range(20)], - "col_3": [random_cloud_region() for i in range(20)], - } - ) - - datahub_classifier_config = DataHubClassifierConfig( - minimum_values_threshold=10, - confidence_level_threshold=0.58, - info_types_config={ - "Age": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, Values=1, Description=0, Datatype=0 - ) - ), - "CloudRegion": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, - Description=0, - Datatype=0, - Values=1, - ), - Values=ValuesFactorConfig( - prediction_type="regex", - regex=[ - r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+" - ], - ), - ), - }, - ) - - pipeline = Pipeline( - config=PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.aws", - username="TST_USR", - password="TST_PWD", - match_fully_qualified_names=True, - schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_technical_schema=True, - include_table_lineage=True, - include_view_lineage=True, - include_usage_stats=True, - use_legacy_lineage_method=True, - validate_upstreams_against_patterns=False, - include_operational_stats=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - classification=ClassificationConfig( - enabled=True, - classifiers=[ - DynamicTypedClassifierConfig( - type="datahub", config=datahub_classifier_config - ) - ], - ), - profiling=GEProfilingConfig( - enabled=True, - profile_if_updated_since_days=None, - profile_table_row_limit=None, - profile_table_size_limit=None, - profile_table_level_only=True, - ), - extract_tags=TagOption.without_lineage, - ), - ), - sink=DynamicTypedConfig( - type="file", config={"filename": str(output_file)} - ), - ) - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - # Verify the output. - - mce_helpers.check_golden_file( - pytestconfig, - output_path=output_file, - golden_path=golden_file, - ignore_paths=[ - r"root\[\d+\]\['aspect'\]\['json'\]\['timestampMillis'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['created'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['lastModified'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['fields'\]\[\d+\]\['glossaryTerms'\]\['auditStamp'\]\['time'\]", - r"root\[\d+\]\['systemMetadata'\]", - ], - ) - - -@freeze_time(FROZEN_TIME) -@pytest.mark.integration -def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" - - # Run the metadata ingestion pipeline. - output_file = tmp_path / "snowflake_privatelink_test_events.json" - golden_file = test_resources_dir / "snowflake_privatelink_golden.json" - - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - sf_cursor.execute.side_effect = default_query_results - - pipeline = Pipeline( - config=PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.privatelink", - username="TST_USR", - password="TST_PWD", - schema_pattern=AllowDenyPattern(allow=["test_schema"]), - include_technical_schema=True, - include_table_lineage=True, - include_column_lineage=False, - include_views=False, - include_view_lineage=False, - use_legacy_lineage_method=True, - include_usage_stats=False, - include_operational_stats=False, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - ), - ), - sink=DynamicTypedConfig( - type="file", config={"filename": str(output_file)} - ), - ) - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - # Verify the output. - - mce_helpers.check_golden_file( - pytestconfig, - output_path=output_file, - golden_path=golden_file, - ignore_paths=[], - ) From 01ae5d96da45a259122a547504265025624c0e11 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 23 Aug 2023 15:58:34 -0400 Subject: [PATCH 11/11] fix(ingest/ldap): Handle case when 'objectClass' not in attrs (#8658) --- metadata-ingestion/src/datahub/ingestion/source/ldap.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 497b49acb6505..e1d035a96d42f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -271,10 +271,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if dn is None: continue - if not attrs: + if not attrs or "objectClass" not in attrs: self.report.report_warning( "", - f"skipping {dn} because attrs is empty; check your permissions if this is unexpected", + f"skipping {dn} because attrs ({attrs}) does not contain expected data; " + f"check your permissions if this is unexpected", ) continue