From cf166843b562ae1df303e4b250b95a9bfe736539 Mon Sep 17 00:00:00 2001 From: Indy Prentice Date: Thu, 7 Sep 2023 17:29:55 -0300 Subject: [PATCH] feat(search): Also de-duplicate the field queries based on field names (#8788) Co-authored-by: Indy Prentice Co-authored-by: David Leifker --- .../query/request/SearchQueryBuilder.java | 139 ++++++++----- .../query/request/SearchQueryBuilderTest.java | 185 ++++++++++++++++-- 2 files changed, 258 insertions(+), 66 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index a00882cfde240..4eebf02d70e9e 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -1,5 +1,6 @@ package com.linkedin.metadata.search.elasticsearch.query.request; +import com.google.common.annotations.VisibleForTesting; import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; @@ -19,6 +20,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -116,11 +118,8 @@ private QueryBuilder buildInternalQuery(@Nullable QueryConfiguration customQuery QueryStringQueryBuilder queryBuilder = QueryBuilders.queryStringQuery(withoutQueryPrefix); queryBuilder.defaultOperator(Operator.AND); - entitySpecs.stream() - .map(this::getStandardFields) - .flatMap(Set::stream) - .distinct() - .forEach(cfg -> queryBuilder.field(cfg.fieldName(), cfg.boost())); + getStandardFields(entitySpecs).forEach(entitySpec -> + queryBuilder.field(entitySpec.fieldName(), entitySpec.boost())); finalQuery.should(queryBuilder); if (exactMatchConfiguration.isEnableStructured()) { getPrefixAndExactMatchQuery(null, entitySpecs, withoutQueryPrefix).ifPresent(finalQuery::should); @@ -130,16 +129,47 @@ private QueryBuilder buildInternalQuery(@Nullable QueryConfiguration customQuery return finalQuery; } - private Set getStandardFields(@Nonnull EntitySpec entitySpec) { + /** + * Gets searchable fields from all entities in the input collection. De-duplicates fields across entities. + * @param entitySpecs: Entity specs to extract searchable fields from + * @return A set of SearchFieldConfigs containing the searchable fields from the input entities. + */ + @VisibleForTesting + Set getStandardFields(@Nonnull Collection entitySpecs) { Set fields = new HashSet<>(); - // Always present final float urnBoost = Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore")); fields.add(SearchFieldConfig.detectSubFieldType("urn", urnBoost, SearchableAnnotation.FieldType.URN, true)); fields.add(SearchFieldConfig.detectSubFieldType("urn.delimited", urnBoost * partialConfiguration.getUrnFactor(), - SearchableAnnotation.FieldType.URN, true)); + SearchableAnnotation.FieldType.URN, true)); + + entitySpecs.stream() + .map(this::getFieldsFromEntitySpec) + .flatMap(Set::stream) + .collect(Collectors.groupingBy(SearchFieldConfig::fieldName)).forEach((key, value) -> + fields.add( + new SearchFieldConfig( + key, + value.get(0).shortName(), + (float) value.stream().mapToDouble(SearchFieldConfig::boost).average().getAsDouble(), + value.get(0).analyzer(), + value.stream().anyMatch(SearchFieldConfig::hasKeywordSubfield), + value.stream().anyMatch(SearchFieldConfig::hasDelimitedSubfield), + value.stream().anyMatch(SearchFieldConfig::hasWordGramSubfields), + true, + value.stream().anyMatch(SearchFieldConfig::isDelimitedSubfield), + value.stream().anyMatch(SearchFieldConfig::isKeywordSubfield), + value.stream().anyMatch(SearchFieldConfig::isWordGramSubfield) + )) + ); + return fields; + } + + @VisibleForTesting + Set getFieldsFromEntitySpec(EntitySpec entitySpec) { + Set fields = new HashSet<>(); List searchableFieldSpecs = entitySpec.getSearchableFieldSpecs(); for (SearchableFieldSpec fieldSpec : searchableFieldSpecs) { if (!fieldSpec.getSearchableAnnotation().isQueryByDefault()) { @@ -153,8 +183,8 @@ private Set getStandardFields(@Nonnull EntitySpec entitySpec) final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation(); fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited", - searchFieldConfig.boost() * partialConfiguration.getFactor(), - searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault())); + searchFieldConfig.boost() * partialConfiguration.getFactor(), + searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault())); if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) { fields.add(SearchFieldConfig.builder() @@ -187,6 +217,20 @@ private Set getStandardFields(@Nonnull EntitySpec entitySpec) } } } + return fields; + } + + private Set getStandardFields(@Nonnull EntitySpec entitySpec) { + Set fields = new HashSet<>(); + + // Always present + final float urnBoost = Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore")); + + fields.add(SearchFieldConfig.detectSubFieldType("urn", urnBoost, SearchableAnnotation.FieldType.URN, true)); + fields.add(SearchFieldConfig.detectSubFieldType("urn.delimited", urnBoost * partialConfiguration.getUrnFactor(), + SearchableAnnotation.FieldType.URN, true)); + + fields.addAll(getFieldsFromEntitySpec(entitySpec)); return fields; } @@ -255,49 +299,42 @@ private Optional getPrefixAndExactMatchQuery(@Nullable QueryConfig BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); String unquotedQuery = unquote(query); - entitySpecs.stream() - .map(this::getStandardFields) - .flatMap(Set::stream) - .filter(SearchFieldConfig::isQueryByDefault) - .forEach(searchFieldConfig -> { - - if (searchFieldConfig.isDelimitedSubfield() && isPrefixQuery) { - finalQuery.should(QueryBuilders.matchPhrasePrefixQuery(searchFieldConfig.fieldName(), query) - .boost(searchFieldConfig.boost() - * exactMatchConfiguration.getPrefixFactor() - * exactMatchConfiguration.getCaseSensitivityFactor()) - .queryName(searchFieldConfig.shortName())); // less than exact - } - - if (searchFieldConfig.isKeyword() && isExactQuery) { - // It is important to use the subfield .keyword (it uses a different normalizer) - // The non-.keyword field removes case information - - // Exact match case-sensitive - finalQuery.should(QueryBuilders - .termQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) - .caseInsensitive(false) - .boost(searchFieldConfig.boost() - * exactMatchConfiguration.getExactFactor()) - .queryName(searchFieldConfig.shortName())); - - // Exact match case-insensitive - finalQuery.should(QueryBuilders - .termQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) - .caseInsensitive(true) - .boost(searchFieldConfig.boost() - * exactMatchConfiguration.getExactFactor() - * exactMatchConfiguration.getCaseSensitivityFactor()) - .queryName(searchFieldConfig.fieldName())); - } - - if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) { - finalQuery.should(QueryBuilders - .matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) + getStandardFields(entitySpecs).forEach(searchFieldConfig -> { + if (searchFieldConfig.isDelimitedSubfield() && isPrefixQuery) { + finalQuery.should(QueryBuilders.matchPhrasePrefixQuery(searchFieldConfig.fieldName(), query) + .boost(searchFieldConfig.boost() * exactMatchConfiguration.getPrefixFactor() + * exactMatchConfiguration.getCaseSensitivityFactor()) + .queryName(searchFieldConfig.shortName())); // less than exact + } + + if (searchFieldConfig.isKeyword() && isExactQuery) { + // It is important to use the subfield .keyword (it uses a different normalizer) + // The non-.keyword field removes case information + + // Exact match case-sensitive + finalQuery.should( + QueryBuilders.termQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) + .caseInsensitive(false) + .boost(searchFieldConfig.boost() * exactMatchConfiguration.getExactFactor()) + .queryName(searchFieldConfig.shortName())); + + // Exact match case-insensitive + finalQuery.should( + QueryBuilders.termQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) + .caseInsensitive(true) + .boost(searchFieldConfig.boost() * exactMatchConfiguration.getExactFactor() + * exactMatchConfiguration.getCaseSensitivityFactor()) + .queryName(searchFieldConfig.fieldName())); + } + + if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) { + finalQuery.should( + QueryBuilders.matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), + unquotedQuery) .boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName())) .queryName(searchFieldConfig.shortName())); - } - }); + } + }); return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java index 282b1d8bb6778..8e73b0ceeae8d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java @@ -1,5 +1,8 @@ package com.linkedin.metadata.search.elasticsearch.query.request; +import com.linkedin.data.schema.DataSchema; +import com.linkedin.data.schema.PathSpec; +import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.config.search.CustomConfiguration; import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; @@ -10,11 +13,18 @@ import com.google.common.collect.ImmutableList; import com.linkedin.metadata.TestEntitySpecBuilder; +import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.models.SearchableFieldSpec; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Optional; +import java.util.Set; import java.util.stream.Collectors; +import java.util.stream.Stream; +import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.util.Pair; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.MatchAllQueryBuilder; @@ -25,8 +35,14 @@ import org.elasticsearch.index.query.SimpleQueryStringBuilder; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.index.query.functionscore.FunctionScoreQueryBuilder; +import org.mockito.Mockito; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.Test; +import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.AUTO_COMPLETE_ENTITY_TYPES; +import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.SEARCHABLE_ENTITY_TYPES; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER; import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER; import static org.testng.Assert.assertEquals; @@ -34,7 +50,12 @@ import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; -public class SearchQueryBuilderTest { +@Import(ESTestConfiguration.class) +public class SearchQueryBuilderTest extends AbstractTestNGSpringContextTests { + + @Autowired + private EntityRegistry entityRegistry; + public static SearchConfiguration testQueryConfig; static { testQueryConfig = new SearchConfiguration(); @@ -66,8 +87,8 @@ public class SearchQueryBuilderTest { @Test public void testQueryBuilderFulltext() { FunctionScoreQueryBuilder result = - (FunctionScoreQueryBuilder) TEST_BUILDER.buildQuery(ImmutableList.of(TestEntitySpecBuilder.getSpec()), "testQuery", - true); + (FunctionScoreQueryBuilder) TEST_BUILDER.buildQuery(ImmutableList.of(TestEntitySpecBuilder.getSpec()), "testQuery", + true); BoolQueryBuilder mainQuery = (BoolQueryBuilder) result.query(); List shouldQueries = mainQuery.should(); assertEquals(shouldQueries.size(), 2); @@ -80,15 +101,15 @@ public void testQueryBuilderFulltext() { Map keywordFields = keywordQuery.fields(); assertEquals(keywordFields.size(), 9); assertEquals(keywordFields, Map.of( - "urn", 10.f, - "textArrayField", 1.0f, - "customProperties", 1.0f, - "wordGramField", 1.0f, - "nestedArrayArrayField", 1.0f, - "textFieldOverride", 1.0f, - "nestedArrayStringField", 1.0f, - "keyPart1", 10.0f, - "esObjectField", 1.0f + "urn", 10.f, + "textArrayField", 1.0f, + "customProperties", 1.0f, + "wordGramField", 1.0f, + "nestedArrayArrayField", 1.0f, + "textFieldOverride", 1.0f, + "nestedArrayStringField", 1.0f, + "keyPart1", 10.0f, + "esObjectField", 1.0f )); SimpleQueryStringBuilder urnComponentQuery = (SimpleQueryStringBuilder) analyzerGroupQuery.should().get(1); @@ -153,8 +174,8 @@ public void testQueryBuilderFulltext() { @Test public void testQueryBuilderStructured() { FunctionScoreQueryBuilder result = - (FunctionScoreQueryBuilder) TEST_BUILDER.buildQuery(ImmutableList.of(TestEntitySpecBuilder.getSpec()), - "testQuery", false); + (FunctionScoreQueryBuilder) TEST_BUILDER.buildQuery(ImmutableList.of(TestEntitySpecBuilder.getSpec()), + "testQuery", false); BoolQueryBuilder mainQuery = (BoolQueryBuilder) result.query(); List shouldQueries = mainQuery.should(); assertEquals(shouldQueries.size(), 2); @@ -263,4 +284,138 @@ public void testCustomDefault() { assertEquals(termQueryBuilder.value().toString(), triggerQuery); } } -} + + /** + * Tests to make sure that the fields are correctly combined across search-able entities + */ + @Test + public void testGetStandardFieldsEntitySpec() { + List entitySpecs = Stream.concat(SEARCHABLE_ENTITY_TYPES.stream(), AUTO_COMPLETE_ENTITY_TYPES.stream()) + .map(entityType -> entityType.toString().toLowerCase().replaceAll("_", "")) + .map(entityRegistry::getEntitySpec) + .collect(Collectors.toList()); + assertTrue(entitySpecs.size() > 30, "Expected at least 30 searchable entities in the registry"); + + // Count of the distinct field names + Set expectedFieldNames = Stream.concat( + // Standard urn fields plus entitySpec sourced fields + Stream.of("urn", "urn.delimited"), + entitySpecs.stream() + .flatMap(spec -> TEST_CUSTOM_BUILDER.getFieldsFromEntitySpec(spec).stream()) + .map(SearchFieldConfig::fieldName)) + .collect(Collectors.toSet()); + + Set actualFieldNames = TEST_CUSTOM_BUILDER.getStandardFields(entitySpecs).stream() + .map(SearchFieldConfig::fieldName) + .collect(Collectors.toSet()); + + assertEquals(actualFieldNames, expectedFieldNames, + String.format("Missing: %s Extra: %s", + expectedFieldNames.stream().filter(f -> !actualFieldNames.contains(f)).collect(Collectors.toSet()), + actualFieldNames.stream().filter(f -> !expectedFieldNames.contains(f)).collect(Collectors.toSet()))); + } + + @Test + public void testGetStandardFields() { + Set fieldConfigs = TEST_CUSTOM_BUILDER.getStandardFields(ImmutableList.of(TestEntitySpecBuilder.getSpec())); + assertEquals(fieldConfigs.size(), 21); + assertEquals(fieldConfigs.stream().map(SearchFieldConfig::fieldName).collect(Collectors.toSet()), Set.of( + "nestedArrayArrayField", + "esObjectField", + "foreignKey", + "keyPart1", + "nestedForeignKey", + "textArrayField.delimited", + "nestedArrayArrayField.delimited", + "wordGramField.delimited", + "wordGramField.wordGrams4", + "textFieldOverride", + "nestedArrayStringField.delimited", + "urn.delimited", + "textArrayField", + "keyPart1.delimited", + "nestedArrayStringField", + "wordGramField", + "customProperties", + "wordGramField.wordGrams3", + "textFieldOverride.delimited", + "urn", + "wordGramField.wordGrams2")); + + assertEquals(fieldConfigs.stream().filter(field -> field.fieldName().equals("keyPart1")).findFirst().map(SearchFieldConfig::boost), Optional.of( + 10.0F)); + assertEquals(fieldConfigs.stream().filter(field -> field.fieldName().equals("nestedForeignKey")).findFirst().map(SearchFieldConfig::boost), Optional.of( + 1.0F)); + assertEquals(fieldConfigs.stream().filter(field -> field.fieldName().equals("textFieldOverride")).findFirst().map(SearchFieldConfig::boost), Optional.of( + 1.0F)); + + EntitySpec mockEntitySpec = Mockito.mock(EntitySpec.class); + Mockito.when(mockEntitySpec.getSearchableFieldSpecs()).thenReturn(List.of( + new SearchableFieldSpec( + Mockito.mock(PathSpec.class), + new SearchableAnnotation("fieldDoesntExistInOriginal", + SearchableAnnotation.FieldType.TEXT, + true, true, false, false, + Optional.empty(), Optional.empty(), 13.0, + Optional.empty(), Optional.empty(), Map.of(), List.of()), + Mockito.mock(DataSchema.class)), + new SearchableFieldSpec( + Mockito.mock(PathSpec.class), + new SearchableAnnotation("keyPart1", + SearchableAnnotation.FieldType.KEYWORD, + true, true, false, false, + Optional.empty(), Optional.empty(), 20.0, + Optional.empty(), Optional.empty(), Map.of(), List.of()), + Mockito.mock(DataSchema.class)), + new SearchableFieldSpec( + Mockito.mock(PathSpec.class), + new SearchableAnnotation("textFieldOverride", + SearchableAnnotation.FieldType.WORD_GRAM, + true, true, false, false, + Optional.empty(), Optional.empty(), 3.0, + Optional.empty(), Optional.empty(), Map.of(), List.of()), + Mockito.mock(DataSchema.class))) + ); + + fieldConfigs = TEST_CUSTOM_BUILDER.getStandardFields(ImmutableList.of(TestEntitySpecBuilder.getSpec(), mockEntitySpec)); + // Same 21 from the original entity + newFieldNotInOriginal + 3 word gram fields from the textFieldOverride + assertEquals(fieldConfigs.size(), 26); + assertEquals(fieldConfigs.stream().map(SearchFieldConfig::fieldName).collect(Collectors.toSet()), Set.of( + "nestedArrayArrayField", + "esObjectField", + "foreignKey", + "keyPart1", + "nestedForeignKey", + "textArrayField.delimited", + "nestedArrayArrayField.delimited", + "wordGramField.delimited", + "wordGramField.wordGrams4", + "textFieldOverride", + "nestedArrayStringField.delimited", + "urn.delimited", + "textArrayField", + "keyPart1.delimited", + "nestedArrayStringField", + "wordGramField", + "customProperties", + "wordGramField.wordGrams3", + "textFieldOverride.delimited", + "urn", + "wordGramField.wordGrams2", + "fieldDoesntExistInOriginal", + "fieldDoesntExistInOriginal.delimited", + "textFieldOverride.wordGrams2", + "textFieldOverride.wordGrams3", + "textFieldOverride.wordGrams4")); + + // Field which only exists in first one: Should be the same + assertEquals(fieldConfigs.stream().filter(field -> field.fieldName().equals("nestedForeignKey")).findFirst().map(SearchFieldConfig::boost), Optional.of( + 1.0F)); + // Average boost value: 10 vs. 20 -> 15 + assertEquals(fieldConfigs.stream().filter(field -> field.fieldName().equals("keyPart1")).findFirst().map(SearchFieldConfig::boost), Optional.of( + 15.0F)); + // Field which added word gram fields: Original boost should be boost value averaged + assertEquals(fieldConfigs.stream().filter(field -> field.fieldName().equals("textFieldOverride")).findFirst().map(SearchFieldConfig::boost), Optional.of( + 2.0F)); + } +} \ No newline at end of file