Skip to content

Commit

Permalink
feat(search): Also de-duplicate the field queries based on field names (
Browse files Browse the repository at this point in the history
#8788)

Co-authored-by: Indy Prentice <[email protected]>
Co-authored-by: David Leifker <[email protected]>
  • Loading branch information
3 people authored Sep 7, 2023
1 parent eb4107a commit cf16684
Show file tree
Hide file tree
Showing 2 changed files with 258 additions and 66 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.linkedin.metadata.search.elasticsearch.query.request;

import com.google.common.annotations.VisibleForTesting;
import com.linkedin.metadata.config.search.ExactMatchConfiguration;
import com.linkedin.metadata.config.search.PartialConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
Expand All @@ -19,6 +20,7 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
Expand Down Expand Up @@ -116,11 +118,8 @@ private QueryBuilder buildInternalQuery(@Nullable QueryConfiguration customQuery

QueryStringQueryBuilder queryBuilder = QueryBuilders.queryStringQuery(withoutQueryPrefix);
queryBuilder.defaultOperator(Operator.AND);
entitySpecs.stream()
.map(this::getStandardFields)
.flatMap(Set::stream)
.distinct()
.forEach(cfg -> queryBuilder.field(cfg.fieldName(), cfg.boost()));
getStandardFields(entitySpecs).forEach(entitySpec ->
queryBuilder.field(entitySpec.fieldName(), entitySpec.boost()));
finalQuery.should(queryBuilder);
if (exactMatchConfiguration.isEnableStructured()) {
getPrefixAndExactMatchQuery(null, entitySpecs, withoutQueryPrefix).ifPresent(finalQuery::should);
Expand All @@ -130,16 +129,47 @@ private QueryBuilder buildInternalQuery(@Nullable QueryConfiguration customQuery
return finalQuery;
}

private Set<SearchFieldConfig> getStandardFields(@Nonnull EntitySpec entitySpec) {
/**
* Gets searchable fields from all entities in the input collection. De-duplicates fields across entities.
* @param entitySpecs: Entity specs to extract searchable fields from
* @return A set of SearchFieldConfigs containing the searchable fields from the input entities.
*/
@VisibleForTesting
Set<SearchFieldConfig> getStandardFields(@Nonnull Collection<EntitySpec> entitySpecs) {
Set<SearchFieldConfig> fields = new HashSet<>();

// Always present
final float urnBoost = Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore"));

fields.add(SearchFieldConfig.detectSubFieldType("urn", urnBoost, SearchableAnnotation.FieldType.URN, true));
fields.add(SearchFieldConfig.detectSubFieldType("urn.delimited", urnBoost * partialConfiguration.getUrnFactor(),
SearchableAnnotation.FieldType.URN, true));
SearchableAnnotation.FieldType.URN, true));

entitySpecs.stream()
.map(this::getFieldsFromEntitySpec)
.flatMap(Set::stream)
.collect(Collectors.groupingBy(SearchFieldConfig::fieldName)).forEach((key, value) ->
fields.add(
new SearchFieldConfig(
key,
value.get(0).shortName(),
(float) value.stream().mapToDouble(SearchFieldConfig::boost).average().getAsDouble(),
value.get(0).analyzer(),
value.stream().anyMatch(SearchFieldConfig::hasKeywordSubfield),
value.stream().anyMatch(SearchFieldConfig::hasDelimitedSubfield),
value.stream().anyMatch(SearchFieldConfig::hasWordGramSubfields),
true,
value.stream().anyMatch(SearchFieldConfig::isDelimitedSubfield),
value.stream().anyMatch(SearchFieldConfig::isKeywordSubfield),
value.stream().anyMatch(SearchFieldConfig::isWordGramSubfield)
))
);

return fields;
}

@VisibleForTesting
Set<SearchFieldConfig> getFieldsFromEntitySpec(EntitySpec entitySpec) {
Set<SearchFieldConfig> fields = new HashSet<>();
List<SearchableFieldSpec> searchableFieldSpecs = entitySpec.getSearchableFieldSpecs();
for (SearchableFieldSpec fieldSpec : searchableFieldSpecs) {
if (!fieldSpec.getSearchableAnnotation().isQueryByDefault()) {
Expand All @@ -153,8 +183,8 @@ private Set<SearchFieldConfig> getStandardFields(@Nonnull EntitySpec entitySpec)
final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation();

fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited",
searchFieldConfig.boost() * partialConfiguration.getFactor(),
searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault()));
searchFieldConfig.boost() * partialConfiguration.getFactor(),
searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault()));

if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) {
fields.add(SearchFieldConfig.builder()
Expand Down Expand Up @@ -187,6 +217,20 @@ private Set<SearchFieldConfig> getStandardFields(@Nonnull EntitySpec entitySpec)
}
}
}
return fields;
}

private Set<SearchFieldConfig> getStandardFields(@Nonnull EntitySpec entitySpec) {
Set<SearchFieldConfig> fields = new HashSet<>();

// Always present
final float urnBoost = Float.parseFloat((String) PRIMARY_URN_SEARCH_PROPERTIES.get("boostScore"));

fields.add(SearchFieldConfig.detectSubFieldType("urn", urnBoost, SearchableAnnotation.FieldType.URN, true));
fields.add(SearchFieldConfig.detectSubFieldType("urn.delimited", urnBoost * partialConfiguration.getUrnFactor(),
SearchableAnnotation.FieldType.URN, true));

fields.addAll(getFieldsFromEntitySpec(entitySpec));

return fields;
}
Expand Down Expand Up @@ -255,49 +299,42 @@ private Optional<QueryBuilder> getPrefixAndExactMatchQuery(@Nullable QueryConfig
BoolQueryBuilder finalQuery = QueryBuilders.boolQuery();
String unquotedQuery = unquote(query);

entitySpecs.stream()
.map(this::getStandardFields)
.flatMap(Set::stream)
.filter(SearchFieldConfig::isQueryByDefault)
.forEach(searchFieldConfig -> {

if (searchFieldConfig.isDelimitedSubfield() && isPrefixQuery) {
finalQuery.should(QueryBuilders.matchPhrasePrefixQuery(searchFieldConfig.fieldName(), query)
.boost(searchFieldConfig.boost()
* exactMatchConfiguration.getPrefixFactor()
* exactMatchConfiguration.getCaseSensitivityFactor())
.queryName(searchFieldConfig.shortName())); // less than exact
}

if (searchFieldConfig.isKeyword() && isExactQuery) {
// It is important to use the subfield .keyword (it uses a different normalizer)
// The non-.keyword field removes case information

// Exact match case-sensitive
finalQuery.should(QueryBuilders
.termQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery)
.caseInsensitive(false)
.boost(searchFieldConfig.boost()
* exactMatchConfiguration.getExactFactor())
.queryName(searchFieldConfig.shortName()));

// Exact match case-insensitive
finalQuery.should(QueryBuilders
.termQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery)
.caseInsensitive(true)
.boost(searchFieldConfig.boost()
* exactMatchConfiguration.getExactFactor()
* exactMatchConfiguration.getCaseSensitivityFactor())
.queryName(searchFieldConfig.fieldName()));
}

if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) {
finalQuery.should(QueryBuilders
.matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery)
getStandardFields(entitySpecs).forEach(searchFieldConfig -> {
if (searchFieldConfig.isDelimitedSubfield() && isPrefixQuery) {
finalQuery.should(QueryBuilders.matchPhrasePrefixQuery(searchFieldConfig.fieldName(), query)
.boost(searchFieldConfig.boost() * exactMatchConfiguration.getPrefixFactor()
* exactMatchConfiguration.getCaseSensitivityFactor())
.queryName(searchFieldConfig.shortName())); // less than exact
}

if (searchFieldConfig.isKeyword() && isExactQuery) {
// It is important to use the subfield .keyword (it uses a different normalizer)
// The non-.keyword field removes case information

// Exact match case-sensitive
finalQuery.should(
QueryBuilders.termQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery)
.caseInsensitive(false)
.boost(searchFieldConfig.boost() * exactMatchConfiguration.getExactFactor())
.queryName(searchFieldConfig.shortName()));

// Exact match case-insensitive
finalQuery.should(
QueryBuilders.termQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery)
.caseInsensitive(true)
.boost(searchFieldConfig.boost() * exactMatchConfiguration.getExactFactor()
* exactMatchConfiguration.getCaseSensitivityFactor())
.queryName(searchFieldConfig.fieldName()));
}

if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) {
finalQuery.should(
QueryBuilders.matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false),
unquotedQuery)
.boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName()))
.queryName(searchFieldConfig.shortName()));
}
});
}
});

return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty();
}
Expand Down
Loading

0 comments on commit cf16684

Please sign in to comment.