Skip to content

Commit

Permalink
feat(search): Add word gram analyzer for name fields (#8611)
Browse files Browse the repository at this point in the history
Co-authored-by: Indy Prentice <[email protected]>
  • Loading branch information
2 people authored and yoonhyejin committed Aug 24, 2023
1 parent 20e179c commit 349b88c
Show file tree
Hide file tree
Showing 53 changed files with 453 additions and 111 deletions.
2 changes: 1 addition & 1 deletion docs/advanced/no-code-modeling.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ record ServiceKey {
* Name of the service
*/
@Searchable = {
"fieldType": "TEXT_PARTIAL",
"fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
name: string
Expand Down
28 changes: 17 additions & 11 deletions docs/modeling/extending-the-metadata-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ It takes the following parameters:
annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define
the set of mappings to be applied in the MappingsBuilder.

Thus far, we have implemented 10 fieldTypes:
Thus far, we have implemented 11 fieldTypes:

1. _KEYWORD_ - Short text fields that only support exact matches, often used only for filtering

Expand All @@ -336,21 +336,27 @@ It takes the following parameters:
3. _TEXT_PARTIAL_ - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial
matching is expensive, so this field type should not be applied to fields with long values (like description)

4. _BROWSE_PATH_ - Field type for browse paths. Applies specific mappings for slash delimited paths.
4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND
word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries
matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is
expensive, so should not be applied to fields with long values such as description.

5. _URN_ - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
"urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components
5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.

6. _URN_PARTIAL_ - Urn fields where each sub-component inside the urn is indexed with partial matching support.
6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
"urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components

7. _BOOLEAN_ - Boolean fields used for filtering.
7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.

8. _COUNT_ - Count fields used for filtering.
9. _DATETIME_ - Datetime fields used to represent timestamps.
8. *BOOLEAN* - Boolean fields used for filtering.

10. _OBJECT_ - Each property in an object will become an extra column in Elasticsearch and can be referenced as
`field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a
mapping explosion in Elasticsearch.
9. *COUNT* - Count fields used for filtering.

10. *DATETIME* - Datetime fields used to represent timestamps.

11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as
`field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a
mapping explosion in Elasticsearch.

- **fieldName**: string (optional) - The name of the field in search index document. Defaults to the field name where
the annotation resides.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public class SearchableAnnotation {

public static final String ANNOTATION_NAME = "Searchable";
private static final Set<FieldType> DEFAULT_QUERY_FIELD_TYPES =
ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL);
ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL);

// Name of the field in the search index. Defaults to the field name in the schema
String fieldName;
Expand Down Expand Up @@ -59,7 +59,8 @@ public enum FieldType {
COUNT,
DATETIME,
OBJECT,
BROWSE_PATH_V2
BROWSE_PATH_V2,
WORD_GRAM
}

@Nonnull
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName());

// Assert on Searchable Fields
assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size());
assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10);
assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get(
Expand All @@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("textArrayField", "*").toString())
.getSearchableAnnotation().getFieldType());
assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("wordGramField").toString())
.getSearchableAnnotation().getFieldType());
assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ public static Map<String, String> getPartialNgramConfigWithOverrides(Map<String,
// Subfields
public static final String DELIMITED = "delimited";
public static final String LENGTH = "length";
public static final String WORD_GRAMS_LENGTH_2 = "wordGrams2";
public static final String WORD_GRAMS_LENGTH_3 = "wordGrams3";
public static final String WORD_GRAMS_LENGTH_4 = "wordGrams4";

private MappingsBuilder() {
}
Expand Down Expand Up @@ -94,16 +97,30 @@ private static Map<String, Object> getMappingsForField(@Nonnull final Searchable
mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
// Add keyword subfield without lowercase filter
mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP));
} else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) {
} else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
mappingForField.put(TYPE, KEYWORD);
mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
Map<String, Object> subFields = new HashMap<>();
if (fieldType == FieldType.TEXT_PARTIAL) {
if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
subFields.put(NGRAM, getPartialNgramConfigWithOverrides(
ImmutableMap.of(
ANALYZER, PARTIAL_ANALYZER
)
));
if (fieldType == FieldType.WORD_GRAM) {
for (Map.Entry<String, String> entry : Map.of(
WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER,
WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER,
WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) {
String fieldName = entry.getKey();
String analyzerName = entry.getValue();
subFields.put(fieldName, ImmutableMap.of(
TYPE, TEXT,
ANALYZER, analyzerName,
SEARCH_ANALYZER, analyzerName
));
}
}
}
subFields.put(DELIMITED, ImmutableMap.of(
TYPE, TEXT,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ public class SettingsBuilder {
public static final String KEYWORD_ANALYZER = "keyword";
public static final String URN_ANALYZER = "urn_component";
public static final String URN_SEARCH_ANALYZER = "query_urn_component";
public static final String WORD_GRAM_2_ANALYZER = "word_gram_2";
public static final String WORD_GRAM_3_ANALYZER = "word_gram_3";
public static final String WORD_GRAM_4_ANALYZER = "word_gram_4";

// Filters
public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space";
Expand All @@ -80,6 +83,10 @@ public class SettingsBuilder {
public static final String MULTIFILTER = "multifilter";
public static final String MULTIFILTER_GRAPH = "multifilter_graph";
public static final String PARTIAL_URN_COMPONENT = "partial_urn_component";
public static final String SHINGLE = "shingle";
public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter";
public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter";
public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter";
public static final String SNOWBALL = "snowball";
public static final String STEM_OVERRIDE = "stem_override";
public static final String STOP = "stop";
Expand Down Expand Up @@ -108,6 +115,7 @@ public class SettingsBuilder {
public static final String SLASH_TOKENIZER = "slash_tokenizer";
public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer";
public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer";
// Do not remove the space, needed for multi-term synonyms
public static final List<String> ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
"([a-z0-9 _-]{2,})",
Expand Down Expand Up @@ -161,6 +169,13 @@ public class SettingsBuilder {
AUTOCOMPLETE_CUSTOM_DELIMITER,
LOWERCASE);

public static final List<String> WORD_GRAM_TOKEN_FILTERS = ImmutableList.of(
ASCII_FOLDING,
LOWERCASE,
TRIM,
REMOVE_QUOTES
);

public final Map<String, Object> settings;

public SettingsBuilder(String mainTokenizer) {
Expand Down Expand Up @@ -275,6 +290,17 @@ private static Map<String, Object> buildFilters() throws IOException {
.collect(Collectors.toList()))
.build());
}

for (Map.Entry<String, Integer> entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) {
String filterName = entry.getKey();
Integer gramSize = entry.getValue();
filters.put(filterName, ImmutableMap.<String, Object>builder()
.put(TYPE, SHINGLE)
.put("min_shingle_size", gramSize)
.put("max_shingle_size", gramSize)
.put("output_unigrams", false)
.build());
}
}

return filters.build();
Expand Down Expand Up @@ -302,13 +328,24 @@ private static Map<String, Object> buildTokenizers() {
.put(DELIMITER, "␟")
.build());

// Tokenize by whitespace and most special chars
// Tokenize by most special chars
// Do NOT tokenize by whitespace to keep multi-word synonyms in the same token
// The split by whitespace is done later in the token filters phase
tokenizers.put(MAIN_TOKENIZER,
ImmutableMap.<String, Object>builder()
.put(TYPE, PATTERN)
.put(PATTERN, "[(),./:]")
.build());

// Tokenize by whitespace and most special chars for wordgrams
// only split on - when not preceded by a whitespace to preserve exclusion functionality
// i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently
tokenizers.put(WORD_GRAM_TOKENIZER,
ImmutableMap.<String, Object>builder()
.put(TYPE, PATTERN)
.put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)")
.build());

return tokenizers.build();
}

Expand Down Expand Up @@ -382,6 +419,21 @@ private static Map<String, Object> buildAnalyzers(String mainTokenizer) {
.put(FILTER, SEARCH_TOKEN_FILTERS)
.build());

// Support word grams
for (Map.Entry<String, String> entry : Map.of(
WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER,
WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER,
WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) {
String analyzerName = entry.getKey();
String filterName = entry.getValue();
analyzers.put(analyzerName, ImmutableMap.<String, Object>builder()
.put(TOKENIZER, WORD_GRAM_TOKENIZER)
.put(FILTER, ImmutableList.<Object>builder()
.addAll(WORD_GRAM_TOKEN_FILTERS)
.add(filterName).build())
.build());
}

// For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN)
// Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token
analyzers.put(PARTIAL_ANALYZER, ImmutableMap.<String, Object>builder()
Expand All @@ -395,6 +447,7 @@ private static Map<String, Object> buildAnalyzers(String mainTokenizer) {
.put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS)
.build());


return analyzers.build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@

import java.util.Set;

import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER;
import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*;


@Builder
@Getter
Expand All @@ -33,7 +30,8 @@ public class SearchFieldConfig {
private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_DELIMITED_SUBFIELD =
Set.of(
SearchableAnnotation.FieldType.TEXT,
SearchableAnnotation.FieldType.TEXT_PARTIAL
SearchableAnnotation.FieldType.TEXT_PARTIAL,
SearchableAnnotation.FieldType.WORD_GRAM
// NOT URN_PARTIAL (urn field is special)
);
// NOT comprehensive
Expand All @@ -56,6 +54,7 @@ public class SearchFieldConfig {
SearchableAnnotation.FieldType.TEXT,
SearchableAnnotation.FieldType.TEXT_PARTIAL,
SearchableAnnotation.FieldType.KEYWORD,
SearchableAnnotation.FieldType.WORD_GRAM,
// not analyzed
SearchableAnnotation.FieldType.BOOLEAN,
SearchableAnnotation.FieldType.COUNT,
Expand All @@ -69,6 +68,11 @@ public class SearchFieldConfig {
SearchableAnnotation.FieldType.URN_PARTIAL
);

public static final Set<SearchableAnnotation.FieldType> TYPES_WITH_WORD_GRAM =
Set.of(
SearchableAnnotation.FieldType.WORD_GRAM
);

@Nonnull
private final String fieldName;
@Nonnull
Expand All @@ -78,9 +82,11 @@ public class SearchFieldConfig {
private final String analyzer;
private boolean hasKeywordSubfield;
private boolean hasDelimitedSubfield;
private boolean hasWordGramSubfields;
private boolean isQueryByDefault;
private boolean isDelimitedSubfield;
private boolean isKeywordSubfield;
private boolean isWordGramSubfield;

public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) {
final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation();
Expand All @@ -106,6 +112,7 @@ public static SearchFieldConfig detectSubFieldType(String fieldName,
.analyzer(getAnalyzer(fieldName, fieldType))
.hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType))
.hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType))
.hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType))
.isQueryByDefault(isQueryByDefault)
.build();
}
Expand All @@ -118,6 +125,11 @@ private static boolean hasDelimitedSubfield(String fieldName, SearchableAnnotati
return !fieldName.contains(".")
&& ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType));
}

private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) {
return !fieldName.contains(".")
&& (TYPES_WITH_WORD_GRAM.contains(fieldType));
}
private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) {
return !"urn".equals(fieldName)
&& !fieldName.contains(".")
Expand Down Expand Up @@ -155,6 +167,7 @@ public SearchFieldConfigBuilder fieldName(@Nonnull String fieldName) {
this.fieldName = fieldName;
isDelimitedSubfield(fieldName.endsWith(".delimited"));
isKeywordSubfield(fieldName.endsWith(".keyword"));
isWordGramSubfield(fieldName.contains("wordGrams"));
shortName(fieldName.split("[.]")[0]);
return this;
}
Expand Down
Loading

0 comments on commit 349b88c

Please sign in to comment.