feat(search): Add word gram analyzer for name fields (#8611)

Co-authored-by: Indy Prentice <[email protected]>
datahub-project · Aug 24, 2023 · 349b88c · 349b88c
1 parent 20e179c
commit 349b88c
Show file tree

Hide file tree

Showing 53 changed files with 453 additions and 111 deletions.
diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md
@@ -219,7 +219,7 @@ record ServiceKey {
   * Name of the service
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true
   }
   name: string

diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md
@@ -327,7 +327,7 @@ It takes the following parameters:
   annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define
   the set of mappings to be applied in the MappingsBuilder.
 
-  Thus far, we have implemented 10 fieldTypes:
+  Thus far, we have implemented 11 fieldTypes:
 
   1. _KEYWORD_ - Short text fields that only support exact matches, often used only for filtering
 
@@ -336,21 +336,27 @@ It takes the following parameters:
   3. _TEXT_PARTIAL_ - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial
      matching is expensive, so this field type should not be applied to fields with long values (like description)
 
-  4. _BROWSE_PATH_ - Field type for browse paths. Applies specific mappings for slash delimited paths.
+    4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND 
+       word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries
+       matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is 
+       expensive, so should not be applied to fields with long values such as description.
 
-  5. _URN_ - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
-     "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components
+    5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
 
-  6. _URN_PARTIAL_ - Urn fields where each sub-component inside the urn is indexed with partial matching support.
+    6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
+       "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components
 
-  7. _BOOLEAN_ - Boolean fields used for filtering.
+    7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
 
-  8. _COUNT_ - Count fields used for filtering.
-  9. _DATETIME_ - Datetime fields used to represent timestamps.
+    8. *BOOLEAN* - Boolean fields used for filtering.
 
-  10. _OBJECT_ - Each property in an object will become an extra column in Elasticsearch and can be referenced as
-      `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a
-      mapping explosion in Elasticsearch.
+    9. *COUNT* - Count fields used for filtering.
+
+    10. *DATETIME* - Datetime fields used to represent timestamps.
+
+    11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as 
+    `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a
+    mapping explosion in Elasticsearch.
 
 - **fieldName**: string (optional) - The name of the field in search index document. Defaults to the field name where
   the annotation resides.

diff --git a/...-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/...-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
@@ -21,7 +21,7 @@ public class SearchableAnnotation {
 
   public static final String ANNOTATION_NAME = "Searchable";
   private static final Set<FieldType> DEFAULT_QUERY_FIELD_TYPES =
-      ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL);
+      ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL);
 
   // Name of the field in the search index. Defaults to the field name in the schema
   String fieldName;
@@ -59,7 +59,8 @@ public enum FieldType {
     COUNT,
     DATETIME,
     OBJECT,
-    BROWSE_PATH_V2
+    BROWSE_PATH_V2,
+    WORD_GRAM
   }
 
   @Nonnull

diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
@@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
     assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName());
 
     // Assert on Searchable Fields
-    assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size());
+    assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10);
     assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get(
         new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName());
     assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get(
@@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
     assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get(
         new PathSpec("textArrayField", "*").toString())
         .getSearchableAnnotation().getFieldType());
+    assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get(
+        new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName());
+    assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get(
+            new PathSpec("wordGramField").toString())
+        .getSearchableAnnotation().getFieldType());
     assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get(
         new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName());
     assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get(

diff --git a/...rc/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/...rc/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
@@ -42,6 +42,9 @@ public static Map<String, String> getPartialNgramConfigWithOverrides(Map<String,
   // Subfields
   public static final String DELIMITED = "delimited";
   public static final String LENGTH = "length";
+  public static final String WORD_GRAMS_LENGTH_2 = "wordGrams2";
+  public static final String WORD_GRAMS_LENGTH_3 = "wordGrams3";
+  public static final String WORD_GRAMS_LENGTH_4 = "wordGrams4";
 
   private MappingsBuilder() {
   }
@@ -94,16 +97,30 @@ private static Map<String, Object> getMappingsForField(@Nonnull final Searchable
       mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
       // Add keyword subfield without lowercase filter
       mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP));
-    } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) {
+    } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
       mappingForField.put(TYPE, KEYWORD);
       mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
       Map<String, Object> subFields = new HashMap<>();
-      if (fieldType == FieldType.TEXT_PARTIAL) {
+      if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
         subFields.put(NGRAM, getPartialNgramConfigWithOverrides(
                 ImmutableMap.of(
                         ANALYZER, PARTIAL_ANALYZER
                 )
         ));
+        if (fieldType == FieldType.WORD_GRAM) {
+          for (Map.Entry<String, String> entry : Map.of(
+              WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER,
+              WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER,
+              WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) {
+            String fieldName = entry.getKey();
+            String analyzerName = entry.getValue();
+            subFields.put(fieldName, ImmutableMap.of(
+                TYPE, TEXT,
+                ANALYZER, analyzerName,
+                SEARCH_ANALYZER, analyzerName
+            ));
+          }
+        }
       }
       subFields.put(DELIMITED, ImmutableMap.of(
               TYPE, TEXT,

diff --git a/...rc/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/...rc/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
@@ -66,6 +66,9 @@ public class SettingsBuilder {
   public static final String KEYWORD_ANALYZER = "keyword";
   public static final String URN_ANALYZER = "urn_component";
   public static final String URN_SEARCH_ANALYZER = "query_urn_component";
+  public static final String WORD_GRAM_2_ANALYZER = "word_gram_2";
+  public static final String WORD_GRAM_3_ANALYZER = "word_gram_3";
+  public static final String WORD_GRAM_4_ANALYZER = "word_gram_4";
 
   // Filters
   public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space";
@@ -80,6 +83,10 @@ public class SettingsBuilder {
   public static final String MULTIFILTER = "multifilter";
   public static final String MULTIFILTER_GRAPH = "multifilter_graph";
   public static final String PARTIAL_URN_COMPONENT = "partial_urn_component";
+  public static final String SHINGLE = "shingle";
+  public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter";
+  public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter";
+  public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter";
   public static final String SNOWBALL = "snowball";
   public static final String STEM_OVERRIDE = "stem_override";
   public static final String STOP = "stop";
@@ -108,6 +115,7 @@ public class SettingsBuilder {
   public static final String SLASH_TOKENIZER = "slash_tokenizer";
   public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer";
   public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
+  public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer";
   // Do not remove the space, needed for multi-term synonyms
   public static final List<String> ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
           "([a-z0-9 _-]{2,})",
@@ -161,6 +169,13 @@ public class SettingsBuilder {
           AUTOCOMPLETE_CUSTOM_DELIMITER,
           LOWERCASE);
 
+  public static final List<String> WORD_GRAM_TOKEN_FILTERS = ImmutableList.of(
+      ASCII_FOLDING,
+      LOWERCASE,
+      TRIM,
+      REMOVE_QUOTES
+  );
+
   public final Map<String, Object> settings;
 
   public SettingsBuilder(String mainTokenizer) {
@@ -275,6 +290,17 @@ private static Map<String, Object> buildFilters() throws IOException {
                         .collect(Collectors.toList()))
                 .build());
       }
+
+      for (Map.Entry<String, Integer> entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) {
+        String filterName = entry.getKey();
+        Integer gramSize = entry.getValue();
+        filters.put(filterName, ImmutableMap.<String, Object>builder()
+            .put(TYPE, SHINGLE)
+            .put("min_shingle_size", gramSize)
+            .put("max_shingle_size", gramSize)
+            .put("output_unigrams", false)
+            .build());
+      }
     }
 
     return filters.build();
@@ -302,13 +328,24 @@ private static Map<String, Object> buildTokenizers() {
             .put(DELIMITER, "␟")
             .build());
 
-    // Tokenize by whitespace and most special chars
+    // Tokenize by most special chars
+    // Do NOT tokenize by whitespace to keep multi-word synonyms in the same token
+    // The split by whitespace is done later in the token filters phase
     tokenizers.put(MAIN_TOKENIZER,
             ImmutableMap.<String, Object>builder()
                     .put(TYPE, PATTERN)
                     .put(PATTERN, "[(),./:]")
                     .build());
 
+    // Tokenize by whitespace and most special chars for wordgrams
+    // only split on - when not preceded by a whitespace to preserve exclusion functionality
+    // i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently
+    tokenizers.put(WORD_GRAM_TOKENIZER,
+        ImmutableMap.<String, Object>builder()
+            .put(TYPE, PATTERN)
+            .put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)")
+            .build());
+
     return tokenizers.build();
   }
 
@@ -382,6 +419,21 @@ private static Map<String, Object> buildAnalyzers(String mainTokenizer) {
             .put(FILTER, SEARCH_TOKEN_FILTERS)
             .build());
 
+    // Support word grams
+    for (Map.Entry<String, String> entry : Map.of(
+        WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER,
+        WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER,
+        WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) {
+      String analyzerName = entry.getKey();
+      String filterName = entry.getValue();
+      analyzers.put(analyzerName, ImmutableMap.<String, Object>builder()
+          .put(TOKENIZER, WORD_GRAM_TOKENIZER)
+          .put(FILTER, ImmutableList.<Object>builder()
+              .addAll(WORD_GRAM_TOKEN_FILTERS)
+              .add(filterName).build())
+          .build());
+    }
+
     // For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN)
     // Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token
     analyzers.put(PARTIAL_ANALYZER, ImmutableMap.<String, Object>builder()
@@ -395,6 +447,7 @@ private static Map<String, Object> buildAnalyzers(String mainTokenizer) {
             .put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS)
             .build());
 
+
     return analyzers.build();
   }
 }
diff --git a/...main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java b/...main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java
@@ -11,11 +11,8 @@
 
 import java.util.Set;
 
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER;
+import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*;
+
 
 @Builder
 @Getter
@@ -33,7 +30,8 @@ public class SearchFieldConfig {
     private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_DELIMITED_SUBFIELD =
             Set.of(
                     SearchableAnnotation.FieldType.TEXT,
-                    SearchableAnnotation.FieldType.TEXT_PARTIAL
+                    SearchableAnnotation.FieldType.TEXT_PARTIAL,
+                    SearchableAnnotation.FieldType.WORD_GRAM
                     // NOT URN_PARTIAL (urn field is special)
             );
     // NOT comprehensive
@@ -56,6 +54,7 @@ public class SearchFieldConfig {
                     SearchableAnnotation.FieldType.TEXT,
                     SearchableAnnotation.FieldType.TEXT_PARTIAL,
                     SearchableAnnotation.FieldType.KEYWORD,
+                    SearchableAnnotation.FieldType.WORD_GRAM,
                     // not analyzed
                     SearchableAnnotation.FieldType.BOOLEAN,
                     SearchableAnnotation.FieldType.COUNT,
@@ -69,6 +68,11 @@ public class SearchFieldConfig {
                     SearchableAnnotation.FieldType.URN_PARTIAL
             );
 
+    public static final Set<SearchableAnnotation.FieldType> TYPES_WITH_WORD_GRAM =
+        Set.of(
+            SearchableAnnotation.FieldType.WORD_GRAM
+        );
+
     @Nonnull
     private final String fieldName;
     @Nonnull
@@ -78,9 +82,11 @@ public class SearchFieldConfig {
     private final String analyzer;
     private boolean hasKeywordSubfield;
     private boolean hasDelimitedSubfield;
+    private boolean hasWordGramSubfields;
     private boolean isQueryByDefault;
     private boolean isDelimitedSubfield;
     private boolean isKeywordSubfield;
+    private boolean isWordGramSubfield;
 
     public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) {
         final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation();
@@ -106,6 +112,7 @@ public static SearchFieldConfig detectSubFieldType(String fieldName,
                 .analyzer(getAnalyzer(fieldName, fieldType))
                 .hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType))
                 .hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType))
+                .hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType))
                 .isQueryByDefault(isQueryByDefault)
                 .build();
     }
@@ -118,6 +125,11 @@ private static boolean hasDelimitedSubfield(String fieldName, SearchableAnnotati
         return !fieldName.contains(".")
                 && ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType));
     }
+
+    private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) {
+        return !fieldName.contains(".")
+            && (TYPES_WITH_WORD_GRAM.contains(fieldType));
+    }
     private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) {
         return !"urn".equals(fieldName)
                 && !fieldName.contains(".")
@@ -155,6 +167,7 @@ public SearchFieldConfigBuilder fieldName(@Nonnull String fieldName) {
             this.fieldName = fieldName;
             isDelimitedSubfield(fieldName.endsWith(".delimited"));
             isKeywordSubfield(fieldName.endsWith(".keyword"));
+            isWordGramSubfield(fieldName.contains("wordGrams"));
             shortName(fieldName.split("[.]")[0]);
             return this;
         }