Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Add a new configuration setting synonym_analyzer for synonym_graph and synonym #16625

Merged
merged 1 commit into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Add dynamic setting allowing size > 0 requests to be cached in the request cache ([#16483](https://github.com/opensearch-project/OpenSearch/pull/16483))
- Make IndexStoreListener a pluggable interface ([#16583](https://github.com/opensearch-project/OpenSearch/pull/16583))
- Add vertical scaling and SoftReference for snapshot repository data cache ([#16489](https://github.com/opensearch-project/OpenSearch/pull/16489))
- Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)).

### Dependencies
- Bump `com.google.apis:google-api-services-compute` from v1-rev20240407-2.0.0 to v1-rev20241021-2.0.0 ([#16502](https://github.com/opensearch-project/OpenSearch/pull/16502), [#16548](https://github.com/opensearch-project/OpenSearch/pull/16548))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
import org.opensearch.index.analysis.PreConfiguredTokenizer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.opensearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.opensearch.plugins.AnalysisPlugin;
Expand Down Expand Up @@ -247,7 +248,7 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
}

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
filters.put("apostrophe", ApostropheFilterFactory::new);
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
Expand Down Expand Up @@ -325,14 +326,36 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
filters.put("stemmer_override", requiresAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
filters.put("stemmer", StemmerTokenFilterFactory::new);
filters.put("synonym", requiresAnalysisSettings(SynonymTokenFilterFactory::new));
filters.put("synonym_graph", requiresAnalysisSettings(SynonymGraphTokenFilterFactory::new));
filters.put("trim", TrimTokenFilterFactory::new);
filters.put("truncate", requiresAnalysisSettings(TruncateTokenFilterFactory::new));
filters.put("unique", UniqueTokenFilterFactory::new);
filters.put("uppercase", UpperCaseTokenFilterFactory::new);
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
filters.put(
"synonym",
requiresAnalysisSettings(
(indexSettings, environment, name, settings) -> new SynonymTokenFilterFactory(
indexSettings,
environment,
name,
settings,
analysisModule.getAnalysisRegistry()
)
)
);
filters.put(
"synonym_graph",
requiresAnalysisSettings(
(indexSettings, environment, name, settings) -> new SynonymGraphTokenFilterFactory(
indexSettings,
environment,
name,
settings,
analysisModule.getAnalysisRegistry()
)
)
);
return filters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalysisMode;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
Expand All @@ -49,8 +50,14 @@

public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {

SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);
SynonymGraphTokenFilterFactory(
IndexSettings indexSettings,
Environment env,
String name,
Settings settings,
AnalysisRegistry analysisRegistry
) {
super(indexSettings, env, name, settings, analysisRegistry);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.AnalysisMode;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.CustomAnalyzer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
Expand All @@ -64,8 +66,16 @@
protected final Settings settings;
protected final Environment environment;
protected final AnalysisMode analysisMode;

SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
private final String synonymAnalyzerName;
private final AnalysisRegistry analysisRegistry;

SynonymTokenFilterFactory(
IndexSettings indexSettings,
Environment env,
String name,
Settings settings,
AnalysisRegistry analysisRegistry
) {
super(indexSettings, name, settings);
this.settings = settings;

Expand All @@ -83,6 +93,8 @@
boolean updateable = settings.getAsBoolean("updateable", false);
this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL;
this.environment = env;
this.synonymAnalyzerName = settings.get("synonym_analyzer", null);
this.analysisRegistry = analysisRegistry;
}

@Override
Expand Down Expand Up @@ -137,6 +149,17 @@
List<TokenFilterFactory> tokenFilters,
Function<String, TokenFilterFactory> allFilters
) {
if (synonymAnalyzerName != null) {
Analyzer customSynonymAnalyzer;
try {
customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName);
} catch (IOException e) {
throw new RuntimeException(e);

Check warning on line 157 in modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java

View check run for this annotation

Codecov / codecov/patch

modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java#L156-L157

Added lines #L156 - L157 were not covered by tests
}
if (customSynonymAnalyzer != null) {
return customSynonymAnalyzer;
}
}
return new CustomAnalyzer(
tokenizer,
charFilters.toArray(new CharFilterFactory[0]),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,16 @@
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.te.TeluguNormalizationFilterFactory;
import org.apache.lucene.analysis.te.TeluguStemFilterFactory;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.indices.analysis.AnalysisFactoryTestCase;
import org.opensearch.indices.analysis.AnalysisModule;

import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.mockito.Mock;

import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;

Expand All @@ -53,6 +57,9 @@ public CommonAnalysisFactoryTests() {
super(new CommonAnalysisPlugin());
}

@Mock
private AnalysisModule analysisModule;

@Override
protected Map<String, Class<?>> getTokenizers() {
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
Expand Down Expand Up @@ -302,4 +309,19 @@ private void markedTestCase(String name, Map<String, Class<?>> map) {
unmarked
);
}

/**
* Tests the getTokenFilters(AnalysisModule) method to verify:
* 1. All token filters are properly loaded
* 2. Basic filters remain available
* 3. Synonym filters remain available when AnalysisModule is provided
*/
public void testGetTokenFiltersWithAnalysisModule() {
CommonAnalysisPlugin plugin = (CommonAnalysisPlugin) getAnalysisPlugin();
Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(analysisModule);
assertNotNull("Token filters should not be null", filters);
assertTrue("Should contain basic filters", filters.containsKey("lowercase"));
assertTrue("Should contain synonym filter", filters.containsKey("synonym"));
assertTrue("Should contain synonym_graph filter", filters.containsKey("synonym_graph"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.env.TestEnvironment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.IndexAnalyzers;
import org.opensearch.index.analysis.PreConfiguredTokenFilter;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.test.IndexSettingsModule;
import org.opensearch.test.OpenSearchTestCase;
import org.opensearch.test.VersionUtils;
Expand All @@ -64,6 +67,7 @@
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.startsWith;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class SynonymsAnalysisTests extends OpenSearchTestCase {
private IndexAnalyzers indexAnalyzers;
Expand Down Expand Up @@ -259,14 +263,17 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
.put("hyphenation_patterns_path", "foo")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = TestEnvironment.newEnvironment(settings);
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();

String[] bypassingFactories = new String[] { "dictionary_decompounder" };

CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
for (String factory : bypassingFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);

try (TokenStream ts = analyzer.tokenStream("field", "text")) {
Expand Down Expand Up @@ -329,7 +336,10 @@ public void testDisallowedTokenFilters() throws IOException {
.putList("common_words", "a", "b")
.put("output_unigrams", "true")
.build();
Environment environment = TestEnvironment.newEnvironment(settings);
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();

String[] disallowedFactories = new String[] {
Expand All @@ -343,9 +353,9 @@ public void testDisallowedTokenFilters() throws IOException {
"fingerprint" };

for (String factory : disallowedFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);

IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
Expand All @@ -372,4 +382,76 @@ private void match(String analyzerName, String source, String target) throws IOE
MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

/**
* Tests the integration of word delimiter and synonym graph filters with synonym_analyzer based on issue #16263.
* This test verifies the correct handling of:
* 1. Hyphenated words with word delimiter (e.g., "note-book" → ["notebook", "note", "book"])
* 2. Multi-word synonyms (e.g., "mobile phone" → ["smartphone"])
* 3. Single word synonyms (e.g., "laptop" → ["notebook"])
*
* @see <a href="https://github.com/opensearch-project/OpenSearch/issues/16263">Issue #16263</a>
*/
public void testSynonymAnalyzerWithWordDelimiter() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.custom_word_delimiter.type", "word_delimiter_graph")
.put("index.analysis.filter.custom_word_delimiter.generate_word_parts", true)
.put("index.analysis.filter.custom_word_delimiter.catenate_all", true)
.put("index.analysis.filter.custom_word_delimiter.split_on_numerics", false)
.put("index.analysis.filter.custom_word_delimiter.split_on_case_change", false)
.put("index.analysis.filter.custom_pattern_replace_filter.type", "pattern_replace")
.put("index.analysis.filter.custom_pattern_replace_filter.pattern", "(-)")
.put("index.analysis.filter.custom_pattern_replace_filter.replacement", " ")
.put("index.analysis.filter.custom_pattern_replace_filter.all", true)
.put("index.analysis.filter.custom_synonym_graph_filter.type", "synonym_graph")
.putList(
"index.analysis.filter.custom_synonym_graph_filter.synonyms",
"laptop => notebook",
"smartphone, mobile phone, cell phone => smartphone",
"tv, television => television"
)
.put("index.analysis.filter.custom_synonym_graph_filter.synonym_analyzer", "standard")
.put("index.analysis.analyzer.text_en_index.type", "custom")
.put("index.analysis.analyzer.text_en_index.tokenizer", "whitespace")
.putList(
"index.analysis.analyzer.text_en_index.filter",
"lowercase",
"custom_word_delimiter",
"custom_synonym_graph_filter",
"custom_pattern_replace_filter",
"flatten_graph"
)
.build();
Environment environment = TestEnvironment.newEnvironment(settings);
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule module = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisPlugin()));
IndexAnalyzers analyzers = module.getAnalysisRegistry().build(indexSettings);
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "note-book")) {
assertTokenStreamContents(
ts,
new String[] { "notebook", "note", "book" },
new int[] { 0, 0, 5 },
new int[] { 9, 4, 9 },
new String[] { "word", "word", "word" },
new int[] { 1, 0, 1 },
new int[] { 2, 1, 1 }
);
}
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "mobile phone")) {
assertTokenStreamContents(
ts,
new String[] { "smartphone" },
new int[] { 0 },
new int[] { 12 },
new String[] { "SYNONYM" },
new int[] { 1 },
new int[] { 1 }
);
}
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "laptop")) {
assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 });
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,12 @@ public boolean requiresAnalysisSettings() {
)
);

tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters);
for (AnalysisPlugin plugin : plugins) {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(this);
for (Map.Entry<String, AnalysisProvider<TokenFilterFactory>> entry : filters.entrySet()) {
tokenFilters.register(entry.getKey(), entry.getValue());
}
}
return tokenFilters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.opensearch.index.analysis.PreConfiguredTokenizer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;

import java.io.IOException;
Expand Down Expand Up @@ -92,6 +93,14 @@ default Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return emptyMap();
}

/**
* Override to add additional {@link TokenFilter}s that need access to the AnalysisModule.
* The default implementation for plugins that don't need AnalysisModule calls the existing getTokenFilters() method.
*/
default Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
return getTokenFilters();
}

/**
* Override to add additional {@link Tokenizer}s. See {@link #requiresAnalysisSettings(AnalysisProvider)}
* how to on get the configuration from the index.
Expand Down
Loading
Loading