From 05bd65ab262bbee90b63bb23b6f021b7a8c9d820 Mon Sep 17 00:00:00 2001 From: Mike Alhayek Date: Thu, 14 Nov 2024 12:19:21 -0800 Subject: [PATCH] Correctly create custom analyzer in Elasticsearch (#17013) --- .../ElasticsearchOptionsExtensions.cs | 6 +- .../Startup.cs | 8 +- .../ElasticsearchOptions.cs | 2 +- .../Services/ElasticIndexManager.cs | 548 ++++-------------- .../reference/modules/Elasticsearch/README.md | 28 + src/docs/releases/2.1.0.md | 30 + 6 files changed, 184 insertions(+), 438 deletions(-) diff --git a/src/OrchardCore.Modules/OrchardCore.Search.Elasticsearch/Extensions/ElasticsearchOptionsExtensions.cs b/src/OrchardCore.Modules/OrchardCore.Search.Elasticsearch/Extensions/ElasticsearchOptionsExtensions.cs index 577b81ea6ad..890e7903ce6 100644 --- a/src/OrchardCore.Modules/OrchardCore.Search.Elasticsearch/Extensions/ElasticsearchOptionsExtensions.cs +++ b/src/OrchardCore.Modules/OrchardCore.Search.Elasticsearch/Extensions/ElasticsearchOptionsExtensions.cs @@ -45,9 +45,9 @@ internal static ElasticsearchOptions AddAnalyzers(this ElasticsearchOptions opti return options; } - internal static ElasticsearchOptions AddFilter(this ElasticsearchOptions options, IConfigurationSection configuration) + internal static ElasticsearchOptions AddTokenFilters(this ElasticsearchOptions options, IConfigurationSection configuration) { - var jsonNode = configuration.GetSection(nameof(options.Filter)).AsJsonNode(); + var jsonNode = configuration.GetSection(nameof(options.TokenFilters)).AsJsonNode(); var jsonElement = JsonSerializer.Deserialize(jsonNode); var filterObject = JsonObject.Create(jsonElement, new JsonNodeOptions() @@ -66,7 +66,7 @@ internal static ElasticsearchOptions AddFilter(this ElasticsearchOptions options continue; } - options.Filter.Add(filter.Key, jFilter); + options.TokenFilters.Add(filter.Key, jFilter); } } } diff --git a/src/OrchardCore.Modules/OrchardCore.Search.Elasticsearch/Startup.cs b/src/OrchardCore.Modules/OrchardCore.Search.Elasticsearch/Startup.cs index 5fc514ea532..339a1b62264 100644 --- a/src/OrchardCore.Modules/OrchardCore.Search.Elasticsearch/Startup.cs +++ b/src/OrchardCore.Modules/OrchardCore.Search.Elasticsearch/Startup.cs @@ -47,13 +47,13 @@ public override void ConfigureServices(IServiceCollection services) return new ElasticClient(options.GetConnectionSettings() ?? new ConnectionSettings()); }); - services.Configure(o => + services.Configure(options => { var configuration = _shellConfiguration.GetSection(ElasticConnectionOptionsConfigurations.ConfigSectionName); - o.AddIndexPrefix(configuration); - o.AddFilter(configuration); - o.AddAnalyzers(configuration); + options.AddIndexPrefix(configuration); + options.AddTokenFilters(configuration); + options.AddAnalyzers(configuration); }); services.AddElasticServices(); diff --git a/src/OrchardCore/OrchardCore.Search.Elasticsearch.Abstractions/ElasticsearchOptions.cs b/src/OrchardCore/OrchardCore.Search.Elasticsearch.Abstractions/ElasticsearchOptions.cs index 53a6466ff40..276cebd2154 100644 --- a/src/OrchardCore/OrchardCore.Search.Elasticsearch.Abstractions/ElasticsearchOptions.cs +++ b/src/OrchardCore/OrchardCore.Search.Elasticsearch.Abstractions/ElasticsearchOptions.cs @@ -8,5 +8,5 @@ public class ElasticsearchOptions public Dictionary Analyzers { get; } = []; - public Dictionary Filter { get; } = []; + public Dictionary TokenFilters { get; } = []; } diff --git a/src/OrchardCore/OrchardCore.Search.Elasticsearch.Core/Services/ElasticIndexManager.cs b/src/OrchardCore/OrchardCore.Search.Elasticsearch.Core/Services/ElasticIndexManager.cs index 75c2a9877be..1ef92433a38 100644 --- a/src/OrchardCore/OrchardCore.Search.Elasticsearch.Core/Services/ElasticIndexManager.cs +++ b/src/OrchardCore/OrchardCore.Search.Elasticsearch.Core/Services/ElasticIndexManager.cs @@ -3,6 +3,7 @@ using System.Text.Json; using System.Text.Json.Nodes; using Elasticsearch.Net; +using Json.Path; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Nest; @@ -30,7 +31,8 @@ public sealed class ElasticIndexManager private readonly ElasticsearchOptions _elasticSearchOptions; private readonly ConcurrentDictionary _timestamps = new(StringComparer.OrdinalIgnoreCase); private readonly string _lastTaskId = "last_task_id"; - private readonly Dictionary> _analyzerGetter = new(StringComparer.OrdinalIgnoreCase) + + private static readonly Dictionary> _analyzerGetter = new(StringComparer.OrdinalIgnoreCase) { { ElasticsearchConstants.DefaultAnalyzer, () => new StandardAnalyzer() }, { ElasticsearchConstants.SimpleAnalyzer, () => new SimpleAnalyzer() }, @@ -43,342 +45,56 @@ public sealed class ElasticIndexManager { ElasticsearchConstants.StopAnalyzer, () => new StopAnalyzer() }, }; - private readonly List _tokenFilterNames = new List() - { - "asciifolding", - "common_grams", - "condition", - "delimited_payload", - "dictionary_decompounder", - "edge_ngram", - "elision", - "fingerprint", - "hunspell", - "hyphenation_decompounder", - "icu_collation", - "icu_folding", - "icu_normalizer", - "icu_transform", - "keep_types", - "keep", - "keyword_marker", - "kstem", - "kuromoji_part_of_speech", - "kuromoji_readingform", - "kuromoji_stemmer", - "length", - "limit", - "lowercase", - "multiplexer", - "ngram", - "nori_part_of_speech", - "pattern_capture", - "pattern_replace", - "phonetic", - "porter_stem", - "predicate_token_filter", - "remove_duplicates", - "reverse", - "shingle", - "snowball", - "stemmer_override", - "stemmer", - "stop", - "synonym_graph", - "synonym", - "trim", - "truncate", - "unique", - "uppercase", - "word_delimiter_graph", - "word_delimiter" - }; - private sealed record TokenFilterBuildingInfo(ITokenFilter TokenFilter, Func AddTokenFilter); - private readonly Dictionary _tokenFilterBuildingInfoGetter = new(StringComparer.OrdinalIgnoreCase) + private static readonly Dictionary> _tokenFilterGetter = new(StringComparer.OrdinalIgnoreCase) { - { - "asciifolding", - new TokenFilterBuildingInfo( new AsciiFoldingTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.AsciiFolding(name, f => (AsciiFoldingTokenFilter)tokenFilter) ) - }, - { - "common_grams", - new TokenFilterBuildingInfo( new CommonGramsTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.CommonGrams(name, f => (CommonGramsTokenFilter)tokenFilter) ) - }, - { - "condition", - new TokenFilterBuildingInfo( new ConditionTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Condition(name, f => (ConditionTokenFilter)tokenFilter) ) - }, - { - "delimited_payload", - new TokenFilterBuildingInfo( new DelimitedPayloadTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.DelimitedPayload(name, f => (DelimitedPayloadTokenFilter)tokenFilter) ) - }, - { - "dictionary_decompounder", - new TokenFilterBuildingInfo( new DictionaryDecompounderTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.DictionaryDecompounder(name, f => (DictionaryDecompounderTokenFilter)tokenFilter) ) - }, - { - "edge_ngram", - new TokenFilterBuildingInfo( new EdgeNGramTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.EdgeNGram(name, f => (EdgeNGramTokenFilter)tokenFilter) ) - }, - { - "elision", - new TokenFilterBuildingInfo( new ElisionTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Elision(name, f => (ElisionTokenFilter)tokenFilter) ) - }, - { - "fingerprint", - new TokenFilterBuildingInfo( new FingerprintTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Fingerprint(name, f => (FingerprintTokenFilter)tokenFilter) ) - }, - { - "hunspell", - new TokenFilterBuildingInfo( new HunspellTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Hunspell(name, f => (HunspellTokenFilter)tokenFilter) ) - }, - { - "hyphenation_decompounder", - new TokenFilterBuildingInfo( new HyphenationDecompounderTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.HyphenationDecompounder(name, f => (HyphenationDecompounderTokenFilter)tokenFilter) ) - }, - { - "icu_collation", - new TokenFilterBuildingInfo( new IcuCollationTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.IcuCollation(name, f => (IcuCollationTokenFilter)tokenFilter) ) - }, - { - "icu_folding", - new TokenFilterBuildingInfo( new IcuFoldingTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.IcuFolding(name, f => (IcuFoldingTokenFilter)tokenFilter) ) - }, - { - "icu_normalizer", - new TokenFilterBuildingInfo( new IcuNormalizationTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.IcuNormalization(name, f => (IcuNormalizationTokenFilter)tokenFilter) ) - }, - { - "icu_transform", - new TokenFilterBuildingInfo( new IcuTransformTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.IcuTransform(name, f => (IcuTransformTokenFilter)tokenFilter) ) - }, - { - "keep_types", - new TokenFilterBuildingInfo( new KeepTypesTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.KeepTypes(name, f => (KeepTypesTokenFilter)tokenFilter) ) - }, - { - "keep", - new TokenFilterBuildingInfo( new KeepWordsTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.KeepWords(name, f => (KeepWordsTokenFilter)tokenFilter) ) - }, - { - "keyword_marker", - new TokenFilterBuildingInfo( new KeywordMarkerTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.KeywordMarker(name, f => (KeywordMarkerTokenFilter)tokenFilter) ) - }, - { - "kstem", - new TokenFilterBuildingInfo( new KStemTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.KStem(name, f => (KStemTokenFilter)tokenFilter) ) - }, - { - "kuromoji_part_of_speech", - new TokenFilterBuildingInfo( new KuromojiPartOfSpeechTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.KuromojiPartOfSpeech(name, f => (KuromojiPartOfSpeechTokenFilter)tokenFilter) ) - }, - { - "kuromoji_readingform", - new TokenFilterBuildingInfo( new KuromojiReadingFormTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.KuromojiReadingForm(name, f => (KuromojiReadingFormTokenFilter)tokenFilter) ) - }, - { - "kuromoji_stemmer", - new TokenFilterBuildingInfo( new KuromojiStemmerTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.KuromojiStemmer(name, f => (KuromojiStemmerTokenFilter)tokenFilter) ) - }, - { - "length", - new TokenFilterBuildingInfo( new LengthTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Length(name, f => (LengthTokenFilter)tokenFilter) ) - }, - { - "limit", - new TokenFilterBuildingInfo( new LimitTokenCountTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.LimitTokenCount(name, f => (LimitTokenCountTokenFilter)tokenFilter) ) - }, - { - "lowercase", - new TokenFilterBuildingInfo( new LowercaseTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Lowercase(name, f => (LowercaseTokenFilter)tokenFilter) ) - }, - { - "multiplexer", - new TokenFilterBuildingInfo( new MultiplexerTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Multiplexer(name, f => (MultiplexerTokenFilter)tokenFilter) ) - }, - { - "ngram", - new TokenFilterBuildingInfo( new NGramTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.NGram(name, f => (NGramTokenFilter)tokenFilter) ) - }, - { - "nori_part_of_speech", - new TokenFilterBuildingInfo( new NoriPartOfSpeechTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.NoriPartOfSpeech(name, f => (NoriPartOfSpeechTokenFilter)tokenFilter) ) - }, - { - "pattern_capture", - new TokenFilterBuildingInfo( new PatternCaptureTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.PatternCapture(name, f => (PatternCaptureTokenFilter)tokenFilter) ) - }, - { - "pattern_replace", - new TokenFilterBuildingInfo( new PatternReplaceTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.PatternReplace(name, f => (PatternReplaceTokenFilter)tokenFilter) ) - }, - { - "phonetic", - new TokenFilterBuildingInfo( new PhoneticTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Phonetic(name, f => (PhoneticTokenFilter)tokenFilter) ) - }, - { - "porter_stem", - new TokenFilterBuildingInfo( new PorterStemTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.PorterStem(name, f => (PorterStemTokenFilter)tokenFilter) ) - }, - { - "predicate_token_filter", - new TokenFilterBuildingInfo( new PredicateTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Predicate(name, f => (PredicateTokenFilter)tokenFilter) ) - }, - { - "remove_duplicates", - new TokenFilterBuildingInfo( new RemoveDuplicatesTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.RemoveDuplicates(name, f => (RemoveDuplicatesTokenFilter)tokenFilter) ) - }, - { - "reverse", - new TokenFilterBuildingInfo( new ReverseTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Reverse(name, f => (ReverseTokenFilter)tokenFilter) ) - }, - { - "shingle", - new TokenFilterBuildingInfo( new ShingleTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Shingle(name, f => (ShingleTokenFilter)tokenFilter) ) - }, - { - "snowball", - new TokenFilterBuildingInfo( new SnowballTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Snowball(name, f => (SnowballTokenFilter)tokenFilter) ) - }, - { - "stemmer_override", - new TokenFilterBuildingInfo( new StemmerOverrideTokenFilterDescriptor(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.StemmerOverride(name, f => (StemmerOverrideTokenFilterDescriptor)tokenFilter) ) - }, - { - "stemmer", - new TokenFilterBuildingInfo( new StemmerTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Stemmer(name, f => (StemmerTokenFilter)tokenFilter) ) - }, - { - "stop", - new TokenFilterBuildingInfo( new StopTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Stop(name, f => (StopTokenFilter)tokenFilter) ) - }, - { - "synonym_graph", - new TokenFilterBuildingInfo( new SynonymGraphTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.SynonymGraph(name, f => (SynonymGraphTokenFilter)tokenFilter) ) - }, - { - "synonym", - new TokenFilterBuildingInfo( new SynonymTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Synonym(name, f => (SynonymTokenFilter)tokenFilter) ) - }, - { - "trim", - new TokenFilterBuildingInfo( new TrimTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Trim(name, f => (TrimTokenFilter)tokenFilter) ) - }, - { - "truncate", - new TokenFilterBuildingInfo( new TruncateTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Truncate(name, f => (TruncateTokenFilter)tokenFilter) ) - }, - { - "unique", - new TokenFilterBuildingInfo( new UniqueTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Unique(name, f => (UniqueTokenFilter)tokenFilter) ) - }, - { - "uppercase", - new TokenFilterBuildingInfo( new UppercaseTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.Uppercase(name, f => (UppercaseTokenFilter)tokenFilter) ) - }, - { - "word_delimiter_graph", - new TokenFilterBuildingInfo( new WordDelimiterGraphTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.WordDelimiterGraph(name, f => (WordDelimiterGraphTokenFilter)tokenFilter) ) - }, - { - "word_delimiter", - new TokenFilterBuildingInfo( new WordDelimiterTokenFilter(), - (TokenFiltersDescriptor d, ITokenFilter tokenFilter, string name) => - d.WordDelimiter(name, f => (WordDelimiterTokenFilter)tokenFilter) ) - } + { "asciifolding", () => new AsciiFoldingTokenFilter() }, + { "common_grams", () => new CommonGramsTokenFilter() }, + { "condition", () => new ConditionTokenFilter() }, + { "delimited_payload", () => new DelimitedPayloadTokenFilter() }, + { "dictionary_decompounder", () => new DictionaryDecompounderTokenFilter() }, + { "edge_ngram", () => new EdgeNGramTokenFilter() }, + { "elision", () => new ElisionTokenFilter() }, + { "fingerprint", () => new FingerprintTokenFilter() }, + { "hunspell", () => new HunspellTokenFilter() }, + { "hyphenation_decompounder", () => new HyphenationDecompounderTokenFilter() }, + { "icu_collation", () => new IcuCollationTokenFilter() }, + { "icu_folding", () => new IcuFoldingTokenFilter() }, + { "icu_normalizer", () => new IcuNormalizationTokenFilter() }, + { "icu_transform", () => new IcuTransformTokenFilter() }, + { "keep_types", () => new KeepTypesTokenFilter() }, + { "keep", () => new KeepWordsTokenFilter() }, + { "keyword_marker", () => new KeywordMarkerTokenFilter() }, + { "kstem", () => new KStemTokenFilter() }, + { "kuromoji_part_of_speech", () => new KuromojiPartOfSpeechTokenFilter() }, + { "kuromoji_readingform", () => new KuromojiReadingFormTokenFilter() }, + { "kuromoji_stemmer", () => new KuromojiStemmerTokenFilter() }, + { "length", () => new LengthTokenFilter() }, + { "limit", () => new LimitTokenCountTokenFilter() }, + { "lowercase", () => new LowercaseTokenFilter() }, + { "multiplexer", () => new MultiplexerTokenFilter() }, + { "ngram", () => new NGramTokenFilter() }, + { "nori_part_of_speech", () => new NoriPartOfSpeechTokenFilter() }, + { "pattern_capture", () => new PatternCaptureTokenFilter() }, + { "pattern_replace", () => new PatternReplaceTokenFilter() }, + { "phonetic", () => new PhoneticTokenFilter() }, + { "porter_stem", () => new PorterStemTokenFilter() }, + { "remove_duplicates", () => new RemoveDuplicatesTokenFilter() }, + { "reverse", () => new ReverseTokenFilter() }, + { "shingle", () => new ShingleTokenFilter() }, + { "snowball", () => new SnowballTokenFilter() }, + { "stemmer_override", () => new StemmerOverrideTokenFilterDescriptor() }, + { "stemmer", () => new StemmerTokenFilter() }, + { "stop", () => new StopTokenFilter() }, + { "synonym_graph", () => new SynonymGraphTokenFilter() }, + { "synonym", () => new SynonymTokenFilter() }, + { "trim", () => new TrimTokenFilter() }, + { "truncate", () => new TruncateTokenFilter() }, + { "unique", () => new UniqueTokenFilter() }, + { "uppercase", () => new UppercaseTokenFilter() }, + { "word_delimiter_graph", () => new WordDelimiterGraphTokenFilter() }, + { "word_delimiter", () => new WordDelimiterTokenFilter() } }; + private static readonly List _charsToRemove = [ '\\', @@ -431,7 +147,6 @@ public async Task CreateIndexAsync(ElasticIndexSettings elasticIndexSettin } var analysisDescriptor = new AnalysisDescriptor(); - var analyzersDescriptor = new AnalyzersDescriptor(); var indexSettingsDescriptor = new IndexSettingsDescriptor(); // The name "standardanalyzer" is a legacy used prior OC 1.6 release. It can be removed in future releases. @@ -439,35 +154,27 @@ public async Task CreateIndexAsync(ElasticIndexSettings elasticIndexSettin ? null : elasticIndexSettings.AnalyzerName) ?? ElasticsearchConstants.DefaultAnalyzer; - if (_elasticSearchOptions.Analyzers.TryGetValue(analyzerName, out var analyzerProperties)) + if (_elasticSearchOptions.Analyzers is not null && _elasticSearchOptions.Analyzers.TryGetValue(analyzerName, out var analyzerProperties)) { - if (_elasticSearchOptions.Filter is not null) - { - var tokenFiltersDescriptor = GetTokenFilterDescriptor(_elasticSearchOptions.Filter); - analysisDescriptor.TokenFilters(f => tokenFiltersDescriptor); - } + var analyzer = GetAnalyzer(analyzerProperties); - if (analyzerProperties.TryGetPropertyValue("filter", out var filterResult)) - { - var filterCollection = JsonSerializer.Deserialize>(filterResult); + analysisDescriptor.Analyzers(a => a.UserDefined(analyzerName, analyzer)); + } - var existingFilters = filterCollection.Where(f => _tokenFilterNames.Contains(f)); - analysisDescriptor.Analyzers(a => a.Custom("default", c => c.Tokenizer("standard").Filters(existingFilters))); - } - else - { - var analyzer = CreateAnalyzer(analyzerProperties); - analysisDescriptor.Analyzers(a => a.UserDefined(analyzerName, analyzer)); - } + if (_elasticSearchOptions.TokenFilters is not null && _elasticSearchOptions.TokenFilters.Count > 0) + { + var tokenFiltersDescriptor = GetTokenFiltersDescriptor(_elasticSearchOptions.TokenFilters); - indexSettingsDescriptor = new IndexSettingsDescriptor(); - indexSettingsDescriptor.Analysis(an => analysisDescriptor); + analysisDescriptor.TokenFilters(d => tokenFiltersDescriptor); } + indexSettingsDescriptor.Analysis(a => analysisDescriptor); + // Custom metadata to store the last indexing task id. - var IndexingState = new FluentDictionary() { - { _lastTaskId, 0 } - }; + var IndexingState = new FluentDictionary() + { + { _lastTaskId, 0 }, + }; var fullIndexName = GetFullIndexName(elasticIndexSettings.IndexName); var createIndexDescriptor = new CreateIndexDescriptor(fullIndexName) .Settings(s => indexSettingsDescriptor) @@ -562,33 +269,31 @@ await _elasticClient.MapAsync(p => p return response.Acknowledged; } - private TokenFiltersDescriptor GetTokenFilterDescriptor(Dictionary filter) + private TokenFiltersDescriptor GetTokenFiltersDescriptor(Dictionary filters) { var descriptor = new TokenFiltersDescriptor(); - foreach (var filterName in filter.Keys) + foreach (var filter in filters) { - var filterProps = filter[filterName]; - - if (filterProps.TryGetPropertyValue("type", out var typeObject) is false - || _tokenFilterBuildingInfoGetter.TryGetValue(typeObject.ToString(), out var tokenFilterBuildingInfo) is false) + if (!filter.Value.TryGetPropertyValue("type", out var typeObject) || + !_tokenFilterGetter.TryGetValue(typeObject.ToString(), out var tokenFilterBuildingInfo)) { continue; } - var properties = tokenFilterBuildingInfo.TokenFilter.GetType().GetProperties(BindingFlags.Public | BindingFlags.Instance); + var tokenFilter = tokenFilterBuildingInfo.Invoke(); - foreach (var filterProperty in filterProps) + var properties = tokenFilter.GetType().GetProperties(BindingFlags.Public | BindingFlags.Instance); + + foreach (var filterProperty in filter.Value) { if (filterProperty.Value == null || string.Equals(filterProperty.Key, "type", StringComparison.OrdinalIgnoreCase)) { continue; } - var key = filterProperty.Key.Replace("_", string.Empty); - + var key = filterProperty.Key.Replace(_separator, string.Empty); var property = properties.FirstOrDefault(p => p.Name.Equals(key, StringComparison.OrdinalIgnoreCase)); - var propertyType = property.PropertyType; if (property == null) { @@ -597,51 +302,21 @@ private TokenFiltersDescriptor GetTokenFilterDescriptor(Dictionary>(filterProperty.Value); - property.SetValue(tokenFilterBuildingInfo.TokenFilter, new StopWords(propertyValue)); - } - else - { - var propertyValue = JsonSerializer.Deserialize(filterProperty.Value); - property.SetValue(tokenFilterBuildingInfo.TokenFilter, new StopWords(propertyValue)); - } - - tokenFilterBuildingInfo.AddTokenFilter(descriptor, tokenFilterBuildingInfo.TokenFilter, filterName); - _tokenFilterNames.Add(filterName); - - continue; - } - - if (filterProperty.Value is JsonArray jsonArray) - { - var propertyValue = JsonSerializer.Deserialize(filterProperty.Value, propertyType); - property.SetValue(tokenFilterBuildingInfo.TokenFilter, propertyValue); - } - else - { - var propertyValue = JsonSerializer.Deserialize(filterProperty.Value, propertyType); - property.SetValue(tokenFilterBuildingInfo.TokenFilter, propertyValue); - } + PopulateValue(tokenFilter, property, filterProperty.Value); - tokenFilterBuildingInfo.AddTokenFilter(descriptor, tokenFilterBuildingInfo.TokenFilter, filterName); - _tokenFilterNames.Add(filterName); + descriptor.UserDefined(filter.Key, tokenFilter); } catch (Exception e) { - Console.WriteLine(e.Message); + _logger.LogError(e, "Unable to parse token filter for Elasticsearch (Filter: {Key}).", filter.Key); } } - } return descriptor; } - private IAnalyzer CreateAnalyzer(JsonObject analyzerProperties) + private IAnalyzer GetAnalyzer(JsonObject analyzerProperties) { IAnalyzer analyzer = null; @@ -670,32 +345,12 @@ private IAnalyzer CreateAnalyzer(JsonObject analyzerProperties) try { - if (property.PropertyType == typeof(StopWords)) - { - if (analyzerProperty.Value is JsonArray) - { - var values = analyzerProperty.Value.Values().ToArray(); - - property.SetValue(analyzer, new StopWords(values)); - } - - continue; - } - - if (analyzerProperty.Value is JsonArray jsonArray) - { - var values = jsonArray.Values().ToArray(); - - property.SetValue(analyzer, values); - } - else - { - var value = JNode.ToObject(analyzerProperty.Value, property.PropertyType); - - property.SetValue(analyzer, value); - } + PopulateValue(analyzer, property, analyzerProperty.Value); + } + catch (Exception e) + { + _logger.LogError(e, "Unable to parse an analyzer for Elasticsearch."); } - catch { } } } @@ -757,8 +412,9 @@ public async Task GetIndexInfo(string indexName) /// public async Task SetLastTaskId(string indexName, long lastTaskId) { - var IndexingState = new FluentDictionary() { - { _lastTaskId, lastTaskId } + var IndexingState = new FluentDictionary() + { + { _lastTaskId, lastTaskId }, }; var putMappingRequest = new PutMappingRequest(GetFullIndexName(indexName)) @@ -1065,6 +721,38 @@ public string GetFullIndexName(string indexName) return GetIndexPrefix() + _separator + indexName; } + private static void PopulateValue(object destination, PropertyInfo property, JsonNode value) + { + if (property.PropertyType == typeof(StopWords)) + { + if (value is JsonArray) + { + var values = value.Values().ToArray(); + + property.SetValue(destination, new StopWords(values)); + } + else if (value.TryGetValue(out var saveValue)) + { + property.SetValue(destination, new StopWords(saveValue)); + } + + return; + } + + if (value is JsonArray jsonArray) + { + var values = jsonArray.Values().ToArray(); + + property.SetValue(destination, values); + } + else + { + var safeValue = JNode.ToObject(value, property.PropertyType); + + property.SetValue(destination, safeValue); + } + } + private static void AddValue(Dictionary entries, string key, object value) { if (entries.TryAdd(key, value)) diff --git a/src/docs/reference/modules/Elasticsearch/README.md b/src/docs/reference/modules/Elasticsearch/README.md index 63affe55328..4d775a33978 100644 --- a/src/docs/reference/modules/Elasticsearch/README.md +++ b/src/docs/reference/modules/Elasticsearch/README.md @@ -291,6 +291,34 @@ At the same time, you may define custom analyzers using the [appsettings.json](. } ``` +## Elasticsearch Token-Filters + +As of version 2.1, you can define custom [token filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenfilters.html) in your Elasticsearch configuration. To add new custom token filters, update your Elasticsearch settings accordingly. + +For instance, to create a token filter named `english_stop`, you can include the following configuration in your `appsettings.json` file: + +```json +"OrchardCore_Elasticsearch": { + "TokenFilters": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" + } + }, + "Analyzers": { + "my_new_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "english_stop" + ] + } + } +} +``` + +In this example, the `english_stop` token filter removes English stop words, and the `my_new_analyzer` uses the standard tokenizer along with the `english_stop` filter to process text. + ## Elasticsearch vs Lucene Both modules are complementary and can be enabled at the same time. diff --git a/src/docs/releases/2.1.0.md b/src/docs/releases/2.1.0.md index b9e06dd13d8..e33e2be7630 100644 --- a/src/docs/releases/2.1.0.md +++ b/src/docs/releases/2.1.0.md @@ -192,6 +192,36 @@ Thanks to the newly introduced System-Type capability, it is no longer necessary !!! note The `DashboardPart` will no longer be available as an attachable part by default. +### Elasticsearch + +#### Add Custom Token-Filters Support + +You can now define custom [token filters](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenfilters.html) in your Elasticsearch configuration. To add new custom token filters, update your Elasticsearch settings accordingly. + +For instance, to create a token filter named `english_stop`, you can include the following configuration in your `appsettings.json` file: + +```json +"OrchardCore_Elasticsearch": { + "TokenFilters": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" + } + }, + "Analyzers": { + "my_new_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "english_stop" + ] + } + } +} +``` + +In this example, the `english_stop` token filter removes English stop words, and the `my_new_analyzer` uses the standard tokenizer along with the `english_stop` filter to process text. + ## Added Features ### Azure Communication Services SMS Feature