From bf39cb2bb19fb1796f58c2c9e7ec56ca5b90ea22 Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Tue, 7 Mar 2017 17:58:23 +0800 Subject: [PATCH 01/12] make DynamicSynonymTokenFilterFactory singleton --- .../plugin/DynamicSynonymPlugin.java | 2 +- .../DynamicSynonymTokenFilterFactory.java | 22 ++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java b/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java index c4ab821..f916c58 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java @@ -65,7 +65,7 @@ public Map> getToken @Override public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { - return new DynamicSynonymTokenFilterFactory(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry()); + return DynamicSynonymTokenFilterFactory.getInstance(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry()); } @Override diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java index f7c1578..46f903f 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java @@ -7,7 +7,6 @@ import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.logging.ESLoggerFactory; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; @@ -47,6 +46,27 @@ public class DynamicSynonymTokenFilterFactory extends private SynonymMap synonymMap; private Map dynamicSynonymFilters = new WeakHashMap(); + private static DynamicSynonymTokenFilterFactory instance = null; + + public synchronized static DynamicSynonymTokenFilterFactory getInstance( + IndexSettings indexSettings, + Environment env, + String name, + Settings settings, + AnalysisRegistry analysisRegistry + ) throws IOException { + if( instance != null ){ + return instance; + } + instance = new DynamicSynonymTokenFilterFactory( + indexSettings, + env, + name, + settings, + analysisRegistry + ); + return instance; + } public DynamicSynonymTokenFilterFactory( IndexSettings indexSettings, From e778df7ed2fd205b30e7ca8718743b44e579726e Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Thu, 26 Oct 2017 14:53:34 +0800 Subject: [PATCH 02/12] make same synonyms_path share one DynamicSynonymTokenFilterFactory instance --- .../DynamicSynonymTokenFilterFactory.java | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java index 46f903f..6682d66 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java @@ -19,10 +19,7 @@ import java.io.IOException; import java.util.Map; import java.util.WeakHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.*; /** * @@ -46,7 +43,7 @@ public class DynamicSynonymTokenFilterFactory extends private SynonymMap synonymMap; private Map dynamicSynonymFilters = new WeakHashMap(); - private static DynamicSynonymTokenFilterFactory instance = null; + private static ConcurrentMap instanceMap = new ConcurrentHashMap<>(); public synchronized static DynamicSynonymTokenFilterFactory getInstance( IndexSettings indexSettings, @@ -55,6 +52,14 @@ public synchronized static DynamicSynonymTokenFilterFactory getInstance( Settings settings, AnalysisRegistry analysisRegistry ) throws IOException { + + String location = settings.get("synonyms_path"); + if (location == null) { + throw new IllegalArgumentException( + "dynamic synonym requires `synonyms_path` to be configured"); + } + + DynamicSynonymTokenFilterFactory instance = instanceMap.get(location); if( instance != null ){ return instance; } @@ -65,6 +70,7 @@ public synchronized static DynamicSynonymTokenFilterFactory getInstance( settings, analysisRegistry ); + instanceMap.put(location,instance); return instance; } From 598491a52edb091f6b25ed2d1e8dfb26dd9a1aca Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Thu, 26 Oct 2017 15:04:32 +0800 Subject: [PATCH 03/12] update for 5.6.3 --- pom.xml | 4 ++-- .../bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java | 4 ++-- .../synonym/analysis/DynamicSynonymTokenFilterFactory.java | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 429a5b8..4185322 100644 --- a/pom.xml +++ b/pom.xml @@ -4,14 +4,14 @@ com.bellszhu.elasticsearch elasticsearch-analysis-dynamic-synonym - 5.1.1 + 5.6.3 jar elasticsearch-dynamic-synonym Analysis-plugin for synonym UTF-8 - 5.1.1 + 5.6.3 1.8 analysis-dynamic-synonym ${project.basedir}/src/main/assemblies/plugin.xml diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java b/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java index f916c58..813464c 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java @@ -8,6 +8,7 @@ import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.component.LifecycleComponent; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisRegistry; @@ -18,7 +19,6 @@ import com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymTokenFilterFactory; import org.elasticsearch.script.ScriptService; -import org.elasticsearch.search.SearchRequestParsers; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.watcher.ResourceWatcherService; @@ -44,7 +44,7 @@ public Collection createComponents( ThreadPool threadPool, ResourceWatcherService resourceWatcherService, ScriptService scriptService, - SearchRequestParsers searchRequestParsers + NamedXContentRegistry xContentRegistry ) { Collection components = new ArrayList<>(); components.add(pluginComponent); diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java index 6682d66..94ff3e6 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java @@ -123,7 +123,7 @@ protected TokenStreamComponents createComponents(String fieldName) { }; SynonymFile synonymFile; - if (location.startsWith("http://")) { + if (location.startsWith("http://") || location.startsWith("https://")) { synonymFile = new RemoteSynonymFile(env, analyzer, expand, format, location); } else { From 9fdf27e802a5c4ce763e27e81b731982e5aa6bec Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Thu, 3 Jan 2019 21:26:04 +0800 Subject: [PATCH 04/12] change assemble --- pom.xml | 7 ++----- src/main/assemblies/plugin.xml | 6 +++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 4185322..4ace988 100644 --- a/pom.xml +++ b/pom.xml @@ -101,16 +101,13 @@ maven-assembly-plugin + 2.3 + false ${project.build.directory}/releases/ ${basedir}/src/main/assemblies/plugin.xml - - - fully.qualified.MainClass - - diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index 7e6df66..a3da411 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -8,13 +8,13 @@ ${project.basedir}/src/main/resources/plugin-descriptor.properties - + elasticsearch true - / + elasticsearch true true @@ -22,7 +22,7 @@ - / + elasticsearch true true From 674a797d1ba8f30c1fbb0494788bb47242bea6de Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Mon, 14 Jan 2019 16:43:32 +0800 Subject: [PATCH 05/12] update to 6.5.4 --- pom.xml | 4 +-- .../analysis/DynamicSynonymFilter.java | 4 +-- .../DynamicSynonymTokenFilterFactory.java | 6 ++--- .../synonym/analysis/LocalSynonymFile.java | 27 +++++++------------ .../synonym/analysis/RemoteSynonymFile.java | 4 +-- 5 files changed, 17 insertions(+), 28 deletions(-) diff --git a/pom.xml b/pom.xml index 4ace988..241777a 100644 --- a/pom.xml +++ b/pom.xml @@ -4,14 +4,14 @@ com.bellszhu.elasticsearch elasticsearch-analysis-dynamic-synonym - 5.6.3 + 6.5.4 jar elasticsearch-dynamic-synonym Analysis-plugin for synonym UTF-8 - 5.6.3 + 6.5.4 1.8 analysis-dynamic-synonym ${project.basedir}/src/main/assemblies/plugin.xml diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java index 6edca5b..b538fcd 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java @@ -25,7 +25,7 @@ import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.*; import org.apache.lucene.util.fst.FST; -import org.elasticsearch.common.logging.ESLoggerFactory; +import org.elasticsearch.common.logging.Loggers; import java.io.IOException; import java.util.Arrays; @@ -102,7 +102,7 @@ public final class DynamicSynonymFilter extends TokenFilter { - public static Logger logger = ESLoggerFactory.getLogger("dynamic-synonym"); + public static Logger logger = Loggers.getLogger("dynamic-synonym"); public static final String TYPE_SYNONYM = "SYNONYM"; diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java index 5c32ace..fd0bd14 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java @@ -19,7 +19,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.elasticsearch.common.logging.ESLoggerFactory; +import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -29,8 +29,6 @@ import org.elasticsearch.indices.analysis.AnalysisModule; import java.io.IOException; -import java.util.Map; -import java.util.WeakHashMap; import java.util.concurrent.*; /** @@ -41,7 +39,7 @@ public class DynamicSynonymTokenFilterFactory extends AbstractTokenFilterFactory { - public static Logger logger = ESLoggerFactory.getLogger("dynamic-synonym"); + public static Logger logger = Loggers.getLogger("dynamic-synonym"); /** * 静态的id生成器 diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java index eb98846..f46b3c9 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java @@ -3,22 +3,18 @@ */ package com.bellszhu.elasticsearch.plugin.synonym.analysis; -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.Reader; -import java.nio.file.Path; - import org.apache.commons.codec.Charsets; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; -import org.elasticsearch.common.io.FileSystemUtils; -import org.elasticsearch.common.logging.ESLoggerFactory; +import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.env.Environment; +import java.io.*; +import java.nio.file.Path; + /** * @author bellszhu @@ -26,7 +22,7 @@ */ public class LocalSynonymFile implements SynonymFile { - public static Logger logger = ESLoggerFactory.getLogger("dynamic-synonym"); + public static Logger logger = Loggers.getLogger("dynamic-synonym"); private String format; @@ -82,19 +78,14 @@ public Reader getReader() { Reader reader = null; BufferedReader br = null; try { - reader = FileSystemUtils.newBufferedReader( - synonymFilePath.toUri().toURL(), Charsets.UTF_8); - /* - br = new BufferedReader(new InputStreamReader( - synonymFileURL.openStream(), Charsets.UTF_8)); - StringBuffer sb = new StringBuffer(""); - String line = null; + br = new BufferedReader(new InputStreamReader(synonymFilePath.toUri().toURL().openStream(), Charsets.UTF_8)); + StringBuffer sb = new StringBuffer(); + String line; while ((line = br.readLine()) != null) { logger.info("reload local synonym: {}", line); sb.append(line).append(System.getProperty("line.separator")); } - reader = new FastStringReader(sb.toString()); - */ + reader = new StringReader(sb.toString()); } catch (IOException e) { logger.error("get local synonym reader {} error!", e, location); throw new IllegalArgumentException( diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java index 4cb402a..516dd4b 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java @@ -14,7 +14,7 @@ import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; -import org.elasticsearch.common.logging.ESLoggerFactory; +import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.env.Environment; import java.io.BufferedReader; @@ -28,7 +28,7 @@ */ public class RemoteSynonymFile implements SynonymFile { - public static Logger logger = ESLoggerFactory.getLogger("dynamic-synonym"); + public static Logger logger = Loggers.getLogger("dynamic-synonym"); private CloseableHttpClient httpclient = HttpClients.createDefault(); From 9f998dd26d947d1684fea3b54190eb87d22caf2d Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Mon, 14 Jan 2019 16:51:33 +0800 Subject: [PATCH 06/12] commit --- .../plugin/DynamicSynonymPlugin.java | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java b/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java index ee1eb76..f4dd1ca 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java @@ -7,10 +7,12 @@ import org.elasticsearch.client.Client; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.component.LifecycleComponent; +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.env.Environment; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -39,19 +41,31 @@ public class DynamicSynonymPlugin extends Plugin implements AnalysisPlugin { private PluginComponent pluginComponent = new PluginComponent(); @Override - public Collection createComponents( - Client client, - ClusterService clusterService, - ThreadPool threadPool, - ResourceWatcherService resourceWatcherService, - ScriptService scriptService, - NamedXContentRegistry xContentRegistry - ) { + public Collection createComponents(Client client, ClusterService clusterService, ThreadPool threadPool, + ResourceWatcherService resourceWatcherService, ScriptService scriptService, + NamedXContentRegistry xContentRegistry, Environment environment, + NodeEnvironment nodeEnvironment, NamedWriteableRegistry namedWriteableRegistry) { + Collection components = new ArrayList<>(); components.add(pluginComponent); return components; } + +// @Override +// public Collection createComponents( +// Client client, +// ClusterService clusterService, +// ThreadPool threadPool, +// ResourceWatcherService resourceWatcherService, +// ScriptService scriptService, +// NamedXContentRegistry xContentRegistry +// ) { +// Collection components = new ArrayList<>(); +// components.add(pluginComponent); +// return components; +// } + @Override public Collection> getGuiceServiceClasses() { return singletonList(DynamicSynonymAnalysisService.class); From 96c461623607500cbae229d1b17e22b7e79a66da Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Tue, 15 Jan 2019 11:02:16 +0800 Subject: [PATCH 07/12] package --- src/main/assemblies/plugin.xml | 3 --- src/main/resources/plugin-descriptor.properties | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index a3da411..328d2bc 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -8,13 +8,11 @@ ${project.basedir}/src/main/resources/plugin-descriptor.properties - elasticsearch true - elasticsearch true true @@ -22,7 +20,6 @@ - elasticsearch true true diff --git a/src/main/resources/plugin-descriptor.properties b/src/main/resources/plugin-descriptor.properties index 29b3c64..58226c3 100644 --- a/src/main/resources/plugin-descriptor.properties +++ b/src/main/resources/plugin-descriptor.properties @@ -43,7 +43,7 @@ name=${elasticsearch.plugin.name} # # 'site': set to true to indicate contents of the _site/ # directory in the root of the plugin should be served. -site=${elasticsearch.plugin.site} +#site=${elasticsearch.plugin.site} # ### mandatory elements for jvm plugins : # @@ -52,7 +52,7 @@ site=${elasticsearch.plugin.site} # Note that only jar files in the root directory are # added to the classpath for the plugin! If you need # other resources, package them into a resources jar. -jvm=${elasticsearch.plugin.jvm} +#jvm=${elasticsearch.plugin.jvm} # # 'classname': the name of the class to load, fully-qualified. classname=${elasticsearch.plugin.classname} @@ -76,5 +76,5 @@ elasticsearch.version=${elasticsearch.version} # passing false is deprecated, and only intended to support plugins # that have hard dependencies against each other. If this is # not specified, then the plugin is isolated by default. -isolated=${elasticsearch.plugin.isolated} +#isolated=${elasticsearch.plugin.isolated} # \ No newline at end of file From 4a3a1648e50cf6287a9743e2b02d394e8681a711 Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Tue, 15 Jan 2019 14:56:42 +0800 Subject: [PATCH 08/12] secure check --- src/main/assemblies/plugin.xml | 8 +++++ .../synonym/analysis/RemoteSynonymFile.java | 29 +++++++++++++++++-- src/main/resources/plugin-security.policy | 4 +++ 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 src/main/resources/plugin-security.policy diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index 328d2bc..088846a 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -8,11 +8,18 @@ ${project.basedir}/src/main/resources/plugin-descriptor.properties + + true + + + ${project.basedir}/src/main/resources/plugin-security.policy + true + true true @@ -20,6 +27,7 @@ + true true diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java index 516dd4b..16e1913 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java @@ -14,6 +14,7 @@ import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; +import org.elasticsearch.SpecialPermission; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.env.Environment; @@ -21,6 +22,8 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; +import java.security.AccessController; +import java.security.PrivilegedAction; /** * @author bellszhu @@ -61,7 +64,14 @@ public RemoteSynonymFile(Environment env, Analyzer analyzer, } @Override - public SynonymMap reloadSynonymMap() { + public SynonymMap reloadSynonymMap(){ + SpecialPermission.check(); + return AccessController.doPrivileged((PrivilegedAction) () -> { + return reloadSynonymMapUnprivileged(); + }); + } + + public SynonymMap reloadSynonymMapUnprivileged() { Reader rulesReader = null; try { logger.info("start reload remote synonym from {}.", location); @@ -93,10 +103,18 @@ public SynonymMap reloadSynonymMap() { } } + @Override + public Reader getReader() { + SpecialPermission.check(); + return AccessController.doPrivileged((PrivilegedAction) () -> { + return getReaderUnprivileged(); + }); + } + /** * 从远程服务器上下载自定义词条 */ - public Reader getReader() { + public Reader getReaderUnprivileged() { Reader reader = null; RequestConfig rc = RequestConfig.custom() .setConnectionRequestTimeout(10 * 1000) @@ -155,6 +173,13 @@ public Reader getReader() { @Override public boolean isNeedReloadSynonymMap() { + SpecialPermission.check(); + return AccessController.doPrivileged((PrivilegedAction) () -> { + return isNeedReloadSynonymMapUnprivileged(); + }); + } + + public boolean isNeedReloadSynonymMapUnprivileged() { RequestConfig rc = RequestConfig.custom() .setConnectionRequestTimeout(10 * 1000) .setConnectTimeout(10 * 1000).setSocketTimeout(15 * 1000) diff --git a/src/main/resources/plugin-security.policy b/src/main/resources/plugin-security.policy new file mode 100644 index 0000000..55d759a --- /dev/null +++ b/src/main/resources/plugin-security.policy @@ -0,0 +1,4 @@ +grant { + // needed because of the hot reload functionality + permission java.net.SocketPermission "*", "connect,resolve"; +}; \ No newline at end of file From 7999a8426a543a91480197a08fd786518d5f9ccd Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Fri, 18 Jan 2019 15:17:13 +0800 Subject: [PATCH 09/12] ignore offset option --- .../synonym/analysis/DynamicSynonymFilter.java | 14 +++++++++++--- .../analysis/DynamicSynonymTokenFilterFactory.java | 4 +++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java index b538fcd..715bc4f 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java @@ -109,6 +109,7 @@ public final class DynamicSynonymFilter extends TokenFilter { private SynonymMap synonyms; private final boolean ignoreCase; + private final boolean ignoreOffset; private int rollBufferSize; private int captureCount; @@ -263,9 +264,12 @@ public void add(char[] output, int offset, int len, int endOffset, * when you create the {@link SynonymMap} */ public DynamicSynonymFilter(TokenStream input, SynonymMap synonyms, - boolean ignoreCase) { + boolean ignoreCase + ,boolean ignoreOffset + ) { super(input); this.ignoreCase = ignoreCase; + this.ignoreOffset = ignoreOffset; update(synonyms); } @@ -589,7 +593,9 @@ public boolean incrementToken() throws IOException { if (endOffset == -1) { endOffset = input.endOffset; } - offsetAtt.setOffset(input.startOffset, endOffset); + if(!ignoreOffset){ + offsetAtt.setOffset(input.startOffset, endOffset); + } posIncrAtt.setPositionIncrement(posIncr); posLenAtt.setPositionLength(outputs.getLastPosLength()); if (outputs.count == 0) { @@ -623,7 +629,9 @@ public boolean incrementToken() throws IOException { } clearAttributes(); // Keep offset from last input token: - offsetAtt.setOffset(lastStartOffset, lastEndOffset); + if(!this.ignoreOffset){ + offsetAtt.setOffset(lastStartOffset, lastEndOffset); + } termAtt.copyBuffer(output.chars, output.offset, output.length); typeAtt.setType(TYPE_SYNONYM); diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java index fd0bd14..e5cd451 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java @@ -62,6 +62,7 @@ public Thread newThread(Runnable r) { private final boolean expand; private final String format; private final int interval; + private final boolean ignoreOffset; private SynonymMap synonymMap; private Map dynamicSynonymFilters = new WeakHashMap(); @@ -117,6 +118,7 @@ public DynamicSynonymTokenFilterFactory( this.ignoreCase = settings.getAsBoolean("ignore_case", false); this.expand = settings.getAsBoolean("expand", true); this.format = settings.get("format", ""); + this.ignoreOffset = settings.getAsBoolean("ignore_offset", true); String tokenizerName = settings.get("tokenizer", "whitespace"); @@ -162,7 +164,7 @@ protected TokenStreamComponents createComponents(String fieldName) { @Override public TokenStream create(TokenStream tokenStream) { DynamicSynonymFilter dynamicSynonymFilter = new DynamicSynonymFilter( - tokenStream, synonymMap, ignoreCase); + tokenStream, synonymMap, ignoreCase,ignoreOffset); dynamicSynonymFilters.put(dynamicSynonymFilter, 1); // fst is null means no synonyms From aa0e66bdd795e4f8b4c6fce4a938cf16ef8d4b23 Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Thu, 14 Feb 2019 09:45:56 +0800 Subject: [PATCH 10/12] synonym filter graph --- .../plugin/DynamicSynonymPlugin.java | 3 +- .../analysis/DynamicSynonymFilter.java | 4 + .../analysis/DynamicSynonymGraphFilter.java | 629 ++++++++++++++++++ ...DynamicSynonymGraphTokenFilterFactory.java | 185 ++++++ 4 files changed, 820 insertions(+), 1 deletion(-) create mode 100644 src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java create mode 100644 src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java b/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java index f4dd1ca..953e48b 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/DynamicSynonymPlugin.java @@ -3,6 +3,7 @@ */ package com.bellszhu.elasticsearch.plugin; +import com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymGraphTokenFilterFactory; import com.bellszhu.elasticsearch.plugin.synonym.service.DynamicSynonymAnalysisService; import org.elasticsearch.client.Client; import org.elasticsearch.cluster.service.ClusterService; @@ -80,7 +81,7 @@ public Map> getToken @Override public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { - return DynamicSynonymTokenFilterFactory.getInstance(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry()); + return DynamicSynonymGraphTokenFilterFactory.getInstance(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry()); } @Override diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java index 715bc4f..67ed610 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java @@ -595,6 +595,8 @@ public boolean incrementToken() throws IOException { } if(!ignoreOffset){ offsetAtt.setOffset(input.startOffset, endOffset); + }else{ + logger.info("dynamic synonym offset ignored"); } posIncrAtt.setPositionIncrement(posIncr); posLenAtt.setPositionLength(outputs.getLastPosLength()); @@ -631,6 +633,8 @@ public boolean incrementToken() throws IOException { // Keep offset from last input token: if(!this.ignoreOffset){ offsetAtt.setOffset(lastStartOffset, lastEndOffset); + }else{ + logger.info("set offset ignored"); } termAtt.copyBuffer(output.chars, output.offset, output.length); diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java new file mode 100644 index 0000000..7d3c253 --- /dev/null +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java @@ -0,0 +1,629 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.bellszhu.elasticsearch.plugin.synonym.analysis; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +import org.apache.logging.log4j.Logger; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.FlattenGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.RollingBuffer; +import org.apache.lucene.util.fst.FST; +import org.elasticsearch.common.logging.Loggers; + +// TODO: maybe we should resolve token -> wordID then run +// FST on wordIDs, for better perf? + +// TODO: a more efficient approach would be Aho/Corasick's +// algorithm +// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm +// It improves over the current approach here +// because it does not fully re-start matching at every +// token. For example if one pattern is "a b c x" +// and another is "b c d" and the input is "a b c d", on +// trying to parse "a b c x" but failing when you got to x, +// rather than starting over again your really should +// immediately recognize that "b c d" matches at the next +// input. I suspect this won't matter that much in +// practice, but it's possible on some set of synonyms it +// will. We'd have to modify Aho/Corasick to enforce our +// conflict resolving (eg greedy matching) because that algo +// finds all matches. This really amounts to adding a .* +// closure to the FST and then determinizing it. +// +// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps + +/** Applies single- or multi-token synonyms from a {@link SynonymMap} + * to an incoming {@link TokenStream}, producing a fully correct graph + * output. This is a replacement for {@link SynonymFilter}, which produces + * incorrect graphs for multi-token synonyms. + * + *

However, if you use this during indexing, you must follow it with + * {@link FlattenGraphFilter} to squash tokens on top of one another + * like {@link SynonymFilter}, because the indexer can't directly + * consume a graph. To get fully correct positional queries when your + * synonym replacements are multiple tokens, you should instead apply + * synonyms using this {@code TokenFilter} at query time and translate + * the resulting graph to a {@code TermAutomatonQuery} e.g. using + * {@code TokenStreamToTermAutomatonQuery}. + * + *

NOTE: this cannot consume an incoming graph; results will + * be undefined. + * + * @lucene.experimental */ + +public final class DynamicSynonymGraphFilter extends TokenFilter { + + public static final String TYPE_SYNONYM = "SYNONYM"; + public static Logger logger = Loggers.getLogger("dynamic-synonym"); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + private SynonymMap synonyms; + private final boolean ignoreCase; + + private FST fst; + + private FST.BytesReader fstReader; + private FST.Arc scratchArc; + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + private final BytesRef scratchBytes = new BytesRef(); + private final CharsRefBuilder scratchChars = new CharsRefBuilder(); + private final LinkedList outputBuffer = new LinkedList<>(); + + private int nextNodeOut; + private int lastNodeOut; + private int maxLookaheadUsed; + + // For testing: + private int captureCount; + + private boolean liveToken; + + // Start/end offset of the current match: + private int matchStartOffset; + private int matchEndOffset; + + // True once the input TokenStream is exhausted: + private boolean finished; + + private int lookaheadNextRead; + private int lookaheadNextWrite; + + private RollingBuffer lookahead = new RollingBuffer() { + @Override + protected BufferedInputToken newInstance() { + return new BufferedInputToken(); + } + }; + + static class BufferedInputToken implements RollingBuffer.Resettable { + final CharsRefBuilder term = new CharsRefBuilder(); + State state; + int startOffset = -1; + int endOffset = -1; + + @Override + public void reset() { + state = null; + term.clear(); + + // Intentionally invalid to ferret out bugs: + startOffset = -1; + endOffset = -1; + } + } + + static class BufferedOutputToken { + final String term; + + // Non-null if this was an incoming token: + final State state; + + final int startNode; + final int endNode; + + public BufferedOutputToken(State state, String term, int startNode, int endNode) { + this.state = state; + this.term = term; + this.startNode = startNode; + this.endNode = endNode; + } + } + + /** + * Apply previously built synonyms to incoming tokens. + * @param input input tokenstream + * @param synonyms synonym map + * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. + * Note, if you set this to true, it's your responsibility to lowercase + * the input entries when you create the {@link SynonymMap} + */ + public DynamicSynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { + super(input); + this.synonyms = synonyms; + this.fst = synonyms.fst; + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + this.fstReader = fst.getBytesReader(); + scratchArc = new FST.Arc<>(); + this.ignoreCase = ignoreCase; + } + + @Override + public boolean incrementToken() throws IOException { + //System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut); + + assert lastNodeOut <= nextNodeOut; + + if (outputBuffer.isEmpty() == false) { + // We still have pending outputs from a prior synonym match: + releaseBufferedToken(); + //System.out.println(" syn: ret buffered=" + this); + assert liveToken == false; + return true; + } + + // Try to parse a new synonym match at the current token: + + if (parse()) { + // A new match was found: + releaseBufferedToken(); + //System.out.println(" syn: after parse, ret buffered=" + this); + assert liveToken == false; + return true; + } + + if (lookaheadNextRead == lookaheadNextWrite) { + + // Fast path: parse pulled one token, but it didn't match + // the start for any synonym, so we now return it "live" w/o having + // cloned all of its atts: + if (finished) { + //System.out.println(" syn: ret END"); + return false; + } + + assert liveToken; + liveToken = false; + + // NOTE: no need to change posInc since it's relative, i.e. whatever + // node our output is upto will just increase by the incoming posInc. + // We also don't need to change posLen, but only because we cannot + // consume a graph, so the incoming token can never span a future + // synonym match. + + } else { + // We still have buffered lookahead tokens from a previous + // parse attempt that required lookahead; just replay them now: + //System.out.println(" restore buffer"); + assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite; + BufferedInputToken token = lookahead.get(lookaheadNextRead); + lookaheadNextRead++; + + restoreState(token.state); + + lookahead.freeBefore(lookaheadNextRead); + + //System.out.println(" after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset()); + assert liveToken == false; + } + + lastNodeOut += posIncrAtt.getPositionIncrement(); + nextNodeOut = lastNodeOut + posLenAtt.getPositionLength(); + + //System.out.println(" syn: ret lookahead=" + this); + + return true; + } + + private void releaseBufferedToken() throws IOException { + //System.out.println(" releaseBufferedToken"); + + BufferedOutputToken token = outputBuffer.pollFirst(); + + if (token.state != null) { + // This is an original input token (keepOrig=true case): + //System.out.println(" hasState"); + restoreState(token.state); + //System.out.println(" startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset()); +// offsetAtt.setOffset(0,0); + } else { + clearAttributes(); + //System.out.println(" no state"); + termAtt.append(token.term); + + // We better have a match already: + assert matchStartOffset != -1; + +// if(!this.ignoreOffset){ + offsetAtt.setOffset(matchStartOffset, matchEndOffset); +// }else { +// offsetAtt.setOffset(0,0); +// } + //System.out.println(" startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset); + typeAtt.setType(TYPE_SYNONYM); + } + + //System.out.println(" lastNodeOut=" + lastNodeOut); + //System.out.println(" term=" + termAtt); + + posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut); + lastNodeOut = token.startNode; + posLenAtt.setPositionLength(token.endNode - token.startNode); + } + + /** Scans the next input token(s) to see if a synonym matches. Returns true + * if a match was found. */ + private boolean parse() throws IOException { + // System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this)); + + // Holds the longest match we've seen so far: + BytesRef matchOutput = null; + int matchInputLength = 0; + + BytesRef pendingOutput = fst.outputs.getNoOutput(); + fst.getFirstArc(scratchArc); + + assert scratchArc.output == fst.outputs.getNoOutput(); + + // How many tokens in the current match + int matchLength = 0; + boolean doFinalCapture = false; + + int lookaheadUpto = lookaheadNextRead; + matchStartOffset = -1; + + byToken: + while (true) { + //System.out.println(" cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos()); + + // Pull next token's chars: + final char[] buffer; + final int bufferLen; + final int inputEndOffset; + + if (lookaheadUpto <= lookahead.getMaxPos()) { + // Still in our lookahead buffer + BufferedInputToken token = lookahead.get(lookaheadUpto); + lookaheadUpto++; + buffer = token.term.chars(); + bufferLen = token.term.length(); + inputEndOffset = token.endOffset; + //System.out.println(" use buffer now max=" + lookahead.getMaxPos()); + if (matchStartOffset == -1) { + matchStartOffset = token.startOffset; + } + } else { + + // We used up our lookahead buffer of input tokens + // -- pull next real input token: + + assert finished || liveToken == false; + + if (finished) { + //System.out.println(" break: finished"); + break; + } else if (input.incrementToken()) { + //System.out.println(" input.incrToken"); + liveToken = true; + buffer = termAtt.buffer(); + bufferLen = termAtt.length(); + if (matchStartOffset == -1) { + matchStartOffset = offsetAtt.startOffset(); + } + inputEndOffset = offsetAtt.endOffset(); + + lookaheadUpto++; + } else { + // No more input tokens + finished = true; + //System.out.println(" break: now set finished"); + break; + } + } + + matchLength++; + //System.out.println(" cycle term=" + new String(buffer, 0, bufferLen)); + + // Run each char in this token through the FST: + int bufUpto = 0; + while (bufUpto < bufferLen) { + final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); + if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { + break byToken; + } + + // Accum the output + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + bufUpto += Character.charCount(codePoint); + } + + assert bufUpto == bufferLen; + + // OK, entire token matched; now see if this is a final + // state in the FST (a match): + if (scratchArc.isFinal()) { + matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); + matchInputLength = matchLength; + matchEndOffset = inputEndOffset; + //System.out.println(" ** match"); + } + + // See if the FST can continue matching (ie, needs to + // see the next input token): + if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { + // No further rules can match here; we're done + // searching for matching rules starting at the + // current input position. + break; + } else { + // More matching is possible -- accum the output (if + // any) of the WORD_SEP arc: + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + doFinalCapture = true; + if (liveToken) { + capture(); + } + } + } + + if (doFinalCapture && liveToken && finished == false) { + // Must capture the final token if we captured any prior tokens: + capture(); + } + + if (matchOutput != null) { + + if (liveToken) { + // Single input token synonym; we must buffer it now: + capture(); + } + + // There is a match! + bufferOutputTokens(matchOutput, matchInputLength); + lookaheadNextRead += matchInputLength; + //System.out.println(" precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos()); + lookahead.freeBefore(lookaheadNextRead); + //System.out.println(" match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos()); + return true; + } else { + //System.out.println(" no match; lookaheadNextRead=" + lookaheadNextRead); + return false; + } + + //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); + } + + /** Expands the output graph into the necessary tokens, adding + * synonyms as side paths parallel to the input tokens, and + * buffers them in the output token buffer. */ + private void bufferOutputTokens(BytesRef bytes, int matchInputLength) { + bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); + + final int code = bytesReader.readVInt(); + final boolean keepOrig = (code & 0x1) == 0; + //System.out.println(" buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength); + + // How many nodes along all paths; we need this to assign the + // node ID for the final end node where all paths merge back: + int totalPathNodes; + if (keepOrig) { + assert matchInputLength > 0; + totalPathNodes = matchInputLength - 1; + } else { + totalPathNodes = 0; + } + + // How many synonyms we will insert over this match: + final int count = code >>> 1; + + // TODO: we could encode this instead into the FST: + + // 1st pass: count how many new nodes we need + List> paths = new ArrayList<>(); + for(int outputIDX=0;outputIDX path = new ArrayList<>(); + paths.add(path); + int chEnd = scratchChars.length(); + for(int chUpto=0; chUpto<=chEnd; chUpto++) { + if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) { + path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart)); + lastStart = 1 + chUpto; + } + } + + assert path.size() > 0; + totalPathNodes += path.size() - 1; + } + //System.out.println(" totalPathNodes=" + totalPathNodes); + + // 2nd pass: buffer tokens for the graph fragment + + // NOTE: totalPathNodes will be 0 in the case where the matched + // input is a single token and all outputs are also a single token + + // We "spawn" a side-path for each of the outputs for this matched + // synonym, all ending back at this end node: + + int startNode = nextNodeOut; + + int endNode = startNode + totalPathNodes + 1; + //System.out.println(" " + paths.size() + " new side-paths"); + + // First, fanout all tokens departing start node for these new side paths: + int newNodeCount = 0; + for(List path : paths) { + int pathEndNode; + //System.out.println(" path size=" + path.size()); + if (path.size() == 1) { + // Single token output, so there are no intermediate nodes: + pathEndNode = endNode; + } else { + pathEndNode = nextNodeOut + newNodeCount + 1; + newNodeCount += path.size() - 1; + } + outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode)); + } + + // We must do the original tokens last, else the offsets "go backwards": + if (keepOrig) { + BufferedInputToken token = lookahead.get(lookaheadNextRead); + int inputEndNode; + if (matchInputLength == 1) { + // Single token matched input, so there are no intermediate nodes: + inputEndNode = endNode; + } else { + inputEndNode = nextNodeOut + newNodeCount + 1; + } + + //System.out.println(" keepOrig first token: " + token.term); + + outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode)); + } + + nextNodeOut = endNode; + + // Do full side-path for each syn output: + for(int pathID=0;pathID path = paths.get(pathID); + if (path.size() > 1) { + int lastNode = outputBuffer.get(pathID).endNode; + for(int i=1;i 1) { + // Do full "side path" with the original tokens: + int lastNode = outputBuffer.get(paths.size()).endNode; + for(int i=1;i scheduledFuture; + + private final String location; + private final boolean ignoreCase; + private final boolean expand; + private final String format; + private final int interval; + + private SynonymMap synonymMap; + private Map dynamicSynonymFilters = new WeakHashMap(); + private static ConcurrentMap instanceMap = new ConcurrentHashMap<>(); + + public synchronized static DynamicSynonymGraphTokenFilterFactory getInstance( + IndexSettings indexSettings, + Environment env, + String name, + Settings settings, + AnalysisRegistry analysisRegistry + ) throws IOException { + + String location = settings.get("synonyms_path"); + if (location == null) { + throw new IllegalArgumentException( + "dynamic synonym requires `synonyms_path` to be configured"); + } + + DynamicSynonymGraphTokenFilterFactory instance = instanceMap.get(location); + if( instance != null ){ + return instance; + } + instance = new DynamicSynonymGraphTokenFilterFactory( + indexSettings, + env, + name, + settings, + analysisRegistry + ); + instanceMap.put(location,instance); + return instance; + } + + public DynamicSynonymGraphTokenFilterFactory( + IndexSettings indexSettings, + Environment env, + String name, + Settings settings, + AnalysisRegistry analysisRegistry + ) throws IOException { + + super(indexSettings, name, settings); + + + this.location = settings.get("synonyms_path"); + if (this.location == null) { + throw new IllegalArgumentException( + "dynamic synonym requires `synonyms_path` to be configured"); + } + + this.interval = settings.getAsInt("interval", 60); + this.ignoreCase = settings.getAsBoolean("ignore_case", false); + this.expand = settings.getAsBoolean("expand", true); + this.format = settings.get("format", ""); + + String tokenizerName = settings.get("tokenizer", "whitespace"); + + + AnalysisModule.AnalysisProvider tokenizerFactoryFactory = + analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings); + if (tokenizerFactoryFactory == null) { + throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter"); + } + final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName, + + AnalysisRegistry.getSettingsFromIndexSettings(indexSettings, AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName)); + + + + + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create(); + TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; + return new TokenStreamComponents(tokenizer, stream); + } + }; + + SynonymFile synonymFile; + if (location.startsWith("http://") || location.startsWith("https://")) { + synonymFile = new RemoteSynonymFile(env, analyzer, expand, format, + location); + } else { + synonymFile = new LocalSynonymFile(env, analyzer, expand, format, + location); + } + synonymMap = synonymFile.reloadSynonymMap(); + + scheduledFuture = pool.scheduleAtFixedRate(new Monitor(synonymFile), + interval, interval, TimeUnit.SECONDS); + + } + + + + @Override + public TokenStream create(TokenStream tokenStream) { + DynamicSynonymGraphFilter dynamicSynonymFilter = new DynamicSynonymGraphFilter( + tokenStream, synonymMap, ignoreCase); + dynamicSynonymFilters.put(dynamicSynonymFilter, 1); + + // fst is null means no synonyms + return synonymMap.fst == null ? tokenStream : dynamicSynonymFilter; + } + + public class Monitor implements Runnable { + + private SynonymFile synonymFile; + + public Monitor(SynonymFile synonymFile) { + this.synonymFile = synonymFile; + } + + @Override + public void run() { + if (synonymFile.isNeedReloadSynonymMap()) { + synonymMap = synonymFile.reloadSynonymMap(); + for (DynamicSynonymGraphFilter dynamicSynonymFilter : dynamicSynonymFilters + .keySet()) { + dynamicSynonymFilter.update(synonymMap); + logger.info("success reload synonym"); + } + } + } + } + +} \ No newline at end of file From a5f5970b398d230432ae3e0904851d97d8aec7f2 Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Mon, 25 Feb 2019 16:26:22 +0800 Subject: [PATCH 11/12] comimt --- pom.xml | 4 ++-- .../analysis/DynamicSynonymFilter.java | 4 ++-- .../analysis/DynamicSynonymGraphFilter.java | 21 +++++++------------ ...DynamicSynonymGraphTokenFilterFactory.java | 4 ++-- .../DynamicSynonymTokenFilterFactory.java | 19 +++++------------ .../synonym/analysis/LocalSynonymFile.java | 4 ++-- .../synonym/analysis/RemoteSynonymFile.java | 4 ++-- 7 files changed, 22 insertions(+), 38 deletions(-) diff --git a/pom.xml b/pom.xml index 241777a..62d7472 100644 --- a/pom.xml +++ b/pom.xml @@ -4,14 +4,14 @@ com.bellszhu.elasticsearch elasticsearch-analysis-dynamic-synonym - 6.5.4 + 6.6.1 jar elasticsearch-dynamic-synonym Analysis-plugin for synonym UTF-8 - 6.5.4 + 6.6.1 1.8 analysis-dynamic-synonym ${project.basedir}/src/main/assemblies/plugin.xml diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java index 67ed610..d780de5 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java @@ -17,6 +17,7 @@ * limitations under the License. */ +import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -25,7 +26,6 @@ import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.*; import org.apache.lucene.util.fst.FST; -import org.elasticsearch.common.logging.Loggers; import java.io.IOException; import java.util.Arrays; @@ -102,7 +102,7 @@ public final class DynamicSynonymFilter extends TokenFilter { - public static Logger logger = Loggers.getLogger("dynamic-synonym"); + private final Logger logger = LogManager.getLogger(getClass()); public static final String TYPE_SYNONYM = "SYNONYM"; diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java index 7d3c253..6a0fba2 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java @@ -17,29 +17,22 @@ package com.bellszhu.elasticsearch.plugin.synonym.analysis; -import java.io.IOException; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; - -import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.FlattenGraphFilter; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.store.ByteArrayDataInput; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.RollingBuffer; import org.apache.lucene.util.fst.FST; -import org.elasticsearch.common.logging.Loggers; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; // TODO: maybe we should resolve token -> wordID then run // FST on wordIDs, for better perf? @@ -85,7 +78,7 @@ public final class DynamicSynonymGraphFilter extends TokenFilter { public static final String TYPE_SYNONYM = "SYNONYM"; - public static Logger logger = Loggers.getLogger("dynamic-synonym"); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java index 18ecf11..1ac920b 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java @@ -1,6 +1,7 @@ package com.bellszhu.elasticsearch.plugin.synonym.analysis; +import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -8,7 +9,6 @@ import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -31,7 +31,7 @@ public class DynamicSynonymGraphTokenFilterFactory extends AbstractTokenFilterFactory { - public static Logger logger = Loggers.getLogger("dynamic-synonym"); + private final Logger logger = LogManager.getLogger(getClass()); /** * 静态的id生成器 diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java index e5cd451..c774eda 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java @@ -1,25 +1,14 @@ package com.bellszhu.elasticsearch.plugin.synonym.analysis; -import java.util.Map; -import java.util.WeakHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.ThreadFactory; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; - - +import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -29,7 +18,10 @@ import org.elasticsearch.indices.analysis.AnalysisModule; import java.io.IOException; +import java.util.Map; +import java.util.WeakHashMap; import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicInteger; /** * @@ -39,8 +31,7 @@ public class DynamicSynonymTokenFilterFactory extends AbstractTokenFilterFactory { - public static Logger logger = Loggers.getLogger("dynamic-synonym"); - + private final Logger logger = LogManager.getLogger(getClass()); /** * 静态的id生成器 */ diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java index f46b3c9..8a8195a 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java @@ -4,12 +4,12 @@ package com.bellszhu.elasticsearch.plugin.synonym.analysis; import org.apache.commons.codec.Charsets; +import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; -import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.env.Environment; import java.io.*; @@ -22,7 +22,7 @@ */ public class LocalSynonymFile implements SynonymFile { - public static Logger logger = Loggers.getLogger("dynamic-synonym"); + private final Logger logger = LogManager.getLogger(getClass()); private String format; diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java index 16e1913..69894a0 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java @@ -9,13 +9,13 @@ import org.apache.http.client.methods.HttpHead; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.elasticsearch.SpecialPermission; -import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.env.Environment; import java.io.BufferedReader; @@ -31,7 +31,7 @@ */ public class RemoteSynonymFile implements SynonymFile { - public static Logger logger = Loggers.getLogger("dynamic-synonym"); + private final Logger logger = LogManager.getLogger(getClass()); private CloseableHttpClient httpclient = HttpClients.createDefault(); From 0d7187897c454c43a34f4766c4853f2b4c6a1d0e Mon Sep 17 00:00:00 2001 From: "yingjie.cai" Date: Mon, 25 Feb 2019 17:51:44 +0800 Subject: [PATCH 12/12] getLogger --- .../plugin/synonym/analysis/DynamicSynonymFilter.java | 2 +- .../synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java | 2 +- .../synonym/analysis/DynamicSynonymTokenFilterFactory.java | 2 +- .../elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java | 2 +- .../plugin/synonym/analysis/RemoteSynonymFile.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java index d780de5..ce77f13 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java @@ -102,7 +102,7 @@ public final class DynamicSynonymFilter extends TokenFilter { - private final Logger logger = LogManager.getLogger(getClass()); + private final Logger logger = LogManager.getLogger("dynamic-synonym"); public static final String TYPE_SYNONYM = "SYNONYM"; diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java index 1ac920b..7912acc 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphTokenFilterFactory.java @@ -31,7 +31,7 @@ public class DynamicSynonymGraphTokenFilterFactory extends AbstractTokenFilterFactory { - private final Logger logger = LogManager.getLogger(getClass()); + private final Logger logger = LogManager.getLogger("dynamic-synonym"); /** * 静态的id生成器 diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java index c774eda..8a7b376 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java @@ -31,7 +31,7 @@ public class DynamicSynonymTokenFilterFactory extends AbstractTokenFilterFactory { - private final Logger logger = LogManager.getLogger(getClass()); + private final Logger logger = LogManager.getLogger("dynamic-synonym"); /** * 静态的id生成器 */ diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java index 8a8195a..c659043 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java @@ -22,7 +22,7 @@ */ public class LocalSynonymFile implements SynonymFile { - private final Logger logger = LogManager.getLogger(getClass()); + private final Logger logger = LogManager.getLogger("dynamic-synonym"); private String format; diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java index 69894a0..05cc6dd 100644 --- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java +++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java @@ -31,7 +31,7 @@ */ public class RemoteSynonymFile implements SynonymFile { - private final Logger logger = LogManager.getLogger(getClass()); + private final Logger logger = LogManager.getLogger("dynamic-synonym"); private CloseableHttpClient httpclient = HttpClients.createDefault();