From 76ae8222ea26215431302fb1c0fa183c0a859572 Mon Sep 17 00:00:00 2001 From: robert-bor Date: Tue, 22 Sep 2015 22:10:19 +0200 Subject: [PATCH] issue #12 adopted the suggestion by yim1990 with a small change, so that the keyword emit is lowercased as well --- src/main/java/org/ahocorasick/trie/Trie.java | 5 ++++- .../java/org/ahocorasick/trie/TrieTest.java | 19 ++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index d31d52f..8c8b58f 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -33,9 +33,12 @@ private void addKeyword(String keyword) { } State currentState = this.rootState; for (Character character : keyword.toCharArray()) { + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } currentState = currentState.addState(character); } - currentState.addEmit(keyword); + currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword); } public Collection tokenize(String text) { diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index 063f177..f4d3a6c 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -106,7 +106,24 @@ public void ushersTest() { checkEmit(iterator.next(), 2, 5, "hers"); } - @Test + @Test + public void ushersTestWithCapitalKeywords() { + Trie trie = Trie.builder() + .caseInsensitive() + .addKeyword("HERS") + .addKeyword("HIS") + .addKeyword("SHE") + .addKeyword("HE") + .build(); + Collection emits = trie.parseText("ushers"); + assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 + Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 3, "he"); + checkEmit(iterator.next(), 1, 3, "she"); + checkEmit(iterator.next(), 2, 5, "hers"); + } + + @Test public void ushersTestFirstMatch() { Trie trie = Trie.builder() .addKeyword("hers")