Skip to content

Commit

Permalink
pull #14 implemented pull request by rripken for containsMatch and fi…
Browse files Browse the repository at this point in the history
…rstMatch
  • Loading branch information
robert-bor committed Sep 22, 2015
1 parent 4633b1b commit e2c5334
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 66 deletions.
36 changes: 9 additions & 27 deletions src/main/java/org/ahocorasick/trie/Trie.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ public Collection<Emit> parseText(CharSequence text) {
return collectedEmits;
}

public boolean matches(String text)
{
public boolean containsMatch(CharSequence text) {
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}

public void parseText(CharSequence text, EmitHandler emitHandler) {
State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) {
Expand All @@ -105,33 +105,18 @@ public void parseText(CharSequence text, EmitHandler emitHandler) {

}

private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
long size = searchText.length();
List<Emit> removeEmits = new ArrayList<Emit>();
for (Emit emit : collectedEmits) {
if ((emit.getStart() == 0 ||
!Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) &&
(emit.getEnd() + 1 == size ||
!Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)))) {
continue;
}
removeEmits.add(emit);
}

public Emit firstMatch(String text)
{
public Emit firstMatch(CharSequence text) {
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
Collection<Emit> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
} else {
// Fast path. Returs first match found.
checkForConstructedFailureStates();
int position = 0;
// Fast path. Returns first match found.
State currentState = this.rootState;
for (Character character : text.toCharArray()) {
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
Expand All @@ -149,23 +134,20 @@ public Emit firstMatch(String text)
}
}
}
position++;
}
}
return null;
}

private boolean isPartialMatch(String searchText, Emit emit)
{
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
return (emit.getStart() != 0 &&
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
(emit.getEnd() + 1 != searchText.length() &&
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}

private void removePartialMatches(String searchText, List<Emit> collectedEmits)
{
List<Emit> removeEmits = new ArrayList<Emit>();
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
List<Emit> removeEmits = new ArrayList<>();
for (Emit emit : collectedEmits) {
if (isPartialMatch(searchText, emit)) {
removeEmits.add(emit);
Expand Down
104 changes: 65 additions & 39 deletions src/test/java/org/ahocorasick/trie/TrieTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import java.util.List;

import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class TrieTest {

Expand All @@ -24,8 +25,9 @@ public void keywordAndTextAreTheSame() {

@Test
public void keywordAndTextAreTheSameFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("abc");
Trie trie = Trie.builder()
.addKeyword("abc")
.build();
Emit firstMatch = trie.firstMatch("abc");
checkEmit(firstMatch, 0, 2, "abc");
}
Expand All @@ -42,8 +44,9 @@ public void textIsLongerThanKeyword() {

@Test
public void textIsLongerThanKeywordFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("abc");
Trie trie = Trie.builder()
.addKeyword("abc")
.build();
Emit firstMatch = trie.firstMatch(" abc");
checkEmit(firstMatch, 1, 3, "abc");
}
Expand All @@ -62,10 +65,11 @@ public void variousKeywordsOneMatch() {

@Test
public void variousKeywordsFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("abc");
trie.addKeyword("bcd");
trie.addKeyword("cde");
Trie trie = Trie.builder()
.addKeyword("abc")
.addKeyword("bcd")
.addKeyword("cde")
.build();
Emit firstMatch = trie.firstMatch("bcd");
checkEmit(firstMatch, 0, 2, "bcd");
}
Expand Down Expand Up @@ -104,11 +108,12 @@ public void ushersTest() {

@Test
public void ushersTestFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("hers");
trie.addKeyword("his");
trie.addKeyword("she");
trie.addKeyword("he");
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
Emit firstMatch = trie.firstMatch("ushers");
checkEmit(firstMatch, 2, 3, "he");
}
Expand Down Expand Up @@ -150,8 +155,9 @@ public void misleadingTest() {

@Test
public void misleadingTestFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("hers");
Trie trie = Trie.builder()
.addKeyword("hers")
.build();
Emit firstMatch = trie.firstMatch("h he her hers");
checkEmit(firstMatch, 9, 12, "hers");
}
Expand All @@ -174,21 +180,23 @@ public void recipes() {

@Test
public void recipesFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("veal");
trie.addKeyword("cauliflower");
trie.addKeyword("broccoli");
trie.addKeyword("tomatoes");
Trie trie = Trie.builder()
.addKeyword("veal")
.addKeyword("cauliflower")
.addKeyword("broccoli")
.addKeyword("tomatoes")
.build();
Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");

checkEmit(firstMatch, 2, 12, "cauliflower");
}

@Test
public void longAndShortOverlappingMatch() {
Trie trie = new Trie();
trie.addKeyword("he");
trie.addKeyword("hehehehe");
Trie trie = Trie.builder()
.addKeyword("he")
.addKeyword("hehehehe")
.build();
Collection<Emit> emits = trie.parseText("hehehehehe");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he");
Expand All @@ -215,17 +223,28 @@ public void nonOverlapping() {
checkEmit(iterator.next(), 6, 7, "ab");
}

@Test
@Test
public void nonOverlappingFirstMatch() {
Trie trie = new Trie().removeOverlaps();
trie.addKeyword("ab");
trie.addKeyword("cba");
trie.addKeyword("ababc");
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
Emit firstMatch = trie.firstMatch("ababcbab");

checkEmit(firstMatch, 0, 4, "ababc");
}

@Test
public void containsMatch() {
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
assertTrue(trie.containsMatch("ababcbab"));
}

@Test
public void startOfChurchillSpeech() {
Trie trie = Trie.builder().removeOverlaps()
Expand All @@ -246,7 +265,8 @@ public void startOfChurchillSpeech() {

@Test
public void partialMatch() {
Trie trie = Trie.builder().onlyWholeWords()
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
Expand All @@ -256,8 +276,10 @@ public void partialMatch() {

@Test
public void partialMatchFirstMatch() {
Trie trie = new Trie().onlyWholeWords();
trie.addKeyword("sugar");
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test

checkEmit(firstMatch, 20, 24, "sugar");
Expand Down Expand Up @@ -318,11 +340,12 @@ public void caseInsensitive() {

@Test
public void caseInsensitiveFirstMatch() {
Trie trie = new Trie().caseInsensitive();
trie.addKeyword("turning");
trie.addKeyword("once");
trie.addKeyword("again");
trie.addKeyword("börkü");
Trie trie = Trie.builder().caseInsensitive()
.addKeyword("turning")
.addKeyword("once")
.addKeyword("again")
.addKeyword("börkü")
.build();
Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");

checkEmit(firstMatch, 0, 6, "turning");
Expand Down Expand Up @@ -365,9 +388,12 @@ public void unicodeIssueBug8ReportedByDwyerk() {
@Test
public void unicodeIssueBug8ReportedByDwyerkFirstMatch() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
assertEquals("THIS", target.substring(5,9)); // Java does it the right way
trie.addKeyword("this");
Trie trie = Trie.builder()
.caseInsensitive()
.onlyWholeWords()
.addKeyword("this")
.build();
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
Emit firstMatch = trie.firstMatch(target);
checkEmit(firstMatch, 5, 8, "this");
}
Expand Down

0 comments on commit e2c5334

Please sign in to comment.