Skip to content

Commit

Permalink
#176 Updating tests from replacing lucene filter.
Browse files Browse the repository at this point in the history
  • Loading branch information
jzonthemtn committed Dec 16, 2024
1 parent f898301 commit 000ad53
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ public FilterResult filter(Policy policy, String context, String documentId, int
final List<Span> spans = new LinkedList<>();

// TODO: Get ngrams from max to size 1.
final Map<String, Position> ngrams = getNgrams(input, 1);
final Map<Position, String> ngrams = getNgrams(input, 1);

for(final String candidate : ngrams.keySet()) {
for(final String candidate : ngrams.values()) {

if (endsWithPostNominal(candidate) || startsWithPreNominal(candidate)) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import ai.philterd.phileas.model.filter.FilterConfiguration;
import ai.philterd.phileas.model.filter.rules.dictionary.BloomFilterDictionaryFilter;
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.filters.strategies.custom.CustomDictionaryFilterStrategy;
import ai.philterd.phileas.model.services.AlertService;
import ai.philterd.phileas.services.anonymization.AlphanumericAnonymizationService;
Expand Down Expand Up @@ -157,4 +158,31 @@ public void filterDictionaryPhraseMatch2() throws Exception {

}

@Test
public void filterDictionaryPhraseMatchMultipleMatches() throws Exception {

final FilterConfiguration filterConfiguration = new FilterConfiguration.FilterConfigurationBuilder()
.withStrategies(List.of(new CustomDictionaryFilterStrategy()))
.withAlertService(alertService)
.withAnonymizationService(new AlphanumericAnonymizationService(new LocalAnonymizationCacheService()))
.withWindowSize(windowSize)
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george jones", "ted", "bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"He lived with george jones and george jones in California.", attributes);

showSpans(filterResult.getSpans());

Assertions.assertEquals(2, filterResult.getSpans().size());

for(final Span span : filterResult.getSpans()) {
Assertions.assertEquals("george jones", span.getText());
Assertions.assertTrue(span.getCharacterStart() == 31 || span.getCharacterStart() == 14);
Assertions.assertTrue(span.getCharacterEnd() == 43 || span.getCharacterEnd() == 26);
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
Expand Down Expand Up @@ -342,15 +341,15 @@ public static List<? extends AbstractFilterStrategy> getIdentifierFilterStrategi

}

public Map<String, Position> splitWithIndexes(String text, String delimiter) {
public Map<Position, String> splitWithIndexes(String text, String delimiter) {

final Map<Position, String> splitsWithIndexes = new HashMap<>();

final Map<String, Position> splitsWithIndexes = new HashMap<>();
List<String> result = new ArrayList<>();
String[] tokens = text.split(delimiter);

int index = 0;
for (String token : tokens) {
splitsWithIndexes.put(token, new Position(index, index + token.length()));
splitsWithIndexes.put(new Position(index, index + token.length()), token);
index += token.length() + delimiter.length();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import ai.philterd.phileas.model.enums.FilterType;
import ai.philterd.phileas.model.filter.Filter;
import ai.philterd.phileas.model.filter.FilterConfiguration;
import ai.philterd.phileas.model.filter.rules.regex.RegexFilter;
import ai.philterd.phileas.model.objects.Analyzer;
import ai.philterd.phileas.model.objects.ConfidenceModifier;
import ai.philterd.phileas.model.objects.FilterPattern;
Expand All @@ -29,8 +28,6 @@
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
Expand Down Expand Up @@ -225,12 +222,13 @@ public int getOccurrences(final Policy policy, final String input, final Map<Str

}

public Map<String, Position> getNgrams(String text, int n) {
public Map<Position, String> getNgrams(String text, int n) {

final String delimiter = " ";

final Map<String, Position> ngramsWithIndexes = new HashMap<>();
final Map<Position, String> ngramsWithIndexes = new HashMap<>();
final String[] words = text.split(delimiter);
int lastLocation = 0;

for (int i = 0; i <= words.length - n; i++) {

Expand All @@ -246,9 +244,12 @@ public Map<String, Position> getNgrams(String text, int n) {

}

int location = text.indexOf(ngram.toString());
int newLocation = text.indexOf(ngram.toString(), lastLocation);
lastLocation = newLocation;

ngramsWithIndexes.put(ngram.toString(), new Position(location, location + ngram.toString().length()));
final Position position = new Position(newLocation, newLocation + ngram.toString().length());

ngramsWithIndexes.put(position, ngram.toString());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,16 @@ public FilterResult filter(Policy policy, String context, String documentId, int

final List<Span> spans = new LinkedList<>();

final Map<String, Position> ngrams = new HashMap<>();
final Map<Position, String> ngrams = new HashMap<>();

// Get ngrams from max to size 1.
for(int i = 0; i <= maxNgramSize; i++) {
for(int i = 1; i <= maxNgramSize; i++) {
ngrams.putAll(getNgrams(text, i));
}

for(final String ngram : ngrams.keySet()) {
for(final Position position : ngrams.keySet()) {

final String ngram = ngrams.get(position);

if (bloomFilter.mightContain(ngram.toLowerCase())) {

Expand All @@ -100,8 +102,8 @@ public FilterResult filter(Policy policy, String context, String documentId, int
// Set the meta values for the span.
final boolean isIgnored = ignored.contains(ngram);

final int characterStart = ngrams.get(ngram).getStart();
final int characterEnd = ngrams.get(ngram).getEnd();
final int characterStart = position.getStart();
final int characterEnd = position.getEnd();
final double confidence = 1.0;
final String[] window = getWindow(text, characterStart, characterEnd);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
Expand Down Expand Up @@ -46,7 +45,7 @@ public FilterResult filter(Policy policy, String context, String documentId, int
if(policy.getIdentifiers().hasFilter(filterType)) {

// Build ngrams from the input text.
final Map<Integer, Map<String, Position>> ngrams = new HashMap<>();
final Map<Integer, Map<Position, String>> ngrams = new HashMap<>();
ngrams.put(0, splitWithIndexes(input, " "));

final int maxNgrams;
Expand Down Expand Up @@ -74,15 +73,17 @@ public FilterResult filter(Policy policy, String context, String documentId, int
// Fuzzy matches.
final int spacesInEntry = StringUtils.countMatches(entry, " ");

for(final Position position : ngrams.get(spacesInEntry).keySet()) {

// Compare string distance between word and ngrams.
for (final String ngram : ngrams.get(spacesInEntry).keySet()) {
final String ngram = ngrams.get(spacesInEntry).get(position);

if(ngram.length() > 2) {

if (requireCapitalization && Character.isUpperCase(ngram.charAt(0))) {

final int start = ngrams.get(spacesInEntry).get(ngram).getStart();
final int end = ngrams.get(spacesInEntry).get(ngram).getEnd();
final int start = position.getStart();
final int end = position.getEnd();

final LevenshteinDistance levenshteinDistance = LevenshteinDistance.getDefaultInstance();
final int distance = levenshteinDistance.apply(entry, ngram);
Expand Down

0 comments on commit 000ad53

Please sign in to comment.