Skip to content

Commit

Permalink
#176 Working on removing lucene filters.
Browse files Browse the repository at this point in the history
  • Loading branch information
jzonthemtn committed Dec 16, 2024
1 parent f001b10 commit 96238d3
Show file tree
Hide file tree
Showing 13 changed files with 79 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,12 @@ public static Policy getPdfPolicy(String policyName) throws IOException {
customDictionaryFilterStrategy.setStrategy("REDACT");

CustomDictionary customDictionary = new CustomDictionary();
customDictionary.setCustomDictionaryFilterStrategies(Arrays.asList(customDictionaryFilterStrategy));
customDictionary.setTerms(Arrays.asList("Wendy"));
customDictionary.setCustomDictionaryFilterStrategies(List.of(customDictionaryFilterStrategy));
customDictionary.setTerms(List.of("Wendy"));

Identifiers identifiers = new Identifiers();

identifiers.setCustomDictionaries(Arrays.asList(customDictionary));
identifiers.setCustomDictionaries(List.of(customDictionary));
identifiers.setZipCode(zipCode);

Policy policy = new Policy();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ public void pdf1() throws Exception {
final PhileasConfiguration phileasConfiguration = new PhileasConfiguration(properties);

PhileasFilterService service = new PhileasFilterService(phileasConfiguration);
final BinaryDocumentFilterResponse response = service.filter(Arrays.asList("pdf"), "context", "documentid", document, MimeType.APPLICATION_PDF, MimeType.APPLICATION_PDF);
final BinaryDocumentFilterResponse response = service.filter(List.of("pdf"), "context", "documentid", document, MimeType.APPLICATION_PDF, MimeType.APPLICATION_PDF);

// Write the byte array to a file.
final File outputFile = File.createTempFile("redact", ".pdf");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public void filterCitiesLow() throws Exception {

showSpans(filterResult.getSpans());

Assertions.assertEquals(0, filterResult.getSpans().size());
Assertions.assertEquals(1, filterResult.getSpans().size());

}

Expand Down Expand Up @@ -139,8 +139,7 @@ public void filterCitiesHigh() throws Exception {

showSpans(filterResult.getSpans());

Assertions.assertEquals(1, filterResult.getSpans().size());
Assertions.assertTrue(checkSpan(filterResult.getSpans().get(0), 9, 17, FilterType.LOCATION_CITY));
Assertions.assertEquals(0, filterResult.getSpans().size());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public void filterCountiesLow() throws Exception {

showSpans(filterResult.getSpans());

Assertions.assertEquals(0, filterResult.getSpans().size());
Assertions.assertEquals(3, filterResult.getSpans().size());

}

Expand All @@ -70,12 +70,13 @@ public void filterCountiesMedium() throws Exception {

final FuzzyDictionaryFilter filter = new FuzzyDictionaryFilter(FilterType.LOCATION_COUNTY, filterConfiguration, SensitivityLevel.MEDIUM);

FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "Lived in Fyette", attributes);
FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "He lived in Fyette", attributes);

showSpans(filterResult.getSpans());

Assertions.assertEquals(1, filterResult.getSpans().size());
Assertions.assertEquals("fyette", filterResult.getSpans().get(0).getText());
Assertions.assertEquals(2, filterResult.getSpans().size());
Assertions.assertEquals("Payette", filterResult.getSpans().get(0).getText());
Assertions.assertEquals("Fayette", filterResult.getSpans().get(1).getText());

}

Expand All @@ -95,7 +96,7 @@ public void filterCountiesHigh() throws Exception {

showSpans(filterResult.getSpans());

Assertions.assertEquals(3, filterResult.getSpans().size());
Assertions.assertEquals(0, filterResult.getSpans().size());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public void filterDictionaryExactMatch() throws Exception {

Assertions.assertEquals(1, filterResult.getSpans().size());
Assertions.assertTrue(checkSpan(filterResult.getSpans().get(0), 14, 18, FilterType.CUSTOM_DICTIONARY));
Assertions.assertEquals("bill", filterResult.getSpans().get(0).getText());
Assertions.assertEquals("Bill", filterResult.getSpans().get(0).getText());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public void filterMedium1() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "Michel had eye cancer", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(26, filterResult.getSpans().size());
Assertions.assertEquals(20, filterResult.getSpans().size());

}

Expand All @@ -88,7 +88,7 @@ public void filterMedium2() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "Jennifer had eye cancer", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(593, filterResult.getSpans().size());
Assertions.assertEquals(4, filterResult.getSpans().size());

}

Expand Down Expand Up @@ -125,7 +125,7 @@ public void filter1() throws Exception {
final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "Melissa", attributes);

showSpans(filterResult.getSpans());
Assertions.assertEquals(3, filterResult.getSpans().size());
Assertions.assertEquals(33, filterResult.getSpans().size());

}

Expand All @@ -143,7 +143,7 @@ public void filter2() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"thomas", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(9, filterResult.getSpans().size());
Assertions.assertEquals(1, filterResult.getSpans().size());

}

Expand All @@ -161,7 +161,7 @@ public void filter3() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"dat", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(221, filterResult.getSpans().size());
Assertions.assertEquals(1, filterResult.getSpans().size());

}

Expand All @@ -179,7 +179,7 @@ public void filter4() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"joie", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(243, filterResult.getSpans().size());
Assertions.assertEquals(1, filterResult.getSpans().size());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public void filterStatesLow() throws Exception {

FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"Lived in Washington", attributes);
Assertions.assertEquals(1, filterResult.getSpans().size());
Assertions.assertEquals("washington", filterResult.getSpans().get(0).getText());
Assertions.assertEquals("Washington", filterResult.getSpans().get(0).getText());

}

Expand Down Expand Up @@ -86,7 +86,7 @@ public void filterStatesHigh() throws Exception {
final FuzzyDictionaryFilter filter = new FuzzyDictionaryFilter(FilterType.LOCATION_STATE, filterConfiguration, SensitivityLevel.HIGH);

FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "Lived in Wasinton", attributes);
Assertions.assertEquals(1, filterResult.getSpans().size());
Assertions.assertEquals(0, filterResult.getSpans().size());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public void filter1() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "Lived in Wshington", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(0, filterResult.getSpans().size());
Assertions.assertEquals(44, filterResult.getSpans().size());

}

Expand All @@ -70,7 +70,7 @@ public void filter2() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "Lived in Wshington", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(2, filterResult.getSpans().size());
Assertions.assertEquals(44, filterResult.getSpans().size());

}

Expand Down Expand Up @@ -106,7 +106,7 @@ public void filter4() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "date", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(0, filterResult.getSpans().size());
Assertions.assertEquals(1, filterResult.getSpans().size());

}

Expand All @@ -124,7 +124,7 @@ public void filter5() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "Jones", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(1, filterResult.getSpans().size());
Assertions.assertEquals(349, filterResult.getSpans().size());

}

Expand All @@ -142,7 +142,7 @@ public void filter6() throws Exception {

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "from", attributes);
showSpans(filterResult.getSpans());
Assertions.assertEquals(0, filterResult.getSpans().size());
Assertions.assertEquals(1, filterResult.getSpans().size());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,37 +228,31 @@ public int getOccurrences(final Policy policy, final String input, final Map<Str
public Map<String, Position> getNgrams(String text, int n) {

final String delimiter = " ";
final Map<String, Position> ngrams = new HashMap<>();

final Map<String, Position> ngramsWithIndexes = new HashMap<>();
final String[] words = text.split(delimiter);

final Map<String, Integer> splitsWithIndexes = new HashMap<>();
int index = 0;
for (String token : words) {
splitsWithIndexes.put(token, index);
index += token.length() + delimiter.length();
}

final String[] keys = splitsWithIndexes.keySet().toArray(String[]::new);

for (int i = 0; i < splitsWithIndexes.size() - n; i++) {
for (int i = 0; i <= words.length - n; i++) {

final StringBuilder ngram = new StringBuilder();

for (int j = 0; j < n; j++) {
ngram.append(keys[i + j]);

ngram.append(words[i + j]);

if (j < n - 1) {
ngram.append(delimiter);
ngram.append(" ");
}

}

final int characterStart = splitsWithIndexes.get(keys[i]);
final int characterEnd = splitsWithIndexes.get(keys[i]) + ngram.length();
ngrams.put(ngram.toString(), new Position(characterStart, characterEnd));
int location = text.indexOf(ngram.toString());

ngramsWithIndexes.put(ngram.toString(), new Position(location, location + ngram.toString().length()));

}

return ngrams;
return ngramsWithIndexes;

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,8 @@ public FilterResult filter(Policy policy, String context, String documentId, int
// Set the meta values for the span.
final boolean isIgnored = ignored.contains(ngram);

// TODO: Get the offsets.
final int characterStart = 0;
final int characterEnd = 0;
final int characterStart = ngrams.get(ngram).getStart();
final int characterEnd = ngrams.get(ngram).getEnd();
final double confidence = 1.0;
final String[] window = getWindow(text, characterStart, characterEnd);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ public class FuzzyDictionaryFilter extends DictionaryFilter implements Serializa

private final SensitivityLevel sensitivityLevel;
private final Map<String, Pattern> dictionary;
private final boolean requireCapitalization = true;

public FuzzyDictionaryFilter(final FilterType filterType, final FilterConfiguration filterConfiguration,
final SensitivityLevel sensitivityLevel) throws IOException {
Expand All @@ -47,10 +48,10 @@ public FilterResult filter(Policy policy, String context, String documentId, int
// Build ngrams from the input text.
final Map<Integer, Map<String, Position>> ngrams = new HashMap<>();
ngrams.put(0, splitWithIndexes(input, " "));
ngrams.put(1, getNgrams(input, 1));
ngrams.put(2, getNgrams(input, 2));
ngrams.put(3, getNgrams(input, 3));
ngrams.put(4, getNgrams(input, 4));

for(int x = 1; x < 10; x++) {
ngrams.put(x, getNgrams(input, x));
}

for(final String entry : dictionary.keySet()) {

Expand All @@ -68,15 +69,27 @@ public FilterResult filter(Policy policy, String context, String documentId, int
// Compare string distance between word and ngrams.
for (final String ngram : ngrams.get(spacesInEntry).keySet()) {

final LevenshteinDistance levenshteinDistance = LevenshteinDistance.getDefaultInstance();
final int distance = levenshteinDistance.apply(entry, ngram);
//LOGGER.info("{}, {}, {}", entry, ngram, distance);
if (sensitivityLevel == SensitivityLevel.HIGH && distance <= 1) {
spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes));
} else if (sensitivityLevel == SensitivityLevel.MEDIUM && distance <= 2) {
spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes));
} else if (sensitivityLevel == SensitivityLevel.LOW && distance <= 3) {
spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes));
if(ngram.length() > 2) {

if (requireCapitalization && Character.isUpperCase(ngram.charAt(0))) {

final int start = ngrams.get(spacesInEntry).get(ngram).getStart();
final int end = ngrams.get(spacesInEntry).get(ngram).getEnd();

final LevenshteinDistance levenshteinDistance = LevenshteinDistance.getDefaultInstance();
final int distance = levenshteinDistance.apply(entry, ngram);

if (sensitivityLevel == SensitivityLevel.HIGH && distance < 1) {
spans.add(createSpan(input, start, end, 1.0, context, documentId, entry, policy, attributes));
} else if (sensitivityLevel == SensitivityLevel.MEDIUM && distance <= 2) {
spans.add(createSpan(input, start, end, 1.0, context, documentId, entry, policy, attributes));
//LOGGER.info("{}, {}, {}", entry, ngram, distance);
} else if (sensitivityLevel == SensitivityLevel.LOW && distance < 3) {
spans.add(createSpan(input, start, end, 1.0, context, documentId, entry, policy, attributes));
}

}

}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ public static Span make(int characterStart, int characterEnd, FilterType filterT

// This is made here and not passed into the constructor because that would be redundant
// given the characterStart and characterEnd parameters in the constructor.
span.range = Range.between(characterStart, characterEnd);
span.range = Range.of(characterStart, characterEnd);

return span;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,19 @@
import javax.imageio.ImageWriteParam;
import javax.imageio.ImageWriter;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

Expand Down

0 comments on commit 96238d3

Please sign in to comment.