Skip to content

Commit

Permalink
#176 Working on removing lucene filters.
Browse files Browse the repository at this point in the history
  • Loading branch information
jzonthemtn committed Dec 16, 2024
1 parent 7319c2a commit f001b10
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 97 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import ai.philterd.phileas.model.objects.Analyzer;
import ai.philterd.phileas.model.objects.FilterPattern;
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.objects.Position;
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;

Expand Down Expand Up @@ -67,9 +68,9 @@ public FilterResult filter(Policy policy, String context, String documentId, int
final List<Span> spans = new LinkedList<>();

// TODO: Get ngrams from max to size 1.
final List<String> ngrams = getNgrams(input, 1);
final Map<String, Position> ngrams = getNgrams(input, 1);

for(final String candidate : ngrams) {
for(final String candidate : ngrams.keySet()) {

if (endsWithPostNominal(candidate) || startsWithPreNominal(candidate)) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.policy.filters.strategies.dynamic.CountyFilterStrategy;
import ai.philterd.phileas.model.services.AlertService;
import ai.philterd.phileas.model.services.AnonymizationService;
import ai.philterd.phileas.services.anonymization.CountyAnonymizationService;
import ai.philterd.phileas.services.anonymization.cache.LocalAnonymizationCacheService;
import org.apache.logging.log4j.LogManager;
Expand Down Expand Up @@ -83,8 +82,6 @@ public void filterCountiesMedium() throws Exception {
@Test
public void filterCountiesHigh() throws Exception {

AnonymizationService anonymizationService = new CountyAnonymizationService(new LocalAnonymizationCacheService());

final FilterConfiguration filterConfiguration = new FilterConfiguration.FilterConfigurationBuilder()
.withStrategies(List.of(new CountyFilterStrategy()))
.withAlertService(alertService)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,15 @@
package ai.philterd.phileas.model.filter;

import ai.philterd.phileas.model.enums.FilterType;
import ai.philterd.phileas.model.objects.*;
import ai.philterd.phileas.model.objects.FilterPattern;
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.objects.Position;
import ai.philterd.phileas.model.objects.Replacement;
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Crypto;
import ai.philterd.phileas.model.policy.FPE;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.policy.IgnoredPattern;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.policy.filters.Identifier;
import ai.philterd.phileas.model.policy.filters.strategies.AbstractFilterStrategy;
import ai.philterd.phileas.model.services.AlertService;
Expand All @@ -34,7 +38,13 @@
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

public abstract class Filter {
Expand Down Expand Up @@ -332,72 +342,19 @@ public static List<? extends AbstractFilterStrategy> getIdentifierFilterStrategi

}

public static List<? extends AbstractFilterStrategy> getFilterStrategies(final Policy policy,
final FilterType filterType,
final int index) {

LOGGER.debug("Getting filter strategies for filter type {}", filterType.getType());

if(filterType == FilterType.AGE) {
return policy.getIdentifiers().getAge().getAgeFilterStrategies();
} else if(filterType == FilterType.BITCOIN_ADDRESS) {
return policy.getIdentifiers().getBitcoinAddress().getBitcoinFilterStrategies();
} else if(filterType == FilterType.CREDIT_CARD) {
return policy.getIdentifiers().getCreditCard().getCreditCardFilterStrategies();
} else if(filterType == FilterType.CUSTOM_DICTIONARY) {
// There can be multiple custom dictionaries in the policy.
// The index is used to determine which one is the appropriate one.
return policy.getIdentifiers().getCustomDictionaries().get(index).getCustomDictionaryFilterStrategies();
} else if(filterType == FilterType.DATE) {
return policy.getIdentifiers().getDate().getDateFilterStrategies();
} else if(filterType == FilterType.DRIVERS_LICENSE_NUMBER) {
return policy.getIdentifiers().getDriversLicense().getDriversLicenseFilterStrategies();
} else if(filterType == FilterType.EMAIL_ADDRESS) {
return policy.getIdentifiers().getEmailAddress().getEmailAddressFilterStrategies();
} else if(filterType == FilterType.IBAN_CODE) {
return policy.getIdentifiers().getIbanCode().getIbanCodeFilterStrategies();
} else if(filterType == FilterType.IP_ADDRESS) {
return policy.getIdentifiers().getIpAddress().getIpAddressFilterStrategies();
} else if(filterType == FilterType.PERSON) {
return policy.getIdentifiers().getPhEye().getPhEyeFilterStrategies();
} else if(filterType == FilterType.PASSPORT_NUMBER) {
return policy.getIdentifiers().getPassportNumber().getPassportNumberFilterStrategies();
} else if(filterType == FilterType.PHONE_NUMBER) {
return policy.getIdentifiers().getPhoneNumber().getPhoneNumberFilterStrategies();
} else if(filterType == FilterType.PHONE_NUMBER_EXTENSION) {
return policy.getIdentifiers().getPhoneNumberExtension().getPhoneNumberExtensionFilterStrategies();
} else if(filterType == FilterType.PHYSICIAN_NAME) {
return policy.getIdentifiers().getPhysicianName().getPhysicianNameFilterStrategies();
} else if(filterType == FilterType.SSN) {
return policy.getIdentifiers().getSsn().getSsnFilterStrategies();
} else if(filterType == FilterType.STATE_ABBREVIATION) {
return policy.getIdentifiers().getStateAbbreviation().getStateAbbreviationsFilterStrategies();
} else if(filterType == FilterType.STREET_ADDRESS) {
return policy.getIdentifiers().getStreetAddress().getStreetAddressFilterStrategies();
} else if(filterType == FilterType.URL) {
return policy.getIdentifiers().getUrl().getUrlFilterStrategies();
} else if(filterType == FilterType.VIN) {
return policy.getIdentifiers().getVin().getVinFilterStrategies();
} else if(filterType == FilterType.ZIP_CODE) {
return policy.getIdentifiers().getZipCode().getZipCodeFilterStrategies();
} else if(filterType == FilterType.LOCATION_CITY) {
return policy.getIdentifiers().getCity().getCityFilterStrategies();
} else if(filterType == FilterType.LOCATION_COUNTY) {
return policy.getIdentifiers().getCounty().getCountyFilterStrategies();
} else if(filterType == FilterType.FIRST_NAME) {
return policy.getIdentifiers().getFirstName().getFirstNameFilterStrategies();
} else if(filterType == FilterType.HOSPITAL_ABBREVIATION) {
return policy.getIdentifiers().getHospitalAbbreviation().getHospitalAbbreviationFilterStrategies();
} else if(filterType == FilterType.HOSPITAL) {
return policy.getIdentifiers().getHospital().getHospitalFilterStrategies();
} else if(filterType == FilterType.LOCATION_STATE) {
return policy.getIdentifiers().getState().getStateFilterStrategies();
} else if(filterType == FilterType.SURNAME) {
return policy.getIdentifiers().getSurname().getSurnameFilterStrategies();
public Map<String, Position> splitWithIndexes(String text, String delimiter) {

final Map<String, Position> splitsWithIndexes = new HashMap<>();
List<String> result = new ArrayList<>();
String[] tokens = text.split(delimiter);

int index = 0;
for (String token : tokens) {
splitsWithIndexes.put(token, new Position(index, index + token.length()));
index += token.length() + delimiter.length();
}

// Should never happen.
return null;
return splitsWithIndexes;

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import ai.philterd.phileas.model.objects.Analyzer;
import ai.philterd.phileas.model.objects.ConfidenceModifier;
import ai.philterd.phileas.model.objects.FilterPattern;
import ai.philterd.phileas.model.objects.Position;
import ai.philterd.phileas.model.objects.Replacement;
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;
Expand All @@ -30,6 +31,7 @@

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -223,20 +225,37 @@ public int getOccurrences(final Policy policy, final String input, final Map<Str

}

public List<String> getNgrams(String text, int n) {
public Map<String, Position> getNgrams(String text, int n) {

final List<String> ngrams = new ArrayList<>();
final String[] words = text.split(" ");
final String delimiter = " ";
final Map<String, Position> ngrams = new HashMap<>();

final String[] words = text.split(delimiter);

final Map<String, Integer> splitsWithIndexes = new HashMap<>();
int index = 0;
for (String token : words) {
splitsWithIndexes.put(token, index);
index += token.length() + delimiter.length();
}

final String[] keys = splitsWithIndexes.keySet().toArray(String[]::new);

for (int i = 0; i < splitsWithIndexes.size() - n; i++) {

for (int i = 0; i <= words.length - n; i++) {
final StringBuilder ngram = new StringBuilder();

for (int j = 0; j < n; j++) {
ngram.append(words[i + j]);
ngram.append(keys[i + j]);
if (j < n - 1) {
ngram.append(" ");
ngram.append(delimiter);
}
}
ngrams.add(ngram.toString());

final int characterStart = splitsWithIndexes.get(keys[i]);
final int characterEnd = splitsWithIndexes.get(keys[i]) + ngram.length();
ngrams.put(ngram.toString(), new Position(characterStart, characterEnd));

}

return ngrams;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@
package ai.philterd.phileas.model.filter.rules.dictionary;

import ai.philterd.phileas.model.enums.FilterType;
import ai.philterd.phileas.model.enums.SensitivityLevel;
import ai.philterd.phileas.model.filter.FilterConfiguration;
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.objects.Position;
import ai.philterd.phileas.model.objects.Replacement;
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.utils.BloomFilter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
Expand Down Expand Up @@ -68,12 +69,12 @@ public BloomFilterDictionaryFilter(FilterType filterType,
maxNgramSize = split.length;
}
}
LOGGER.info("Max ngram size is {}", maxNgramSize);

// Lowercase the terms and add each to the bloom filter.
LOGGER.info("Creating bloom filter from {} terms.", terms.size());
terms.forEach(t -> lowerCaseTerms.add(t.toLowerCase()));
lowerCaseTerms.forEach(t -> bloomFilter.put(t.toLowerCase()));
for(final String term : terms) {
lowerCaseTerms.add(term.toLowerCase());
bloomFilter.put(term.toLowerCase());
}

}

Expand All @@ -83,14 +84,18 @@ public FilterResult filter(Policy policy, String context, String documentId, int

final List<Span> spans = new LinkedList<>();

// TODO: Get ngrams from max to size 1.
final List<String> ngrams = getNgrams(text, maxNgramSize);
final Map<String, Position> ngrams = new HashMap<>();

// Get ngrams from max to size 1.
for(int i = 0; i <= maxNgramSize; i++) {
ngrams.putAll(getNgrams(text, i));
}

for(final String ngram : ngrams) {
for(final String ngram : ngrams.keySet()) {

if (bloomFilter.mightContain(ngram)) {
if (bloomFilter.mightContain(ngram.toLowerCase())) {

if (lowerCaseTerms.contains(ngram)) {
if (lowerCaseTerms.contains(ngram.toLowerCase())) {

// Set the meta values for the span.
final boolean isIgnored = ignored.contains(ngram);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import ai.philterd.phileas.model.enums.SensitivityLevel;
import ai.philterd.phileas.model.filter.FilterConfiguration;
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.objects.Position;
import ai.philterd.phileas.model.objects.Replacement;
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;
Expand Down Expand Up @@ -44,12 +45,12 @@ public FilterResult filter(Policy policy, String context, String documentId, int
if(policy.getIdentifiers().hasFilter(filterType)) {

// Build ngrams from the input text.
final Map<Integer, List<String>> ngrams = new HashMap<>();
final Map<Integer, Map<String, Position>> ngrams = new HashMap<>();
ngrams.put(0, splitWithIndexes(input, " "));
ngrams.put(1, getNgrams(input, 1));
ngrams.put(2, getNgrams(input, 2));
ngrams.put(3, getNgrams(input, 3));

ngrams.put(0, Arrays.stream(input.split(" ")).toList());
ngrams.put(4, getNgrams(input, 4));

for(final String entry : dictionary.keySet()) {

Expand All @@ -65,16 +66,16 @@ public FilterResult filter(Policy policy, String context, String documentId, int
final int spacesInEntry = StringUtils.countMatches(entry, " ");

// Compare string distance between word and ngrams.
for (final String ngram : ngrams.get(spacesInEntry)) {
for (final String ngram : ngrams.get(spacesInEntry).keySet()) {

final LevenshteinDistance levenshteinDistance = LevenshteinDistance.getDefaultInstance();
final int distance = levenshteinDistance.apply(entry, ngram);

if (sensitivityLevel == SensitivityLevel.HIGH && distance < 1) {
//LOGGER.info("{}, {}, {}", entry, ngram, distance);
if (sensitivityLevel == SensitivityLevel.HIGH && distance <= 1) {
spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes));
} else if (sensitivityLevel == SensitivityLevel.MEDIUM && distance < 2) {
} else if (sensitivityLevel == SensitivityLevel.MEDIUM && distance <= 2) {
spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes));
} else if (sensitivityLevel == SensitivityLevel.LOW && distance < 3) {
} else if (sensitivityLevel == SensitivityLevel.LOW && distance <= 3) {
spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes));
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright 2024 Philterd, LLC @ https://www.philterd.ai
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ai.philterd.phileas.model.objects;

public class Position {

private final int start;
private final int end;

public Position(int start, int end) {
this.start = start;
this.end = end;
}

public int getStart() {
return start;
}

public int getEnd() {
return end;
}

@Override
public String toString() {
return "Position [start=" + start + ", end=" + end + "]";
}

}

0 comments on commit f001b10

Please sign in to comment.