diff --git a/phileas-core/src/main/java/ai/philterd/phileas/services/filters/regex/PhysicianNameFilter.java b/phileas-core/src/main/java/ai/philterd/phileas/services/filters/regex/PhysicianNameFilter.java index 293ba00b4..3bc09d20f 100644 --- a/phileas-core/src/main/java/ai/philterd/phileas/services/filters/regex/PhysicianNameFilter.java +++ b/phileas-core/src/main/java/ai/philterd/phileas/services/filters/regex/PhysicianNameFilter.java @@ -21,6 +21,7 @@ import ai.philterd.phileas.model.objects.Analyzer; import ai.philterd.phileas.model.objects.FilterPattern; import ai.philterd.phileas.model.objects.FilterResult; +import ai.philterd.phileas.model.objects.Position; import ai.philterd.phileas.model.objects.Span; import ai.philterd.phileas.model.policy.Policy; @@ -67,9 +68,9 @@ public FilterResult filter(Policy policy, String context, String documentId, int final List spans = new LinkedList<>(); // TODO: Get ngrams from max to size 1. - final List ngrams = getNgrams(input, 1); + final Map ngrams = getNgrams(input, 1); - for(final String candidate : ngrams) { + for(final String candidate : ngrams.keySet()) { if (endsWithPostNominal(candidate) || startsWithPreNominal(candidate)) { diff --git a/phileas-core/src/test/java/ai/philterd/test/phileas/services/filters/CountyFilterTest.java b/phileas-core/src/test/java/ai/philterd/test/phileas/services/filters/CountyFilterTest.java index 1d1460a09..0da9dd55c 100644 --- a/phileas-core/src/test/java/ai/philterd/test/phileas/services/filters/CountyFilterTest.java +++ b/phileas-core/src/test/java/ai/philterd/test/phileas/services/filters/CountyFilterTest.java @@ -22,7 +22,6 @@ import ai.philterd.phileas.model.objects.FilterResult; import ai.philterd.phileas.model.policy.filters.strategies.dynamic.CountyFilterStrategy; import ai.philterd.phileas.model.services.AlertService; -import ai.philterd.phileas.model.services.AnonymizationService; import ai.philterd.phileas.services.anonymization.CountyAnonymizationService; import ai.philterd.phileas.services.anonymization.cache.LocalAnonymizationCacheService; import org.apache.logging.log4j.LogManager; @@ -83,8 +82,6 @@ public void filterCountiesMedium() throws Exception { @Test public void filterCountiesHigh() throws Exception { - AnonymizationService anonymizationService = new CountyAnonymizationService(new LocalAnonymizationCacheService()); - final FilterConfiguration filterConfiguration = new FilterConfiguration.FilterConfigurationBuilder() .withStrategies(List.of(new CountyFilterStrategy())) .withAlertService(alertService) diff --git a/phileas-model/src/main/java/ai/philterd/phileas/model/filter/Filter.java b/phileas-model/src/main/java/ai/philterd/phileas/model/filter/Filter.java index b62427118..657b6609f 100644 --- a/phileas-model/src/main/java/ai/philterd/phileas/model/filter/Filter.java +++ b/phileas-model/src/main/java/ai/philterd/phileas/model/filter/Filter.java @@ -16,11 +16,15 @@ package ai.philterd.phileas.model.filter; import ai.philterd.phileas.model.enums.FilterType; -import ai.philterd.phileas.model.objects.*; +import ai.philterd.phileas.model.objects.FilterPattern; +import ai.philterd.phileas.model.objects.FilterResult; +import ai.philterd.phileas.model.objects.Position; +import ai.philterd.phileas.model.objects.Replacement; +import ai.philterd.phileas.model.objects.Span; import ai.philterd.phileas.model.policy.Crypto; import ai.philterd.phileas.model.policy.FPE; -import ai.philterd.phileas.model.policy.Policy; import ai.philterd.phileas.model.policy.IgnoredPattern; +import ai.philterd.phileas.model.policy.Policy; import ai.philterd.phileas.model.policy.filters.Identifier; import ai.philterd.phileas.model.policy.filters.strategies.AbstractFilterStrategy; import ai.philterd.phileas.model.services.AlertService; @@ -34,7 +38,13 @@ import java.io.File; import java.io.IOException; import java.nio.charset.Charset; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; public abstract class Filter { @@ -332,72 +342,19 @@ public static List getIdentifierFilterStrategi } - public static List getFilterStrategies(final Policy policy, - final FilterType filterType, - final int index) { - - LOGGER.debug("Getting filter strategies for filter type {}", filterType.getType()); - - if(filterType == FilterType.AGE) { - return policy.getIdentifiers().getAge().getAgeFilterStrategies(); - } else if(filterType == FilterType.BITCOIN_ADDRESS) { - return policy.getIdentifiers().getBitcoinAddress().getBitcoinFilterStrategies(); - } else if(filterType == FilterType.CREDIT_CARD) { - return policy.getIdentifiers().getCreditCard().getCreditCardFilterStrategies(); - } else if(filterType == FilterType.CUSTOM_DICTIONARY) { - // There can be multiple custom dictionaries in the policy. - // The index is used to determine which one is the appropriate one. - return policy.getIdentifiers().getCustomDictionaries().get(index).getCustomDictionaryFilterStrategies(); - } else if(filterType == FilterType.DATE) { - return policy.getIdentifiers().getDate().getDateFilterStrategies(); - } else if(filterType == FilterType.DRIVERS_LICENSE_NUMBER) { - return policy.getIdentifiers().getDriversLicense().getDriversLicenseFilterStrategies(); - } else if(filterType == FilterType.EMAIL_ADDRESS) { - return policy.getIdentifiers().getEmailAddress().getEmailAddressFilterStrategies(); - } else if(filterType == FilterType.IBAN_CODE) { - return policy.getIdentifiers().getIbanCode().getIbanCodeFilterStrategies(); - } else if(filterType == FilterType.IP_ADDRESS) { - return policy.getIdentifiers().getIpAddress().getIpAddressFilterStrategies(); - } else if(filterType == FilterType.PERSON) { - return policy.getIdentifiers().getPhEye().getPhEyeFilterStrategies(); - } else if(filterType == FilterType.PASSPORT_NUMBER) { - return policy.getIdentifiers().getPassportNumber().getPassportNumberFilterStrategies(); - } else if(filterType == FilterType.PHONE_NUMBER) { - return policy.getIdentifiers().getPhoneNumber().getPhoneNumberFilterStrategies(); - } else if(filterType == FilterType.PHONE_NUMBER_EXTENSION) { - return policy.getIdentifiers().getPhoneNumberExtension().getPhoneNumberExtensionFilterStrategies(); - } else if(filterType == FilterType.PHYSICIAN_NAME) { - return policy.getIdentifiers().getPhysicianName().getPhysicianNameFilterStrategies(); - } else if(filterType == FilterType.SSN) { - return policy.getIdentifiers().getSsn().getSsnFilterStrategies(); - } else if(filterType == FilterType.STATE_ABBREVIATION) { - return policy.getIdentifiers().getStateAbbreviation().getStateAbbreviationsFilterStrategies(); - } else if(filterType == FilterType.STREET_ADDRESS) { - return policy.getIdentifiers().getStreetAddress().getStreetAddressFilterStrategies(); - } else if(filterType == FilterType.URL) { - return policy.getIdentifiers().getUrl().getUrlFilterStrategies(); - } else if(filterType == FilterType.VIN) { - return policy.getIdentifiers().getVin().getVinFilterStrategies(); - } else if(filterType == FilterType.ZIP_CODE) { - return policy.getIdentifiers().getZipCode().getZipCodeFilterStrategies(); - } else if(filterType == FilterType.LOCATION_CITY) { - return policy.getIdentifiers().getCity().getCityFilterStrategies(); - } else if(filterType == FilterType.LOCATION_COUNTY) { - return policy.getIdentifiers().getCounty().getCountyFilterStrategies(); - } else if(filterType == FilterType.FIRST_NAME) { - return policy.getIdentifiers().getFirstName().getFirstNameFilterStrategies(); - } else if(filterType == FilterType.HOSPITAL_ABBREVIATION) { - return policy.getIdentifiers().getHospitalAbbreviation().getHospitalAbbreviationFilterStrategies(); - } else if(filterType == FilterType.HOSPITAL) { - return policy.getIdentifiers().getHospital().getHospitalFilterStrategies(); - } else if(filterType == FilterType.LOCATION_STATE) { - return policy.getIdentifiers().getState().getStateFilterStrategies(); - } else if(filterType == FilterType.SURNAME) { - return policy.getIdentifiers().getSurname().getSurnameFilterStrategies(); + public Map splitWithIndexes(String text, String delimiter) { + + final Map splitsWithIndexes = new HashMap<>(); + List result = new ArrayList<>(); + String[] tokens = text.split(delimiter); + + int index = 0; + for (String token : tokens) { + splitsWithIndexes.put(token, new Position(index, index + token.length())); + index += token.length() + delimiter.length(); } - // Should never happen. - return null; + return splitsWithIndexes; } diff --git a/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/RulesFilter.java b/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/RulesFilter.java index 18dbfaa2f..e62fd9171 100644 --- a/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/RulesFilter.java +++ b/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/RulesFilter.java @@ -22,6 +22,7 @@ import ai.philterd.phileas.model.objects.Analyzer; import ai.philterd.phileas.model.objects.ConfidenceModifier; import ai.philterd.phileas.model.objects.FilterPattern; +import ai.philterd.phileas.model.objects.Position; import ai.philterd.phileas.model.objects.Replacement; import ai.philterd.phileas.model.objects.Span; import ai.philterd.phileas.model.policy.Policy; @@ -30,6 +31,7 @@ import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -223,20 +225,37 @@ public int getOccurrences(final Policy policy, final String input, final Map getNgrams(String text, int n) { + public Map getNgrams(String text, int n) { - final List ngrams = new ArrayList<>(); - final String[] words = text.split(" "); + final String delimiter = " "; + final Map ngrams = new HashMap<>(); + + final String[] words = text.split(delimiter); + + final Map splitsWithIndexes = new HashMap<>(); + int index = 0; + for (String token : words) { + splitsWithIndexes.put(token, index); + index += token.length() + delimiter.length(); + } + + final String[] keys = splitsWithIndexes.keySet().toArray(String[]::new); + + for (int i = 0; i < splitsWithIndexes.size() - n; i++) { - for (int i = 0; i <= words.length - n; i++) { final StringBuilder ngram = new StringBuilder(); + for (int j = 0; j < n; j++) { - ngram.append(words[i + j]); + ngram.append(keys[i + j]); if (j < n - 1) { - ngram.append(" "); + ngram.append(delimiter); } } - ngrams.add(ngram.toString()); + + final int characterStart = splitsWithIndexes.get(keys[i]); + final int characterEnd = splitsWithIndexes.get(keys[i]) + ngram.length(); + ngrams.put(ngram.toString(), new Position(characterStart, characterEnd)); + } return ngrams; diff --git a/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/dictionary/BloomFilterDictionaryFilter.java b/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/dictionary/BloomFilterDictionaryFilter.java index 28c0af520..2b432a497 100644 --- a/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/dictionary/BloomFilterDictionaryFilter.java +++ b/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/dictionary/BloomFilterDictionaryFilter.java @@ -16,9 +16,9 @@ package ai.philterd.phileas.model.filter.rules.dictionary; import ai.philterd.phileas.model.enums.FilterType; -import ai.philterd.phileas.model.enums.SensitivityLevel; import ai.philterd.phileas.model.filter.FilterConfiguration; import ai.philterd.phileas.model.objects.FilterResult; +import ai.philterd.phileas.model.objects.Position; import ai.philterd.phileas.model.objects.Replacement; import ai.philterd.phileas.model.objects.Span; import ai.philterd.phileas.model.policy.Policy; @@ -26,6 +26,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; @@ -68,12 +69,12 @@ public BloomFilterDictionaryFilter(FilterType filterType, maxNgramSize = split.length; } } - LOGGER.info("Max ngram size is {}", maxNgramSize); // Lowercase the terms and add each to the bloom filter. - LOGGER.info("Creating bloom filter from {} terms.", terms.size()); - terms.forEach(t -> lowerCaseTerms.add(t.toLowerCase())); - lowerCaseTerms.forEach(t -> bloomFilter.put(t.toLowerCase())); + for(final String term : terms) { + lowerCaseTerms.add(term.toLowerCase()); + bloomFilter.put(term.toLowerCase()); + } } @@ -83,14 +84,18 @@ public FilterResult filter(Policy policy, String context, String documentId, int final List spans = new LinkedList<>(); - // TODO: Get ngrams from max to size 1. - final List ngrams = getNgrams(text, maxNgramSize); + final Map ngrams = new HashMap<>(); + + // Get ngrams from max to size 1. + for(int i = 0; i <= maxNgramSize; i++) { + ngrams.putAll(getNgrams(text, i)); + } - for(final String ngram : ngrams) { + for(final String ngram : ngrams.keySet()) { - if (bloomFilter.mightContain(ngram)) { + if (bloomFilter.mightContain(ngram.toLowerCase())) { - if (lowerCaseTerms.contains(ngram)) { + if (lowerCaseTerms.contains(ngram.toLowerCase())) { // Set the meta values for the span. final boolean isIgnored = ignored.contains(ngram); diff --git a/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/dictionary/FuzzyDictionaryFilter.java b/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/dictionary/FuzzyDictionaryFilter.java index 0027f1915..5171e86ba 100644 --- a/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/dictionary/FuzzyDictionaryFilter.java +++ b/phileas-model/src/main/java/ai/philterd/phileas/model/filter/rules/dictionary/FuzzyDictionaryFilter.java @@ -4,6 +4,7 @@ import ai.philterd.phileas.model.enums.SensitivityLevel; import ai.philterd.phileas.model.filter.FilterConfiguration; import ai.philterd.phileas.model.objects.FilterResult; +import ai.philterd.phileas.model.objects.Position; import ai.philterd.phileas.model.objects.Replacement; import ai.philterd.phileas.model.objects.Span; import ai.philterd.phileas.model.policy.Policy; @@ -44,12 +45,12 @@ public FilterResult filter(Policy policy, String context, String documentId, int if(policy.getIdentifiers().hasFilter(filterType)) { // Build ngrams from the input text. - final Map> ngrams = new HashMap<>(); + final Map> ngrams = new HashMap<>(); + ngrams.put(0, splitWithIndexes(input, " ")); ngrams.put(1, getNgrams(input, 1)); ngrams.put(2, getNgrams(input, 2)); ngrams.put(3, getNgrams(input, 3)); - - ngrams.put(0, Arrays.stream(input.split(" ")).toList()); + ngrams.put(4, getNgrams(input, 4)); for(final String entry : dictionary.keySet()) { @@ -65,16 +66,16 @@ public FilterResult filter(Policy policy, String context, String documentId, int final int spacesInEntry = StringUtils.countMatches(entry, " "); // Compare string distance between word and ngrams. - for (final String ngram : ngrams.get(spacesInEntry)) { + for (final String ngram : ngrams.get(spacesInEntry).keySet()) { final LevenshteinDistance levenshteinDistance = LevenshteinDistance.getDefaultInstance(); final int distance = levenshteinDistance.apply(entry, ngram); - - if (sensitivityLevel == SensitivityLevel.HIGH && distance < 1) { +//LOGGER.info("{}, {}, {}", entry, ngram, distance); + if (sensitivityLevel == SensitivityLevel.HIGH && distance <= 1) { spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes)); - } else if (sensitivityLevel == SensitivityLevel.MEDIUM && distance < 2) { + } else if (sensitivityLevel == SensitivityLevel.MEDIUM && distance <= 2) { spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes)); - } else if (sensitivityLevel == SensitivityLevel.LOW && distance < 3) { + } else if (sensitivityLevel == SensitivityLevel.LOW && distance <= 3) { spans.add(createSpan(input, 0, input.length(), 1.0, context, documentId, entry, policy, attributes)); } diff --git a/phileas-model/src/main/java/ai/philterd/phileas/model/objects/Position.java b/phileas-model/src/main/java/ai/philterd/phileas/model/objects/Position.java new file mode 100644 index 000000000..e5919d412 --- /dev/null +++ b/phileas-model/src/main/java/ai/philterd/phileas/model/objects/Position.java @@ -0,0 +1,41 @@ +/* + * Copyright 2024 Philterd, LLC @ https://www.philterd.ai + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ai.philterd.phileas.model.objects; + +public class Position { + + private final int start; + private final int end; + + public Position(int start, int end) { + this.start = start; + this.end = end; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + @Override + public String toString() { + return "Position [start=" + start + ", end=" + end + "]"; + } + +}