From ef743c99682c3ece4b4d1b26b71604d9d3e0bd56 Mon Sep 17 00:00:00 2001 From: Vasyl Khrystiuk Date: Thu, 19 Dec 2024 03:14:55 +0200 Subject: [PATCH] [WIP] --- .../liqp/filters/date/fuzzy/LookupResult.java | 7 - .../filters/date/fuzzy/PartExtractor.java | 18 ++ .../extractors/AnyYMDPatternExtractor.java | 159 ++++++++++++++++++ .../EnglishDMYPatternExtractor.java | 49 +----- .../ISO8601YMDPatternExtractor.java | 2 +- .../extractors/RegularTimeExtractor.java | 12 +- .../date/fuzzy/extractors/YearWithEra.java | 9 +- .../date/fuzzy/FuzzyDateParserTest.java | 2 +- .../EnglishDMYPatternExtractorTest.java | 21 +++ 9 files changed, 214 insertions(+), 65 deletions(-) create mode 100644 src/main/java/liqp/filters/date/fuzzy/extractors/AnyYMDPatternExtractor.java create mode 100644 src/test/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractorTest.java diff --git a/src/main/java/liqp/filters/date/fuzzy/LookupResult.java b/src/main/java/liqp/filters/date/fuzzy/LookupResult.java index a15d1d64..f138f86b 100644 --- a/src/main/java/liqp/filters/date/fuzzy/LookupResult.java +++ b/src/main/java/liqp/filters/date/fuzzy/LookupResult.java @@ -6,17 +6,10 @@ class LookupResult { final List parts; final boolean found; - final DatePatternRecognizingContext ctx; LookupResult(List parts, boolean found) { this.parts = parts; this.found = found; - this.ctx = null; } - LookupResult(List parts, boolean found, DatePatternRecognizingContext ctx) { - this.parts = parts; - this.found = found; - this.ctx = ctx; - } } diff --git a/src/main/java/liqp/filters/date/fuzzy/PartExtractor.java b/src/main/java/liqp/filters/date/fuzzy/PartExtractor.java index 76d534c4..b872f508 100644 --- a/src/main/java/liqp/filters/date/fuzzy/PartExtractor.java +++ b/src/main/java/liqp/filters/date/fuzzy/PartExtractor.java @@ -1,8 +1,26 @@ package liqp.filters.date.fuzzy; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.Supplier; +import java.util.stream.Collectors; import liqp.filters.date.fuzzy.extractors.PartExtractorResult; public interface PartExtractor { PartExtractorResult extract(String source); + + default List newList(String... el) { + return Arrays.asList(el); + } + + default List appendToExisting(List start, Supplier> supplier) { + if (start.isEmpty()) { + return supplier.get(); + } + return start.stream() + .flatMap(prefix -> supplier.get().stream().map(suffix -> prefix + suffix)) + .collect(Collectors.toList()); + } } diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/AnyYMDPatternExtractor.java b/src/main/java/liqp/filters/date/fuzzy/extractors/AnyYMDPatternExtractor.java new file mode 100644 index 00000000..ac864d4b --- /dev/null +++ b/src/main/java/liqp/filters/date/fuzzy/extractors/AnyYMDPatternExtractor.java @@ -0,0 +1,159 @@ +package liqp.filters.date.fuzzy.extractors; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.regex.Matcher; + +abstract class AnyYMDPatternExtractor extends RegexPartExtractor { + + public enum RuleType { + Y, M, D, PUNCTUATION; + } + public static class RulePart { + private final RuleType type; + private final Integer[] length; + private final String content; + private RulePart(RuleType type, String content) { + this.type = type; + this.content = content; + this.length = new Integer[0]; + } + + private RulePart(RuleType type, Integer[] length) { + this.type = type; + this.length = length; + this.content = null; + } + } + + static RulePart pp(String content) { + return new RulePart(RuleType.PUNCTUATION, content); + } + static RulePart pY(Integer length) { + return new RulePart(RuleType.Y, new Integer[]{length}); + } + static RulePart pY(Integer length1, Integer length2) { + return new RulePart(RuleType.Y, new Integer[]{length1, length2}); + } + static RulePart pM() { + return new RulePart(RuleType.M, (Integer[])null); + } + static RulePart pD() { + return new RulePart(RuleType.D, (Integer[])null); + } + private final RulePart[] partsInOrder; + protected AnyYMDPatternExtractor(RulePart... partsInOrder) { + super(reconstructPattern(partsInOrder), null); + this.partsInOrder = partsInOrder; + } + + private static String reconstructPattern(RulePart[] partsInOrder) { + StringBuilder sb = new StringBuilder("(?:^|.*?\\D)"); + for (RulePart part : partsInOrder) { + if (part.type == RuleType.PUNCTUATION) { + sb.append(part.content); + } else { + if (part.type == RuleType.Y) { + if (part.length == null || part.length.length == 0) { + throw new IllegalArgumentException("Year part must have length"); + } + if (part.length.length == 1) { + sb.append("(?\\d{").append(part.length[0]).append("})"); + } else { + sb.append("(?\\d{").append(part.length[0]).append("}|\\d{") + .append(part.length[1]).append("})"); + } + } else if (part.type == RuleType.M) { + sb.append("(?0?[1-9]|1[0-2])"); + } else if (part.type == RuleType.D) { + sb.append("(?0?[1-9]|[12][0-9]|3[01])"); + } + } + } + sb.append("(?:$|\\D.*?)"); + return sb.toString(); + } + + @Override + public PartExtractorResult extract(String source) { + Matcher matcher = pattern.matcher(source); + if (matcher.find()) { + PartExtractorResult result = new PartExtractorResult(); + result.found = true; + result.start = matcher.start(findFirstGroupName()); + result.end = matcher.end(findLastGroupName()); + result.formatterPatterns = getPatterns(matcher); + return result; + } + return new PartExtractorResult(); + } + + private String findLastGroupName() { + List list = new ArrayList<>(); + Collections.addAll(list, partsInOrder); + Collections.reverse(list); + Optional first = list + .stream() + .filter(p -> p.type != RuleType.PUNCTUATION) + .findFirst(); + return getNoGroupNameFound(first); + } + + private String findFirstGroupName() { + Optional first = Arrays.stream(partsInOrder) + .filter(p -> p.type != RuleType.PUNCTUATION) + .findFirst(); + return getNoGroupNameFound(first); + } + + @SuppressWarnings("OptionalUsedAsFieldOrParameterType") + private static String getNoGroupNameFound(Optional first) { + return first.map(e -> { + switch (e.type) { + case Y: + return "year"; + case M: + return "month"; + case D: + default: + return "day"; + } + }).map(String::toLowerCase) + .orElseThrow(() -> new IllegalArgumentException("No group name found")); + } + + protected List getPatterns(Matcher matcher) { + List start = new ArrayList<>(); + for (RulePart part : partsInOrder) { + start = appendToExisting(start, () -> { + if (part.type == RuleType.Y) { + if (matcher.group("year").length() == 2) { + return newList("yy"); + } else { + return newList("yyyy"); + } + } else if (part.type == RuleType.M) { + if (matcher.group("month").length() == 1) { + return newList("M", "MM"); + } else { + return newList("MM", "M"); + } + } else if (part.type == RuleType.D) { + if (matcher.group("day").length() == 1) { + return newList("d", "dd"); + } else { + return newList("dd", "d"); + } + } else if (part.type == RuleType.PUNCTUATION) { + return Collections.singletonList(part.content); + } + return Collections.singletonList(""); + }); + } + return start; + } + +} diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractor.java b/src/main/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractor.java index 128d6a15..48781819 100644 --- a/src/main/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractor.java +++ b/src/main/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractor.java @@ -1,52 +1,7 @@ package liqp.filters.date.fuzzy.extractors; -import java.util.regex.Matcher; - -class EnglishDMYPatternExtractor extends RegexPartExtractor { +class EnglishDMYPatternExtractor extends AnyYMDPatternExtractor { public EnglishDMYPatternExtractor() { - super("(?:^|.*?\\D)" - + "(?0?[1-9]|[12][0-9]|3[01])" - + "/" - + "(?0?[1-9]|1[0-2])" - + "/" - + "(?\\d{2}|\\d{4})" - + "(?:$|\\D.*?)", null); - } - - - @Override - public PartExtractorResult extract(String source) { - Matcher matcher = pattern.matcher(source); - if (matcher.find()) { - PartExtractorResult result = new PartExtractorResult(); - result.found = true; - result.start = matcher.start("day"); - result.end = matcher.end("year"); - result.formatterPattern = getPattern(matcher); - return result; - } - return new PartExtractorResult(); - } - - private String getPattern(Matcher matcher) { - StringBuilder sbfp = new StringBuilder(); - if (matcher.group("day").length() == 1) { - sbfp.append("d"); - } else { - sbfp.append("dd"); - } - sbfp.append("/"); - if (matcher.group("month").length() == 1) { - sbfp.append("M"); - } else { - sbfp.append("MM"); - } - sbfp.append("/"); - if (matcher.group("year").length() == 2) { - sbfp.append("yy"); - } else { - sbfp.append("yyyy"); - } - return sbfp.toString(); + super(pD(), pp("/"), pM(), pp("/"), pY(2, 4)); } } diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/ISO8601YMDPatternExtractor.java b/src/main/java/liqp/filters/date/fuzzy/extractors/ISO8601YMDPatternExtractor.java index c3535402..e8952afa 100644 --- a/src/main/java/liqp/filters/date/fuzzy/extractors/ISO8601YMDPatternExtractor.java +++ b/src/main/java/liqp/filters/date/fuzzy/extractors/ISO8601YMDPatternExtractor.java @@ -18,7 +18,7 @@ public PartExtractorResult extract(String source) { result.found = true; result.start = matcher.start("year"); result.end = matcher.end("date"); - result.formatterPattern = getPattern(matcher); + result.formatterPatterns = newList(getPattern(matcher)); return result; } return new PartExtractorResult(); diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/RegularTimeExtractor.java b/src/main/java/liqp/filters/date/fuzzy/extractors/RegularTimeExtractor.java index 7219b7cb..b0a50272 100644 --- a/src/main/java/liqp/filters/date/fuzzy/extractors/RegularTimeExtractor.java +++ b/src/main/java/liqp/filters/date/fuzzy/extractors/RegularTimeExtractor.java @@ -47,25 +47,27 @@ public PartExtractorResult extract(String source) { } r.start = m.start("hours"); + String resPattern; if (m.group("milliseconds") != null) { int millisecondsLength = m.group("milliseconds").length(); r.end = m.end("milliseconds"); - r.formatterPattern = + resPattern = hourPart + ":mm:ss." + repeat("S", millisecondsLength); } else if (m.group("seconds") != null) { r.end = m.end("seconds"); - r.formatterPattern = hourPart + ":mm:ss"; + resPattern = hourPart + ":mm:ss"; } else if (m.group("minutes") != null) { r.end = m.end("minutes"); - r.formatterPattern = hourPart + ":mm"; + resPattern = hourPart + ":mm"; } else { r.end = m.end("hours"); - r.formatterPattern = hourPart; + resPattern = hourPart; } if (hasAmPm) { - r.formatterPattern += ampmPart; + resPattern += ampmPart; r.end = m.end("ampm"); } + r.formatterPatterns = newList(resPattern); return r; } return new PartExtractorResult(); diff --git a/src/main/java/liqp/filters/date/fuzzy/extractors/YearWithEra.java b/src/main/java/liqp/filters/date/fuzzy/extractors/YearWithEra.java index 86dad939..bfe65332 100644 --- a/src/main/java/liqp/filters/date/fuzzy/extractors/YearWithEra.java +++ b/src/main/java/liqp/filters/date/fuzzy/extractors/YearWithEra.java @@ -19,22 +19,23 @@ public PartExtractorResult extract(String source) { PartExtractorResult result = new PartExtractorResult(); result.found = true; result.start = matcher.start("year"); - result.formatterPattern = repeat("y", matcher.group("year").length()); + String resPattern = repeat("y", matcher.group("year").length()); String era = matcher.group("era"); if (!isBlank(era)) { String eraSeparator = matcher.group("eraSeparator"); if (eraSeparator != null) { - result.formatterPattern += eraSeparator; + resPattern += eraSeparator; } result.end = matcher.end("era"); if (era.length() == 2) { - result.formatterPattern += "GG"; + resPattern += "GG"; } else { - result.formatterPattern += "GGGG"; + resPattern += "GGGG"; } } else { result.end = matcher.end("year"); } + result.formatterPatterns = newList(resPattern); return result; } return new PartExtractorResult(); diff --git a/src/test/java/liqp/filters/date/fuzzy/FuzzyDateParserTest.java b/src/test/java/liqp/filters/date/fuzzy/FuzzyDateParserTest.java index f1b04a5d..8da62066 100644 --- a/src/test/java/liqp/filters/date/fuzzy/FuzzyDateParserTest.java +++ b/src/test/java/liqp/filters/date/fuzzy/FuzzyDateParserTest.java @@ -18,7 +18,7 @@ public void testTimeRegexp() { assertTrue(result.found); assertEquals( 1, result.start); assertEquals( 6, result.end); - assertEquals("HH:mm", result.formatterPattern); + assertEquals("HH:mm", result.formatterPatterns.get(0)); } @Test diff --git a/src/test/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractorTest.java b/src/test/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractorTest.java new file mode 100644 index 00000000..1da48d9c --- /dev/null +++ b/src/test/java/liqp/filters/date/fuzzy/extractors/EnglishDMYPatternExtractorTest.java @@ -0,0 +1,21 @@ +package liqp.filters.date.fuzzy.extractors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import org.junit.Test; + +public class EnglishDMYPatternExtractorTest{ + @Test + public void test() { + EnglishDMYPatternExtractor extractor = new EnglishDMYPatternExtractor(); + PartExtractorResult extract = extractor.extract(" 1/1/11 "); + assertTrue(extract.found); + assertEquals(2, extract.start); + assertEquals(8, extract.end); + assertEquals(4, extract.formatterPatterns.size()); + assertEquals(Arrays.asList("d/M/yy", "d/MM/yy", "dd/M/yy", "dd/MM/yy"), + extract.formatterPatterns); + } +}