Support matching binary utf8 string

Summary: 1. Introduce MatcherInput to represent both utf16 and utf8. 2. Reuse existing tests with ApiTestUtils to use both MatcherInput.
google · Mar 8, 2021 · c4e8120 · herbyderby · Oct 14, 2023 · c4e8120
1 parent 2986a5d
commit c4e8120
Show file tree

Hide file tree

Showing 7 changed files with 238 additions and 41 deletions.
diff --git a/java/com/google/re2j/Matcher.java b/java/com/google/re2j/Matcher.java
@@ -6,6 +6,7 @@
  */
 package com.google.re2j;
 
+import com.google.re2j.MatcherInput.Encoding;
 import java.util.Map;
 
 /**
@@ -48,7 +49,7 @@ public final class Matcher {
   // The number of submatches (groups) in the pattern.
   private final int groupCount;
 
-  private CharSequence inputSequence;
+  private MatcherInput matcherInput;
 
   // The input length in UTF16 codes.
   private int inputLength;
@@ -83,6 +84,11 @@ private Matcher(Pattern pattern) {
     reset(input);
   }
 
+  Matcher(Pattern pattern, MatcherInput input) {
+    this(pattern);
+    reset(input);
+  }
+
   /** Returns the {@code Pattern} associated with this {@code Matcher}. */
   public Pattern pattern() {
     return pattern;
@@ -94,7 +100,7 @@ public Pattern pattern() {
    * @return the {@code Matcher} itself, for chained method calls
    */
   public Matcher reset() {
-    inputLength = inputSequence.length();
+    inputLength = matcherInput.length();
     appendPos = 0;
     hasMatch = false;
     hasGroups = false;
@@ -108,10 +114,24 @@ public Matcher reset() {
    * @return the {@code Matcher} itself, for chained method calls
    */
   public Matcher reset(CharSequence input) {
+    return reset(MatcherInput.utf16(input));
+  }
+
+  /**
+   * Resets the {@code Matcher} and changes the input.
+   *
+   * @param bytes utf8 bytes of the input string.
+   * @return the {@code Matcher} itself, for chained method calls
+   */
+  public Matcher reset(byte[] bytes) {
+    return reset(MatcherInput.utf8(bytes));
+  }
+
+  private Matcher reset(MatcherInput input) {
     if (input == null) {
       throw new NullPointerException("input is null");
     }
-    inputSequence = input;
+    matcherInput = input;
     reset();
     return this;
   }
@@ -261,7 +281,7 @@ private void loadGroup(int group) {
     }
 
     boolean ok =
-        pattern.re2().match(inputSequence, groups[0], end, anchorFlag, groups, 1 + groupCount);
+        pattern.re2().match(matcherInput, groups[0], end, anchorFlag, groups, 1 + groupCount);
     // Must match - hasMatch says that the last call with these
     // parameters worked just fine.
     if (!ok) {
@@ -328,7 +348,7 @@ public boolean find(int start) {
   private boolean genMatch(int startByte, int anchor) {
     // TODO(rsc): Is matches/lookingAt supposed to reset the append or input positions?
     // From the JDK docs, looks like no.
-    boolean ok = pattern.re2().match(inputSequence, startByte, inputLength, anchor, groups, 1);
+    boolean ok = pattern.re2().match(matcherInput, startByte, inputLength, anchor, groups, 1);
     if (!ok) {
       return false;
     }
@@ -341,8 +361,13 @@ private boolean genMatch(int startByte, int anchor) {
 
   /** Helper: return substring for [start, end). */
   String substring(int start, int end) {
+    // UTF_8 is matched in binary mode. So slice the bytes.
+    if (matcherInput.getEncoding() == Encoding.UTF_8) {
+      return new String(matcherInput.asBytes(), start, end - start);
+    }
+
     // This is fast for both StringBuilder and String.
-    return inputSequence.subSequence(start, end).toString();
+    return matcherInput.asCharSequence().subSequence(start, end).toString();
   }
 
   /** Helper for Pattern: return input length. */
@@ -492,7 +517,7 @@ private void appendReplacementInternal(StringBuilder sb, String replacement) {
       }
     }
     if (last < m) {
-      sb.append(replacement.substring(last, m));
+      sb.append(replacement, last, m);
     }
   }
 

diff --git a/java/com/google/re2j/MatcherInput.java b/java/com/google/re2j/MatcherInput.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021 The Go Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+package com.google.re2j;
+
+import java.nio.charset.Charset;
+
+/**
+ * Abstract the representations of input text supplied to Matcher.
+ */
+abstract class MatcherInput {
+
+  enum Encoding {
+    UTF_16,
+    UTF_8,
+  }
+
+  /**
+   * Return the MatcherInput for UTF_16 encoding.
+   */
+  static MatcherInput utf16(CharSequence charSequence) {
+    return new Utf16MatcherInput(charSequence);
+  }
+
+  /**
+   * Return the MatcherInput for UTF_8 encoding.
+   */
+  static MatcherInput utf8(byte[] bytes) {
+    return new Utf8MatcherInput(bytes);
+  }
+
+  /**
+   * Return the MatcherInput for UTF_8 encoding.
+   */
+  static MatcherInput utf8(String input) {
+    return new Utf8MatcherInput(input.getBytes(Charset.forName("UTF-8")));
+  }
+
+  abstract Encoding getEncoding();
+
+  abstract CharSequence asCharSequence();
+
+  abstract byte[] asBytes();
+
+  abstract int length();
+
+  static class Utf8MatcherInput extends MatcherInput {
+    byte[] bytes;
+
+    public Utf8MatcherInput(byte[] bytes) {
+      this.bytes = bytes;
+    }
+
+    @Override
+    public Encoding getEncoding() {
+      return Encoding.UTF_8;
+    }
+
+    @Override
+    public CharSequence asCharSequence() {
+      return new String(bytes, Charset.forName("UTF-8"));
+    }
+
+    @Override
+    public byte[] asBytes() {
+      return bytes;
+    }
+
+    @Override
+    public int length() {
+      return bytes.length;
+    }
+  }
+
+  static class Utf16MatcherInput extends MatcherInput {
+    CharSequence charSequence;
+
+    public Utf16MatcherInput(CharSequence charSequence) {
+      this.charSequence = charSequence;
+    }
+
+    @Override
+    public Encoding getEncoding() {
+      return Encoding.UTF_16;
+    }
+
+    @Override
+    public CharSequence asCharSequence() {
+      return charSequence;
+    }
+
+    @Override
+    public byte[] asBytes() {
+      return charSequence.toString().getBytes(Charset.forName("UTF-16"));
+    }
+
+    @Override
+    public int length() {
+      return charSequence.length();
+    }
+  }
+}
diff --git a/java/com/google/re2j/Pattern.java b/java/com/google/re2j/Pattern.java
@@ -161,10 +161,18 @@ public static boolean matches(String regex, CharSequence input) {
     return compile(regex).matcher(input).matches();
   }
 
+  public static boolean matches(String regex, byte[] input) {
+    return compile(regex).matcher(input).matches();
+  }
+
   public boolean matches(String input) {
     return this.matcher(input).matches();
   }
 
+  public boolean matches(byte[] input) {
+    return this.matcher(input).matches();
+  }
+
   /**
    * Creates a new {@code Matcher} matching the pattern against the input.
    *
@@ -174,6 +182,15 @@ public Matcher matcher(CharSequence input) {
     return new Matcher(this, input);
   }
 
+  public Matcher matcher(byte[] input) {
+    return new Matcher(this, MatcherInput.utf8(input));
+  }
+
+  // This is visible for testing.
+  Matcher matcher(MatcherInput input) {
+    return new Matcher(this, input);
+  }
+
   /**
    * Splits input around instances of the regular expression. It returns an array giving the strings
    * that occur before, between, and after instances of the regular expression. Empty strings that

diff --git a/java/com/google/re2j/RE2.java b/java/com/google/re2j/RE2.java
@@ -20,6 +20,7 @@
 
 package com.google.re2j;
 
+import com.google.re2j.MatcherInput.Encoding;
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
@@ -257,6 +258,10 @@ boolean match(CharSequence s) {
     return doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, 0) != null;
   }
 
+  boolean match(CharSequence input, int start, int end, int anchor, int[] group, int ngroup) {
+    return match(MatcherInput.utf16(input), start, end, anchor, group, ngroup);
+  }
+
   /**
    * Matches the regular expression against input starting at position start and ending at position
    * end, with the given anchoring. Records the submatch boundaries in group, which is [start, end)
@@ -271,7 +276,7 @@ boolean match(CharSequence s) {
    * @param ngroup the number of array pairs to fill in
    * @return true if a match was found
    */
-  boolean match(CharSequence input, int start, int end, int anchor, int[] group, int ngroup) {
+  boolean match(MatcherInput input, int start, int end, int anchor, int[] group, int ngroup) {
     if (start > end) {
       return false;
     }
@@ -282,7 +287,11 @@ boolean match(CharSequence input, int start, int end, int anchor, int[] group, i
     // In Russ' own words:
     // That is, I believe doExecute needs to know the bounds of the whole input
     // as well as the bounds of the subpiece that is being searched.
-    int[] groupMatch = doExecute(MachineInput.fromUTF16(input, 0, end), start, anchor, 2 * ngroup);
+    MachineInput machineInput =
+        input.getEncoding() == Encoding.UTF_16
+            ? MachineInput.fromUTF16(input.asCharSequence(), 0, end)
+            : MachineInput.fromUTF8(input.asBytes(), 0, end);
+    int[] groupMatch = doExecute(machineInput, start, anchor, 2 * ngroup);
 
     if (groupMatch == null) {
       return false;