Skip to content

Commit

Permalink
Support matching binary utf8 string
Browse files Browse the repository at this point in the history
Summary:
1. Introduce MatcherInput to represent both utf16 and utf8.
2. Reuse existing tests with ApiTestUtils to use both MatcherInput.
  • Loading branch information
jc4x4 authored and sjamesr committed Mar 8, 2021
1 parent 2986a5d commit c4e8120
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 41 deletions.
39 changes: 32 additions & 7 deletions java/com/google/re2j/Matcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/
package com.google.re2j;

import com.google.re2j.MatcherInput.Encoding;
import java.util.Map;

/**
Expand Down Expand Up @@ -48,7 +49,7 @@ public final class Matcher {
// The number of submatches (groups) in the pattern.
private final int groupCount;

private CharSequence inputSequence;
private MatcherInput matcherInput;

// The input length in UTF16 codes.
private int inputLength;
Expand Down Expand Up @@ -83,6 +84,11 @@ private Matcher(Pattern pattern) {
reset(input);
}

Matcher(Pattern pattern, MatcherInput input) {
this(pattern);
reset(input);
}

/** Returns the {@code Pattern} associated with this {@code Matcher}. */
public Pattern pattern() {
return pattern;
Expand All @@ -94,7 +100,7 @@ public Pattern pattern() {
* @return the {@code Matcher} itself, for chained method calls
*/
public Matcher reset() {
inputLength = inputSequence.length();
inputLength = matcherInput.length();
appendPos = 0;
hasMatch = false;
hasGroups = false;
Expand All @@ -108,10 +114,24 @@ public Matcher reset() {
* @return the {@code Matcher} itself, for chained method calls
*/
public Matcher reset(CharSequence input) {
return reset(MatcherInput.utf16(input));
}

/**
* Resets the {@code Matcher} and changes the input.
*
* @param bytes utf8 bytes of the input string.
* @return the {@code Matcher} itself, for chained method calls
*/
public Matcher reset(byte[] bytes) {
return reset(MatcherInput.utf8(bytes));
}

private Matcher reset(MatcherInput input) {
if (input == null) {
throw new NullPointerException("input is null");
}
inputSequence = input;
matcherInput = input;
reset();
return this;
}
Expand Down Expand Up @@ -261,7 +281,7 @@ private void loadGroup(int group) {
}

boolean ok =
pattern.re2().match(inputSequence, groups[0], end, anchorFlag, groups, 1 + groupCount);
pattern.re2().match(matcherInput, groups[0], end, anchorFlag, groups, 1 + groupCount);
// Must match - hasMatch says that the last call with these
// parameters worked just fine.
if (!ok) {
Expand Down Expand Up @@ -328,7 +348,7 @@ public boolean find(int start) {
private boolean genMatch(int startByte, int anchor) {
// TODO(rsc): Is matches/lookingAt supposed to reset the append or input positions?
// From the JDK docs, looks like no.
boolean ok = pattern.re2().match(inputSequence, startByte, inputLength, anchor, groups, 1);
boolean ok = pattern.re2().match(matcherInput, startByte, inputLength, anchor, groups, 1);
if (!ok) {
return false;
}
Expand All @@ -341,8 +361,13 @@ private boolean genMatch(int startByte, int anchor) {

/** Helper: return substring for [start, end). */
String substring(int start, int end) {
// UTF_8 is matched in binary mode. So slice the bytes.
if (matcherInput.getEncoding() == Encoding.UTF_8) {
return new String(matcherInput.asBytes(), start, end - start);

This comment has been minimized.

Copy link
@herbyderby

herbyderby Oct 14, 2023

Collaborator

This needs an explicit charset (e.g. add a "UTF-8" argument to end of method call). The platform default charset may not be UTF-8.

}

// This is fast for both StringBuilder and String.
return inputSequence.subSequence(start, end).toString();
return matcherInput.asCharSequence().subSequence(start, end).toString();
}

/** Helper for Pattern: return input length. */
Expand Down Expand Up @@ -492,7 +517,7 @@ private void appendReplacementInternal(StringBuilder sb, String replacement) {
}
}
if (last < m) {
sb.append(replacement.substring(last, m));
sb.append(replacement, last, m);
}
}

Expand Down
105 changes: 105 additions & 0 deletions java/com/google/re2j/MatcherInput.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright (c) 2021 The Go Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style
* license that can be found in the LICENSE file.
*/
package com.google.re2j;

import java.nio.charset.Charset;

/**
* Abstract the representations of input text supplied to Matcher.
*/
abstract class MatcherInput {

enum Encoding {
UTF_16,
UTF_8,
}

/**
* Return the MatcherInput for UTF_16 encoding.
*/
static MatcherInput utf16(CharSequence charSequence) {
return new Utf16MatcherInput(charSequence);
}

/**
* Return the MatcherInput for UTF_8 encoding.
*/
static MatcherInput utf8(byte[] bytes) {
return new Utf8MatcherInput(bytes);
}

/**
* Return the MatcherInput for UTF_8 encoding.
*/
static MatcherInput utf8(String input) {
return new Utf8MatcherInput(input.getBytes(Charset.forName("UTF-8")));
}

abstract Encoding getEncoding();

abstract CharSequence asCharSequence();

abstract byte[] asBytes();

abstract int length();

static class Utf8MatcherInput extends MatcherInput {
byte[] bytes;

public Utf8MatcherInput(byte[] bytes) {
this.bytes = bytes;
}

@Override
public Encoding getEncoding() {
return Encoding.UTF_8;
}

@Override
public CharSequence asCharSequence() {
return new String(bytes, Charset.forName("UTF-8"));
}

@Override
public byte[] asBytes() {
return bytes;
}

@Override
public int length() {
return bytes.length;
}
}

static class Utf16MatcherInput extends MatcherInput {
CharSequence charSequence;

public Utf16MatcherInput(CharSequence charSequence) {
this.charSequence = charSequence;
}

@Override
public Encoding getEncoding() {
return Encoding.UTF_16;
}

@Override
public CharSequence asCharSequence() {
return charSequence;
}

@Override
public byte[] asBytes() {
return charSequence.toString().getBytes(Charset.forName("UTF-16"));
}

@Override
public int length() {
return charSequence.length();
}
}
}
17 changes: 17 additions & 0 deletions java/com/google/re2j/Pattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,18 @@ public static boolean matches(String regex, CharSequence input) {
return compile(regex).matcher(input).matches();
}

public static boolean matches(String regex, byte[] input) {
return compile(regex).matcher(input).matches();
}

public boolean matches(String input) {
return this.matcher(input).matches();
}

public boolean matches(byte[] input) {
return this.matcher(input).matches();
}

/**
* Creates a new {@code Matcher} matching the pattern against the input.
*
Expand All @@ -174,6 +182,15 @@ public Matcher matcher(CharSequence input) {
return new Matcher(this, input);
}

public Matcher matcher(byte[] input) {
return new Matcher(this, MatcherInput.utf8(input));
}

// This is visible for testing.
Matcher matcher(MatcherInput input) {
return new Matcher(this, input);
}

/**
* Splits input around instances of the regular expression. It returns an array giving the strings
* that occur before, between, and after instances of the regular expression. Empty strings that
Expand Down
13 changes: 11 additions & 2 deletions java/com/google/re2j/RE2.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

package com.google.re2j;

import com.google.re2j.MatcherInput.Encoding;
import java.io.UnsupportedEncodingException;
import java.util.ArrayDeque;
import java.util.ArrayList;
Expand Down Expand Up @@ -257,6 +258,10 @@ boolean match(CharSequence s) {
return doExecute(MachineInput.fromUTF16(s), 0, UNANCHORED, 0) != null;
}

boolean match(CharSequence input, int start, int end, int anchor, int[] group, int ngroup) {
return match(MatcherInput.utf16(input), start, end, anchor, group, ngroup);
}

/**
* Matches the regular expression against input starting at position start and ending at position
* end, with the given anchoring. Records the submatch boundaries in group, which is [start, end)
Expand All @@ -271,7 +276,7 @@ boolean match(CharSequence s) {
* @param ngroup the number of array pairs to fill in
* @return true if a match was found
*/
boolean match(CharSequence input, int start, int end, int anchor, int[] group, int ngroup) {
boolean match(MatcherInput input, int start, int end, int anchor, int[] group, int ngroup) {
if (start > end) {
return false;
}
Expand All @@ -282,7 +287,11 @@ boolean match(CharSequence input, int start, int end, int anchor, int[] group, i
// In Russ' own words:
// That is, I believe doExecute needs to know the bounds of the whole input
// as well as the bounds of the subpiece that is being searched.
int[] groupMatch = doExecute(MachineInput.fromUTF16(input, 0, end), start, anchor, 2 * ngroup);
MachineInput machineInput =
input.getEncoding() == Encoding.UTF_16
? MachineInput.fromUTF16(input.asCharSequence(), 0, end)
: MachineInput.fromUTF8(input.asBytes(), 0, end);
int[] groupMatch = doExecute(machineInput, start, anchor, 2 * ngroup);

if (groupMatch == null) {
return false;
Expand Down
Loading

0 comments on commit c4e8120

Please sign in to comment.