Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HIVE-27370: support 4 bytes characters #5624

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 140 additions & 33 deletions ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,53 +96,131 @@ private Text evaluateInternal(Text t, int pos, int len) {
return r;
}

String s = t.toString();
int[] index = makeIndex(pos, len, s.length());
if (index == null) {
byte[] utf8String = t.toString().getBytes();
populateSubstrOffsets(utf8String, pos, len);
if (index[0] == -1) {
return r;
}

r.set(s.substring(index[0], index[1]));
r.set(new String(utf8String, index[0], index[1]));
return r;
}

private int[] makeIndex(int pos, int len, int inputLen) {
if ((Math.abs(pos) > inputLen)) {
return null;
private Text evaluateInternal(Text t, int pos) {
r.clear();

byte[] utf8String = t.toString().getBytes();
int offset = getSubstrStartOffset(utf8String, pos);
if (offset == -1) {
return r;
}

int start, end;
r.set(new String(utf8String, offset, utf8String.length - offset));
return r;
}

private void populateSubstrOffsets(byte[] utf8String, int start, int len) {
int curIdx = -1;
index[0] = -1;
index[1] = -1;
int end = utf8String.length;

if (start > 0) {
start = start - 1;
} else if (start < 0) {
int length = 0;
for (int i = 0; i != end; ++i) {
if ((utf8String[i] & 0xc0) != 0x80) {
++length;
}
}

if (-start > length) {
return;
}

start = length + start;
}

if (len == 0) {
return;
} else if (len > end) {
len = end;
}

if (pos > 0) {
start = pos - 1;
} else if (pos < 0) {
start = inputLen + pos;
} else {
start = 0;
int endIdx = start + len - 1;
for (int i = 0; i != end; ++i) {
if ((utf8String[i] & 0xc0) != 0x80) {
++curIdx;
if (curIdx == start) {
index[0] = i;
} else if (curIdx - 1 == endIdx) {
index[1] = i - index[0];
}
}
}

if ((inputLen - start) < len) {
end = inputLen;
} else {
end = start + len;
if (index[1] == -1) {
index[1] = end - index[0];
}
index[0] = start;
index[1] = end;
return index;
}

private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE);
private int getSubstrStartOffset(byte[] utf8String, int start) {
int end = utf8String.length;

if (start >= 1) {
start = start - 1;
}
if (start < 0) {
int length = 0;
for (int i = 0; i != end; ++i) {
if ((utf8String[i] & 0xc0) != 0x80) {
++length;
}
}

if (-start > length) {
return -1;
}

// Even though we are using longs, substr can only deal with ints, so we use
// the maximum int value as the maxValue
private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE);
start = length + start;
}

int curIdx = -1;
for (int i = 0; i != end; ++i) {
if ((utf8String[i] & 0xc0) != 0x80) {
++curIdx;
if (curIdx == start) {
return i;
}
}
}

return -1;
}

public Text evaluate(Text s, IntWritable pos) {
return evaluate(s, pos, maxValue);
if ((s == null) || (pos == null)) {
return null;
}

return evaluateInternal(s, pos.get());
}

public Text evaluate(Text s, LongWritable pos) {
return evaluate(s, pos, maxLongValue);
if ((s == null) || (pos == null)) {
return null;
}

long longPos = pos.get();
// If an unsupported value is seen, we don't want to return a string
// that doesn't match what the user expects, so we return NULL (still
// unexpected, of course, but probably better than a bad string).
if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) {
return null;
}

return evaluateInternal(s, (int) pos.get());
}

public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) {
Expand Down Expand Up @@ -172,32 +250,61 @@ public BytesWritable evaluate(BytesWritable bw, IntWritable pos, IntWritable len
}

private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {

if (len <= 0) {
return new BytesWritable();
}

int[] index = makeIndex(pos, len, bw.getLength());
if (index == null) {
byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
populateSubstrOffsets(b, pos, len);
if (index[0] == -1) {
return new BytesWritable();
}

return new BytesWritable(Arrays.copyOfRange(bw.getBytes(), index[0], index[1]));
return new BytesWritable(arrayCopy(b, index[0], index[1]));
}

private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
int offset = getSubstrStartOffset(b, pos);
if (offset == -1) {
return new BytesWritable();
}

return new BytesWritable(arrayCopy(b, offset, bw.getLength() - offset));
}

public BytesWritable evaluate(BytesWritable bw, IntWritable pos){
return evaluate(bw, pos, maxValue);
if ((bw == null) || (pos == null)) {
return null;
}
return evaluateInternal(bw, pos.get());
}

public BytesWritable evaluate(BytesWritable bw, LongWritable pos){
return evaluate(bw, pos, maxLongValue);
if ((bw == null) || (pos == null)) {
return null;
}

return evaluateInternal(bw, (int) pos.get());
}

@Override
public StatEstimator getStatEstimator() {
return new SubStrStatEstimator();
}

private byte[] arrayCopy(byte[] src, int pos, int len) {
byte[] b = new byte[len];

int copyIdx = 0;
for (int srcIdx = pos; copyIdx < len; srcIdx++) {
b[copyIdx] = src[srcIdx];
copyIdx++;
}

return b;
}

private static class SubStrStatEstimator implements StatEstimator {

@Override
Expand Down
8 changes: 8 additions & 0 deletions ql/src/test/queries/clientpositive/udf_substr.q
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,11 @@ FROM src tablesample (1 rows);
SELECT
substr('ABC', cast(2147483649 as bigint))
FROM src tablesample (1 rows);

--test 4-byte charactor
set hive.vectorized.execution.enabled=false;
SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows);
17 changes: 17 additions & 0 deletions ql/src/test/results/clientpositive/llap/udf_substr.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,20 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
NULL
PREHOOK: query: SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows)
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
POSTHOOK: query: SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows)
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
あa🤎 🤎いiうu 🤎いiうu
Loading