From d9c1671dba2f9258740a603d737b345c88f03f72 Mon Sep 17 00:00:00 2001 From: David Date: Fri, 4 Oct 2024 10:07:08 -0400 Subject: [PATCH 1/2] AVRO-4074: Optimization for Serializing ASCII Strings --- .../org/apache/avro/io/BinaryEncoder.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java index aacb83b88f4..179e3bc54c5 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java +++ b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java @@ -37,6 +37,9 @@ */ public abstract class BinaryEncoder extends Encoder { + // Buffer used for writing ASCII strings + private final byte[] stringBuffer = new byte[128]; + @Override public void writeNull() throws IOException { } @@ -48,10 +51,47 @@ public void writeString(Utf8 utf8) throws IOException { @Override public void writeString(String string) throws IOException { + /* empty string short-circuit */ if (string.isEmpty()) { writeZero(); return; } + + /* + * Assume the String is ASCII. If the ASCII String fits into the existing + * buffer, copy the characters into the buffer and write it to the underlying + * Encoder. If the String is too long, or ends up not being ASCII, then + * fall-back to the default JDK mechanism for handling String to byte array. + */ + final int stringLength = string.length(); + if (stringLength <= stringBuffer.length) { + boolean onlyAscii = true; + for (int i = 0; onlyAscii && (i < stringLength); i++) { + /* + * The char data type is a single 16-bit Unicode character (UTF-16). ASCII, is a + * 7-bit character encoding. Therefore, if the 8-bit is set than it cannot be + * ASCII. If it is ASCII, it is safe to trim to byte. + */ + final char c = string.charAt(i); + if (c >= 0x80) { + onlyAscii = false; + } else { + stringBuffer[i] = (byte) c; + } + } + if (onlyAscii) { + writeInt(stringLength); + writeFixed(stringBuffer, 0, stringLength); + return; + } + } + + /* + * The standard JDK way of turning Strings into byte arrays. Handles UTF-16 + * case. However, for ASCII this has the overhead of instantiating a new byte + * array (which pollutes the heap), and then copying the underlying bytes into + * the array, + */ byte[] bytes = string.getBytes(StandardCharsets.UTF_8); writeInt(bytes.length); writeFixed(bytes, 0, bytes.length); From 1671a345837ace5de7bff27f4bd220de799da977 Mon Sep 17 00:00:00 2001 From: David Mollitor Date: Sun, 6 Oct 2024 13:35:55 -0400 Subject: [PATCH 2/2] Update comment regarding ASCII --- .../avro/src/main/java/org/apache/avro/io/BinaryEncoder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java index 179e3bc54c5..f8f9802edb7 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java +++ b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java @@ -69,8 +69,8 @@ public void writeString(String string) throws IOException { for (int i = 0; onlyAscii && (i < stringLength); i++) { /* * The char data type is a single 16-bit Unicode character (UTF-16). ASCII, is a - * 7-bit character encoding. Therefore, if the 8-bit is set than it cannot be - * ASCII. If it is ASCII, it is safe to trim to byte. + * 7-bit character encoding. Therefore, if the value is larger than 127, it + * cannot be ASCII. If it is ASCII, it is safe to trim to byte. */ final char c = string.charAt(i); if (c >= 0x80) {