From d9c1671dba2f9258740a603d737b345c88f03f72 Mon Sep 17 00:00:00 2001
From: David <dmollitor@apache.org>
Date: Fri, 4 Oct 2024 10:07:08 -0400
Subject: [PATCH 1/2] AVRO-4074: Optimization for Serializing ASCII Strings

---
 .../org/apache/avro/io/BinaryEncoder.java     | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
index aacb83b88f4..179e3bc54c5 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
@@ -37,6 +37,9 @@
  */
 public abstract class BinaryEncoder extends Encoder {
 
+  // Buffer used for writing ASCII strings
+  private final byte[] stringBuffer = new byte[128];
+
   @Override
   public void writeNull() throws IOException {
   }
@@ -48,10 +51,47 @@ public void writeString(Utf8 utf8) throws IOException {
 
   @Override
   public void writeString(String string) throws IOException {
+    /* empty string short-circuit */
     if (string.isEmpty()) {
       writeZero();
       return;
     }
+
+    /*
+     * Assume the String is ASCII. If the ASCII String fits into the existing
+     * buffer, copy the characters into the buffer and write it to the underlying
+     * Encoder. If the String is too long, or ends up not being ASCII, then
+     * fall-back to the default JDK mechanism for handling String to byte array.
+     */
+    final int stringLength = string.length();
+    if (stringLength <= stringBuffer.length) {
+      boolean onlyAscii = true;
+      for (int i = 0; onlyAscii && (i < stringLength); i++) {
+        /*
+         * The char data type is a single 16-bit Unicode character (UTF-16). ASCII, is a
+         * 7-bit character encoding. Therefore, if the 8-bit is set than it cannot be
+         * ASCII. If it is ASCII, it is safe to trim to byte.
+         */
+        final char c = string.charAt(i);
+        if (c >= 0x80) {
+          onlyAscii = false;
+        } else {
+          stringBuffer[i] = (byte) c;
+        }
+      }
+      if (onlyAscii) {
+        writeInt(stringLength);
+        writeFixed(stringBuffer, 0, stringLength);
+        return;
+      }
+    }
+
+    /*
+     * The standard JDK way of turning Strings into byte arrays. Handles UTF-16
+     * case. However, for ASCII this has the overhead of instantiating a new byte
+     * array (which pollutes the heap), and then copying the underlying bytes into
+     * the array,
+     */
     byte[] bytes = string.getBytes(StandardCharsets.UTF_8);
     writeInt(bytes.length);
     writeFixed(bytes, 0, bytes.length);

From 1671a345837ace5de7bff27f4bd220de799da977 Mon Sep 17 00:00:00 2001
From: David Mollitor <dmollitor@apache.org>
Date: Sun, 6 Oct 2024 13:35:55 -0400
Subject: [PATCH 2/2] Update comment regarding ASCII

---
 .../avro/src/main/java/org/apache/avro/io/BinaryEncoder.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
index 179e3bc54c5..f8f9802edb7 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryEncoder.java
@@ -69,8 +69,8 @@ public void writeString(String string) throws IOException {
       for (int i = 0; onlyAscii && (i < stringLength); i++) {
         /*
          * The char data type is a single 16-bit Unicode character (UTF-16). ASCII, is a
-         * 7-bit character encoding. Therefore, if the 8-bit is set than it cannot be
-         * ASCII. If it is ASCII, it is safe to trim to byte.
+         * 7-bit character encoding. Therefore, if the value is larger than 127, it
+         * cannot be ASCII. If it is ASCII, it is safe to trim to byte.
          */
         final char c = string.charAt(i);
         if (c >= 0x80) {