From 7315f6d398d2d18443b26c513cbdcdbffaeebaa3 Mon Sep 17 00:00:00 2001
From: Protobuf Team Bot <protobuf-github-bot@google.com>
Date: Mon, 18 Dec 2023 17:56:29 -0800
Subject: [PATCH] lzcnt enabled varint size calculation for proto

The x86 microbenchmark speedups here are only representative for packed repeated primitive fields, as they benefit from some improved vectorization.

For normal cases like non-repeated fields, the util/coding microbenchmarks showed roughly 18% for arm and 2-3% for 32 bit x86, 10% for 64 bit x86 (including loop overhead)

PiperOrigin-RevId: 592061508
---
 src/google/protobuf/io/coded_stream.h | 67 ++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/src/google/protobuf/io/coded_stream.h b/src/google/protobuf/io/coded_stream.h
index 97d62e49001b..7e4f6a57e81e 100644
--- a/src/google/protobuf/io/coded_stream.h
+++ b/src/google/protobuf/io/coded_stream.h
@@ -1704,38 +1704,68 @@ inline uint8_t* CodedOutputStream::WriteTagToArray(uint32_t value,
   return WriteVarint32ToArray(value, target);
 }
 
+#if (defined(__x86__) || defined(__x86_64__) || defined(_M_IX86) || \
+     defined(_M_X64)) &&                                            \
+    !(defined(__LZCNT__) || defined(__AVX2__))
+// X86 CPUs lacking the lzcnt instruction are faster with the bsr-based
+// implementation. MSVC does not define __LZCNT__, the nearest option that
+// it interprets as lzcnt availability is __AVX2__.
+#define PROTOBUF_CODED_STREAM_H_PREFER_BSR 1
+#else
+#define PROTOBUF_CODED_STREAM_H_PREFER_BSR 0
+#endif
 inline size_t CodedOutputStream::VarintSize32(uint32_t value) {
-  // This computes value == 0 ? 1 : floor(log2(value)) / 7 + 1
-  // Use an explicit multiplication to implement the divide of
-  // a number in the 1..31 range.
-  //
+#if PROTOBUF_CODED_STREAM_H_PREFER_BSR
   // Explicit OR 0x1 to avoid calling absl::countl_zero(0), which
-  // requires a branch to check for on many platforms.
-  uint32_t log2value = 31 - absl::countl_zero(value | 0x1);
-  return static_cast<size_t>((log2value * 9 + 73) / 64);
+  // requires a branch to check for on platforms without a clz instruction.
+  uint32_t log2value = (std::numeric_limits<uint32_t>::digits - 1) -
+                       absl::countl_zero(value | 0x1);
+  return static_cast<size_t>((log2value * 9 + (64 + 9)) / 64);
+#else
+  uint32_t clz = absl::countl_zero(value);
+  return static_cast<size_t>(
+      ((std::numeric_limits<uint32_t>::digits * 9 + 64) - (clz * 9)) / 64);
+#endif
 }
 
 inline size_t CodedOutputStream::VarintSize32PlusOne(uint32_t value) {
   // Same as above, but one more.
-  uint32_t log2value = 31 - absl::countl_zero(value | 0x1);
-  return static_cast<size_t>((log2value * 9 + 73 + 64) / 64);
+#if PROTOBUF_CODED_STREAM_H_PREFER_BSR
+  uint32_t log2value = (std::numeric_limits<uint32_t>::digits - 1) -
+                       absl::countl_zero(value | 0x1);
+  return static_cast<size_t>((log2value * 9 + (64 + 9) + 64) / 64);
+#else
+  uint32_t clz = absl::countl_zero(value);
+  return static_cast<size_t>(
+      ((std::numeric_limits<uint32_t>::digits * 9 + 64 + 64) - (clz * 9)) / 64);
+#endif
 }
 
 inline size_t CodedOutputStream::VarintSize64(uint64_t value) {
-  // This computes value == 0 ? 1 : floor(log2(value)) / 7 + 1
-  // Use an explicit multiplication to implement the divide of
-  // a number in the 1..63 range.
-  //
+#if PROTOBUF_CODED_STREAM_H_PREFER_BSR
   // Explicit OR 0x1 to avoid calling absl::countl_zero(0), which
-  // requires a branch to check for on many platforms.
-  uint32_t log2value = 63 - absl::countl_zero(value | 0x1);
-  return static_cast<size_t>((log2value * 9 + 73) / 64);
+  // requires a branch to check for on platforms without a clz instruction.
+  uint32_t log2value = (std::numeric_limits<uint64_t>::digits - 1) -
+                       absl::countl_zero(value | 0x1);
+  return static_cast<size_t>((log2value * 9 + (64 + 9)) / 64);
+#else
+  uint32_t clz = absl::countl_zero(value);
+  return static_cast<size_t>(
+      ((std::numeric_limits<uint64_t>::digits * 9 + 64) - (clz * 9)) / 64);
+#endif
 }
 
 inline size_t CodedOutputStream::VarintSize64PlusOne(uint64_t value) {
   // Same as above, but one more.
-  uint32_t log2value = 63 - absl::countl_zero(value | 0x1);
-  return static_cast<size_t>((log2value * 9 + 73 + 64) / 64);
+#if PROTOBUF_CODED_STREAM_H_PREFER_BSR
+  uint32_t log2value = (std::numeric_limits<uint64_t>::digits - 1) -
+                       absl::countl_zero(value | 0x1);
+  return static_cast<size_t>((log2value * 9 + (64 + 9) + 64) / 64);
+#else
+  uint32_t clz = absl::countl_zero(value);
+  return static_cast<size_t>(
+      ((std::numeric_limits<uint64_t>::digits * 9 + 64 + 64) - (clz * 9)) / 64);
+#endif
 }
 
 inline size_t CodedOutputStream::VarintSize32SignExtended(int32_t value) {
@@ -1746,6 +1776,7 @@ inline size_t CodedOutputStream::VarintSize32SignExtendedPlusOne(
     int32_t value) {
   return VarintSize64PlusOne(static_cast<uint64_t>(int64_t{value}));
 }
+#undef PROTOBUF_CODED_STREAM_H_PREFER_BSR
 
 inline void CodedOutputStream::WriteString(const std::string& str) {
   WriteRaw(str.data(), static_cast<int>(str.size()));