From 2524ada63fa1098b19b86c84d1eebd74de0df424 Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Mon, 23 Dec 2024 05:58:27 +0300
Subject: [PATCH] Optimize to_binary() function and to_bytes() method

It seems, that using mpn_get_str() is more efficient than generic
mpz_export().

Some benchmarks are here:
https://github.com/aleaxit/gmpy/issues/404#issuecomment-2526603947

Not sure what else we can do for #404.  In the python-gmp I've added
also the `__reduce__` dunded method.  This seems slightly better than
rely on copyreg to support pickling:

| Benchmark      | ref     | patch                 | gmp                   |
|----------------|:-------:|:---------------------:|:---------------------:|
| dumps(1<<7)    | 23.9 us | 23.8 us: 1.01x faster | 22.6 us: 1.06x faster |
| dumps(1<<38)   | 24.0 us | 23.9 us: 1.01x faster | 22.7 us: 1.06x faster |
| dumps(1<<300)  | 24.1 us | 23.8 us: 1.01x faster | 22.9 us: 1.05x faster |
| dumps(1<<3000) | 26.8 us | 25.2 us: 1.07x faster | 23.8 us: 1.13x faster |
| Geometric mean | (ref)   | 1.02x faster          | 1.07x faster          |

Can we add pickling to the gmpy2 with even less overhead?  I don't know.

But if we avoid pickle machinery, you can see noticeable performance
boost for small numbers too:

| Benchmark      | to_binary-ref | to_binary-patch       |
|----------------|:-------------:|:---------------------:|
| dumps(1<<7)    | 323 ns        | 300 ns: 1.08x faster  |
| dumps(1<<38)   | 352 ns        | 315 ns: 1.12x faster  |
| dumps(1<<300)  | 603 ns        | 436 ns: 1.39x faster  |
| dumps(1<<3000) | 3.17 us       | 1.57 us: 2.02x faster |
| Geometric mean | (ref)         | 1.35x faster          |

New code seems faster than int.to_bytes() roughly from 500bit numbers on
my system.
---
 src/gmpy2_binary.c   |  6 ++++--
 src/gmpy2_macros.h   | 17 +++++++++++++++++
 src/gmpy2_mpz_misc.c | 12 ++++++------
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/gmpy2_binary.c b/src/gmpy2_binary.c
index d46e2d22..ce50501d 100644
--- a/src/gmpy2_binary.c
+++ b/src/gmpy2_binary.c
@@ -260,7 +260,7 @@ GMPy_MPZ_To_Binary(MPZ_Object *self)
         goto done;
     }
 
-    size = ((mpz_sizeinbase(self->z, 2) + 7) / 8) + 2;
+    size = mpz_sizeinbase(self->z, 256) + 2;
 
     TEMP_ALLOC(buffer, size);
     buffer[0] = 0x01;
@@ -268,7 +268,9 @@ GMPy_MPZ_To_Binary(MPZ_Object *self)
         buffer[1] = 0x01;
     else
         buffer[1] = 0x02;
-    mpz_export(buffer+2, NULL, -1, sizeof(char), 0, 0, self->z);
+    mpn_get_str((unsigned char *)(buffer + 2), 256,
+                self->z->_mp_d, Py_ABS(self->z->_mp_size));
+    revstr(buffer, 2, size - 1);
 
   done:
     result = PyBytes_FromStringAndSize(buffer, size);
diff --git a/src/gmpy2_macros.h b/src/gmpy2_macros.h
index e2af9149..aaf87b3e 100644
--- a/src/gmpy2_macros.h
+++ b/src/gmpy2_macros.h
@@ -726,3 +726,20 @@ GMPy_Context_##NAME(PyObject *self, PyObject *args) \
     } \
     return GMPy_Number_##NAME(PyTuple_GET_ITEM(args, 0), PyTuple_GET_ITEM(args, 1), context); \
 }
+
+#define SWAP(T, a, b)  \
+    do {               \
+        T tmp = a;     \
+        a = b;         \
+        b = tmp;       \
+    } while (0);
+
+static inline void
+revstr(char *s, size_t l, size_t r)
+{
+    while (l < r) {
+        SWAP(char, s[l], s[r]);
+        l++;
+        r--;
+    }
+}
diff --git a/src/gmpy2_mpz_misc.c b/src/gmpy2_mpz_misc.c
index c3597807..87393dcd 100644
--- a/src/gmpy2_mpz_misc.c
+++ b/src/gmpy2_mpz_misc.c
@@ -1986,15 +1986,15 @@ GMPy_MPZ_Method_To_Bytes(PyObject *self, PyObject *const *args,
         return NULL;
     }
     buffer = PyBytes_AS_STRING(bytes);
-    memset(buffer, 0, length);
+    memset(buffer, is_negative ? 0xFF : 0, gap);
 
-    if (is_big) {
-        mpz_export(buffer + gap, NULL, 1, sizeof(char), 0, 0, *px);
+    if ((*px)->_mp_size) {
+        mpn_get_str((unsigned char *)(buffer + gap), 256,
+                    (*px)->_mp_d, (*px)->_mp_size);
     }
-    else {
-        mpz_export(buffer, NULL, -1, sizeof(char), 0, 0, *px);
+    if (!is_big && length) {
+        revstr(buffer, 0, length - 1);
     }
-
     if (is_negative) {
         mpz_clear(tmp);
     }