From 28c30c0f7686d56fcef385c236594ab5f1e56a77 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 27 May 2024 16:21:18 +0200 Subject: [PATCH 01/11] gh-119609: Add PyUnicode_AsNativeFormat() function Add PyUnicode_AsNativeFormat() and PyUnicode_FromNativeFormat() functions to the C API. --- Doc/c-api/unicode.rst | 47 +++++++++++ Doc/data/stable_abi.dat | 2 + Doc/whatsnew/3.14.rst | 6 ++ Include/unicodeobject.h | 22 +++++ Lib/test/test_capi/test_unicode.py | 81 +++++++++++++++++- Lib/test/test_stable_abi_ctypes.py | 2 + ...-05-27-17-46-17.gh-issue-119609.kPIx6S.rst | 3 + Misc/stable_abi.toml | 4 + Modules/_testlimitedcapi/unicode.c | 31 +++++++ Objects/unicodeobject.c | 83 +++++++++++++++++++ PC/python3dll.c | 2 + 11 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 7320d035bab513..0f3b6c29200f34 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,6 +341,53 @@ APIs: .. versionadded:: 3.3 +.. c:function:: const void* PyUnicode_AsNativeFormat(PyObject *unicode, Py_ssize_t *size, int *native_format) + + Get the contents of a string in its native format. + + * Return the contents, set *\*size* and *\*native_format* on success. + * Set an exception and return ``NULL`` on error. + + The contents is valid as long as *unicode* is valid. + + *unicode*, *size* and *native_format* must not be NULL. + + *\*native_format* is set to one of these native formats: + + .. c:namespace:: NULL + + ======================================== ===== ============================ + Constant Identifier Value Description + ======================================== ===== ============================ + .. c:macro:: PyUnicode_NATIVE_ASCII ``1`` ASCII string (``Py_UCS1*``) + .. c:macro:: PyUnicode_NATIVE_UCS1 ``2`` UCS-1 string (``Py_UCS1*``) + .. c:macro:: PyUnicode_NATIVE_UCS2 ``3`` UCS-2 string (``Py_UCS2*``) + .. c:macro:: PyUnicode_NATIVE_UCS4 ``4`` UCS-4 string (``Py_UCS4*``) + .. c:macro:: PyUnicode_NATIVE_UTF8 ``5`` UTF-8 string (``char*``) + ======================================== ===== ============================ + + .. impl-detail:: + In CPython, the :c:macro:`PyUnicode_NATIVE_UTF8` format is not used by + :c:func:`PyUnicode_AsNativeFormat`, but it's accepted by + :c:func:`PyUnicode_FromNativeFormat`. + + .. versionadded:: 3.14 + + +.. c:function:: PyObject* PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, int native_format) + + Create a string object from a native format string. + + * Return a reference to a new string object on success. + * Set an exception and return ``NULL`` on error. + + *data* must not be NULL. *size* must be positive or zero. + + See :c:func:`PyUnicode_AsNativeFormat` for the available native formats. + + .. versionadded:: 3.14 + + .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \ Py_ssize_t size) diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 76a035f194d911..e4aef2ea0385e3 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -734,6 +734,7 @@ function,PyUnicode_AsEncodedString,3.2,, function,PyUnicode_AsEncodedUnicode,3.2,, function,PyUnicode_AsLatin1String,3.2,, function,PyUnicode_AsMBCSString,3.7,on Windows, +function,PyUnicode_AsNativeFormat,3.14,, function,PyUnicode_AsRawUnicodeEscapeString,3.2,, function,PyUnicode_AsUCS4,3.7,, function,PyUnicode_AsUCS4Copy,3.7,, @@ -784,6 +785,7 @@ function,PyUnicode_Format,3.2,, function,PyUnicode_FromEncodedObject,3.2,, function,PyUnicode_FromFormat,3.2,, function,PyUnicode_FromFormatV,3.2,, +function,PyUnicode_FromNativeFormat,3.14,, function,PyUnicode_FromObject,3.2,, function,PyUnicode_FromOrdinal,3.2,, function,PyUnicode_FromString,3.2,, diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index b77ff30a8fbbee..785e8431c0be34 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -265,6 +265,12 @@ New Features * Add :c:func:`PyLong_GetSign` function to get the sign of :class:`int` objects. (Contributed by Sergey B Kirpichev in :gh:`116560`.) +* Add :c:func:`PyUnicode_AsNativeFormat` and + :c:func:`PyUnicode_FromNativeFormat` functions to import and export strings + in their native format. + (Contributed by Victor Stinner in :gh:`119609`.) + + Porting to Python 3.14 ---------------------- diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index dee00715b3c51d..a106b0aaf03ba8 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -248,6 +248,28 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( const char *u /* UTF-8 encoded string */ ); +#define PyUnicode_NATIVE_ASCII 1 +#define PyUnicode_NATIVE_UCS1 2 +#define PyUnicode_NATIVE_UCS2 3 +#define PyUnicode_NATIVE_UCS4 4 +#define PyUnicode_NATIVE_UTF8 5 + +// Get the content of a string in its native format. +// - Return the content, set '*size' and '*native_format' on success. +// - Set an exception and return NULL on error. +PyAPI_FUNC(const void*) PyUnicode_AsNativeFormat( + PyObject *unicode, + Py_ssize_t *size, + int *native_format); + +// Create a string object from a native format string. +// - Return a reference to a new string object on success. +// - Set an exception and return NULL on error. +PyAPI_FUNC(PyObject*) PyUnicode_FromNativeFormat( + const void *data, + Py_ssize_t size, + int native_format); + /* --- wchar_t support for platforms which support it --------------------- */ #ifdef HAVE_WCHAR_H diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a69f817c515ba7..dda1dd116f0c04 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -24,6 +24,14 @@ class Str(str): pass +PyUnicode_NATIVE_ASCII = 1 +PyUnicode_NATIVE_UCS1 = 2 +PyUnicode_NATIVE_UCS2 = 3 +PyUnicode_NATIVE_UCS4 = 4 +PyUnicode_NATIVE_UTF8 = 5 +# Invalid native format +PyUnicode_NATIVE_INVALID = 0 + class CAPITest(unittest.TestCase): @support.cpython_only @@ -1675,6 +1683,75 @@ def test_pep393_utf8_caching_bug(self): # Check that the second call returns the same result self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) - -if __name__ == "__main__": + def test_unicode_asnativeformat(self): + # Test PyUnicode_AsNativeFormat() + asnativeformat = _testlimitedcapi.unicode_asnativeformat + self.assertEqual(asnativeformat("abc"), + (b'abc', PyUnicode_NATIVE_ASCII)) + self.assertEqual(asnativeformat("latin1:\xe9"), + (b'latin1:\xe9', PyUnicode_NATIVE_UCS1)) + + ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be' + self.assertEqual(asnativeformat('ucs2:\u20ac'), + ('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_NATIVE_UCS2)) + + ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be' + self.assertEqual(asnativeformat('ucs4:\U0010ffff'), + ('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_NATIVE_UCS4)) + + def test_unicode_fromnativeformat(self): + # Test PyUnicode_FromNativeFormat() + fromnativeformat = _testlimitedcapi.unicode_fromnativeformat + self.assertEqual(fromnativeformat(b'abc', PyUnicode_NATIVE_ASCII), + "abc") + self.assertEqual(fromnativeformat(b'latin1:\xe9', PyUnicode_NATIVE_UCS1), + "latin1:\xe9") + + ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be' + self.assertEqual(fromnativeformat('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_NATIVE_UCS2), + 'ucs2:\u20ac') + + ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be' + self.assertEqual(fromnativeformat('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_NATIVE_UCS4), + 'ucs4:\U0010ffff') + + text = "abc\xe9\U0010ffff" + self.assertEqual(fromnativeformat(text.encode('utf8'), + PyUnicode_NATIVE_UTF8), + text) + + # Empty string + for native_format in ( + PyUnicode_NATIVE_ASCII, + PyUnicode_NATIVE_UCS1, + PyUnicode_NATIVE_UCS2, + PyUnicode_NATIVE_UCS4, + PyUnicode_NATIVE_UTF8, + ): + with self.subTest(native_format=native_format): + self.assertEqual(fromnativeformat(b'', native_format), + '') + + # Invalid format + with self.assertRaises(ValueError): + fromnativeformat(b'', PyUnicode_NATIVE_INVALID) + + # Invalid size + ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc) + with self.assertRaises(ValueError): + fromnativeformat(ucs2[:-1], PyUnicode_NATIVE_UCS2) + ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc) + with self.assertRaises(ValueError): + fromnativeformat(ucs4[:-1], PyUnicode_NATIVE_UCS4) + with self.assertRaises(ValueError): + fromnativeformat(ucs4[:-2], PyUnicode_NATIVE_UCS4) + with self.assertRaises(ValueError): + fromnativeformat(ucs4[:-3], PyUnicode_NATIVE_UCS4) + + +if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index c06c285c5013a6..99bc693448f122 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -760,6 +760,7 @@ def test_windows_feature_macros(self): "PyUnicode_AsEncodedString", "PyUnicode_AsEncodedUnicode", "PyUnicode_AsLatin1String", + "PyUnicode_AsNativeFormat", "PyUnicode_AsRawUnicodeEscapeString", "PyUnicode_AsUCS4", "PyUnicode_AsUCS4Copy", @@ -806,6 +807,7 @@ def test_windows_feature_macros(self): "PyUnicode_FromEncodedObject", "PyUnicode_FromFormat", "PyUnicode_FromFormatV", + "PyUnicode_FromNativeFormat", "PyUnicode_FromObject", "PyUnicode_FromOrdinal", "PyUnicode_FromString", diff --git a/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst new file mode 100644 index 00000000000000..06f9a061ec8ac0 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst @@ -0,0 +1,3 @@ +Add :c:func:`PyUnicode_AsNativeFormat` and +:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings +in their native format. Patch by Victor Stinner. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 77473662aaa76c..5fe199be27f79d 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2507,3 +2507,7 @@ added = '3.13' [function.PyEval_GetFrameLocals] added = '3.13' +[function.PyUnicode_AsNativeFormat] + added = '3.14' +[function.PyUnicode_FromNativeFormat] + added = '3.14' diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 2b70d09108a333..66da5b1d1846b4 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1837,6 +1837,35 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) #undef CHECK_FORMAT_0 } + +// Test PyUnicode_AsNativeFormat() +static PyObject* +unicode_asnativeformat(PyObject *self, PyObject *obj) +{ + Py_ssize_t size; + int native_format; + const void *data = PyUnicode_AsNativeFormat(obj, &size, &native_format); + if (data == NULL) { + return NULL; + } + return Py_BuildValue("y#i", data, size, native_format); +} + + +// Test PyUnicode_FromNativeFormat() +static PyObject* +unicode_fromnativeformat(PyObject *self, PyObject *args) +{ + const void *data; + Py_ssize_t size; + int native_format; + if (!PyArg_ParseTuple(args, "y#i", &data, &size, &native_format)) { + return NULL; + } + return PyUnicode_FromNativeFormat(data, size, native_format); +} + + static PyMethodDef TestMethods[] = { {"codec_incrementalencoder", codec_incrementalencoder, METH_VARARGS}, {"codec_incrementaldecoder", codec_incrementaldecoder, METH_VARARGS}, @@ -1924,6 +1953,8 @@ static PyMethodDef TestMethods[] = { {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, {"unicode_isidentifier", unicode_isidentifier, METH_O}, + {"unicode_asnativeformat", unicode_asnativeformat, METH_O}, + {"unicode_fromnativeformat", unicode_fromnativeformat, METH_VARARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3b0b4173408724..068315fb13aa72 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2097,6 +2097,89 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) return res; } +const void* +PyUnicode_AsNativeFormat(PyObject *unicode, + Py_ssize_t *size, int *native_format) +{ + if (!PyUnicode_Check(unicode)) { + *size = 0; + *native_format = 0; + PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); + return NULL; + } + + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + + if (PyUnicode_IS_ASCII(unicode)) { + *native_format = PyUnicode_NATIVE_ASCII; + *size = len; + return PyUnicode_1BYTE_DATA(unicode); + } + int kind = PyUnicode_KIND(unicode); + + switch (kind) + { + case PyUnicode_1BYTE_KIND: + *native_format = PyUnicode_NATIVE_UCS1; + *size = len; + return PyUnicode_1BYTE_DATA(unicode); + + case PyUnicode_2BYTE_KIND: + *native_format = PyUnicode_NATIVE_UCS2; + *size = len * 2; + return PyUnicode_2BYTE_DATA(unicode); + + default: + assert(kind == PyUnicode_4BYTE_KIND); + *native_format = PyUnicode_NATIVE_UCS4; + *size = len * 4; + return PyUnicode_4BYTE_DATA(unicode); + } +} + +PyObject* +PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, + int native_format) +{ + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "Negative size"); + return NULL; + } + + switch (native_format) + { + case PyUnicode_NATIVE_ASCII: + return PyUnicode_DecodeASCII((const char*)data, size, NULL); + + case PyUnicode_NATIVE_UCS1: + return _PyUnicode_FromUCS1(data, size); + + case PyUnicode_NATIVE_UCS2: + if (size % 2) { + PyErr_Format(PyExc_ValueError, "size must be a multiple of 2: %zd", + size); + return NULL; + } + return _PyUnicode_FromUCS2(data, size / 2); + + case PyUnicode_NATIVE_UCS4: + if (size % 4) { + PyErr_Format(PyExc_ValueError, "size must be a multiple of 4: %zd", + size); + return NULL; + } + return _PyUnicode_FromUCS4(data, size / 4); + + case PyUnicode_NATIVE_UTF8: + return PyUnicode_DecodeUTF8((const char*)data, size, NULL); + + default: + PyErr_Format(PyExc_ValueError, "unknown native format %i", + native_format); + return NULL; + } +} + PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { diff --git a/PC/python3dll.c b/PC/python3dll.c index 86c888430891c9..ca558c6fcf56fe 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -665,6 +665,7 @@ EXPORT_FUNC(PyUnicode_AsEncodedString) EXPORT_FUNC(PyUnicode_AsEncodedUnicode) EXPORT_FUNC(PyUnicode_AsLatin1String) EXPORT_FUNC(PyUnicode_AsMBCSString) +EXPORT_FUNC(PyUnicode_AsNativeFormat) EXPORT_FUNC(PyUnicode_AsRawUnicodeEscapeString) EXPORT_FUNC(PyUnicode_AsUCS4) EXPORT_FUNC(PyUnicode_AsUCS4Copy) @@ -713,6 +714,7 @@ EXPORT_FUNC(PyUnicode_Format) EXPORT_FUNC(PyUnicode_FromEncodedObject) EXPORT_FUNC(PyUnicode_FromFormat) EXPORT_FUNC(PyUnicode_FromFormatV) +EXPORT_FUNC(PyUnicode_FromNativeFormat) EXPORT_FUNC(PyUnicode_FromObject) EXPORT_FUNC(PyUnicode_FromOrdinal) EXPORT_FUNC(PyUnicode_FromString) From 4d771924595075fc2e8d4a2c5a1cbb6662d9dd36 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sun, 9 Jun 2024 11:39:17 +0200 Subject: [PATCH 02/11] Change the API to PyUnicode_Export() --- Include/unicodeobject.h | 24 +++-- Lib/test/test_capi/test_unicode.py | 137 +++++++++++++++++++---------- Modules/_testlimitedcapi/unicode.c | 29 +++--- Objects/unicodeobject.c | 108 +++++++++++++++++------ 4 files changed, 207 insertions(+), 91 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index a106b0aaf03ba8..c23849a0365982 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -248,27 +248,33 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( const char *u /* UTF-8 encoded string */ ); -#define PyUnicode_NATIVE_ASCII 1 -#define PyUnicode_NATIVE_UCS1 2 -#define PyUnicode_NATIVE_UCS2 3 -#define PyUnicode_NATIVE_UCS4 4 -#define PyUnicode_NATIVE_UTF8 5 +#define PyUnicode_FORMAT_ASCII 0x01 +#define PyUnicode_FORMAT_UCS1 0x02 +#define PyUnicode_FORMAT_UCS2 0x04 +#define PyUnicode_FORMAT_UCS4 0x08 +#define PyUnicode_FORMAT_UTF8 0x10 // Get the content of a string in its native format. // - Return the content, set '*size' and '*native_format' on success. // - Set an exception and return NULL on error. -PyAPI_FUNC(const void*) PyUnicode_AsNativeFormat( +PyAPI_FUNC(const void*) PyUnicode_Export( PyObject *unicode, + unsigned int supported_formats, Py_ssize_t *size, - int *native_format); + unsigned int *format); + +PyAPI_FUNC(void) PyUnicode_FreeExport( + PyObject *unicode, + const void* data, + unsigned int format); // Create a string object from a native format string. // - Return a reference to a new string object on success. // - Set an exception and return NULL on error. -PyAPI_FUNC(PyObject*) PyUnicode_FromNativeFormat( +PyAPI_FUNC(PyObject*) PyUnicode_Import( const void *data, Py_ssize_t size, - int native_format); + unsigned int format); /* --- wchar_t support for platforms which support it --------------------- */ diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index dda1dd116f0c04..a7eccb1c973616 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -24,13 +24,13 @@ class Str(str): pass -PyUnicode_NATIVE_ASCII = 1 -PyUnicode_NATIVE_UCS1 = 2 -PyUnicode_NATIVE_UCS2 = 3 -PyUnicode_NATIVE_UCS4 = 4 -PyUnicode_NATIVE_UTF8 = 5 +PyUnicode_FORMAT_ASCII = 0x01 +PyUnicode_FORMAT_UCS1 = 0x02 +PyUnicode_FORMAT_UCS2 = 0x04 +PyUnicode_FORMAT_UCS4 = 0x08 +PyUnicode_FORMAT_UTF8 = 0x10 # Invalid native format -PyUnicode_NATIVE_INVALID = 0 +PyUnicode_FORMAT_INVALID = 0x20 class CAPITest(unittest.TestCase): @@ -1683,74 +1683,119 @@ def test_pep393_utf8_caching_bug(self): # Check that the second call returns the same result self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) - def test_unicode_asnativeformat(self): - # Test PyUnicode_AsNativeFormat() - asnativeformat = _testlimitedcapi.unicode_asnativeformat - self.assertEqual(asnativeformat("abc"), - (b'abc', PyUnicode_NATIVE_ASCII)) - self.assertEqual(asnativeformat("latin1:\xe9"), - (b'latin1:\xe9', PyUnicode_NATIVE_UCS1)) - - ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be' - self.assertEqual(asnativeformat('ucs2:\u20ac'), + def test_unicode_export(self): + # Test PyUnicode_Export() and PyUnicode_FreeExport() + unicode_export = _testlimitedcapi.unicode_export + if sys.byteorder == 'little': + ucs2_enc = 'utf-16le' + ucs4_enc = 'utf-32le' + else: + ucs2_enc = 'utf-16be' + ucs4_enc = 'utf-32be' + + # export to the native format + formats = (PyUnicode_FORMAT_ASCII + | PyUnicode_FORMAT_UCS1 + | PyUnicode_FORMAT_UCS2 + | PyUnicode_FORMAT_UCS4) + self.assertEqual(unicode_export("abc", formats), + (b'abc', PyUnicode_FORMAT_ASCII)) + self.assertEqual(unicode_export("latin1:\xe9", formats), + (b'latin1:\xe9', PyUnicode_FORMAT_UCS1)) + self.assertEqual(unicode_export('ucs2:\u20ac', formats), ('ucs2:\u20ac'.encode(ucs2_enc), - PyUnicode_NATIVE_UCS2)) - - ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be' - self.assertEqual(asnativeformat('ucs4:\U0010ffff'), + PyUnicode_FORMAT_UCS2)) + self.assertEqual(unicode_export('ucs4:\U0010ffff', formats), ('ucs4:\U0010ffff'.encode(ucs4_enc), - PyUnicode_NATIVE_UCS4)) - - def test_unicode_fromnativeformat(self): - # Test PyUnicode_FromNativeFormat() - fromnativeformat = _testlimitedcapi.unicode_fromnativeformat - self.assertEqual(fromnativeformat(b'abc', PyUnicode_NATIVE_ASCII), + PyUnicode_FORMAT_UCS4)) + + # always export to UCS4 + self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4), + ('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4)) + self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4), + ('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4)) + self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4), + ('ucs2:\u20ac'.encode(ucs4_enc), + PyUnicode_FORMAT_UCS4)) + self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4), + ('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_FORMAT_UCS4)) + + # always export to UTF8 + self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8), + ('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8)) + self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8), + ('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8)) + self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8), + ('ucs2:\u20ac'.encode('utf8'), + PyUnicode_FORMAT_UTF8)) + self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8), + ('ucs4:\U0010ffff'.encode('utf8'), + PyUnicode_FORMAT_UTF8)) + + # No supported format or invalid format + with self.assertRaisesRegex(ValueError, + "unable to find a matching export format"): + unicode_export('abc', 0) + with self.assertRaisesRegex(ValueError, + "unable to find a matching export format"): + unicode_export('abc', PyUnicode_FORMAT_INVALID) + + def test_unicode_import(self): + # Test PyUnicode_Import() + unicode_import = _testlimitedcapi.unicode_import + if sys.byteorder == 'little': + ucs2_enc = 'utf-16le' + ucs4_enc = 'utf-32le' + else: + ucs2_enc = 'utf-16be' + ucs4_enc = 'utf-32be' + + self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII), "abc") - self.assertEqual(fromnativeformat(b'latin1:\xe9', PyUnicode_NATIVE_UCS1), + self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1), "latin1:\xe9") - ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be' - self.assertEqual(fromnativeformat('ucs2:\u20ac'.encode(ucs2_enc), - PyUnicode_NATIVE_UCS2), + self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_FORMAT_UCS2), 'ucs2:\u20ac') - ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be' - self.assertEqual(fromnativeformat('ucs4:\U0010ffff'.encode(ucs4_enc), - PyUnicode_NATIVE_UCS4), + self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_FORMAT_UCS4), 'ucs4:\U0010ffff') text = "abc\xe9\U0010ffff" - self.assertEqual(fromnativeformat(text.encode('utf8'), - PyUnicode_NATIVE_UTF8), + self.assertEqual(unicode_import(text.encode('utf8'), + PyUnicode_FORMAT_UTF8), text) # Empty string for native_format in ( - PyUnicode_NATIVE_ASCII, - PyUnicode_NATIVE_UCS1, - PyUnicode_NATIVE_UCS2, - PyUnicode_NATIVE_UCS4, - PyUnicode_NATIVE_UTF8, + PyUnicode_FORMAT_ASCII, + PyUnicode_FORMAT_UCS1, + PyUnicode_FORMAT_UCS2, + PyUnicode_FORMAT_UCS4, + PyUnicode_FORMAT_UTF8, ): with self.subTest(native_format=native_format): - self.assertEqual(fromnativeformat(b'', native_format), + self.assertEqual(unicode_import(b'', native_format), '') # Invalid format with self.assertRaises(ValueError): - fromnativeformat(b'', PyUnicode_NATIVE_INVALID) + unicode_import(b'', PyUnicode_FORMAT_INVALID) # Invalid size ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc) with self.assertRaises(ValueError): - fromnativeformat(ucs2[:-1], PyUnicode_NATIVE_UCS2) + unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2) ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc) with self.assertRaises(ValueError): - fromnativeformat(ucs4[:-1], PyUnicode_NATIVE_UCS4) + unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4) with self.assertRaises(ValueError): - fromnativeformat(ucs4[:-2], PyUnicode_NATIVE_UCS4) + unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4) with self.assertRaises(ValueError): - fromnativeformat(ucs4[:-3], PyUnicode_NATIVE_UCS4) + unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4) if __name__ == '__main__': diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 66da5b1d1846b4..360f432fd51a57 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1840,29 +1840,38 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) // Test PyUnicode_AsNativeFormat() static PyObject* -unicode_asnativeformat(PyObject *self, PyObject *obj) +unicode_export(PyObject *self, PyObject *args) { + PyObject *obj; + unsigned int supported_formats; + if (!PyArg_ParseTuple(args, "OI", &obj, &supported_formats)) { + return NULL; + } + Py_ssize_t size; - int native_format; - const void *data = PyUnicode_AsNativeFormat(obj, &size, &native_format); + unsigned int format; + const void *data = PyUnicode_Export(obj, supported_formats, &size, &format); if (data == NULL) { return NULL; } - return Py_BuildValue("y#i", data, size, native_format); + + PyObject *res = Py_BuildValue("y#i", data, size, format); + PyUnicode_FreeExport(obj, data, format); + return res; } // Test PyUnicode_FromNativeFormat() static PyObject* -unicode_fromnativeformat(PyObject *self, PyObject *args) +unicode_import(PyObject *self, PyObject *args) { const void *data; Py_ssize_t size; - int native_format; - if (!PyArg_ParseTuple(args, "y#i", &data, &size, &native_format)) { + unsigned int format; + if (!PyArg_ParseTuple(args, "y#i", &data, &size, &format)) { return NULL; } - return PyUnicode_FromNativeFormat(data, size, native_format); + return PyUnicode_Import(data, size, format); } @@ -1953,8 +1962,8 @@ static PyMethodDef TestMethods[] = { {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, {"unicode_isidentifier", unicode_isidentifier, METH_O}, - {"unicode_asnativeformat", unicode_asnativeformat, METH_O}, - {"unicode_fromnativeformat", unicode_fromnativeformat, METH_VARARGS}, + {"unicode_export", unicode_export, METH_VARARGS}, + {"unicode_import", unicode_import, METH_VARARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 068315fb13aa72..020b0b3bacefd9 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2098,63 +2098,119 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) } const void* -PyUnicode_AsNativeFormat(PyObject *unicode, - Py_ssize_t *size, int *native_format) +PyUnicode_Export(PyObject *unicode, unsigned int supported_formats, + Py_ssize_t *size, unsigned int *format) { if (!PyUnicode_Check(unicode)) { - *size = 0; - *native_format = 0; PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); - return NULL; + goto error; } Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); - if (PyUnicode_IS_ASCII(unicode)) { - *native_format = PyUnicode_NATIVE_ASCII; + if (PyUnicode_IS_ASCII(unicode) + && (supported_formats & PyUnicode_FORMAT_ASCII)) + { + *format = PyUnicode_FORMAT_ASCII; *size = len; return PyUnicode_1BYTE_DATA(unicode); } - int kind = PyUnicode_KIND(unicode); - switch (kind) + int kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND + && (supported_formats & PyUnicode_FORMAT_UCS1)) { - case PyUnicode_1BYTE_KIND: - *native_format = PyUnicode_NATIVE_UCS1; + *format = PyUnicode_FORMAT_UCS1; *size = len; return PyUnicode_1BYTE_DATA(unicode); + } - case PyUnicode_2BYTE_KIND: - *native_format = PyUnicode_NATIVE_UCS2; + if (kind == PyUnicode_2BYTE_KIND + && (supported_formats & PyUnicode_FORMAT_UCS2)) + { + *format = PyUnicode_FORMAT_UCS2; *size = len * 2; return PyUnicode_2BYTE_DATA(unicode); + } - default: - assert(kind == PyUnicode_4BYTE_KIND); - *native_format = PyUnicode_NATIVE_UCS4; + if (kind == PyUnicode_4BYTE_KIND + && (supported_formats & PyUnicode_FORMAT_UCS4)) + { + *format = PyUnicode_FORMAT_UCS4; *size = len * 4; return PyUnicode_4BYTE_DATA(unicode); } + + if (supported_formats & PyUnicode_FORMAT_UCS4) { + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); + if (ucs4 == NULL) { + goto error; + } + *format = PyUnicode_FORMAT_UCS4; + *size = len * 4; + return ucs4; + } + + if (supported_formats & PyUnicode_FORMAT_UTF8) { + const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, size); + if (utf8 == NULL) { + goto error; + } + *format = PyUnicode_FORMAT_UTF8; + return utf8; + } + + PyErr_Format(PyExc_ValueError, "unable to find a matching export format"); + + +error: + *size = 0; + *format = 0; + return NULL; +} + +void +PyUnicode_FreeExport(PyObject *unicode, const void* data, unsigned int format) +{ + switch (format) + { + case PyUnicode_FORMAT_ASCII: + break; + case PyUnicode_FORMAT_UCS1: + break; + case PyUnicode_FORMAT_UCS2: + break; + case PyUnicode_FORMAT_UCS4: + if (PyUnicode_KIND(unicode) != PyUnicode_4BYTE_KIND) { + PyMem_Free((void*)data); + } + break; + case PyUnicode_FORMAT_UTF8: + break; + default: + // ignore silently an unknown format + break; + } } PyObject* -PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, - int native_format) +PyUnicode_Import(const void *data, Py_ssize_t size, + unsigned int format) { if (size < 0) { PyErr_SetString(PyExc_ValueError, "Negative size"); return NULL; } - switch (native_format) + switch (format) { - case PyUnicode_NATIVE_ASCII: + case PyUnicode_FORMAT_ASCII: return PyUnicode_DecodeASCII((const char*)data, size, NULL); - case PyUnicode_NATIVE_UCS1: + case PyUnicode_FORMAT_UCS1: return _PyUnicode_FromUCS1(data, size); - case PyUnicode_NATIVE_UCS2: + case PyUnicode_FORMAT_UCS2: if (size % 2) { PyErr_Format(PyExc_ValueError, "size must be a multiple of 2: %zd", size); @@ -2162,7 +2218,7 @@ PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, } return _PyUnicode_FromUCS2(data, size / 2); - case PyUnicode_NATIVE_UCS4: + case PyUnicode_FORMAT_UCS4: if (size % 4) { PyErr_Format(PyExc_ValueError, "size must be a multiple of 4: %zd", size); @@ -2170,12 +2226,12 @@ PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, } return _PyUnicode_FromUCS4(data, size / 4); - case PyUnicode_NATIVE_UTF8: + case PyUnicode_FORMAT_UTF8: return PyUnicode_DecodeUTF8((const char*)data, size, NULL); default: - PyErr_Format(PyExc_ValueError, "unknown native format %i", - native_format); + PyErr_Format(PyExc_ValueError, "unknown format: %i", + format); return NULL; } } From 076985d06d87a1a71e7b334dc3bcd22b423ee685 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Jun 2024 14:41:13 +0200 Subject: [PATCH 03/11] Rename PyUnicode_FreeExport() to PyUnicode_ReleaseExport() --- Include/unicodeobject.h | 11 +++++++---- Modules/_testlimitedcapi/unicode.c | 2 +- Objects/unicodeobject.c | 3 ++- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index c23849a0365982..e9ccb480ded2c7 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -254,21 +254,24 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( #define PyUnicode_FORMAT_UCS4 0x08 #define PyUnicode_FORMAT_UTF8 0x10 -// Get the content of a string in its native format. -// - Return the content, set '*size' and '*native_format' on success. +// Get the content of a string in the requested format: +// - Return the content, set '*size' and '*format' on success. // - Set an exception and return NULL on error. +// +// The export must be released by PyUnicode_ReleaseExport(). PyAPI_FUNC(const void*) PyUnicode_Export( PyObject *unicode, unsigned int supported_formats, Py_ssize_t *size, unsigned int *format); -PyAPI_FUNC(void) PyUnicode_FreeExport( +// Release an export created by PyUnicode_Export(). +PyAPI_FUNC(void) PyUnicode_ReleaseExport( PyObject *unicode, const void* data, unsigned int format); -// Create a string object from a native format string. +// Create a string object from a string in the format 'format'. // - Return a reference to a new string object on success. // - Set an exception and return NULL on error. PyAPI_FUNC(PyObject*) PyUnicode_Import( diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 360f432fd51a57..306612e726aaab 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1856,7 +1856,7 @@ unicode_export(PyObject *self, PyObject *args) } PyObject *res = Py_BuildValue("y#i", data, size, format); - PyUnicode_FreeExport(obj, data, format); + PyUnicode_ReleaseExport(obj, data, format); return res; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 020b0b3bacefd9..19ce47b0b8bfa8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2170,7 +2170,8 @@ PyUnicode_Export(PyObject *unicode, unsigned int supported_formats, } void -PyUnicode_FreeExport(PyObject *unicode, const void* data, unsigned int format) +PyUnicode_ReleaseExport(PyObject *unicode, const void* data, + unsigned int format) { switch (format) { From fa0ff6da57e1bbe99cf91cc167e9fcb18b122947 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Jun 2024 14:49:13 +0200 Subject: [PATCH 04/11] Add test and comments --- Lib/test/test_capi/test_unicode.py | 4 ++++ Objects/unicodeobject.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a7eccb1c973616..a8bc1a2117687c 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1709,6 +1709,10 @@ def test_unicode_export(self): ('ucs4:\U0010ffff'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4)) + # export ASCII as UCS1 + self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS1), + (b'abc', PyUnicode_FORMAT_UCS1)) + # always export to UCS4 self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4), ('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4)) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 19ce47b0b8bfa8..ef7f882d3248c6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2142,6 +2142,7 @@ PyUnicode_Export(PyObject *unicode, unsigned int supported_formats, } if (supported_formats & PyUnicode_FORMAT_UCS4) { + // Convert UCS1 or UCS2 to UCS4 Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); if (ucs4 == NULL) { goto error; @@ -2152,6 +2153,7 @@ PyUnicode_Export(PyObject *unicode, unsigned int supported_formats, } if (supported_formats & PyUnicode_FORMAT_UTF8) { + // Encode UCS1, UCS2 or UCS4 to UTF-8 const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, size); if (utf8 == NULL) { goto error; From 11b9f43a17dfafb0447adb3b447556d1acae2cc4 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 14 Jun 2024 14:34:08 +0200 Subject: [PATCH 05/11] Use uint32_t for the format --- Include/unicodeobject.h | 8 ++++---- Modules/_testlimitedcapi/unicode.c | 6 +++--- Objects/unicodeobject.c | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index e9ccb480ded2c7..01e76034a54fbc 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -261,15 +261,15 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( // The export must be released by PyUnicode_ReleaseExport(). PyAPI_FUNC(const void*) PyUnicode_Export( PyObject *unicode, - unsigned int supported_formats, + uint32_t supported_formats, Py_ssize_t *size, - unsigned int *format); + uint32_t *format); // Release an export created by PyUnicode_Export(). PyAPI_FUNC(void) PyUnicode_ReleaseExport( PyObject *unicode, const void* data, - unsigned int format); + uint32_t format); // Create a string object from a string in the format 'format'. // - Return a reference to a new string object on success. @@ -277,7 +277,7 @@ PyAPI_FUNC(void) PyUnicode_ReleaseExport( PyAPI_FUNC(PyObject*) PyUnicode_Import( const void *data, Py_ssize_t size, - unsigned int format); + uint32_t format); /* --- wchar_t support for platforms which support it --------------------- */ diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 306612e726aaab..252714fc82c62e 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1849,13 +1849,13 @@ unicode_export(PyObject *self, PyObject *args) } Py_ssize_t size; - unsigned int format; + uint32_t format; const void *data = PyUnicode_Export(obj, supported_formats, &size, &format); if (data == NULL) { return NULL; } - PyObject *res = Py_BuildValue("y#i", data, size, format); + PyObject *res = Py_BuildValue("y#I", data, size, (unsigned int)format); PyUnicode_ReleaseExport(obj, data, format); return res; } @@ -1868,7 +1868,7 @@ unicode_import(PyObject *self, PyObject *args) const void *data; Py_ssize_t size; unsigned int format; - if (!PyArg_ParseTuple(args, "y#i", &data, &size, &format)) { + if (!PyArg_ParseTuple(args, "y#I", &data, &size, &format)) { return NULL; } return PyUnicode_Import(data, size, format); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ef7f882d3248c6..522ca31f405964 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2098,8 +2098,8 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) } const void* -PyUnicode_Export(PyObject *unicode, unsigned int supported_formats, - Py_ssize_t *size, unsigned int *format) +PyUnicode_Export(PyObject *unicode, uint32_t supported_formats, + Py_ssize_t *size, uint32_t *format) { if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); @@ -2173,7 +2173,7 @@ PyUnicode_Export(PyObject *unicode, unsigned int supported_formats, void PyUnicode_ReleaseExport(PyObject *unicode, const void* data, - unsigned int format) + uint32_t format) { switch (format) { @@ -2198,7 +2198,7 @@ PyUnicode_ReleaseExport(PyObject *unicode, const void* data, PyObject* PyUnicode_Import(const void *data, Py_ssize_t size, - unsigned int format) + uint32_t format) { if (size < 0) { PyErr_SetString(PyExc_ValueError, "Negative size"); From 72ad7ec5546ae805126cf418b6bf653928c64474 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 20 Jun 2024 16:21:11 +0200 Subject: [PATCH 06/11] Update stable ABI manifest --- Doc/data/stable_abi.dat | 5 +++-- Lib/test/test_stable_abi_ctypes.py | 5 +++-- Misc/stable_abi.toml | 16 ++++++++++++++-- PC/python3dll.c | 5 +++-- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 0e02420db0951c..80222096f3a0b6 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -734,7 +734,6 @@ function,PyUnicode_AsEncodedString,3.2,, function,PyUnicode_AsEncodedUnicode,3.2,, function,PyUnicode_AsLatin1String,3.2,, function,PyUnicode_AsMBCSString,3.7,on Windows, -function,PyUnicode_AsNativeFormat,3.14,, function,PyUnicode_AsRawUnicodeEscapeString,3.2,, function,PyUnicode_AsUCS4,3.7,, function,PyUnicode_AsUCS4Copy,3.7,, @@ -777,6 +776,7 @@ function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeLocale,3.7,, function,PyUnicode_EqualToUTF8,3.13,, function,PyUnicode_EqualToUTF8AndSize,3.13,, +function,PyUnicode_Export,3.14,, function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_Find,3.2,, @@ -785,7 +785,6 @@ function,PyUnicode_Format,3.2,, function,PyUnicode_FromEncodedObject,3.2,, function,PyUnicode_FromFormat,3.2,, function,PyUnicode_FromFormatV,3.2,, -function,PyUnicode_FromNativeFormat,3.14,, function,PyUnicode_FromObject,3.2,, function,PyUnicode_FromOrdinal,3.2,, function,PyUnicode_FromString,3.2,, @@ -793,6 +792,7 @@ function,PyUnicode_FromStringAndSize,3.2,, function,PyUnicode_FromWideChar,3.2,, function,PyUnicode_GetDefaultEncoding,3.2,, function,PyUnicode_GetLength,3.7,, +function,PyUnicode_Import,3.14,, function,PyUnicode_InternFromString,3.2,, function,PyUnicode_InternInPlace,3.2,, function,PyUnicode_IsIdentifier,3.2,, @@ -801,6 +801,7 @@ function,PyUnicode_Partition,3.2,, function,PyUnicode_RPartition,3.2,, function,PyUnicode_RSplit,3.2,, function,PyUnicode_ReadChar,3.7,, +function,PyUnicode_ReleaseExport,3.14,, function,PyUnicode_Replace,3.2,, function,PyUnicode_Resize,3.2,, function,PyUnicode_RichCompare,3.2,, diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 16cc37dea3a040..b4e977f4e972e2 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -760,7 +760,6 @@ def test_windows_feature_macros(self): "PyUnicode_AsEncodedString", "PyUnicode_AsEncodedUnicode", "PyUnicode_AsLatin1String", - "PyUnicode_AsNativeFormat", "PyUnicode_AsRawUnicodeEscapeString", "PyUnicode_AsUCS4", "PyUnicode_AsUCS4Copy", @@ -799,6 +798,7 @@ def test_windows_feature_macros(self): "PyUnicode_EncodeLocale", "PyUnicode_EqualToUTF8", "PyUnicode_EqualToUTF8AndSize", + "PyUnicode_Export", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", @@ -807,7 +807,6 @@ def test_windows_feature_macros(self): "PyUnicode_FromEncodedObject", "PyUnicode_FromFormat", "PyUnicode_FromFormatV", - "PyUnicode_FromNativeFormat", "PyUnicode_FromObject", "PyUnicode_FromOrdinal", "PyUnicode_FromString", @@ -816,6 +815,7 @@ def test_windows_feature_macros(self): "PyUnicode_GetDefaultEncoding", "PyUnicode_GetLength", "PyUnicode_GetSize", + "PyUnicode_Import", "PyUnicode_InternFromString", "PyUnicode_InternImmortal", "PyUnicode_InternInPlace", @@ -825,6 +825,7 @@ def test_windows_feature_macros(self): "PyUnicode_RPartition", "PyUnicode_RSplit", "PyUnicode_ReadChar", + "PyUnicode_ReleaseExport", "PyUnicode_Replace", "PyUnicode_Resize", "PyUnicode_RichCompare", diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index a9d554a6667a3d..c77dd429549509 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2510,7 +2510,19 @@ [function.Py_TYPE] added = '3.14' -[function.PyUnicode_AsNativeFormat] +[function.PyUnicode_Import] added = '3.14' -[function.PyUnicode_FromNativeFormat] +[function.PyUnicode_Export] + added = '3.14' +[function.PyUnicode_ReleaseExport] + added = '3.14' +[const.PyUnicode_FORMAT_ASCII] + added = '3.14' +[const.PyUnicode_FORMAT_UCS1] + added = '3.14' +[const.PyUnicode_FORMAT_UCS2] + added = '3.14' +[const.PyUnicode_FORMAT_UCS4] + added = '3.14' +[const.PyUnicode_FORMAT_UTF8] added = '3.14' diff --git a/PC/python3dll.c b/PC/python3dll.c index c69a584a5b2891..3086a08c0b70f5 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -666,7 +666,6 @@ EXPORT_FUNC(PyUnicode_AsEncodedString) EXPORT_FUNC(PyUnicode_AsEncodedUnicode) EXPORT_FUNC(PyUnicode_AsLatin1String) EXPORT_FUNC(PyUnicode_AsMBCSString) -EXPORT_FUNC(PyUnicode_AsNativeFormat) EXPORT_FUNC(PyUnicode_AsRawUnicodeEscapeString) EXPORT_FUNC(PyUnicode_AsUCS4) EXPORT_FUNC(PyUnicode_AsUCS4Copy) @@ -709,13 +708,13 @@ EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) EXPORT_FUNC(PyUnicode_EqualToUTF8) EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize) +EXPORT_FUNC(PyUnicode_Export) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) EXPORT_FUNC(PyUnicode_FromEncodedObject) EXPORT_FUNC(PyUnicode_FromFormat) EXPORT_FUNC(PyUnicode_FromFormatV) -EXPORT_FUNC(PyUnicode_FromNativeFormat) EXPORT_FUNC(PyUnicode_FromObject) EXPORT_FUNC(PyUnicode_FromOrdinal) EXPORT_FUNC(PyUnicode_FromString) @@ -726,6 +725,7 @@ EXPORT_FUNC(PyUnicode_FSDecoder) EXPORT_FUNC(PyUnicode_GetDefaultEncoding) EXPORT_FUNC(PyUnicode_GetLength) EXPORT_FUNC(PyUnicode_GetSize) +EXPORT_FUNC(PyUnicode_Import) EXPORT_FUNC(PyUnicode_InternFromString) EXPORT_FUNC(PyUnicode_InternImmortal) EXPORT_FUNC(PyUnicode_InternInPlace) @@ -733,6 +733,7 @@ EXPORT_FUNC(PyUnicode_IsIdentifier) EXPORT_FUNC(PyUnicode_Join) EXPORT_FUNC(PyUnicode_Partition) EXPORT_FUNC(PyUnicode_ReadChar) +EXPORT_FUNC(PyUnicode_ReleaseExport) EXPORT_FUNC(PyUnicode_Replace) EXPORT_FUNC(PyUnicode_Resize) EXPORT_FUNC(PyUnicode_RichCompare) From a2433ee87a460c6f9dc507624eff7fe11193ce4b Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 20 Jun 2024 16:45:58 +0200 Subject: [PATCH 07/11] In debug mode, break assumptions that the buffer is zero-terminated --- Modules/_testlimitedcapi/unicode.c | 15 +++++++++++++++ Objects/unicodeobject.c | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 252714fc82c62e..b367920f7106b4 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1855,6 +1855,21 @@ unicode_export(PyObject *self, PyObject *args) return NULL; } +#ifdef Py_DEBUG +#define CHECK_END_BYTE(X) assert((X) == 0 || (X) == 0xAA) +#else +#define CHECK_END_BYTE(X) assert((X) == 0) +#endif + CHECK_END_BYTE(((unsigned char*)data)[size]); + if (format == PyUnicode_FORMAT_UCS2) { + CHECK_END_BYTE(((unsigned char*)data)[size + 1]); + } + if (format == PyUnicode_FORMAT_UCS4) { + CHECK_END_BYTE(((unsigned char*)data)[size + 1]); + CHECK_END_BYTE(((unsigned char*)data)[size + 2]); + CHECK_END_BYTE(((unsigned char*)data)[size + 3]); + } + PyObject *res = Py_BuildValue("y#I", data, size, (unsigned int)format); PyUnicode_ReleaseExport(obj, data, format); return res; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c3faa174d1c88d..94801e2adc0d94 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2147,6 +2147,14 @@ PyUnicode_Export(PyObject *unicode, uint32_t supported_formats, if (ucs4 == NULL) { goto error; } + + // The buffer is not necessarily zero-terminated. + // In debug mode, explicitly set a non-zero byte. + // For production, keep the safe zero. + assert(ucs4[len] == 0); +#ifdef Py_DEBUG + ucs4[len] = 0xAAAAAAAA; +#endif *format = PyUnicode_FORMAT_UCS4; *size = len * 4; return ucs4; From ea4c7f6d68538041d5d563843fae04fb8188e798 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 20 Jun 2024 16:46:51 +0200 Subject: [PATCH 08/11] Export compatible strings to UCS2 --- Lib/test/test_capi/test_unicode.py | 9 +++++++++ Objects/unicodeobject.c | 27 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a8bc1a2117687c..a811bce6b3330c 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1725,6 +1725,15 @@ def test_unicode_export(self): ('ucs4:\U0010ffff'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4)) + # export to UCS2 unless it's UCS4 + self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS2), + ('abc'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2)) + self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS2), + ('latin1:\xe9'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2)) + self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS2), + ('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_FORMAT_UCS2)) + # always export to UTF8 self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8), ('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8)) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 94801e2adc0d94..60228c16d5c80f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2160,6 +2160,30 @@ PyUnicode_Export(PyObject *unicode, uint32_t supported_formats, return ucs4; } + if (supported_formats & PyUnicode_FORMAT_UCS2 + && kind == PyUnicode_1BYTE_KIND) + { + // Convert UCS1 to UCS2 + Py_UCS2 *ucs2 = PyMem_Malloc(sizeof(Py_UCS2) * (len + 1)); + if (!ucs2) { + PyErr_NoMemory(); + goto error; + } + _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, + PyUnicode_1BYTE_DATA(unicode), + PyUnicode_1BYTE_DATA(unicode) + len, + ucs2); +#ifdef Py_DEBUG + // See AAAAAAAA in PyUnicode_FORMAT_UCS4 + ucs2[len] = 0xAAAA; +#else + ucs2[len] = 0; +#endif + *format = PyUnicode_FORMAT_UCS2; + *size = len * 2; + return ucs2; + } + if (supported_formats & PyUnicode_FORMAT_UTF8) { // Encode UCS1, UCS2 or UCS4 to UTF-8 const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, size); @@ -2190,6 +2214,9 @@ PyUnicode_ReleaseExport(PyObject *unicode, const void* data, case PyUnicode_FORMAT_UCS1: break; case PyUnicode_FORMAT_UCS2: + if (PyUnicode_KIND(unicode) != PyUnicode_2BYTE_KIND) { + PyMem_Free((void*)data); + } break; case PyUnicode_FORMAT_UCS4: if (PyUnicode_KIND(unicode) != PyUnicode_4BYTE_KIND) { From bece7e206768177574e414579687f576dda31059 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 20 Jun 2024 16:47:33 +0200 Subject: [PATCH 09/11] Add a more compact roundtrip test This makes it easier to see the tested combinations --- Lib/test/test_capi/test_unicode.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a811bce6b3330c..20cdedfdde1945 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1810,6 +1810,36 @@ def test_unicode_import(self): with self.assertRaises(ValueError): unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4) + def test_unicode_import_export_roundtrip(self): + unicode_export = _testlimitedcapi.unicode_export + unicode_import = _testlimitedcapi.unicode_import + A = PyUnicode_FORMAT_ASCII + CS1 = PyUnicode_FORMAT_UCS1 + CS2 = PyUnicode_FORMAT_UCS2 + CS4 = PyUnicode_FORMAT_UCS4 + TF8 = PyUnicode_FORMAT_UTF8 + for string, alowed_encodings in ( + ('', {A, CS1, CS2, CS4, TF8}), + ('ascii', {A, CS1, CS2, CS4, TF8}), + ('latin1:\xe9', {CS1, CS2, CS4, TF8}), + ('ucs2:\u20ac', {CS2, CS4, TF8}), + ('ucs4:\U0001f638', {CS4, TF8}), + ): + for encoding in A, CS1, CS2, CS4, TF8: + with self.subTest(string=string, encoding=encoding): + if encoding not in alowed_encodings: + with self.assertRaises(ValueError): + unicode_export(string, encoding) + else: + buf, buf_enc = unicode_export(string, encoding) + restored = unicode_import(buf, buf_enc) + self.assertEqual(restored, string) + + with self.subTest(string=string, encoding=-1): + buf, buf_enc = unicode_export(string, -1) + restored = unicode_import(buf, buf_enc) + self.assertEqual(restored, string) + if __name__ == '__main__': unittest.main() From 20592018beff14239608a6f7d586e685a4aa81eb Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 20 Jun 2024 16:47:58 +0200 Subject: [PATCH 10/11] Rename the argument to *nbytes* to make the unit clearer --- Include/unicodeobject.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 01e76034a54fbc..0f983d08f740a7 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -255,14 +255,14 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( #define PyUnicode_FORMAT_UTF8 0x10 // Get the content of a string in the requested format: -// - Return the content, set '*size' and '*format' on success. +// - Return the content, set '*nbytes' and '*format' on success. // - Set an exception and return NULL on error. // // The export must be released by PyUnicode_ReleaseExport(). PyAPI_FUNC(const void*) PyUnicode_Export( PyObject *unicode, uint32_t supported_formats, - Py_ssize_t *size, + Py_ssize_t *nbytes, uint32_t *format); // Release an export created by PyUnicode_Export(). @@ -276,7 +276,7 @@ PyAPI_FUNC(void) PyUnicode_ReleaseExport( // - Set an exception and return NULL on error. PyAPI_FUNC(PyObject*) PyUnicode_Import( const void *data, - Py_ssize_t size, + Py_ssize_t nbytes, uint32_t format); /* --- wchar_t support for platforms which support it --------------------- */ From dee3755a095836411696d6cf5c6d9274b64bf898 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 20 Jun 2024 16:48:13 +0200 Subject: [PATCH 11/11] Adjust docs --- Doc/c-api/unicode.rst | 74 ++++++++++++------- Doc/whatsnew/3.14.rst | 5 +- ...-05-27-17-46-17.gh-issue-119609.kPIx6S.rst | 5 +- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index cc48254255ecf9..1ab4924b763635 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,49 +341,71 @@ APIs: .. versionadded:: 3.3 -.. c:function:: const void* PyUnicode_AsNativeFormat(PyObject *unicode, Py_ssize_t *size, int *native_format) +.. c:function:: const void* PyUnicode_Export(PyObject *unicode, uint32_t supported_formats, + Py_ssize_t *nbytes, uint32_t *format) - Get the contents of a string in its native format. + Get the contents of a string in an “export format”. - * Return the contents, set *\*size* and *\*native_format* on success. - * Set an exception and return ``NULL`` on error. + Set *supported_formats* to formats from the following list, OR-ed together: + + .. c:namespace:: NULL - The contents is valid as long as *unicode* is valid. + ======================================== ======== ============================ + Constant Identifier Value Description + ======================================== ======== ============================ + .. c:macro:: PyUnicode_FORMAT_ASCII ``0x01`` ASCII string (``Py_UCS1*``) + .. c:macro:: PyUnicode_FORMAT_UCS1 ``0x02`` UCS-1 string (``Py_UCS1*``) + .. c:macro:: PyUnicode_FORMAT_UCS2 ``0x04`` UCS-2 string (``Py_UCS2*``) + .. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``) + .. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``) + ======================================== ======== ============================ - *unicode*, *size* and *native_format* must not be NULL. + Note that future versions of Python may introduce additional formats. - *\*native_format* is set to one of these native formats: + On success: - .. c:namespace:: NULL + * Return a buffer containing the string data. Note that the buffer is not + necessarily zero-terminated. + * Set *\*format* to the buffer's format -- this will be one of the flags + set in *supported_formats*. + * Set *\*nbytes* to the size of the buffer, in bytes. + + On error, set an exception, set *\*format* and *\*nbytes* to zero, and + return ``NULL``. + + The returned buffer must be later released using + :c:func:`PyUnicode_ReleaseExport`. + + The returned buffer must not be modified. + + If possible, the export is a zero-copy operation -- for example, + the string's underlying storage is returned. + + *unicode*, *nbytes* and *native_format* must not be NULL. + + .. versionadded:: 3.14 + + +.. c:function:: void PyUnicode_ReleaseExport(PyObject *unicode, const void* data, uint32_t format) + + Release a string's export buffer. The buffer is invalid after this call. - ======================================== ===== ============================ - Constant Identifier Value Description - ======================================== ===== ============================ - .. c:macro:: PyUnicode_NATIVE_ASCII ``1`` ASCII string (``Py_UCS1*``) - .. c:macro:: PyUnicode_NATIVE_UCS1 ``2`` UCS-1 string (``Py_UCS1*``) - .. c:macro:: PyUnicode_NATIVE_UCS2 ``3`` UCS-2 string (``Py_UCS2*``) - .. c:macro:: PyUnicode_NATIVE_UCS4 ``4`` UCS-4 string (``Py_UCS4*``) - .. c:macro:: PyUnicode_NATIVE_UTF8 ``5`` UTF-8 string (``char*``) - ======================================== ===== ============================ - - .. impl-detail:: - In CPython, the :c:macro:`PyUnicode_NATIVE_UTF8` format is not used by - :c:func:`PyUnicode_AsNativeFormat`, but it's accepted by - :c:func:`PyUnicode_FromNativeFormat`. + Each argument must match the corresponding argument or result of + a single earlier call to :c:func:`PyUnicode_Export`. .. versionadded:: 3.14 -.. c:function:: PyObject* PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, int native_format) +.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format) - Create a string object from a native format string. + Create a string object from a buffer in an “export format”. * Return a reference to a new string object on success. * Set an exception and return ``NULL`` on error. - *data* must not be NULL. *size* must be positive or zero. + *data* must not be NULL. *nbytes* must be positive or zero. - See :c:func:`PyUnicode_AsNativeFormat` for the available native formats. + See :c:func:`PyUnicode_Export` for the available native formats. .. versionadded:: 3.14 diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 0fa79805809f7b..1914ad3114cfd1 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -298,9 +298,8 @@ New Features (Contributed by Victor Stinner in :gh:`119182`.) -* Add :c:func:`PyUnicode_AsNativeFormat` and - :c:func:`PyUnicode_FromNativeFormat` functions to import and export strings - in their native format. +* Add :c:func:`PyUnicode_Import` and :c:func:`PyUnicode_Export` functions to + import and export strings from/to buffers in a given format. (Contributed by Victor Stinner in :gh:`119609`.) diff --git a/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst index 06f9a061ec8ac0..266fac739e338b 100644 --- a/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst +++ b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst @@ -1,3 +1,2 @@ -Add :c:func:`PyUnicode_AsNativeFormat` and -:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings -in their native format. Patch by Victor Stinner. +Add :c:func:`PyUnicode_Export` and :c:func:`PyUnicode_Import` functions to +import and export strings from native buffers. Patch by Victor Stinner.