Skip to content

Commit

Permalink
Change the API to PyUnicode_Export()
Browse files Browse the repository at this point in the history
  • Loading branch information
vstinner committed Jun 11, 2024
1 parent 28c30c0 commit 4d77192
Show file tree
Hide file tree
Showing 4 changed files with 207 additions and 91 deletions.
24 changes: 15 additions & 9 deletions Include/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,27 +248,33 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
const char *u /* UTF-8 encoded string */
);

#define PyUnicode_NATIVE_ASCII 1
#define PyUnicode_NATIVE_UCS1 2
#define PyUnicode_NATIVE_UCS2 3
#define PyUnicode_NATIVE_UCS4 4
#define PyUnicode_NATIVE_UTF8 5
#define PyUnicode_FORMAT_ASCII 0x01
#define PyUnicode_FORMAT_UCS1 0x02
#define PyUnicode_FORMAT_UCS2 0x04
#define PyUnicode_FORMAT_UCS4 0x08
#define PyUnicode_FORMAT_UTF8 0x10

// Get the content of a string in its native format.
// - Return the content, set '*size' and '*native_format' on success.
// - Set an exception and return NULL on error.
PyAPI_FUNC(const void*) PyUnicode_AsNativeFormat(
PyAPI_FUNC(const void*) PyUnicode_Export(
PyObject *unicode,
unsigned int supported_formats,
Py_ssize_t *size,
int *native_format);
unsigned int *format);

PyAPI_FUNC(void) PyUnicode_FreeExport(
PyObject *unicode,
const void* data,
unsigned int format);

// Create a string object from a native format string.
// - Return a reference to a new string object on success.
// - Set an exception and return NULL on error.
PyAPI_FUNC(PyObject*) PyUnicode_FromNativeFormat(
PyAPI_FUNC(PyObject*) PyUnicode_Import(
const void *data,
Py_ssize_t size,
int native_format);
unsigned int format);

/* --- wchar_t support for platforms which support it --------------------- */

Expand Down
137 changes: 91 additions & 46 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ class Str(str):
pass


PyUnicode_NATIVE_ASCII = 1
PyUnicode_NATIVE_UCS1 = 2
PyUnicode_NATIVE_UCS2 = 3
PyUnicode_NATIVE_UCS4 = 4
PyUnicode_NATIVE_UTF8 = 5
PyUnicode_FORMAT_ASCII = 0x01
PyUnicode_FORMAT_UCS1 = 0x02
PyUnicode_FORMAT_UCS2 = 0x04
PyUnicode_FORMAT_UCS4 = 0x08
PyUnicode_FORMAT_UTF8 = 0x10
# Invalid native format
PyUnicode_NATIVE_INVALID = 0
PyUnicode_FORMAT_INVALID = 0x20

class CAPITest(unittest.TestCase):

Expand Down Expand Up @@ -1683,74 +1683,119 @@ def test_pep393_utf8_caching_bug(self):
# Check that the second call returns the same result
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))

def test_unicode_asnativeformat(self):
# Test PyUnicode_AsNativeFormat()
asnativeformat = _testlimitedcapi.unicode_asnativeformat
self.assertEqual(asnativeformat("abc"),
(b'abc', PyUnicode_NATIVE_ASCII))
self.assertEqual(asnativeformat("latin1:\xe9"),
(b'latin1:\xe9', PyUnicode_NATIVE_UCS1))

ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
self.assertEqual(asnativeformat('ucs2:\u20ac'),
def test_unicode_export(self):
# Test PyUnicode_Export() and PyUnicode_FreeExport()
unicode_export = _testlimitedcapi.unicode_export
if sys.byteorder == 'little':
ucs2_enc = 'utf-16le'
ucs4_enc = 'utf-32le'
else:
ucs2_enc = 'utf-16be'
ucs4_enc = 'utf-32be'

# export to the native format
formats = (PyUnicode_FORMAT_ASCII
| PyUnicode_FORMAT_UCS1
| PyUnicode_FORMAT_UCS2
| PyUnicode_FORMAT_UCS4)
self.assertEqual(unicode_export("abc", formats),
(b'abc', PyUnicode_FORMAT_ASCII))
self.assertEqual(unicode_export("latin1:\xe9", formats),
(b'latin1:\xe9', PyUnicode_FORMAT_UCS1))
self.assertEqual(unicode_export('ucs2:\u20ac', formats),
('ucs2:\u20ac'.encode(ucs2_enc),
PyUnicode_NATIVE_UCS2))

ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
self.assertEqual(asnativeformat('ucs4:\U0010ffff'),
PyUnicode_FORMAT_UCS2))
self.assertEqual(unicode_export('ucs4:\U0010ffff', formats),
('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_NATIVE_UCS4))

def test_unicode_fromnativeformat(self):
# Test PyUnicode_FromNativeFormat()
fromnativeformat = _testlimitedcapi.unicode_fromnativeformat
self.assertEqual(fromnativeformat(b'abc', PyUnicode_NATIVE_ASCII),
PyUnicode_FORMAT_UCS4))

# always export to UCS4
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4),
('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4),
('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4))
self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4),
('ucs2:\u20ac'.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4))
self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4),
('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4))

# always export to UTF8
self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8),
('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8))
self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8),
('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8))
self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8),
('ucs2:\u20ac'.encode('utf8'),
PyUnicode_FORMAT_UTF8))
self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8),
('ucs4:\U0010ffff'.encode('utf8'),
PyUnicode_FORMAT_UTF8))

# No supported format or invalid format
with self.assertRaisesRegex(ValueError,
"unable to find a matching export format"):
unicode_export('abc', 0)
with self.assertRaisesRegex(ValueError,
"unable to find a matching export format"):
unicode_export('abc', PyUnicode_FORMAT_INVALID)

def test_unicode_import(self):
# Test PyUnicode_Import()
unicode_import = _testlimitedcapi.unicode_import
if sys.byteorder == 'little':
ucs2_enc = 'utf-16le'
ucs4_enc = 'utf-32le'
else:
ucs2_enc = 'utf-16be'
ucs4_enc = 'utf-32be'

self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
"abc")
self.assertEqual(fromnativeformat(b'latin1:\xe9', PyUnicode_NATIVE_UCS1),
self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
"latin1:\xe9")

ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
self.assertEqual(fromnativeformat('ucs2:\u20ac'.encode(ucs2_enc),
PyUnicode_NATIVE_UCS2),
self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
PyUnicode_FORMAT_UCS2),
'ucs2:\u20ac')

ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
self.assertEqual(fromnativeformat('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_NATIVE_UCS4),
self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4),
'ucs4:\U0010ffff')

text = "abc\xe9\U0010ffff"
self.assertEqual(fromnativeformat(text.encode('utf8'),
PyUnicode_NATIVE_UTF8),
self.assertEqual(unicode_import(text.encode('utf8'),
PyUnicode_FORMAT_UTF8),
text)

# Empty string
for native_format in (
PyUnicode_NATIVE_ASCII,
PyUnicode_NATIVE_UCS1,
PyUnicode_NATIVE_UCS2,
PyUnicode_NATIVE_UCS4,
PyUnicode_NATIVE_UTF8,
PyUnicode_FORMAT_ASCII,
PyUnicode_FORMAT_UCS1,
PyUnicode_FORMAT_UCS2,
PyUnicode_FORMAT_UCS4,
PyUnicode_FORMAT_UTF8,
):
with self.subTest(native_format=native_format):
self.assertEqual(fromnativeformat(b'', native_format),
self.assertEqual(unicode_import(b'', native_format),
'')

# Invalid format
with self.assertRaises(ValueError):
fromnativeformat(b'', PyUnicode_NATIVE_INVALID)
unicode_import(b'', PyUnicode_FORMAT_INVALID)

# Invalid size
ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
with self.assertRaises(ValueError):
fromnativeformat(ucs2[:-1], PyUnicode_NATIVE_UCS2)
unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
with self.assertRaises(ValueError):
fromnativeformat(ucs4[:-1], PyUnicode_NATIVE_UCS4)
unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
with self.assertRaises(ValueError):
fromnativeformat(ucs4[:-2], PyUnicode_NATIVE_UCS4)
unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
with self.assertRaises(ValueError):
fromnativeformat(ucs4[:-3], PyUnicode_NATIVE_UCS4)
unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)


if __name__ == '__main__':
Expand Down
29 changes: 19 additions & 10 deletions Modules/_testlimitedcapi/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1840,29 +1840,38 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))

// Test PyUnicode_AsNativeFormat()
static PyObject*
unicode_asnativeformat(PyObject *self, PyObject *obj)
unicode_export(PyObject *self, PyObject *args)
{
PyObject *obj;
unsigned int supported_formats;
if (!PyArg_ParseTuple(args, "OI", &obj, &supported_formats)) {
return NULL;
}

Py_ssize_t size;
int native_format;
const void *data = PyUnicode_AsNativeFormat(obj, &size, &native_format);
unsigned int format;
const void *data = PyUnicode_Export(obj, supported_formats, &size, &format);
if (data == NULL) {
return NULL;
}
return Py_BuildValue("y#i", data, size, native_format);

PyObject *res = Py_BuildValue("y#i", data, size, format);
PyUnicode_FreeExport(obj, data, format);
return res;
}


// Test PyUnicode_FromNativeFormat()
static PyObject*
unicode_fromnativeformat(PyObject *self, PyObject *args)
unicode_import(PyObject *self, PyObject *args)
{
const void *data;
Py_ssize_t size;
int native_format;
if (!PyArg_ParseTuple(args, "y#i", &data, &size, &native_format)) {
unsigned int format;
if (!PyArg_ParseTuple(args, "y#i", &data, &size, &format)) {
return NULL;
}
return PyUnicode_FromNativeFormat(data, size, native_format);
return PyUnicode_Import(data, size, format);
}


Expand Down Expand Up @@ -1953,8 +1962,8 @@ static PyMethodDef TestMethods[] = {
{"unicode_format", unicode_format, METH_VARARGS},
{"unicode_contains", unicode_contains, METH_VARARGS},
{"unicode_isidentifier", unicode_isidentifier, METH_O},
{"unicode_asnativeformat", unicode_asnativeformat, METH_O},
{"unicode_fromnativeformat", unicode_fromnativeformat, METH_VARARGS},
{"unicode_export", unicode_export, METH_VARARGS},
{"unicode_import", unicode_import, METH_VARARGS},
{NULL},
};

Expand Down
Loading

0 comments on commit 4d77192

Please sign in to comment.