From 13d9d72e03dbfe4444fde4455f48962f0b0d5e01 Mon Sep 17 00:00:00 2001 From: Richard Aas Date: Fri, 6 Apr 2018 14:47:16 +0200 Subject: [PATCH] json/utf8: fix unescaping of unicode code points (#127) --- include/re_fmt.h | 1 + src/fmt/unicode.c | 78 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/include/re_fmt.h b/include/re_fmt.h index 440bd589..55806bf2 100644 --- a/include/re_fmt.h +++ b/include/re_fmt.h @@ -144,3 +144,4 @@ void fmt_param_apply(const struct pl *pl, fmt_param_h *ph, void *arg); /* unicode */ int utf8_encode(struct re_printf *pf, const char *str); int utf8_decode(struct re_printf *pf, const struct pl *pl); +size_t utf8_byteseq(char u[4], unsigned cp); diff --git a/src/fmt/unicode.c b/src/fmt/unicode.c index e7118359..1dde72e5 100644 --- a/src/fmt/unicode.c +++ b/src/fmt/unicode.c @@ -86,6 +86,7 @@ int utf8_encode(struct re_printf *pf, const char *str) */ int utf8_decode(struct re_printf *pf, const struct pl *pl) { + int uhi = -1; size_t i; if (!pf) @@ -101,7 +102,9 @@ int utf8_decode(struct re_printf *pf, const struct pl *pl) if (ch == '\\') { - uint16_t u = 0; + unsigned u = 0; + char ubuf[4]; + size_t ulen; ++i; @@ -147,18 +150,32 @@ int utf8_decode(struct re_printf *pf, const struct pl *pl) u |= ((uint16_t)ch_hex(pl->p[++i])) << 4; u |= ((uint16_t)ch_hex(pl->p[++i])) << 0; - if (u > 255) { - ch = u>>8; - err = pf->vph(&ch, 1, pf->arg); - if (err) - return err; + /* UTF-16 surrogate pair */ + if (u >= 0xd800 && u <= 0xdbff) { + uhi = (u - 0xd800) * 0x400; + continue; } + else if (u >= 0xdc00 && u <= 0xdfff) { + if (uhi < 0) + continue; - ch = u & 0xff; - break; + u = uhi + u - 0xdc00 + 0x10000; + } + + uhi = -1; + + ulen = utf8_byteseq(ubuf, u); + + err = pf->vph(ubuf, ulen, pf->arg); + if (err) + return err; + + continue; } } + uhi = -1; + err = pf->vph(&ch, 1, pf->arg); if (err) return err; @@ -166,3 +183,48 @@ int utf8_decode(struct re_printf *pf, const struct pl *pl) return 0; } + + +/** + * Encode Unicode code point into binary UTF-8 + * + * @param u Binary UTF-8 buffer + * @param cp Unicode code point + * + * @return length of UTF-8 byte sequence + */ +size_t utf8_byteseq(char u[4], unsigned cp) +{ + if (!u) + return 0; + + if (cp <= 0x7f) { + u[0] = cp; + return 1; + } + else if (cp <= 0x7ff) { + u[0] = 0xc0 | (cp>>6 & 0x1f); + u[1] = 0x80 | (cp & 0x3f); + return 2; + } + else if (cp <= 0xffff) { + u[0] = 0xe0 | (cp>>12 & 0x0f); + u[1] = 0x80 | (cp>>6 & 0x3f); + u[2] = 0x80 | (cp & 0x3f); + return 3; + } + else if (cp <= 0x10ffff) { + u[0] = 0xf0 | (cp>>18 & 0x07); + u[1] = 0x80 | (cp>>12 & 0x3f); + u[2] = 0x80 | (cp>>6 & 0x3f); + u[3] = 0x80 | (cp & 0x3f); + return 4; + } + else { + /* The replacement character (U+FFFD) */ + u[0] = (char)0xef; + u[1] = (char)0xbf; + u[2] = (char)0xbd; + return 3; + } +}