From 13d9d72e03dbfe4444fde4455f48962f0b0d5e01 Mon Sep 17 00:00:00 2001
From: Richard Aas <richaraas@gmail.com>
Date: Fri, 6 Apr 2018 14:47:16 +0200
Subject: [PATCH] json/utf8: fix unescaping of unicode code points (#127)

---
 include/re_fmt.h  |  1 +
 src/fmt/unicode.c | 78 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/include/re_fmt.h b/include/re_fmt.h
index 440bd589..55806bf2 100644
--- a/include/re_fmt.h
+++ b/include/re_fmt.h
@@ -144,3 +144,4 @@ void fmt_param_apply(const struct pl *pl, fmt_param_h *ph, void *arg);
 /* unicode */
 int utf8_encode(struct re_printf *pf, const char *str);
 int utf8_decode(struct re_printf *pf, const struct pl *pl);
+size_t utf8_byteseq(char u[4], unsigned cp);
diff --git a/src/fmt/unicode.c b/src/fmt/unicode.c
index e7118359..1dde72e5 100644
--- a/src/fmt/unicode.c
+++ b/src/fmt/unicode.c
@@ -86,6 +86,7 @@ int utf8_encode(struct re_printf *pf, const char *str)
  */
 int utf8_decode(struct re_printf *pf, const struct pl *pl)
 {
+	int uhi = -1;
 	size_t i;
 
 	if (!pf)
@@ -101,7 +102,9 @@ int utf8_decode(struct re_printf *pf, const struct pl *pl)
 
 		if (ch == '\\') {
 
-			uint16_t u = 0;
+			unsigned u = 0;
+			char ubuf[4];
+			size_t ulen;
 
 			++i;
 
@@ -147,18 +150,32 @@ int utf8_decode(struct re_printf *pf, const struct pl *pl)
 				u |= ((uint16_t)ch_hex(pl->p[++i])) << 4;
 				u |= ((uint16_t)ch_hex(pl->p[++i])) << 0;
 
-				if (u > 255) {
-					ch  = u>>8;
-					err = pf->vph(&ch, 1, pf->arg);
-					if (err)
-						return err;
+				/* UTF-16 surrogate pair */
+				if (u >= 0xd800 && u <= 0xdbff) {
+					uhi = (u - 0xd800) * 0x400;
+					continue;
 				}
+				else if (u >= 0xdc00 && u <= 0xdfff) {
+					if (uhi < 0)
+						continue;
 
-				ch = u & 0xff;
-				break;
+					u = uhi + u - 0xdc00 + 0x10000;
+				}
+
+				uhi = -1;
+
+				ulen = utf8_byteseq(ubuf, u);
+
+				err = pf->vph(ubuf, ulen, pf->arg);
+				if (err)
+					return err;
+
+				continue;
 			}
 		}
 
+		uhi = -1;
+
 		err = pf->vph(&ch, 1, pf->arg);
 		if (err)
 			return err;
@@ -166,3 +183,48 @@ int utf8_decode(struct re_printf *pf, const struct pl *pl)
 
 	return 0;
 }
+
+
+/**
+ * Encode Unicode code point into binary UTF-8
+ *
+ * @param u  Binary UTF-8 buffer
+ * @param cp Unicode code point
+ *
+ * @return length of UTF-8 byte sequence
+ */
+size_t utf8_byteseq(char u[4], unsigned cp)
+{
+	if (!u)
+		return 0;
+
+	if (cp <= 0x7f) {
+		u[0] = cp;
+		return 1;
+	}
+	else if (cp <= 0x7ff) {
+		u[0] = 0xc0 | (cp>>6 & 0x1f);
+		u[1] = 0x80 | (cp    & 0x3f);
+		return 2;
+	}
+	else if (cp <= 0xffff) {
+		u[0] = 0xe0 | (cp>>12 & 0x0f);
+		u[1] = 0x80 | (cp>>6  & 0x3f);
+		u[2] = 0x80 | (cp     & 0x3f);
+		return 3;
+	}
+	else if (cp <= 0x10ffff) {
+		u[0] = 0xf0 | (cp>>18 & 0x07);
+		u[1] = 0x80 | (cp>>12 & 0x3f);
+		u[2] = 0x80 | (cp>>6  & 0x3f);
+		u[3] = 0x80 | (cp     & 0x3f);
+		return 4;
+	}
+	else {
+		/* The replacement character (U+FFFD) */
+		u[0] = (char)0xef;
+		u[1] = (char)0xbf;
+		u[2] = (char)0xbd;
+		return 3;
+	}
+}