utf8_decode for printf_atom

erlang · Jul 11, 2024 · 6f11039 · 6f11039
1 parent 1286399
commit 6f11039
Show file tree

Hide file tree

Showing 8 changed files with 72 additions and 102 deletions.
diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
@@ -115,7 +115,7 @@ atom_hash(Atom* obj)
     return h;
 }
 
-byte *erts_atom_get_name(Atom *atom)
+const byte *erts_atom_get_name(const Atom *atom)
 {
     byte *name;
     Uint size;

diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
@@ -62,7 +62,7 @@ ERTS_GLB_INLINE Atom* atom_tab(Uint i);
 ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term);
 ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1);
 
-byte *erts_atom_get_name(Atom *atom);
+const byte *erts_atom_get_name(const Atom *atom);
 
 #if ERTS_GLB_INLINE_INCL_FUNC_DEF
 ERTS_GLB_INLINE Atom*

diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c
@@ -3851,7 +3851,7 @@ bool db_eq_comp(DbTableCommon* tb, Eterm a, DbTerm* b)
 
 int db_is_variable(Eterm obj)
 {
-    byte *b;
+    const byte *b;
     int n;
     int N;
 

diff --git a/erts/emulator/beam/erl_printf_term.c b/erts/emulator/beam/erl_printf_term.c
@@ -188,36 +188,34 @@ static int is_printable_ascii(byte* bytep, Uint bytesize, Uint bitoffs)
  * valid character, the offset is updated to point to the next character. size 
  * is only used for debugging.
  */
-static ERTS_INLINE int utf8_decode(const unsigned char *text, int *offset, int size)
+static ERTS_INLINE int utf8_decode(const byte *text, int *offset, int size)
 {
-    int c1 = text[*offset];
+    int component = text[*offset];
     int codepoint = 0;
     int length = 0;
 
-    if ((c1 & 0x80) == 0) {
-        codepoint = c1;
+    if ((component & 0x80) == 0) {
+        codepoint = component;
         length = 1;
-    } else if ((c1 & 0xE0) == 0xC0) {
-        codepoint = c1 & 0x1F;
+    } else if ((component & 0xE0) == 0xC0) {
+        codepoint = component & 0x1F;
         length = 2;
-    } else if ((c1 & 0xF0) == 0xE0) {
-        codepoint = c1 & 0x0F;
+    } else if ((component & 0xF0) == 0xE0) {
+        codepoint = component & 0x0F;
         length = 3;
-    } else if ((c1 & 0xF8) == 0xF0) {
-        codepoint = c1 & 0x07;
-        length = 4;
     } else {
-        /* Invalid first byte */ 
-        ASSERT(length != 0);
-    }
+        ASSERT((component & 0xF8) == 0xF0);
+        codepoint = component & 0x07;
+        length = 4;
+    } 
 
     /* Assert that there are enough bytes for decoding */
     ASSERT(*offset + length <= size);
 
     for (int i = 1; i < length; i++) {
-        c1 = text[*offset + i];
-        ASSERT((c1 & 0xC0) == 0x80);
-        codepoint = (codepoint << 6) | (c1 & 0x3F);
+        component = text[*offset + i];
+        ASSERT((component & 0xC0) == 0x80);
+        codepoint = (codepoint << 6) | (component & 0x3F);
     }
 
     *offset += length;
@@ -233,31 +231,35 @@ static ERTS_INLINE int utf8_decode(const unsigned char *text, int *offset, int s
  */
 static int print_atom_name(fmtfn_t fn, void* arg, Eterm atom, long *dcount)
 {
-    int n, i;
+    int length, i;
+    const Atom *entry;
     int res;
     int need_quote;
     int pos;
-    byte *s;
-    byte *cpos;
-    int c;
-    int lc;
+    const byte *s;
+    int codepoint;
 
     res = 0;
     i = atom_val(atom);
+    entry = NULL;
 
-    if ((i < 0) || (i >= atom_table_size()) ||  (atom_tab(i) == NULL)) {
-	PRINT_STRING(res, fn, arg, "<bad atom index: ");
-	PRINT_SWORD(res, fn, arg, 'd', 0, 1, (ErlPfSWord) i);
-	PRINT_CHAR(res, fn, arg, '>');
-	return res;
+    if ((i > 0) || (i <= atom_table_size()) ) {
+        entry = atom_tab(i);
+    }
+
+    if (entry == NULL) {
+        PRINT_STRING(res, fn, arg, "<bad atom index: ");
+	    PRINT_SWORD(res, fn, arg, 'd', 0, 1, (ErlPfSWord) i);
+	    PRINT_CHAR(res, fn, arg, '>');
+	    return res;
     }
 
-    s = erts_atom_get_name(atom_tab(i));
-    n = atom_tab(i)->len;
+    s = erts_atom_get_name(entry);
+    length = entry->len;
 
-    *dcount -= atom_tab(i)->len;
+    *dcount -= entry->len;
 
-    if (n == 0) {
+    if (length == 0) {
         /* The empty atom: '' */
 	PRINT_STRING(res, fn, arg, "''");
 	return res;
@@ -274,19 +276,12 @@ static int print_atom_name(fmtfn_t fn, void* arg, Eterm atom, long *dcount)
      *     the Latin-1 code block or the character '_'.
      */
 
-    need_quote = 0;
     pos = 0;
-    lc = utf8_decode(s, &pos, n);
-    if (!IS_LOWER(lc))
-	need_quote++;
-    else {
-	while (pos < n) {
-        lc = utf8_decode(s, &pos, n);
-	    if (!IS_ALNUM(lc) && lc != '_') {
-		need_quote++;
-		break;
-	    }
-	}
+    codepoint = utf8_decode(s, &pos, length);
+    need_quote = !IS_LOWER(codepoint);
+    while (pos < length && !need_quote) {
+        codepoint = utf8_decode(s, &pos, length);
+        need_quote = !IS_ALNUM(codepoint) && codepoint != '_';
     }
 
     /*
@@ -296,58 +291,32 @@ static int print_atom_name(fmtfn_t fn, void* arg, Eterm atom, long *dcount)
      * be specially printed. Therefore, we must do a partial
      * decoding of the utf8 encoding.
      */
-    cpos = s;
-    pos = n;
+    pos = 0;
     if (need_quote)
 	PRINT_CHAR(res, fn, arg, '\'');
-    while(pos--) {
-	c = *cpos++;
-	switch(c) {
-	case '\'': PRINT_STRING(res, fn, arg, "\\'"); break;
-	case '\\': PRINT_STRING(res, fn, arg, "\\\\"); break;
-	case '\n': PRINT_STRING(res, fn, arg, "\\n"); break;
-	case '\f': PRINT_STRING(res, fn, arg, "\\f"); break;
-	case '\t': PRINT_STRING(res, fn, arg, "\\t"); break;
-	case '\r': PRINT_STRING(res, fn, arg, "\\r"); break;
-	case '\b': PRINT_STRING(res, fn, arg, "\\b"); break;
-	case '\v': PRINT_STRING(res, fn, arg, "\\v"); break;
-	default:
-            if (c < ' ') {
-                /* ASCII control character (0-31). */
-		PRINT_CHAR(res, fn, arg, '\\');
-		PRINT_UWORD(res, fn, arg, 'o', 1, 3, (ErlPfUWord) c);
-            } else if (c >= 0x80) {
-                /* A multi-byte utf8-encoded code point. Determine the
-                 * length of the sequence. */
-                int n;
-                if ((c & 0xE0) == 0xC0) {
-                    n = 2;
-                } else if ((c & 0xF0) == 0xE0) {
-                    n = 3;
-                } else {
-                    ASSERT((c & 0xF8) == 0xF0);
-                    n = 4;
-                }
-                ASSERT(pos - n + 1 >= 0);
-
-                if (c == 0xC2 && *cpos < 0xA0) {
-                    /* Extended ASCII control character (128-159). */
-                    ASSERT(pos > 0);
-                    ASSERT(0x80 <= *cpos);
-                    PRINT_CHAR(res, fn, arg, '\\');
-                    PRINT_UWORD(res, fn, arg, 'o', 1, 3, (ErlPfUWord) *cpos);
-                    pos--, cpos++;
-                } else {
-                    PRINT_BUF(res, fn, arg, cpos-1, n);
-                    cpos += n - 1;
-                    pos -= n - 1;
-                }
+
+    while(pos < length) {
+        int cp_start = pos;
+        codepoint = utf8_decode(s, &pos, length);
+        switch(codepoint) {
+        case '\'': PRINT_STRING(res, fn, arg, "\\'"); break;
+        case '\\': PRINT_STRING(res, fn, arg, "\\\\"); break;
+        case '\n': PRINT_STRING(res, fn, arg, "\\n"); break;
+        case '\f': PRINT_STRING(res, fn, arg, "\\f"); break;
+        case '\t': PRINT_STRING(res, fn, arg, "\\t"); break;
+        case '\r': PRINT_STRING(res, fn, arg, "\\r"); break;
+        case '\b': PRINT_STRING(res, fn, arg, "\\b"); break;
+        case '\v': PRINT_STRING(res, fn, arg, "\\v"); break;
+        default:
+            if (codepoint < 32 || (codepoint >= 128 && codepoint <= 159)) {
+                /* ASCII control character (0-31) or extended ASCII control character (128-159)*/
+                PRINT_CHAR(res, fn, arg, '\\');
+                PRINT_UWORD(res, fn, arg, 'o', 1, 3, (ErlPfUWord) codepoint);
             } else {
-                /* Printable ASCII character. */
-		PRINT_CHAR(res, fn, arg, (char) c);
+                PRINT_BUF(res, fn, arg, &s[cp_start], pos - cp_start);
             }
-	    break;
-	}
+            break;
+        }
     }
     if (need_quote)
 	PRINT_CHAR(res, fn, arg, '\'');

diff --git a/erts/emulator/beam/erl_process_dump.c b/erts/emulator/beam/erl_process_dump.c
@@ -323,7 +323,7 @@ dump_element(fmtfn_t to, void *to_arg, Eterm x)
 	erts_print(to, to_arg, "H" PTR_FMT, boxed_val(x));
     } else if (is_immed(x)) {
 	if (is_atom(x)) {
-	    unsigned char* s = erts_atom_get_name(atom_tab(atom_val(x)));
+	    const byte* s = erts_atom_get_name(atom_tab(atom_val(x)));
 	    int len = atom_tab(atom_val(x))->len;
 	    int i;
 

diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
@@ -1347,7 +1347,7 @@ static Eterm do_utf8_to_list(Process *p, Uint num, const byte *bytes, Uint sz,
                                         num_built, num_eaten,
                                         tail);
 }
-Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
+Eterm erts_utf8_to_list(Process *p, Uint num, const byte *bytes, Uint sz, Uint left,
 			Uint *num_built, Uint *num_eaten, Eterm tail)
 {
     return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail);
@@ -2230,12 +2230,13 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding)
 		need = 2* ap->latin1_chars;
             }
 	    else {
+		const byte * name = erts_atom_get_name(ap);
 		for (i = 0; i < ap->len; ) {
-                    if (erts_atom_get_name(ap)[i] < 0x80) {
+                    if (name[i] < 0x80) {
 			i++;
-                    } else if (erts_atom_get_name(ap)[i] < 0xE0) {
+                    } else if (name[i] < 0xE0) {
 			i += 2;
-                    } else if (erts_atom_get_name(ap)[i] < 0xF0) {
+                    } else if (name[i] < 0xF0) {
 			i += 3;
                     } else {
 			need = -1;
@@ -2253,7 +2254,7 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding)
          * the middle of filenames
          */
         if (need > 0) {
-            byte *name = erts_atom_get_name(ap);
+            const byte *name = erts_atom_get_name(ap);
             int len = ap->len;
             for (i = 0; i < len; i++) {
                 if (name[i] == 0) {

diff --git a/erts/emulator/beam/erl_utils.h b/erts/emulator/beam/erl_utils.h
@@ -192,7 +192,7 @@ ERTS_GLB_INLINE int erts_cmp_atoms(Eterm a, Eterm b) {
     Atom *aa = atom_tab(atom_val(a));
     Atom *bb = atom_tab(atom_val(b));
 
-    byte *name_a, *name_b;
+    const byte *name_a, *name_b;
     int len_a, len_b, diff;
 
     diff = aa->ord0 - bb->ord0;

diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
@@ -1491,7 +1491,7 @@ char *erts_convert_filename_to_wchar(const byte* bytes, Uint size,
                                      ErtsAlcType_t alloc_type, Sint* used,
                                      Uint extra_wchars);
 Eterm erts_convert_native_to_filename(Process *p, size_t size, byte *bytes);
-Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
+Eterm erts_utf8_to_list(Process *p, Uint num, const byte *bytes, Uint sz, Uint left,
 			Uint *num_built, Uint *num_eaten, Eterm tail);
 Eterm
 erts_make_list_from_utf8_buf(Eterm **hpp, Uint num,