Skip to content

Commit

Permalink
utf8_decode for printf_atom
Browse files Browse the repository at this point in the history
  • Loading branch information
lucioleKi committed Jul 11, 2024
1 parent 1286399 commit 6f11039
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 102 deletions.
2 changes: 1 addition & 1 deletion erts/emulator/beam/atom.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ atom_hash(Atom* obj)
return h;
}

byte *erts_atom_get_name(Atom *atom)
const byte *erts_atom_get_name(const Atom *atom)
{
byte *name;
Uint size;
Expand Down
2 changes: 1 addition & 1 deletion erts/emulator/beam/atom.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ ERTS_GLB_INLINE Atom* atom_tab(Uint i);
ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term);
ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1);

byte *erts_atom_get_name(Atom *atom);
const byte *erts_atom_get_name(const Atom *atom);

#if ERTS_GLB_INLINE_INCL_FUNC_DEF
ERTS_GLB_INLINE Atom*
Expand Down
2 changes: 1 addition & 1 deletion erts/emulator/beam/erl_db_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -3851,7 +3851,7 @@ bool db_eq_comp(DbTableCommon* tb, Eterm a, DbTerm* b)

int db_is_variable(Eterm obj)
{
byte *b;
const byte *b;
int n;
int N;

Expand Down
151 changes: 60 additions & 91 deletions erts/emulator/beam/erl_printf_term.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,36 +188,34 @@ static int is_printable_ascii(byte* bytep, Uint bytesize, Uint bitoffs)
* valid character, the offset is updated to point to the next character. size
* is only used for debugging.
*/
static ERTS_INLINE int utf8_decode(const unsigned char *text, int *offset, int size)
static ERTS_INLINE int utf8_decode(const byte *text, int *offset, int size)
{
int c1 = text[*offset];
int component = text[*offset];
int codepoint = 0;
int length = 0;

if ((c1 & 0x80) == 0) {
codepoint = c1;
if ((component & 0x80) == 0) {
codepoint = component;
length = 1;
} else if ((c1 & 0xE0) == 0xC0) {
codepoint = c1 & 0x1F;
} else if ((component & 0xE0) == 0xC0) {
codepoint = component & 0x1F;
length = 2;
} else if ((c1 & 0xF0) == 0xE0) {
codepoint = c1 & 0x0F;
} else if ((component & 0xF0) == 0xE0) {
codepoint = component & 0x0F;
length = 3;
} else if ((c1 & 0xF8) == 0xF0) {
codepoint = c1 & 0x07;
length = 4;
} else {
/* Invalid first byte */
ASSERT(length != 0);
}
ASSERT((component & 0xF8) == 0xF0);
codepoint = component & 0x07;
length = 4;
}

/* Assert that there are enough bytes for decoding */
ASSERT(*offset + length <= size);

for (int i = 1; i < length; i++) {
c1 = text[*offset + i];
ASSERT((c1 & 0xC0) == 0x80);
codepoint = (codepoint << 6) | (c1 & 0x3F);
component = text[*offset + i];
ASSERT((component & 0xC0) == 0x80);
codepoint = (codepoint << 6) | (component & 0x3F);
}

*offset += length;
Expand All @@ -233,31 +231,35 @@ static ERTS_INLINE int utf8_decode(const unsigned char *text, int *offset, int s
*/
static int print_atom_name(fmtfn_t fn, void* arg, Eterm atom, long *dcount)
{
int n, i;
int length, i;
const Atom *entry;
int res;
int need_quote;
int pos;
byte *s;
byte *cpos;
int c;
int lc;
const byte *s;
int codepoint;

res = 0;
i = atom_val(atom);
entry = NULL;

if ((i < 0) || (i >= atom_table_size()) || (atom_tab(i) == NULL)) {
PRINT_STRING(res, fn, arg, "<bad atom index: ");
PRINT_SWORD(res, fn, arg, 'd', 0, 1, (ErlPfSWord) i);
PRINT_CHAR(res, fn, arg, '>');
return res;
if ((i > 0) || (i <= atom_table_size()) ) {
entry = atom_tab(i);
}

if (entry == NULL) {
PRINT_STRING(res, fn, arg, "<bad atom index: ");
PRINT_SWORD(res, fn, arg, 'd', 0, 1, (ErlPfSWord) i);
PRINT_CHAR(res, fn, arg, '>');
return res;
}

s = erts_atom_get_name(atom_tab(i));
n = atom_tab(i)->len;
s = erts_atom_get_name(entry);
length = entry->len;

*dcount -= atom_tab(i)->len;
*dcount -= entry->len;

if (n == 0) {
if (length == 0) {
/* The empty atom: '' */
PRINT_STRING(res, fn, arg, "''");
return res;
Expand All @@ -274,19 +276,12 @@ static int print_atom_name(fmtfn_t fn, void* arg, Eterm atom, long *dcount)
* the Latin-1 code block or the character '_'.
*/

need_quote = 0;
pos = 0;
lc = utf8_decode(s, &pos, n);
if (!IS_LOWER(lc))
need_quote++;
else {
while (pos < n) {
lc = utf8_decode(s, &pos, n);
if (!IS_ALNUM(lc) && lc != '_') {
need_quote++;
break;
}
}
codepoint = utf8_decode(s, &pos, length);
need_quote = !IS_LOWER(codepoint);
while (pos < length && !need_quote) {
codepoint = utf8_decode(s, &pos, length);
need_quote = !IS_ALNUM(codepoint) && codepoint != '_';
}

/*
Expand All @@ -296,58 +291,32 @@ static int print_atom_name(fmtfn_t fn, void* arg, Eterm atom, long *dcount)
* be specially printed. Therefore, we must do a partial
* decoding of the utf8 encoding.
*/
cpos = s;
pos = n;
pos = 0;
if (need_quote)
PRINT_CHAR(res, fn, arg, '\'');
while(pos--) {
c = *cpos++;
switch(c) {
case '\'': PRINT_STRING(res, fn, arg, "\\'"); break;
case '\\': PRINT_STRING(res, fn, arg, "\\\\"); break;
case '\n': PRINT_STRING(res, fn, arg, "\\n"); break;
case '\f': PRINT_STRING(res, fn, arg, "\\f"); break;
case '\t': PRINT_STRING(res, fn, arg, "\\t"); break;
case '\r': PRINT_STRING(res, fn, arg, "\\r"); break;
case '\b': PRINT_STRING(res, fn, arg, "\\b"); break;
case '\v': PRINT_STRING(res, fn, arg, "\\v"); break;
default:
if (c < ' ') {
/* ASCII control character (0-31). */
PRINT_CHAR(res, fn, arg, '\\');
PRINT_UWORD(res, fn, arg, 'o', 1, 3, (ErlPfUWord) c);
} else if (c >= 0x80) {
/* A multi-byte utf8-encoded code point. Determine the
* length of the sequence. */
int n;
if ((c & 0xE0) == 0xC0) {
n = 2;
} else if ((c & 0xF0) == 0xE0) {
n = 3;
} else {
ASSERT((c & 0xF8) == 0xF0);
n = 4;
}
ASSERT(pos - n + 1 >= 0);

if (c == 0xC2 && *cpos < 0xA0) {
/* Extended ASCII control character (128-159). */
ASSERT(pos > 0);
ASSERT(0x80 <= *cpos);
PRINT_CHAR(res, fn, arg, '\\');
PRINT_UWORD(res, fn, arg, 'o', 1, 3, (ErlPfUWord) *cpos);
pos--, cpos++;
} else {
PRINT_BUF(res, fn, arg, cpos-1, n);
cpos += n - 1;
pos -= n - 1;
}

while(pos < length) {
int cp_start = pos;
codepoint = utf8_decode(s, &pos, length);
switch(codepoint) {
case '\'': PRINT_STRING(res, fn, arg, "\\'"); break;
case '\\': PRINT_STRING(res, fn, arg, "\\\\"); break;
case '\n': PRINT_STRING(res, fn, arg, "\\n"); break;
case '\f': PRINT_STRING(res, fn, arg, "\\f"); break;
case '\t': PRINT_STRING(res, fn, arg, "\\t"); break;
case '\r': PRINT_STRING(res, fn, arg, "\\r"); break;
case '\b': PRINT_STRING(res, fn, arg, "\\b"); break;
case '\v': PRINT_STRING(res, fn, arg, "\\v"); break;
default:
if (codepoint < 32 || (codepoint >= 128 && codepoint <= 159)) {
/* ASCII control character (0-31) or extended ASCII control character (128-159)*/
PRINT_CHAR(res, fn, arg, '\\');
PRINT_UWORD(res, fn, arg, 'o', 1, 3, (ErlPfUWord) codepoint);
} else {
/* Printable ASCII character. */
PRINT_CHAR(res, fn, arg, (char) c);
PRINT_BUF(res, fn, arg, &s[cp_start], pos - cp_start);
}
break;
}
break;
}
}
if (need_quote)
PRINT_CHAR(res, fn, arg, '\'');
Expand Down
2 changes: 1 addition & 1 deletion erts/emulator/beam/erl_process_dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ dump_element(fmtfn_t to, void *to_arg, Eterm x)
erts_print(to, to_arg, "H" PTR_FMT, boxed_val(x));
} else if (is_immed(x)) {
if (is_atom(x)) {
unsigned char* s = erts_atom_get_name(atom_tab(atom_val(x)));
const byte* s = erts_atom_get_name(atom_tab(atom_val(x)));
int len = atom_tab(atom_val(x))->len;
int i;

Expand Down
11 changes: 6 additions & 5 deletions erts/emulator/beam/erl_unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1347,7 +1347,7 @@ static Eterm do_utf8_to_list(Process *p, Uint num, const byte *bytes, Uint sz,
num_built, num_eaten,
tail);
}
Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
Eterm erts_utf8_to_list(Process *p, Uint num, const byte *bytes, Uint sz, Uint left,
Uint *num_built, Uint *num_eaten, Eterm tail)
{
return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail);
Expand Down Expand Up @@ -2230,12 +2230,13 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding)
need = 2* ap->latin1_chars;
}
else {
const byte * name = erts_atom_get_name(ap);
for (i = 0; i < ap->len; ) {
if (erts_atom_get_name(ap)[i] < 0x80) {
if (name[i] < 0x80) {
i++;
} else if (erts_atom_get_name(ap)[i] < 0xE0) {
} else if (name[i] < 0xE0) {
i += 2;
} else if (erts_atom_get_name(ap)[i] < 0xF0) {
} else if (name[i] < 0xF0) {
i += 3;
} else {
need = -1;
Expand All @@ -2253,7 +2254,7 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding)
* the middle of filenames
*/
if (need > 0) {
byte *name = erts_atom_get_name(ap);
const byte *name = erts_atom_get_name(ap);
int len = ap->len;
for (i = 0; i < len; i++) {
if (name[i] == 0) {
Expand Down
2 changes: 1 addition & 1 deletion erts/emulator/beam/erl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ ERTS_GLB_INLINE int erts_cmp_atoms(Eterm a, Eterm b) {
Atom *aa = atom_tab(atom_val(a));
Atom *bb = atom_tab(atom_val(b));

byte *name_a, *name_b;
const byte *name_a, *name_b;
int len_a, len_b, diff;

diff = aa->ord0 - bb->ord0;
Expand Down
2 changes: 1 addition & 1 deletion erts/emulator/beam/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -1491,7 +1491,7 @@ char *erts_convert_filename_to_wchar(const byte* bytes, Uint size,
ErtsAlcType_t alloc_type, Sint* used,
Uint extra_wchars);
Eterm erts_convert_native_to_filename(Process *p, size_t size, byte *bytes);
Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left,
Eterm erts_utf8_to_list(Process *p, Uint num, const byte *bytes, Uint sz, Uint left,
Uint *num_built, Uint *num_eaten, Eterm tail);
Eterm
erts_make_list_from_utf8_buf(Eterm **hpp, Uint num,
Expand Down

0 comments on commit 6f11039

Please sign in to comment.