Skip to content

Commit

Permalink
UTF8 encode + grapheme boilerplate
Browse files Browse the repository at this point in the history
  • Loading branch information
LunaTheFoxgirl committed Jul 21, 2024
1 parent 00032d6 commit 58b3a7e
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 3 deletions.
38 changes: 37 additions & 1 deletion source/numem/unicode/package.d
Original file line number Diff line number Diff line change
@@ -1,6 +1,42 @@
module numem.unicode;
import numem.mem.vector;

/**
A unicode codepoint
*/
alias codepoint = uint;
alias codepoint = uint;

/**
A unicode codepoint sequence
*/
alias UnicodeSequence = vector!codepoint;

/**
A unicode codepoint sequence
*/
alias UnicodeSlice = codepoint[];

/**
A unicode grapheme
*/
struct Grapheme {
private:
size_t state;

public:

/**
Byte offset
*/
size_t offset;

/**
Cluster of codepoints, memory beloning to the original UnicodeSequence
*/
codepoint[] cluster;
}

/**
A sequence of graphemes
*/
alias GraphemeSequence = weak_vector!Grapheme;
69 changes: 67 additions & 2 deletions source/numem/unicode/utf8.d
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ private {
enum utf8_datamask(uint offset) = 0xFF >> offset;
enum utf8_leadmask(uint offset) = ~utf8_datamask!offset;

// Highest ascii value in UTF8
enum utf8_ascii = 0x7F;

struct utf8_t {
ubyte mask;
ubyte lead;
Expand Down Expand Up @@ -246,8 +249,8 @@ unittest {
Decodes a string to a vector of codepoints.
Invalid codes will be replaced with 0xFFFD
*/
vector!codepoint decode(nstring str) {
vector!codepoint code;
UnicodeSequence decode(nstring str) {
UnicodeSequence code;

size_t i = 0;
while(i < str.size()) {
Expand Down Expand Up @@ -275,4 +278,66 @@ unittest {
assert(decode(nstring("こんにちは世界!"))[0..$] == [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]);

assert(decode(nstring("\xF0\xA4\xADにちは世界!"))[0..$] == [0x3053, 0xFFFD, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]);
}

/**
Encodes a series of unicode sequences to UTF-8
*/
nstring encode(UnicodeSlice sequence) {
nstring out_;

size_t i = 0;
while(i < sequence.length) {
ptrdiff_t count = 0;
ptrdiff_t offset = 0;
if (sequence[i] <= utf8_ascii) {

// Single-byte ascii
out_ ~= cast(char)sequence[i++];
continue;
} else if (sequence[i] >= 0x0080 && sequence[i] <= 0x07FF) {

// 2 byte
count = 1;
offset = 0xC0;
} else if (sequence[i] >= 0x0800 && sequence[i] <= 0xFFFF) {

// 2 byte
count = 2;
offset = 0xE0;
} else if (sequence[i] >= 0x10000 && sequence[i] <= 0x10FFFF) {

// 2 byte
count = 3;
offset = 0xF0;
}


char[4] bytes;
bytes[0] = cast(ubyte)((sequence[i] >> (6 * count)) + offset);
size_t ix = 1;
while (count > 0) {
size_t temp = sequence[i] >> (6 * (count - 1));
bytes[ix++] = 0x80 | (temp & 0x3F);
count--;
}

out_ ~= bytes[0..ix];
i++;
}

return out_;
}

/**
Encodes a series of unicode sequences to UTF-8
*/
nstring encode(UnicodeSequence sequence) {
return encode(sequence[0..$]);
}

@("UTF-8 encode")
unittest {
assert(encode([0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]) == "こんにちは世界!");
assert(encode([0x3053, 0xFFFD, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]) == "\uFFFDにちは世界!");
}

0 comments on commit 58b3a7e

Please sign in to comment.