Skip to content

Commit

Permalink
add code documentation for the tokenizer
Browse files Browse the repository at this point in the history
also clean it up a bit/make usage of things consistent
  • Loading branch information
Pentarctagon committed Sep 17, 2024
1 parent c4610a0 commit 17d915a
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 57 deletions.
28 changes: 14 additions & 14 deletions src/serialization/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ void parser::operator()()
tok_.next_token();

switch(tok_.current_token().type) {
case token::LF:
case token::NEWLINE:
continue;

case '[':
case token::OPEN_BRACKET:
parse_element();
break;

Expand Down Expand Up @@ -195,7 +195,7 @@ void parser::parse_element()
case token::STRING: // [element]
elname = tok_.current_token().value;

if(tok_.next_token().type != ']') {
if(tok_.next_token().type != token::CLOSE_BRACKET) {
error(_("Unterminated [element] tag"));
}

Expand All @@ -210,14 +210,14 @@ void parser::parse_element()

break;

case '+': // [+element]
case token::PLUS: // [+element]
if(tok_.next_token().type != token::STRING) {
error(_("Invalid tag name"));
}

elname = tok_.current_token().value;

if(tok_.next_token().type != ']') {
if(tok_.next_token().type != token::CLOSE_BRACKET) {
error(_("Unterminated [+element] tag"));
}

Expand All @@ -240,14 +240,14 @@ void parser::parse_element()
elements.emplace(current_element, elname, tok_.get_start_line(), tok_.get_file());
break;

case '/': // [/element]
case token::SLASH: // [/element]
if(tok_.next_token().type != token::STRING) {
error(_("Invalid closing tag name"));
}

elname = tok_.current_token().value;

if(tok_.next_token().type != ']') {
if(tok_.next_token().type != token::CLOSE_BRACKET) {
error(_("Unterminated closing tag"));
}

Expand Down Expand Up @@ -290,7 +290,7 @@ void parser::parse_variable()
std::vector<std::string> variables;
variables.emplace_back();

while(tok_.current_token().type != '=') {
while(tok_.current_token().type != token::token_type::EQUALS) {
switch(tok_.current_token().type) {
case token::STRING:
if(!variables.back().empty()) {
Expand All @@ -300,7 +300,7 @@ void parser::parse_variable()
variables.back() += tok_.current_token().value;
break;

case ',':
case token::COMMA:
if(variables.back().empty()) {
error(_("Empty variable name"));
} else {
Expand Down Expand Up @@ -332,7 +332,7 @@ void parser::parse_variable()
assert(curvar != variables.end());

switch(tok_.current_token().type) {
case ',':
case token::COMMA:
if((curvar + 1) != variables.end()) {
if(buffer.translatable()) {
cfg[*curvar] = t_string(buffer);
Expand All @@ -352,7 +352,7 @@ void parser::parse_variable()

break;

case '_':
case token::UNDERSCORE:
tok_.next_token();

switch(tok_.current_token().type) {
Expand All @@ -370,14 +370,14 @@ void parser::parse_variable()
break;

case token::END:
case token::LF:
case token::NEWLINE:
buffer += "_";
goto finish;
}

break;

case '+':
case token::PLUS:
ignore_next_newlines = true;
continue;

Expand All @@ -400,7 +400,7 @@ void parser::parse_variable()
error(_("Unterminated quoted string"));
break;

case token::LF:
case token::NEWLINE:
if(ignore_next_newlines) {
continue;
}
Expand Down
44 changes: 25 additions & 19 deletions src/serialization/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ tokenizer::tokenizer(std::istream& in) :
token_(),
in_(in)
{
for (int c = 0; c < 128; ++c)
for (int c = 0; c < START_EXTENDED_ASCII; ++c)
{
int t = 0;
token_category t = TOK_NONE;
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
t = TOK_ALPHA;
} else if (c >= '0' && c <= '9') {
Expand Down Expand Up @@ -61,14 +61,14 @@ const token &tokenizer::next_token()
while (is_space(current_)) {
next_char_fast();
}
if (current_ != 254)
if (current_ != INLINED_PREPROCESS_DIRECTIVE_CHAR)
break;
skip_comment();
// skip the line end
next_char_fast();
}

if (current_ == '#')
if (current_ == token::POUND)
skip_comment();

startlineno_ = lineno_;
Expand All @@ -78,8 +78,8 @@ const token &tokenizer::next_token()
token_.type = token::END;
break;

case '<':
if (peek_char() != '<') {
case token::LESS_THAN:
if (peek_char() != token::LESS_THAN) {
token_.type = token::MISC;
token_.value += current_;
break;
Expand All @@ -92,27 +92,27 @@ const token &tokenizer::next_token()
token_.type = token::UNTERMINATED_QSTRING;
break;
}
if (current_ == '>' && peek_char() == '>') {
if (current_ == token::GREATER_THAN && peek_char() == token::GREATER_THAN) {
next_char_fast();
break;
}
token_.value += current_;
}
break;

case '"':
case token::DOUBLE_QUOTE:
token_.type = token::QSTRING;
for (;;) {
next_char();
if (current_ == EOF) {
token_.type = token::UNTERMINATED_QSTRING;
break;
}
if (current_ == '"') {
if (peek_char() != '"') break;
if (current_ == token::DOUBLE_QUOTE) {
if (peek_char() != token::DOUBLE_QUOTE) break;
next_char_fast();
}
if (current_ == 254) {
if (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
skip_comment();
--lineno_;
continue;
Expand All @@ -121,12 +121,18 @@ const token &tokenizer::next_token()
}
break;

case '[': case ']': case '/': case '\n': case '=': case ',': case '+':
case token::OPEN_BRACKET:
case token::CLOSE_BRACKET:
case token::SLASH:
case token::NEWLINE:
case token::EQUALS:
case token::COMMA:
case token::PLUS:
token_.type = token::token_type(current_);
token_.value = current_;
break;

case '_':
case token::UNDERSCORE:
if (!is_alnum(peek_char())) {
token_.type = token::token_type(current_);
token_.value = current_;
Expand All @@ -135,16 +141,16 @@ const token &tokenizer::next_token()
[[fallthrough]];

default:
if (is_alnum(current_) || current_ == '$') {
if (is_alnum(current_) || current_ == token::DOLLAR) {
token_.type = token::STRING;
do {
token_.value += current_;
next_char_fast();
while (current_ == 254) {
while (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
skip_comment();
next_char_fast();
}
} while (is_alnum(current_) || current_ == '$');
} while (is_alnum(current_) || current_ == token::DOLLAR);
} else {
token_.type = token::MISC;
token_.value += current_;
Expand Down Expand Up @@ -174,7 +180,7 @@ bool tokenizer::skip_command(char const *cmd)
void tokenizer::skip_comment()
{
next_char_fast();
if (current_ == '\n' || current_ == EOF) return;
if (current_ == token::NEWLINE || current_ == EOF) return;
std::string *dst = nullptr;

if (current_ == 't')
Expand All @@ -197,14 +203,14 @@ void tokenizer::skip_comment()
else
{
fail:
while (current_ != '\n' && current_ != EOF) {
while (current_ != token::NEWLINE && current_ != EOF) {
next_char_fast();
}
return;
}

dst->clear();
while (current_ != '\n' && current_ != EOF) {
while (current_ != token::NEWLINE && current_ != EOF) {
*dst += current_;
next_char_fast();
}
Expand Down
61 changes: 37 additions & 24 deletions src/serialization/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@
#include <istream>
#include <string>

// use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98
// has something to do with the result of the preprocessor whose format is essentially undocumented and I don't intend to delve into that as of writing this comment
#define INLINED_PREPROCESS_DIRECTIVE_CHAR 254

// normal ascii is 0-127
// extended ascii is from 128-255, none of which need any special handling
#define START_EXTENDED_ASCII 128

/**
* contains the current text being parsed as well as the token_type of what's being parsed.
* multi-character token types will have a value that's a string with zero or more characters in it.
* single character token types are a single character with special meaning for a config
*/
struct token
{
token() :
Expand All @@ -32,27 +45,44 @@ struct token

enum token_type
{
// multi-character
/** unquoted text */
STRING,
/** quoted string, contained within double quotes or by less than/greater than symbols */
QSTRING,
/** reached end of file without finding the closing character for a QSTRING */
UNTERMINATED_QSTRING,
/** any characters that don't have special meaning */
MISC,

LF = '\n',
// single characters
NEWLINE = '\n',
EQUALS = '=',
COMMA = ',',
PLUS = '+',
SLASH = '/',
OPEN_BRACKET = '[',
CLOSE_BRACKET = ']',
UNDERSCORE = '_',
END
POUND = '#',
LESS_THAN = '<',
GREATER_THAN = '>',
DOUBLE_QUOTE = '"',
DOLLAR = '$',

/** set when EOF is returned by the input stream */
END = 256
};

token_type type;
std::string value;
};

/** Abstract baseclass for the tokenizer. */
/**
* class responsible for parsing the provided text into tokens and tracking information about the current token.
* can also track the previous token when built with the DEBUG_TOKENIZER compiler define.
* does not otherwise keep track of the processing history.
*/
class tokenizer
{
public:
Expand Down Expand Up @@ -96,7 +126,7 @@ class tokenizer

void next_char()
{
if (current_ == '\n')
if (current_ == token::token_type::NEWLINE)
++lineno_;
next_char_fast();
}
Expand All @@ -106,31 +136,14 @@ class tokenizer
do {
current_ = in_.get();
} while (current_ == '\r');
#if 0
// TODO: disabled until the campaign server is fixed
if(in_.good()) {
current_ = in_.get();
if (current_ == '\r')
{
// we assume that there is only one '\r'
if(in_.good()) {
current_ = in_.get();
} else {
current_ = EOF;
}
}
} else {
current_ = EOF;
}
#endif
}

int peek_char()
{
return in_.peek();
}

enum
enum token_category
{
TOK_NONE = 0,
TOK_SPACE = 1,
Expand All @@ -140,7 +153,7 @@ class tokenizer

int char_type(unsigned c) const
{
return c < 128 ? char_types_[c] : 0;
return c < START_EXTENDED_ASCII ? char_types_[c] : 0;
}

bool is_space(int c) const
Expand Down Expand Up @@ -173,5 +186,5 @@ class tokenizer
token previous_token_;
#endif
buffered_istream in_;
char char_types_[128];
token_category char_types_[START_EXTENDED_ASCII];
};

0 comments on commit 17d915a

Please sign in to comment.