diff --git a/src/serialization/parser.cpp b/src/serialization/parser.cpp index 389046f29a07..67ce3e2f20ea 100644 --- a/src/serialization/parser.cpp +++ b/src/serialization/parser.cpp @@ -128,10 +128,10 @@ void parser::operator()() tok_.next_token(); switch(tok_.current_token().type) { - case token::LF: + case token::NEWLINE: continue; - case '[': + case token::OPEN_BRACKET: parse_element(); break; @@ -195,7 +195,7 @@ void parser::parse_element() case token::STRING: // [element] elname = tok_.current_token().value; - if(tok_.next_token().type != ']') { + if(tok_.next_token().type != token::CLOSE_BRACKET) { error(_("Unterminated [element] tag")); } @@ -210,14 +210,14 @@ void parser::parse_element() break; - case '+': // [+element] + case token::PLUS: // [+element] if(tok_.next_token().type != token::STRING) { error(_("Invalid tag name")); } elname = tok_.current_token().value; - if(tok_.next_token().type != ']') { + if(tok_.next_token().type != token::CLOSE_BRACKET) { error(_("Unterminated [+element] tag")); } @@ -240,14 +240,14 @@ void parser::parse_element() elements.emplace(current_element, elname, tok_.get_start_line(), tok_.get_file()); break; - case '/': // [/element] + case token::SLASH: // [/element] if(tok_.next_token().type != token::STRING) { error(_("Invalid closing tag name")); } elname = tok_.current_token().value; - if(tok_.next_token().type != ']') { + if(tok_.next_token().type != token::CLOSE_BRACKET) { error(_("Unterminated closing tag")); } @@ -290,7 +290,7 @@ void parser::parse_variable() std::vector variables; variables.emplace_back(); - while(tok_.current_token().type != '=') { + while(tok_.current_token().type != token::token_type::EQUALS) { switch(tok_.current_token().type) { case token::STRING: if(!variables.back().empty()) { @@ -300,7 +300,7 @@ void parser::parse_variable() variables.back() += tok_.current_token().value; break; - case ',': + case token::COMMA: if(variables.back().empty()) { error(_("Empty variable name")); } else { @@ -332,7 +332,7 @@ void parser::parse_variable() assert(curvar != variables.end()); switch(tok_.current_token().type) { - case ',': + case token::COMMA: if((curvar + 1) != variables.end()) { if(buffer.translatable()) { cfg[*curvar] = t_string(buffer); @@ -352,7 +352,7 @@ void parser::parse_variable() break; - case '_': + case token::UNDERSCORE: tok_.next_token(); switch(tok_.current_token().type) { @@ -370,14 +370,14 @@ void parser::parse_variable() break; case token::END: - case token::LF: + case token::NEWLINE: buffer += "_"; goto finish; } break; - case '+': + case token::PLUS: ignore_next_newlines = true; continue; @@ -400,7 +400,7 @@ void parser::parse_variable() error(_("Unterminated quoted string")); break; - case token::LF: + case token::NEWLINE: if(ignore_next_newlines) { continue; } diff --git a/src/serialization/tokenizer.cpp b/src/serialization/tokenizer.cpp index f1fec317eea3..7253b6bfa5c6 100644 --- a/src/serialization/tokenizer.cpp +++ b/src/serialization/tokenizer.cpp @@ -26,9 +26,9 @@ tokenizer::tokenizer(std::istream& in) : token_(), in_(in) { - for (int c = 0; c < 128; ++c) + for (int c = 0; c < START_EXTENDED_ASCII; ++c) { - int t = 0; + token_category t = TOK_NONE; if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') { t = TOK_ALPHA; } else if (c >= '0' && c <= '9') { @@ -61,14 +61,14 @@ const token &tokenizer::next_token() while (is_space(current_)) { next_char_fast(); } - if (current_ != 254) + if (current_ != INLINED_PREPROCESS_DIRECTIVE_CHAR) break; skip_comment(); // skip the line end next_char_fast(); } - if (current_ == '#') + if (current_ == token::POUND) skip_comment(); startlineno_ = lineno_; @@ -78,8 +78,8 @@ const token &tokenizer::next_token() token_.type = token::END; break; - case '<': - if (peek_char() != '<') { + case token::LESS_THAN: + if (peek_char() != token::LESS_THAN) { token_.type = token::MISC; token_.value += current_; break; @@ -92,7 +92,7 @@ const token &tokenizer::next_token() token_.type = token::UNTERMINATED_QSTRING; break; } - if (current_ == '>' && peek_char() == '>') { + if (current_ == token::GREATER_THAN && peek_char() == token::GREATER_THAN) { next_char_fast(); break; } @@ -100,7 +100,7 @@ const token &tokenizer::next_token() } break; - case '"': + case token::DOUBLE_QUOTE: token_.type = token::QSTRING; for (;;) { next_char(); @@ -108,11 +108,11 @@ const token &tokenizer::next_token() token_.type = token::UNTERMINATED_QSTRING; break; } - if (current_ == '"') { - if (peek_char() != '"') break; + if (current_ == token::DOUBLE_QUOTE) { + if (peek_char() != token::DOUBLE_QUOTE) break; next_char_fast(); } - if (current_ == 254) { + if (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) { skip_comment(); --lineno_; continue; @@ -121,12 +121,18 @@ const token &tokenizer::next_token() } break; - case '[': case ']': case '/': case '\n': case '=': case ',': case '+': + case token::OPEN_BRACKET: + case token::CLOSE_BRACKET: + case token::SLASH: + case token::NEWLINE: + case token::EQUALS: + case token::COMMA: + case token::PLUS: token_.type = token::token_type(current_); token_.value = current_; break; - case '_': + case token::UNDERSCORE: if (!is_alnum(peek_char())) { token_.type = token::token_type(current_); token_.value = current_; @@ -135,16 +141,16 @@ const token &tokenizer::next_token() [[fallthrough]]; default: - if (is_alnum(current_) || current_ == '$') { + if (is_alnum(current_) || current_ == token::DOLLAR) { token_.type = token::STRING; do { token_.value += current_; next_char_fast(); - while (current_ == 254) { + while (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) { skip_comment(); next_char_fast(); } - } while (is_alnum(current_) || current_ == '$'); + } while (is_alnum(current_) || current_ == token::DOLLAR); } else { token_.type = token::MISC; token_.value += current_; @@ -174,7 +180,7 @@ bool tokenizer::skip_command(char const *cmd) void tokenizer::skip_comment() { next_char_fast(); - if (current_ == '\n' || current_ == EOF) return; + if (current_ == token::NEWLINE || current_ == EOF) return; std::string *dst = nullptr; if (current_ == 't') @@ -197,14 +203,14 @@ void tokenizer::skip_comment() else { fail: - while (current_ != '\n' && current_ != EOF) { + while (current_ != token::NEWLINE && current_ != EOF) { next_char_fast(); } return; } dst->clear(); - while (current_ != '\n' && current_ != EOF) { + while (current_ != token::NEWLINE && current_ != EOF) { *dst += current_; next_char_fast(); } diff --git a/src/serialization/tokenizer.hpp b/src/serialization/tokenizer.hpp index 5236806fb00b..d8905ffa6ad3 100644 --- a/src/serialization/tokenizer.hpp +++ b/src/serialization/tokenizer.hpp @@ -23,6 +23,19 @@ #include #include +// use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98 +// has something to do with the result of the preprocessor whose format is essentially undocumented and I don't intend to delve into that as of writing this comment +#define INLINED_PREPROCESS_DIRECTIVE_CHAR 254 + +// normal ascii is 0-127 +// extended ascii is from 128-255, none of which need any special handling +#define START_EXTENDED_ASCII 128 + +/** + * contains the current text being parsed as well as the token_type of what's being parsed. + * multi-character token types will have a value that's a string with zero or more characters in it. + * single character token types are a single character with special meaning for a config + */ struct token { token() : @@ -32,12 +45,18 @@ struct token enum token_type { + // multi-character + /** unquoted text */ STRING, + /** quoted string, contained within double quotes or by less than/greater than symbols */ QSTRING, + /** reached end of file without finding the closing character for a QSTRING */ UNTERMINATED_QSTRING, + /** any characters that don't have special meaning */ MISC, - LF = '\n', + // single characters + NEWLINE = '\n', EQUALS = '=', COMMA = ',', PLUS = '+', @@ -45,14 +64,25 @@ struct token OPEN_BRACKET = '[', CLOSE_BRACKET = ']', UNDERSCORE = '_', - END + POUND = '#', + LESS_THAN = '<', + GREATER_THAN = '>', + DOUBLE_QUOTE = '"', + DOLLAR = '$', + + /** set when EOF is returned by the input stream */ + END = 256 }; token_type type; std::string value; }; -/** Abstract baseclass for the tokenizer. */ +/** + * class responsible for parsing the provided text into tokens and tracking information about the current token. + * can also track the previous token when built with the DEBUG_TOKENIZER compiler define. + * does not otherwise keep track of the processing history. + */ class tokenizer { public: @@ -96,7 +126,7 @@ class tokenizer void next_char() { - if (current_ == '\n') + if (current_ == token::token_type::NEWLINE) ++lineno_; next_char_fast(); } @@ -106,23 +136,6 @@ class tokenizer do { current_ = in_.get(); } while (current_ == '\r'); -#if 0 - // TODO: disabled until the campaign server is fixed - if(in_.good()) { - current_ = in_.get(); - if (current_ == '\r') - { - // we assume that there is only one '\r' - if(in_.good()) { - current_ = in_.get(); - } else { - current_ = EOF; - } - } - } else { - current_ = EOF; - } -#endif } int peek_char() @@ -130,7 +143,7 @@ class tokenizer return in_.peek(); } - enum + enum token_category { TOK_NONE = 0, TOK_SPACE = 1, @@ -140,7 +153,7 @@ class tokenizer int char_type(unsigned c) const { - return c < 128 ? char_types_[c] : 0; + return c < START_EXTENDED_ASCII ? char_types_[c] : 0; } bool is_space(int c) const @@ -173,5 +186,5 @@ class tokenizer token previous_token_; #endif buffered_istream in_; - char char_types_[128]; + token_category char_types_[START_EXTENDED_ASCII]; };