add code documentation for the tokenizer

also clean it up a bit/make usage of things consistent
Pentarctagon · Sep 17, 2024 · 17d915a · 17d915a
1 parent c4610a0
commit 17d915a
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 57 deletions.
diff --git a/src/serialization/parser.cpp b/src/serialization/parser.cpp
@@ -128,10 +128,10 @@ void parser::operator()()
 		tok_.next_token();
 
 		switch(tok_.current_token().type) {
-		case token::LF:
+		case token::NEWLINE:
 			continue;
 
-		case '[':
+		case token::OPEN_BRACKET:
 			parse_element();
 			break;
 
@@ -195,7 +195,7 @@ void parser::parse_element()
 	case token::STRING: // [element]
 		elname = tok_.current_token().value;
 
-		if(tok_.next_token().type != ']') {
+		if(tok_.next_token().type != token::CLOSE_BRACKET) {
 			error(_("Unterminated [element] tag"));
 		}
 
@@ -210,14 +210,14 @@ void parser::parse_element()
 
 		break;
 
-	case '+': // [+element]
+	case token::PLUS: // [+element]
 		if(tok_.next_token().type != token::STRING) {
 			error(_("Invalid tag name"));
 		}
 
 		elname = tok_.current_token().value;
 
-		if(tok_.next_token().type != ']') {
+		if(tok_.next_token().type != token::CLOSE_BRACKET) {
 			error(_("Unterminated [+element] tag"));
 		}
 
@@ -240,14 +240,14 @@ void parser::parse_element()
 		elements.emplace(current_element, elname, tok_.get_start_line(), tok_.get_file());
 		break;
 
-	case '/': // [/element]
+	case token::SLASH: // [/element]
 		if(tok_.next_token().type != token::STRING) {
 			error(_("Invalid closing tag name"));
 		}
 
 		elname = tok_.current_token().value;
 
-		if(tok_.next_token().type != ']') {
+		if(tok_.next_token().type != token::CLOSE_BRACKET) {
 			error(_("Unterminated closing tag"));
 		}
 
@@ -290,7 +290,7 @@ void parser::parse_variable()
 	std::vector<std::string> variables;
 	variables.emplace_back();
 
-	while(tok_.current_token().type != '=') {
+	while(tok_.current_token().type != token::token_type::EQUALS) {
 		switch(tok_.current_token().type) {
 		case token::STRING:
 			if(!variables.back().empty()) {
@@ -300,7 +300,7 @@ void parser::parse_variable()
 			variables.back() += tok_.current_token().value;
 			break;
 
-		case ',':
+		case token::COMMA:
 			if(variables.back().empty()) {
 				error(_("Empty variable name"));
 			} else {
@@ -332,7 +332,7 @@ void parser::parse_variable()
 		assert(curvar != variables.end());
 
 		switch(tok_.current_token().type) {
-		case ',':
+		case token::COMMA:
 			if((curvar + 1) != variables.end()) {
 				if(buffer.translatable()) {
 					cfg[*curvar] = t_string(buffer);
@@ -352,7 +352,7 @@ void parser::parse_variable()
 
 			break;
 
-		case '_':
+		case token::UNDERSCORE:
 			tok_.next_token();
 
 			switch(tok_.current_token().type) {
@@ -370,14 +370,14 @@ void parser::parse_variable()
 				break;
 
 			case token::END:
-			case token::LF:
+			case token::NEWLINE:
 				buffer += "_";
 				goto finish;
 			}
 
 			break;
 
-		case '+':
+		case token::PLUS:
 			ignore_next_newlines = true;
 			continue;
 
@@ -400,7 +400,7 @@ void parser::parse_variable()
 			error(_("Unterminated quoted string"));
 			break;
 
-		case token::LF:
+		case token::NEWLINE:
 			if(ignore_next_newlines) {
 				continue;
 			}

diff --git a/src/serialization/tokenizer.cpp b/src/serialization/tokenizer.cpp
@@ -26,9 +26,9 @@ tokenizer::tokenizer(std::istream& in) :
 	token_(),
 	in_(in)
 {
-	for (int c = 0; c < 128; ++c)
+	for (int c = 0; c < START_EXTENDED_ASCII; ++c)
 	{
-		int t = 0;
+		token_category t = TOK_NONE;
 		if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
 			t = TOK_ALPHA;
 		} else if (c >= '0' && c <= '9') {
@@ -61,14 +61,14 @@ const token &tokenizer::next_token()
 		while (is_space(current_)) {
 			next_char_fast();
 		}
-		if (current_ != 254)
+		if (current_ != INLINED_PREPROCESS_DIRECTIVE_CHAR)
 			break;
 		skip_comment();
 		// skip the line end
 		next_char_fast();
 	}
 
-	if (current_ == '#')
+	if (current_ == token::POUND)
 		skip_comment();
 
 	startlineno_ = lineno_;
@@ -78,8 +78,8 @@ const token &tokenizer::next_token()
 		token_.type = token::END;
 		break;
 
-	case '<':
-		if (peek_char() != '<') {
+	case token::LESS_THAN:
+		if (peek_char() != token::LESS_THAN) {
 			token_.type = token::MISC;
 			token_.value += current_;
 			break;
@@ -92,27 +92,27 @@ const token &tokenizer::next_token()
 				token_.type = token::UNTERMINATED_QSTRING;
 				break;
 			}
-			if (current_ == '>' && peek_char() == '>') {
+			if (current_ == token::GREATER_THAN && peek_char() == token::GREATER_THAN) {
 				next_char_fast();
 				break;
 			}
 			token_.value += current_;
 		}
 		break;
 
-	case '"':
+	case token::DOUBLE_QUOTE:
 		token_.type = token::QSTRING;
 		for (;;) {
 			next_char();
 			if (current_ == EOF) {
 				token_.type = token::UNTERMINATED_QSTRING;
 				break;
 			}
-			if (current_ == '"') {
-				if (peek_char() != '"') break;
+			if (current_ == token::DOUBLE_QUOTE) {
+				if (peek_char() != token::DOUBLE_QUOTE) break;
 				next_char_fast();
 			}
-			if (current_ == 254) {
+			if (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
 				skip_comment();
 				--lineno_;
 				continue;
@@ -121,12 +121,18 @@ const token &tokenizer::next_token()
 		}
 		break;
 
-	case '[': case ']': case '/': case '\n': case '=': case ',': case '+':
+	case token::OPEN_BRACKET:
+	case token::CLOSE_BRACKET:
+	case token::SLASH:
+	case token::NEWLINE:
+	case token::EQUALS:
+	case token::COMMA:
+	case token::PLUS:
 		token_.type = token::token_type(current_);
 		token_.value = current_;
 		break;
 
-	case '_':
+	case token::UNDERSCORE:
 		if (!is_alnum(peek_char())) {
 			token_.type = token::token_type(current_);
 			token_.value = current_;
@@ -135,16 +141,16 @@ const token &tokenizer::next_token()
 		[[fallthrough]];
 
 	default:
-		if (is_alnum(current_) || current_ == '$') {
+		if (is_alnum(current_) || current_ == token::DOLLAR) {
 			token_.type = token::STRING;
 			do {
 				token_.value += current_;
 				next_char_fast();
-				while (current_ == 254) {
+				while (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
 					skip_comment();
 					next_char_fast();
 				}
-			} while (is_alnum(current_) || current_ == '$');
+			} while (is_alnum(current_) || current_ == token::DOLLAR);
 		} else {
 			token_.type = token::MISC;
 			token_.value += current_;
@@ -174,7 +180,7 @@ bool tokenizer::skip_command(char const *cmd)
 void tokenizer::skip_comment()
 {
 	next_char_fast();
-	if (current_ == '\n' || current_ == EOF) return;
+	if (current_ == token::NEWLINE || current_ == EOF) return;
 	std::string *dst = nullptr;
 
 	if (current_ == 't')
@@ -197,14 +203,14 @@ void tokenizer::skip_comment()
 	else
 	{
 		fail:
-		while (current_ != '\n' && current_ != EOF) {
+		while (current_ != token::NEWLINE && current_ != EOF) {
 			next_char_fast();
 		}
 		return;
 	}
 
 	dst->clear();
-	while (current_ != '\n' && current_ != EOF) {
+	while (current_ != token::NEWLINE && current_ != EOF) {
 		*dst += current_;
 		next_char_fast();
 	}

diff --git a/src/serialization/tokenizer.hpp b/src/serialization/tokenizer.hpp
@@ -23,6 +23,19 @@
 #include <istream>
 #include <string>
 
+// use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98
+// has something to do with the result of the preprocessor whose format is essentially undocumented and I don't intend to delve into that as of writing this comment
+#define INLINED_PREPROCESS_DIRECTIVE_CHAR 254
+
+// normal ascii is 0-127
+// extended ascii is from 128-255, none of which need any special handling
+#define START_EXTENDED_ASCII 128
+
+/**
+ * contains the current text being parsed as well as the token_type of what's being parsed.
+ * multi-character token types will have a value that's a string with zero or more characters in it.
+ * single character token types are a single character with special meaning for a config
+ */
 struct token
 {
 	token() :
@@ -32,27 +45,44 @@ struct token
 
 	enum token_type
 	{
+		// multi-character
+		/** unquoted text */
 		STRING,
+		/** quoted string, contained within double quotes or by less than/greater than symbols */
 		QSTRING,
+		/** reached end of file without finding the closing character for a QSTRING */
 		UNTERMINATED_QSTRING,
+		/** any characters that don't have special meaning */
 		MISC,
 
-		LF = '\n',
+		// single characters
+		NEWLINE = '\n',
 		EQUALS = '=',
 		COMMA = ',',
 		PLUS = '+',
 		SLASH = '/',
 		OPEN_BRACKET = '[',
 		CLOSE_BRACKET = ']',
 		UNDERSCORE = '_',
-		END
+		POUND = '#',
+		LESS_THAN = '<',
+		GREATER_THAN = '>',
+		DOUBLE_QUOTE = '"',
+		DOLLAR = '$',
+
+		/** set when EOF is returned by the input stream */
+		END = 256
 	};
 
 	token_type type;
 	std::string value;
 };
 
-/** Abstract baseclass for the tokenizer. */
+/**
+ * class responsible for parsing the provided text into tokens and tracking information about the current token.
+ * can also track the previous token when built with the DEBUG_TOKENIZER compiler define.
+ * does not otherwise keep track of the processing history.
+ */
 class tokenizer
 {
 public:
@@ -96,7 +126,7 @@ class tokenizer
 
 	void next_char()
 	{
-		if (current_ == '\n')
+		if (current_ == token::token_type::NEWLINE)
 			++lineno_;
 		next_char_fast();
 	}
@@ -106,31 +136,14 @@ class tokenizer
 		do {
 			current_ = in_.get();
 		} while (current_ == '\r');
-#if 0
-			// TODO: disabled until the campaign server is fixed
-			if(in_.good()) {
-				current_ = in_.get();
-				if (current_ == '\r')
-				{
-					// we assume that there is only one '\r'
-					if(in_.good()) {
-						current_ = in_.get();
-					} else {
-						current_ = EOF;
-					}
-				}
-			} else {
-				current_ = EOF;
-			}
-#endif
 	}
 
 	int peek_char()
 	{
 		return in_.peek();
 	}
 
-	enum
+	enum token_category
 	{
 		TOK_NONE = 0,
 		TOK_SPACE = 1,
@@ -140,7 +153,7 @@ class tokenizer
 
 	int char_type(unsigned c) const
 	{
-		return c < 128 ? char_types_[c] : 0;
+		return c < START_EXTENDED_ASCII ? char_types_[c] : 0;
 	}
 
 	bool is_space(int c) const
@@ -173,5 +186,5 @@ class tokenizer
 	token previous_token_;
 #endif
 	buffered_istream in_;
-	char char_types_[128];
+	token_category char_types_[START_EXTENDED_ASCII];
 };