From 22764dcf79cb87b208f77b24c868d2d962809114 Mon Sep 17 00:00:00 2001 From: Demali-876 <90882773+Demali-876@users.noreply.github.com> Date: Mon, 4 Nov 2024 01:54:03 -0500 Subject: [PATCH] fix(lexer): Nested Group and quantifier support --- src/Lexer.mo | 286 +++++++++++++++++++++++++++------------------------ 1 file changed, 151 insertions(+), 135 deletions(-) diff --git a/src/Lexer.mo b/src/Lexer.mo index 107a110..1008071 100644 --- a/src/Lexer.mo +++ b/src/Lexer.mo @@ -12,6 +12,7 @@ module { type Token = Types.Token; public type LexerError = Types.RegexError; type CharacterClass = Types.CharacterClass; + public class Lexer(input: Text) { let cursor = Cursor.Cursor(input); let tokenBuffer = Buffer.Buffer(16); @@ -31,47 +32,69 @@ module { }; private func nextToken(): Result.Result { - switch (cursor.current()) { - case (char) { - let token = switch char { - case '.' { createToken(#Metacharacter(#Dot), ".") }; - case '*' { tokenizeQuantifier(0, null) }; - case '+' { tokenizeQuantifier(1, null) }; - case '?' { tokenizeQuantifier(0, ?1) }; - case '(' { tokenizeGroup() }; - case '[' { tokenizeCharacterClass() }; - case '^' { - if (cursor.getPos() == 0) { - createToken(#Anchor(#StartOfString), "^") - } else { - createToken(#Character(char), Text.fromChar(char)) - } + switch (cursor.current()) { + case (char) { + let token = switch char { + case '.' { + cursor.inc(); + createToken(#Metacharacter(#Dot), ".") + }; + case '^' { + if (cursor.getPos() == 0) { + cursor.inc(); + createToken(#Anchor(#StartOfString), "^") + } else { + cursor.inc(); + createToken(#Character(char), Text.fromChar(char)) + } + }; + case '$' { + cursor.inc(); + createToken(#Anchor(#EndOfString), "$") + }; + case '|' { + cursor.inc(); + createToken(#Alternation, "|") + }; + case _ { + // Default case for characters + if (char != '\\' and char != '(' and char != '[' and char != '*' and char != '+' and char != '?' and char != '{') { + cursor.inc(); + createToken(#Character(char), Text.fromChar(char)) + } else { + // Tokens that require special handling + switch char { + case '*' { tokenizeQuantifier(0, null) }; + case '+' { tokenizeQuantifier(1, null) }; + case '?' { tokenizeQuantifier(0, ?1) }; + case '(' { tokenizeGroup() }; + case '[' { tokenizeCharacterClass() }; + case '\\' { tokenizeEscapedChar() }; + case '{' { tokenizeQuantifierRange() }; + case _ { + #err(#GenericError("Unexpected character '" # Text.fromChar(char) # "' at position " # Nat.toText(cursor.getPos()))) + }; + } + } + }; }; - case '$' { createToken(#Anchor(#EndOfString), "$") }; - case '|' { createToken(#Alternation, "|") }; - case '\\' { tokenizeEscapedChar() }; - case '{' { tokenizeQuantifierRange() }; - case _ { createToken(#Character(char), Text.fromChar(char)) }; - }; - switch (token) { - case (#ok(_)) { cursor.inc() }; - case (#err(_)) { }; + token }; - token - }; - } - }; + } + }; private func createToken(tokenType: Types.TokenType, value: Text): Result.Result { #ok({ tokenType = tokenType; value = value; - position = #Instance(cursor.getPos()); + position = #Instance(cursor.getPos() - 1); }) }; + private func tokenizeQuantifier(min: Nat, max: ?Nat): Result.Result { let start = cursor.getPos(); - cursor.inc(); + cursor.inc(); // Consume the quantifier character + let mode = if (cursor.hasNext()) { switch (cursor.current()) { case '?' { cursor.inc(); #Lazy }; @@ -86,31 +109,29 @@ module { }; private func tokenizeQuantifierRange(): Result.Result { - let start = cursor.getPos(); - cursor.inc(); + let start = cursor.getPos(); + cursor.inc(); // Consume the opening '{' - var rangeContent = ""; - while (cursor.hasNext() and cursor.current() != '}') { + var rangeContent = ""; + while (cursor.hasNext() and cursor.current() != '}') { rangeContent := rangeContent # Text.fromChar(cursor.current()); cursor.inc(); - }; + }; - if (cursor.current() != '}') { + if (not cursor.hasNext() or cursor.current() != '}') { return #err(#InvalidQuantifierRange("Missing closing '}' for quantifier range at position " # Nat.toText(cursor.getPos()))); - }; + }; + cursor.inc(); // Consume the closing '}' - let (min, max) = Extensions.parseQuantifierRange(rangeContent); + let (min, max) = Extensions.parseQuantifierRange(rangeContent); - #ok({ - tokenType = #Quantifier({ min; max; mode = #Greedy }); - value = Extensions.slice(input, start, ?cursor.getPos()); - position = #Instance(start); - }) - }; + createToken(#Quantifier({ min; max; mode = #Greedy }), Extensions.slice(input, start, ?cursor.getPos())) + }; private func tokenizeCharacterClass(): Result.Result { let start = cursor.getPos(); - cursor.inc(); + cursor.inc(); // Consume the opening '[' + var isNegated = false; if (cursor.hasNext() and cursor.current() == '^') { @@ -132,133 +153,126 @@ module { } else { return #err(#UnexpectedEndOfInput); }; - } else if (c == '-' and classTokens.size() > 0 and cursor.hasNext()) { + } else if (c == '-' and classTokens.size() > 0 and cursor.hasNext() and cursor.current() != ']') { let nextChar = cursor.current(); cursor.inc(); - if (nextChar == ']' or nextChar == '-') { - return #err(#GenericError("Invalid character range at position " # Nat.toText(cursor.getPos()) # ": '" # Text.fromChar(c) # "-" # Text.fromChar(nextChar) # "'")); - } else { - switch (Extensions.arrayLast(classTokens)) { - case (?#Single(lastChar)) { - classTokens := Array.append(Extensions.sliceArray(classTokens, 0, Int.abs(classTokens.size() - 1)), [#Range(lastChar, nextChar)]); - }; - case _ { - return #err(#GenericError("Unexpected state in character class at position " # Nat.toText(cursor.getPos()))); - }; + switch (Extensions.arrayLast(classTokens)) { + case (?#Single(lastChar)) { + classTokens := Array.append(Extensions.sliceArray(classTokens, 0, Int.abs(classTokens.size() - 1)), [#Range(lastChar, nextChar)]); + }; + case _ { + return #err(#GenericError("Invalid character range at position " # Nat.toText(cursor.getPos()))); }; }; - } else if (c == '-') { - classTokens := Array.append(classTokens, [#Single(c)]); } else { classTokens := Array.append(classTokens, [#Single(c)]); }; }; - if (not cursor.hasNext() and cursor.current() != ']') { + if (not cursor.hasNext() or cursor.current() != ']') { return #err(#GenericError("Unclosed character class at position " # Nat.toText(cursor.getPos()))); }; - cursor.inc(); + cursor.inc(); // Consume the closing ']' + createToken(#CharacterClass(isNegated, classTokens), Extensions.slice(input, start, ?cursor.getPos())) }; - - private func tokenizeGroup(): Result.Result { - let start = cursor.getPos(); - if (not cursor.hasNext()) { - return #err(#GenericError("Unexpected end of input at position " # Nat.toText(start))); - }; - cursor.inc(); // Consume the opening parenthesis - let groupModifierResult = parseGroupModifier(); - var groupModifier: ?Types.GroupModifierType = null; - switch (groupModifierResult) { - case (#err(error)) { return #err(error) }; - case (#ok(modifier)) { groupModifier := modifier }; - }; + private func tokenizeGroup(): Result.Result { + let start = cursor.getPos(); + cursor.inc(); // Consume the opening '(' - let subExprResult = tokenizeSubExpression(); - var subTokens: [Token] = []; - switch (subExprResult) { - case (#err(error)) { return #err(error) }; - case (#ok(tokens)) { subTokens := Buffer.toArray(tokens) }; - }; + let groupModifierResult = parseGroupModifier(); + var groupModifier: ?Types.GroupModifierType = null; + switch (groupModifierResult) { + case (#err(error)) { return #err(error) }; + case (#ok(modifier)) { groupModifier := modifier }; + }; - if (not cursor.hasNext()) { - return #err(#GenericError("Unexpected end of input while parsing group at position " # Nat.toText(start))); - }; + let subExprResult = tokenizeSubExpression(); + var subTokens: [Token] = []; + switch (subExprResult) { + case (#err(error)) { return #err(error) }; + case (#ok(tokens)) { subTokens := Buffer.toArray(tokens) }; + }; - if (cursor.current() != ')') { - return #err(#GenericError("Expected closing parenthesis at position " # Nat.toText(cursor.getPos()) # ", found '" # Text.fromChar(cursor.current()) # "'")); - }; + if (not cursor.hasNext() or cursor.current() != ')') { + return #err(#GenericError("Expected closing parenthesis at position " # Nat.toText(cursor.getPos()) # ", found '" # Text.fromChar(cursor.current()) # "'")); + }; - cursor.inc(); // Consume the closing parenthesis + cursor.inc(); // Consume the closing ')' - let groupToken: Token = { - tokenType = #Group({ - modifier = groupModifier; - subTokens = subTokens; - quantifier = null; - }); - value = Extensions.slice(input, start, ?cursor.getPos()); - position = #Span(start, cursor.getPos() - 1); + let groupToken: Token = { + tokenType = #Group({ + modifier = groupModifier; + subTokens = subTokens; + quantifier = null; + }); + value = Extensions.slice(input, start, ?cursor.getPos()); + position = #Span(start, cursor.getPos() - 1); + }; + #ok(groupToken) }; - #ok(groupToken) - }; - private func parseGroupModifier(): Result.Result { - if (cursor.hasNext() and cursor.current() == '?') { + if (cursor.hasNext() and cursor.current() == '?') { cursor.inc(); if (cursor.hasNext()) { - switch (cursor.current()) { - case ':' { cursor.inc(); return #ok(?#NonCapturing) }; - case '=' { cursor.inc(); return #ok(?#PositiveLookahead) }; - case '!' { cursor.inc(); return #ok(?#NegativeLookahead) }; - case '<' { - cursor.inc(); - if (cursor.hasNext()) { - switch (cursor.current()) { - case '=' { cursor.inc(); return #ok(?#PositiveLookbehind) }; - case '!' { cursor.inc(); return #ok(?#NegativeLookbehind) }; - case _ { return #err(#GenericError("Invalid lookbehind modifier at position " # Nat.toText(cursor.getPos()))) }; - }; - } else { - return #err(#UnexpectedEndOfInput); - } + switch (cursor.current()) { + case ':' { cursor.inc(); return #ok(?#NonCapturing) }; + case '=' { cursor.inc(); return #ok(?#PositiveLookahead) }; + case '!' { cursor.inc(); return #ok(?#NegativeLookahead) }; + case '<' { + cursor.inc(); + if (cursor.hasNext()) { + switch (cursor.current()) { + case '=' { cursor.inc(); return #ok(?#PositiveLookbehind) }; + case '!' { cursor.inc(); return #ok(?#NegativeLookbehind) }; + case _ { return #err(#GenericError("Invalid lookbehind modifier at position " # Nat.toText(cursor.getPos()))) }; }; - case _ { return #err(#GenericError("Invalid group modifier at position " # Nat.toText(cursor.getPos()))) }; + } else { + return #err(#UnexpectedEndOfInput); + } }; + case _ { return #err(#GenericError("Invalid group modifier at position " # Nat.toText(cursor.getPos()))) }; + }; } else { - return #err(#UnexpectedEndOfInput); + return #err(#UnexpectedEndOfInput); } - }; - // No modifier present - #ok(null) + }; + // No modifier present + #ok(null) }; private func tokenizeSubExpression(): Result.Result, LexerError> { - var subTokens = Buffer.Buffer(16); - - while (cursor.hasNext()) { + var subTokens = Buffer.Buffer(16); + + while (cursor.hasNext()) { if (cursor.current() == ')') { - return #ok(subTokens); + return #ok(subTokens); }; switch (nextToken()) { - case (#ok(token)) { - subTokens.add(token); - }; - case (#err(error)) { - return #err(error); - }; + case (#ok(token)) { + subTokens.add(token); + }; + case (#err(error)) { + return #err(error); + }; }; + }; + + #err(#GenericError("Unclosed group at position " # Nat.toText(cursor.getPos()))) }; - #err(#GenericError("Unclosed group at position " # Nat.toText(cursor.getPos()))) -}; private func tokenizeEscapedChar(): Result.Result { - cursor.inc(); - switch (cursor.current()) { + cursor.inc(); // Move past the backslash + if (not cursor.hasNext()) { + return #err(#UnexpectedEndOfInput); + }; + let escapedChar = cursor.current(); + + let token = switch escapedChar { case 'w' { createToken(#Metacharacter(#WordChar), "\\w") }; case 'W' { createToken(#Metacharacter(#NonWordChar), "\\W") }; case 'd' { createToken(#Metacharacter(#Digit), "\\d") }; @@ -270,8 +284,10 @@ module { case 'A' { createToken(#Anchor(#StartOfStringOnly), "\\A") }; case 'z' { createToken(#Anchor(#EndOfStringOnly), "\\z") }; case 'G' { createToken(#Anchor(#PreviousMatchEnd), "\\G") }; - case _ { createToken(#Character(cursor.current()), "\\" # Text.fromChar(cursor.current())) }; - } + case _ { createToken(#Character(escapedChar), "\\" # Text.fromChar(escapedChar)) }; + }; + cursor.inc(); // Move past the escaped character + token }; private func tokenizeEscapedClass(char: Char): CharacterClass { @@ -286,4 +302,4 @@ module { } }; }; -}; \ No newline at end of file +};