Skip to content

Commit

Permalink
fix(lexer): Nested Group and quantifier support
Browse files Browse the repository at this point in the history
  • Loading branch information
Demali-876 committed Nov 4, 2024
1 parent e5c41fb commit 22764dc
Showing 1 changed file with 151 additions and 135 deletions.
286 changes: 151 additions & 135 deletions src/Lexer.mo
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module {
type Token = Types.Token;
public type LexerError = Types.RegexError;
type CharacterClass = Types.CharacterClass;

public class Lexer(input: Text) {
let cursor = Cursor.Cursor(input);
let tokenBuffer = Buffer.Buffer<Token>(16);
Expand All @@ -31,47 +32,69 @@ module {
};

private func nextToken(): Result.Result<Token, LexerError> {
switch (cursor.current()) {
case (char) {
let token = switch char {
case '.' { createToken(#Metacharacter(#Dot), ".") };
case '*' { tokenizeQuantifier(0, null) };
case '+' { tokenizeQuantifier(1, null) };
case '?' { tokenizeQuantifier(0, ?1) };
case '(' { tokenizeGroup() };
case '[' { tokenizeCharacterClass() };
case '^' {
if (cursor.getPos() == 0) {
createToken(#Anchor(#StartOfString), "^")
} else {
createToken(#Character(char), Text.fromChar(char))
}
switch (cursor.current()) {
case (char) {
let token = switch char {
case '.' {
cursor.inc();
createToken(#Metacharacter(#Dot), ".")
};
case '^' {
if (cursor.getPos() == 0) {
cursor.inc();
createToken(#Anchor(#StartOfString), "^")
} else {
cursor.inc();
createToken(#Character(char), Text.fromChar(char))
}
};
case '$' {
cursor.inc();
createToken(#Anchor(#EndOfString), "$")
};
case '|' {
cursor.inc();
createToken(#Alternation, "|")
};
case _ {
// Default case for characters
if (char != '\\' and char != '(' and char != '[' and char != '*' and char != '+' and char != '?' and char != '{') {
cursor.inc();
createToken(#Character(char), Text.fromChar(char))
} else {
// Tokens that require special handling
switch char {
case '*' { tokenizeQuantifier(0, null) };
case '+' { tokenizeQuantifier(1, null) };
case '?' { tokenizeQuantifier(0, ?1) };
case '(' { tokenizeGroup() };
case '[' { tokenizeCharacterClass() };
case '\\' { tokenizeEscapedChar() };
case '{' { tokenizeQuantifierRange() };
case _ {
#err(#GenericError("Unexpected character '" # Text.fromChar(char) # "' at position " # Nat.toText(cursor.getPos())))
};
}
}
};
};
case '$' { createToken(#Anchor(#EndOfString), "$") };
case '|' { createToken(#Alternation, "|") };
case '\\' { tokenizeEscapedChar() };
case '{' { tokenizeQuantifierRange() };
case _ { createToken(#Character(char), Text.fromChar(char)) };
};
switch (token) {
case (#ok(_)) { cursor.inc() };
case (#err(_)) { };
token
};
token
};
}
};
}
};

private func createToken(tokenType: Types.TokenType, value: Text): Result.Result<Token, LexerError> {
#ok({
tokenType = tokenType;
value = value;
position = #Instance(cursor.getPos());
position = #Instance(cursor.getPos() - 1);
})
};

private func tokenizeQuantifier(min: Nat, max: ?Nat): Result.Result<Token, LexerError> {
let start = cursor.getPos();
cursor.inc();
cursor.inc(); // Consume the quantifier character

let mode = if (cursor.hasNext()) {
switch (cursor.current()) {
case '?' { cursor.inc(); #Lazy };
Expand All @@ -86,31 +109,29 @@ module {
};

private func tokenizeQuantifierRange(): Result.Result<Token, LexerError> {
let start = cursor.getPos();
cursor.inc();
let start = cursor.getPos();
cursor.inc(); // Consume the opening '{'

var rangeContent = "";
while (cursor.hasNext() and cursor.current() != '}') {
var rangeContent = "";
while (cursor.hasNext() and cursor.current() != '}') {
rangeContent := rangeContent # Text.fromChar(cursor.current());
cursor.inc();
};
};

if (cursor.current() != '}') {
if (not cursor.hasNext() or cursor.current() != '}') {
return #err(#InvalidQuantifierRange("Missing closing '}' for quantifier range at position " # Nat.toText(cursor.getPos())));
};
};
cursor.inc(); // Consume the closing '}'

let (min, max) = Extensions.parseQuantifierRange(rangeContent);
let (min, max) = Extensions.parseQuantifierRange(rangeContent);

#ok({
tokenType = #Quantifier({ min; max; mode = #Greedy });
value = Extensions.slice(input, start, ?cursor.getPos());
position = #Instance(start);
})
};
createToken(#Quantifier({ min; max; mode = #Greedy }), Extensions.slice(input, start, ?cursor.getPos()))
};

private func tokenizeCharacterClass(): Result.Result<Token, LexerError> {
let start = cursor.getPos();
cursor.inc();
cursor.inc(); // Consume the opening '['

var isNegated = false;

if (cursor.hasNext() and cursor.current() == '^') {
Expand All @@ -132,133 +153,126 @@ module {
} else {
return #err(#UnexpectedEndOfInput);
};
} else if (c == '-' and classTokens.size() > 0 and cursor.hasNext()) {
} else if (c == '-' and classTokens.size() > 0 and cursor.hasNext() and cursor.current() != ']') {
let nextChar = cursor.current();
cursor.inc();
if (nextChar == ']' or nextChar == '-') {
return #err(#GenericError("Invalid character range at position " # Nat.toText(cursor.getPos()) # ": '" # Text.fromChar(c) # "-" # Text.fromChar(nextChar) # "'"));
} else {
switch (Extensions.arrayLast(classTokens)) {
case (?#Single(lastChar)) {
classTokens := Array.append(Extensions.sliceArray(classTokens, 0, Int.abs(classTokens.size() - 1)), [#Range(lastChar, nextChar)]);
};
case _ {
return #err(#GenericError("Unexpected state in character class at position " # Nat.toText(cursor.getPos())));
};
switch (Extensions.arrayLast(classTokens)) {
case (?#Single(lastChar)) {
classTokens := Array.append(Extensions.sliceArray(classTokens, 0, Int.abs(classTokens.size() - 1)), [#Range(lastChar, nextChar)]);
};
case _ {
return #err(#GenericError("Invalid character range at position " # Nat.toText(cursor.getPos())));
};
};
} else if (c == '-') {
classTokens := Array.append(classTokens, [#Single(c)]);
} else {
classTokens := Array.append(classTokens, [#Single(c)]);
};
};

if (not cursor.hasNext() and cursor.current() != ']') {
if (not cursor.hasNext() or cursor.current() != ']') {
return #err(#GenericError("Unclosed character class at position " # Nat.toText(cursor.getPos())));
};

cursor.inc();
cursor.inc(); // Consume the closing ']'

createToken(#CharacterClass(isNegated, classTokens), Extensions.slice(input, start, ?cursor.getPos()))
};

private func tokenizeGroup(): Result.Result<Token, LexerError> {
let start = cursor.getPos();
if (not cursor.hasNext()) {
return #err(#GenericError("Unexpected end of input at position " # Nat.toText(start)));
};
cursor.inc(); // Consume the opening parenthesis

let groupModifierResult = parseGroupModifier();
var groupModifier: ?Types.GroupModifierType = null;
switch (groupModifierResult) {
case (#err(error)) { return #err(error) };
case (#ok(modifier)) { groupModifier := modifier };
};
private func tokenizeGroup(): Result.Result<Token, LexerError> {
let start = cursor.getPos();
cursor.inc(); // Consume the opening '('

let subExprResult = tokenizeSubExpression();
var subTokens: [Token] = [];
switch (subExprResult) {
case (#err(error)) { return #err(error) };
case (#ok(tokens)) { subTokens := Buffer.toArray(tokens) };
};
let groupModifierResult = parseGroupModifier();
var groupModifier: ?Types.GroupModifierType = null;
switch (groupModifierResult) {
case (#err(error)) { return #err(error) };
case (#ok(modifier)) { groupModifier := modifier };
};

if (not cursor.hasNext()) {
return #err(#GenericError("Unexpected end of input while parsing group at position " # Nat.toText(start)));
};
let subExprResult = tokenizeSubExpression();
var subTokens: [Token] = [];
switch (subExprResult) {
case (#err(error)) { return #err(error) };
case (#ok(tokens)) { subTokens := Buffer.toArray(tokens) };
};

if (cursor.current() != ')') {
return #err(#GenericError("Expected closing parenthesis at position " # Nat.toText(cursor.getPos()) # ", found '" # Text.fromChar(cursor.current()) # "'"));
};
if (not cursor.hasNext() or cursor.current() != ')') {
return #err(#GenericError("Expected closing parenthesis at position " # Nat.toText(cursor.getPos()) # ", found '" # Text.fromChar(cursor.current()) # "'"));
};

cursor.inc(); // Consume the closing parenthesis
cursor.inc(); // Consume the closing ')'

let groupToken: Token = {
tokenType = #Group({
modifier = groupModifier;
subTokens = subTokens;
quantifier = null;
});
value = Extensions.slice(input, start, ?cursor.getPos());
position = #Span(start, cursor.getPos() - 1);
let groupToken: Token = {
tokenType = #Group({
modifier = groupModifier;
subTokens = subTokens;
quantifier = null;
});
value = Extensions.slice(input, start, ?cursor.getPos());
position = #Span(start, cursor.getPos() - 1);
};
#ok(groupToken)
};
#ok(groupToken)
};


private func parseGroupModifier(): Result.Result<?Types.GroupModifierType, LexerError> {
if (cursor.hasNext() and cursor.current() == '?') {
if (cursor.hasNext() and cursor.current() == '?') {
cursor.inc();
if (cursor.hasNext()) {
switch (cursor.current()) {
case ':' { cursor.inc(); return #ok(?#NonCapturing) };
case '=' { cursor.inc(); return #ok(?#PositiveLookahead) };
case '!' { cursor.inc(); return #ok(?#NegativeLookahead) };
case '<' {
cursor.inc();
if (cursor.hasNext()) {
switch (cursor.current()) {
case '=' { cursor.inc(); return #ok(?#PositiveLookbehind) };
case '!' { cursor.inc(); return #ok(?#NegativeLookbehind) };
case _ { return #err(#GenericError("Invalid lookbehind modifier at position " # Nat.toText(cursor.getPos()))) };
};
} else {
return #err(#UnexpectedEndOfInput);
}
switch (cursor.current()) {
case ':' { cursor.inc(); return #ok(?#NonCapturing) };
case '=' { cursor.inc(); return #ok(?#PositiveLookahead) };
case '!' { cursor.inc(); return #ok(?#NegativeLookahead) };
case '<' {
cursor.inc();
if (cursor.hasNext()) {
switch (cursor.current()) {
case '=' { cursor.inc(); return #ok(?#PositiveLookbehind) };
case '!' { cursor.inc(); return #ok(?#NegativeLookbehind) };
case _ { return #err(#GenericError("Invalid lookbehind modifier at position " # Nat.toText(cursor.getPos()))) };
};
case _ { return #err(#GenericError("Invalid group modifier at position " # Nat.toText(cursor.getPos()))) };
} else {
return #err(#UnexpectedEndOfInput);
}
};
case _ { return #err(#GenericError("Invalid group modifier at position " # Nat.toText(cursor.getPos()))) };
};
} else {
return #err(#UnexpectedEndOfInput);
return #err(#UnexpectedEndOfInput);
}
};
// No modifier present
#ok(null)
};
// No modifier present
#ok(null)
};

private func tokenizeSubExpression(): Result.Result<Buffer.Buffer<Token>, LexerError> {
var subTokens = Buffer.Buffer<Token>(16);
while (cursor.hasNext()) {
var subTokens = Buffer.Buffer<Token>(16);

while (cursor.hasNext()) {
if (cursor.current() == ')') {
return #ok(subTokens);
return #ok(subTokens);
};

switch (nextToken()) {
case (#ok(token)) {
subTokens.add(token);
};
case (#err(error)) {
return #err(error);
};
case (#ok(token)) {
subTokens.add(token);
};
case (#err(error)) {
return #err(error);
};
};
};

#err(#GenericError("Unclosed group at position " # Nat.toText(cursor.getPos())))
};

#err(#GenericError("Unclosed group at position " # Nat.toText(cursor.getPos())))
};
private func tokenizeEscapedChar(): Result.Result<Token, LexerError> {
cursor.inc();
switch (cursor.current()) {
cursor.inc(); // Move past the backslash
if (not cursor.hasNext()) {
return #err(#UnexpectedEndOfInput);
};
let escapedChar = cursor.current();

let token = switch escapedChar {
case 'w' { createToken(#Metacharacter(#WordChar), "\\w") };
case 'W' { createToken(#Metacharacter(#NonWordChar), "\\W") };
case 'd' { createToken(#Metacharacter(#Digit), "\\d") };
Expand All @@ -270,8 +284,10 @@ module {
case 'A' { createToken(#Anchor(#StartOfStringOnly), "\\A") };
case 'z' { createToken(#Anchor(#EndOfStringOnly), "\\z") };
case 'G' { createToken(#Anchor(#PreviousMatchEnd), "\\G") };
case _ { createToken(#Character(cursor.current()), "\\" # Text.fromChar(cursor.current())) };
}
case _ { createToken(#Character(escapedChar), "\\" # Text.fromChar(escapedChar)) };
};
cursor.inc(); // Move past the escaped character
token
};

private func tokenizeEscapedClass(char: Char): CharacterClass {
Expand All @@ -286,4 +302,4 @@ module {
}
};
};
};
};

0 comments on commit 22764dc

Please sign in to comment.