Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(regex-engine): implement custom regex-engine for the compiler #16

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions regex_engine/ast/ast.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include "ast.hh"
// RE Class Implementation
RE::RE(shared_ptr<ASTNode> child, bool capturing, string group_name)
: __capturing__(capturing), group_name(group_name), group_id(-1),
child(child) {
children.push_back(child);
}

bool RE::is_capturing() const { return __capturing__; }

// LeafNode Class Implementation
LeafNode::LeafNode() : ASTNode() {}

bool LeafNode::is_match(const string &ch, int str_i, int str_len) const {
return false;
}

// Element Class Implementation
Element::Element(const string &match_ch)
: LeafNode(), match(match_ch), min(1), max(1) {}

bool Element::is_match(const string &ch, int str_i, int str_len) const {
return match == ch;
}

// WildcardElement Class Implementation
WildcardElement::WildcardElement() : Element("anything") {}

bool WildcardElement::is_match(const string &ch, int str_i, int str_len) const {
return ch != "\n";
}

// SpaceElement Class Implementation
SpaceElement::SpaceElement() : Element("") { match = ""; }

bool SpaceElement::is_match(const string &ch, int str_i, int str_len) const {
return ch.length() == 1 && isspace(ch[0]);
}

// RangeElement Class Implementation
RangeElement::RangeElement(const string &match_str, bool is_positive_logic)
: LeafNode(), match(match_str), min(1), max(1),
is_positive_logic(is_positive_logic) {}

bool RangeElement::is_match(const string &ch, int str_i, int str_len) const {
if (ch.empty())
return false;
bool ch_in_match = (match.find(ch[0]) != string::npos);
return is_positive_logic ? ch_in_match : !ch_in_match;
}

// StartElement Class Implementation
StartElement::StartElement() : LeafNode() {
match = "";
min = 1;
max = 1;
}

bool StartElement::is_match(const string &ch, int str_i, int str_len) const {
return str_i == 0;
}

// EndElement Class Implementation
EndElement::EndElement() : LeafNode() {
match = "";
min = 1;
max = 1;
}

bool EndElement::is_match(const string &ch, int str_i, int str_len) const {
return str_i == str_len;
}

// OrNode Class Implementation
OrNode::OrNode(shared_ptr<ASTNode> left, shared_ptr<ASTNode> right)
: ASTNode() {
this->left = left;
this->right = right;
this->children.push_back(left);
this->children.push_back(right);
this->min = 1;
this->max = 1;
}

// GroupNode Class Implementation
GroupNode::GroupNode(deque<shared_ptr<ASTNode>> children, bool capturing,
string group_name, int group_id)
: __capturing__(capturing), group_id(group_id), min(1), max(1) {
this->group_name =
(group_name.empty()) ? "Group" + to_string(this->group_id) : group_name;
this->children = children;
}

bool GroupNode::is_capturing() const { return __capturing__; }
132 changes: 132 additions & 0 deletions regex_engine/ast/ast.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#ifndef RE_AST_HH
#define RE_AST_HH

#include <algorithm>
#include <cctype>
#include <deque>
#include <iostream>
#include <memory>
#include <string>
#include <variant>
#include <vector>

using namespace std;

// Base class for all AST nodes
class ASTNode {
public:
ASTNode() = default;
virtual ~ASTNode(){};
};

// Regular Expression Node
class RE : public ASTNode {
public:
bool __capturing__;
string group_name;
int group_id;
shared_ptr<ASTNode> child;
deque<shared_ptr<ASTNode>> children;

RE(shared_ptr<ASTNode> child, bool capturing = false,
string group_name = "RegEx");
bool is_capturing() const;
};

// Leaf Node Base Class
class LeafNode : public ASTNode {
public:
LeafNode();
virtual bool is_match(const string &ch = "", int str_i = -1,
int str_len = -1) const;
};

// Element Node
class Element : public LeafNode {
public:
Element(const string &match_ch = "");
bool is_match(const string &ch = "", int str_i = 0,
int str_len = 0) const override;
string match;
variant<int, double> min;
variant<int, double> max;
};

// Wildcard Element Node
class WildcardElement : public Element {
public:
WildcardElement();
bool is_match(const string &ch = "", int str_i = 0,
int str_len = 0) const override;
};

// Space Element Node
class SpaceElement : public Element {
public:
SpaceElement();
bool is_match(const string &ch = "", int str_i = 0,
int str_len = 0) const override;
};

// Range Element Node
class RangeElement : public LeafNode {
public:
RangeElement(const string &match_str, bool is_positive_logic = true);
bool is_match(const string &ch = "", int str_i = 0,
int str_len = 0) const override;
string match;
variant<int, double> min;
variant<int, double> max;
bool is_positive_logic;
};

// Start Element Node
class StartElement : public LeafNode {
public:
StartElement();
bool is_match(const string &ch = "", int str_i = 0,
int str_len = 0) const override;
string match;
variant<int, double> min;
variant<int, double> max;
};

// End Element Node
class EndElement : public LeafNode {
public:
EndElement();
bool is_match(const string &ch = "", int str_i = 0,
int str_len = 0) const override;
string match;
variant<int, double> min;
variant<int, double> max;
};

// OR Node
class OrNode : public ASTNode {
public:
shared_ptr<ASTNode> left;
shared_ptr<ASTNode> right;
vector<shared_ptr<ASTNode>> children;
variant<int, double> min;
variant<int, double> max;

OrNode(shared_ptr<ASTNode> left, shared_ptr<ASTNode> right);
};

// Group Node
class GroupNode : public ASTNode {
public:
bool __capturing__;
string group_name;
int group_id;
deque<shared_ptr<ASTNode>> children;
variant<int, double> min;
variant<int, double> max;

GroupNode(deque<shared_ptr<ASTNode>> children, bool capturing = false,
string group_name = "", int group_id = -1);
bool is_capturing() const;
};

#endif // RE_AST_HH
90 changes: 90 additions & 0 deletions regex_engine/lexer/lexer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#include "lexer.hh"

bool Lexer::isDigit(char ch) const { return digits.find(ch) != string::npos; }

vector<Token *> Lexer::scan(const string &re) {
vector<Token *> tokens;

auto append = [&tokens](Token *elem) { tokens.push_back(elem); };

size_t i = 0;
bool escape_found = false;

while (i < re.size()) {
char ch = re[i];

if (escape_found) {
// Handle escape sequences
if (ch == 't') {
append(new ElementToken("\t"));
} else if (ch == 's') {
append(new SpaceToken(ch));
} else {
string str(1, ch);
append(new ElementToken(str));
}
} else if (ch == '\\') {
escape_found = true;
++i;
continue;
} else if (ch == '.') {
append(new Wildcard());
} else if (ch == '(') {
append(new LeftParenthesis());
} else if (ch == ')') {
append(new RightParenthesis());
} else if (ch == '[') {
append(new LeftBracket());
} else if (ch == '-') {
append(new Dash());
} else if (ch == ']') {
append(new RightBracket());
} else if (ch == '{') {
append(new LeftCurlyBrace());
++i;
while (i < re.size()) {
ch = re[i];
if (ch == ',') {
append(new Comma());
} else if (isDigit(ch)) {
append(new ElementToken(string(1, ch)));
} else if (ch == '}') {
append(new RightCurlyBrace());
break;
} else {
throw invalid_argument("Bad token at index " + to_string(i));
}
++i;
}
} else if (ch == '^') {
if (i == 0) {
append(new Start());
} else {
append(new Circumflex());
}
} else if (ch == '$') {
append(new End());
} else if (ch == '?') {
append(new QuestionMark());
} else if (ch == '*') {
append(new Asterisk());
} else if (ch == '+') {
append(new Plus());
} else if (ch == '|') {
append(new VerticalBar());
} else if (ch == '}') {
append(new RightCurlyBrace());
} else {
append(new ElementToken(string(1, ch)));
}

escape_found = false;
++i;
}
// for (const auto& token : tokens) {
// cerr << "Token: " << token->char_ << " (Type: " <<
// typeid(*token).name() << ")" << endl;
// }

return tokens;
}
21 changes: 21 additions & 0 deletions regex_engine/lexer/lexer.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#ifndef LEXER_H
#define LEXER_H

#include "../tokens/tokens.hh"
#include <string>
#include <vector>
using namespace std;

class Lexer {
private:
const string digits = "0123456789";

public:
Lexer() = default;

bool isDigit(char ch) const;

vector<Token *> scan(const std::string &re);
};

#endif
Loading
Loading