Skip to content

Commit

Permalink
feat: implement regex anchors ^ and $ in multi-line mode.
Browse files Browse the repository at this point in the history
Until now, anchors `^` and `$` only worked in a single-line mode, where `^`matches only at the start of the data and `$` at the end. In multi-line mode these anchors also match at the start and the end of a line respectively. Multi-line mode can be enabled by prefixing the regex with `(?m)`.
  • Loading branch information
plusvic committed Feb 25, 2025
1 parent 68f8d90 commit 8b01882
Show file tree
Hide file tree
Showing 5 changed files with 277 additions and 21 deletions.
10 changes: 10 additions & 0 deletions lib/src/re/thompson/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,10 @@ impl Compiler {
Ok(match look {
Look::Start => self.emit_instr(Instr::START)?,
Look::End => self.emit_instr(Instr::END)?,
Look::StartLF | Look::StartCRLF => {
self.emit_instr(Instr::LINE_START)?
}
Look::EndLF | Look::EndCRLF => self.emit_instr(Instr::LINE_END)?,
Look::WordAscii => self.emit_instr(Instr::WORD_BOUNDARY)?,
Look::WordAsciiNegate => {
self.emit_instr(Instr::WORD_BOUNDARY_NEG)?
Expand Down Expand Up @@ -1792,6 +1796,12 @@ impl Display for InstrSeq {
Instr::End => {
writeln!(f, "{:05x}: END", addr)?;
}
Instr::LineStart => {
writeln!(f, "{:05x}: LINE_START", addr)?;
}
Instr::LineEnd => {
writeln!(f, "{:05x}: LINE_END", addr)?;
}
Instr::WordBoundary => {
writeln!(f, "{:05x}: WORD_BOUNDARY", addr)?;
}
Expand Down
16 changes: 16 additions & 0 deletions lib/src/re/thompson/instr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,18 @@ pub enum Instr<'a> {
/// Matches the end of the scanned data ($).
End,

/// Matches the start of the scanned data or the start of a line (^ in
/// multi-line mode). Specifically, this matches at the start of the
/// data, or at the position immediately before a \n character, a \r
/// character or a \r\n sequence.
LineStart,

/// Matches the end of the scanned data or the end of a line ($ in
/// multi-line mode). Specifically, this matches at the end of the
/// data, or at the position immediately after a \n character, a \r
/// character or a \r\n sequence.
LineEnd,

/// Matches a word boundary (i.e: characters that are not part of the
/// \w class). Used for \b look-around assertions. This is a zero-length
/// match.
Expand Down Expand Up @@ -289,6 +301,8 @@ impl Instr<'_> {
pub const WORD_END: u8 = 0x0F;
pub const REPEAT_GREEDY: u8 = 0x10;
pub const REPEAT_NON_GREEDY: u8 = 0x11;
pub const LINE_START: u8 = 0x12;
pub const LINE_END: u8 = 0x13;
}

/// Parses a slice of bytes that contains Pike VM instructions, returning
Expand Down Expand Up @@ -398,6 +412,8 @@ impl<'a> InstrParser<'a> {
}
[OPCODE_PREFIX, Instr::START, ..] => (Instr::Start, 2),
[OPCODE_PREFIX, Instr::END, ..] => (Instr::End, 2),
[OPCODE_PREFIX, Instr::LINE_START, ..] => (Instr::LineStart, 2),
[OPCODE_PREFIX, Instr::LINE_END, ..] => (Instr::LineEnd, 2),
[OPCODE_PREFIX, Instr::WORD_BOUNDARY, ..] => {
(Instr::WordBoundary, 2)
}
Expand Down
130 changes: 115 additions & 15 deletions lib/src/re/thompson/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,29 +394,129 @@ pub(crate) fn epsilon_closure<C: CodeLoc>(
state.threads.push((apply_offset(ip, offset), rep_count));
}
Instr::Start => {
if start.backwards() {
if curr_byte.is_none() {
state.threads.push((
apply_offset(ip, instr_size.into()),
rep_count,
));
}
} else if prev_byte.is_none() {
let is_match = match (start.backwards(), prev_byte, curr_byte)
{
// Going forward, no previous byte, we are at the start of
// the input, this is a match.
(false, None, _) => true,
// Going backward, no current byte, we are at the start
// of the input, this is a match.
(true, _, None) => true,
_ => false,
};
if is_match {
state.threads.push((
apply_offset(ip, instr_size.into()),
rep_count,
));
}
}
Instr::End => {
if start.backwards() {
if prev_byte.is_none() {
state.threads.push((
apply_offset(ip, instr_size.into()),
rep_count,
));
let is_match = match (start.backwards(), prev_byte, curr_byte)
{
// Going forward, no current byte, we are at the end of the
// data, this is a match.
(false, _, None) => true,
// Going backward, no previous byte, we are at the end of
// the data, this is a match.
(true, None, _) => true,
_ => false,
};
if is_match {
state.threads.push((
apply_offset(ip, instr_size.into()),
rep_count,
));
}
}
Instr::LineStart => {
let is_match = match (start.backwards(), prev_byte, curr_byte)
{
// Going forward, no previous byte, we are at the start of
// the input, this is a match.
(false, None, _) => true,
// Going forward, no current byte, previous byte was \n,
// this is a match.
(false, Some(b'\n'), None) => true,
// Going forward, no current byte, previous byte was \r,
// this is a match.
(false, Some(b'\r'), None) => true,
// Going forward, previous byte was \n, this is match if
// current byte is not \r.
(false, Some(b'\n'), Some(curr_byte)) => {
*curr_byte != b'\r'
}
// Going forward, previous byte was \r, this is match if
// current byte is not \n.
(false, Some(b'\r'), Some(curr_byte)) => {
*curr_byte != b'\n'
}
} else if curr_byte.is_none() {
// Going backward, no current byte, we are at the start
// of the input, this is a match.
(true, _, None) => true,
// Going backward, no previous byte and current byte is \n,
// this is a match.
(true, None, Some(b'\n')) => true,
// Going backward, no previous byte and current byte is \r,
// this is a match.
(true, None, Some(b'\r')) => true,
// Going backward, current byte is \n, this is a match if
// previous byte was not \r.
(true, Some(prev_byte), Some(b'\n')) => {
*prev_byte != b'\r'
}
// Going backward, current byte is \r, this is a match if
// previous byte was not \n.
(true, Some(prev_byte), Some(b'\r')) => {
*prev_byte != b'\n'
}
_ => false,
};
if is_match {
state.threads.push((
apply_offset(ip, instr_size.into()),
rep_count,
));
}
}
Instr::LineEnd => {
let is_match = match (start.backwards(), prev_byte, curr_byte)
{
// Going forward, no current byte, we are at the end of the
// data, this is a match.
(false, _, None) => true,
// Going forward, no previous byte and current byte is \n,
// this is a match.
(false, None, Some(b'\n')) => true,
// Going forward, no previous byte and current byte is \t,
// this is a match.
(false, None, Some(b'\t')) => true,
// Going forward, current byte is \n, this is a match if
// the previous byte is not \r.
(false, Some(prev_byte), Some(b'\n')) => {
*prev_byte != b'\r'
}
// Going forward, current byte is \r, this is a match if
// the previous byte is not \n.
(false, Some(prev_byte), Some(b'\r')) => {
*prev_byte != b'\n'
}
// Going backward, no previous byte, we are at the end of
// the data, this is a match.
(true, None, _) => true,
// Going backward, previous byte is \n, this is a match if
// the current byte is not \r.
(true, Some(b'\n'), Some(curr_byte)) => {
*curr_byte != b'\r'
}
// Going backward, previous byte is \r, this is a match if
// the current byte is not \n.
(true, Some(b'\r'), Some(curr_byte)) => {
*curr_byte != b'\n'
}
_ => false,
};
if is_match {
state.threads.push((
apply_offset(ip, instr_size.into()),
rep_count,
Expand Down
84 changes: 84 additions & 0 deletions lib/src/re/thompson/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1217,6 +1217,90 @@ fn re_code_24() {
);
}

#[test]
fn re_code_25() {
assert_re_code!(
r#"^abc\bxyz$"#,
// Forward code
r#"
00000: START
00002: LIT 0x61
00003: LIT 0x62
00004: LIT 0x63
00005: WORD_BOUNDARY
00007: LIT 0x78
00008: LIT 0x79
00009: LIT 0x7a
0000a: END
0000c: MATCH
"#,
// Backward code
r#"
00000: END
00002: LIT 0x7a
00003: LIT 0x79
00004: LIT 0x78
00005: WORD_BOUNDARY
00007: LIT 0x63
00008: LIT 0x62
00009: LIT 0x61
0000a: START
0000c: MATCH
"#,
// Atoms
vec![RegexpAtom {
atom: Atom::inexact(vec![0x61, 0x62, 0x63, 0x78]),
code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x0c }
},],
// Epsilon closure starting at forward code 0.
vec![0x02],
// Epsilon closure starting at backward code 0.
vec![0x02]
);
}

#[test]
fn re_code_26() {
assert_re_code!(
r#"(?m)^abc\Bxyz$"#,
// Forward code
r#"
00000: LINE_START
00002: LIT 0x61
00003: LIT 0x62
00004: LIT 0x63
00005: WORD_BOUNDARY_NEG
00007: LIT 0x78
00008: LIT 0x79
00009: LIT 0x7a
0000a: LINE_END
0000c: MATCH
"#,
// Backward code
r#"
00000: LINE_END
00002: LIT 0x7a
00003: LIT 0x79
00004: LIT 0x78
00005: WORD_BOUNDARY_NEG
00007: LIT 0x63
00008: LIT 0x62
00009: LIT 0x61
0000a: LINE_START
0000c: MATCH
"#,
// Atoms
vec![RegexpAtom {
atom: Atom::inexact(vec![0x61, 0x62, 0x63, 0x78]),
code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x0c }
},],
// Epsilon closure starting at forward code 0.
vec![0x02],
// Epsilon closure starting at backward code 0.
vec![0x02]
);
}

#[rustfmt::skip]
#[test]
fn re_atoms() {
Expand Down
Loading

0 comments on commit 8b01882

Please sign in to comment.