feat: implement regex anchors ^ and $ in multi-line mode.

Until now, anchors `^` and `$` only worked in a single-line mode, where `^`matches only at the start of the data and `$` at the end. In multi-line mode these anchors also match at the start and the end of a line respectively. Multi-line mode can be enabled by prefixing the regex with `(?m)`.
VirusTotal · Feb 25, 2025 · 8b01882 · 8b01882
1 parent 68f8d90
commit 8b01882
Show file tree

Hide file tree

Showing 5 changed files with 277 additions and 21 deletions.
diff --git a/lib/src/re/thompson/compiler.rs b/lib/src/re/thompson/compiler.rs
@@ -504,6 +504,10 @@ impl Compiler {
         Ok(match look {
             Look::Start => self.emit_instr(Instr::START)?,
             Look::End => self.emit_instr(Instr::END)?,
+            Look::StartLF | Look::StartCRLF => {
+                self.emit_instr(Instr::LINE_START)?
+            }
+            Look::EndLF | Look::EndCRLF => self.emit_instr(Instr::LINE_END)?,
             Look::WordAscii => self.emit_instr(Instr::WORD_BOUNDARY)?,
             Look::WordAsciiNegate => {
                 self.emit_instr(Instr::WORD_BOUNDARY_NEG)?
@@ -1792,6 +1796,12 @@ impl Display for InstrSeq {
                 Instr::End => {
                     writeln!(f, "{:05x}: END", addr)?;
                 }
+                Instr::LineStart => {
+                    writeln!(f, "{:05x}: LINE_START", addr)?;
+                }
+                Instr::LineEnd => {
+                    writeln!(f, "{:05x}: LINE_END", addr)?;
+                }
                 Instr::WordBoundary => {
                     writeln!(f, "{:05x}: WORD_BOUNDARY", addr)?;
                 }

diff --git a/lib/src/re/thompson/instr.rs b/lib/src/re/thompson/instr.rs
@@ -248,6 +248,18 @@ pub enum Instr<'a> {
     /// Matches the end of the scanned data ($).
     End,
 
+    /// Matches the start of the scanned data or the start of a line (^ in
+    /// multi-line mode). Specifically, this matches at the start of the
+    /// data, or at the position immediately before a \n character, a \r
+    /// character or a \r\n sequence.
+    LineStart,
+
+    /// Matches the end of the scanned data or the end of a line ($ in
+    /// multi-line mode). Specifically, this matches at the end of the
+    /// data, or at the position immediately after a \n character, a \r
+    /// character or a \r\n sequence.
+    LineEnd,
+
     /// Matches a word boundary (i.e: characters that are not part of the
     /// \w class). Used for \b look-around assertions. This is a zero-length
     /// match.
@@ -289,6 +301,8 @@ impl Instr<'_> {
     pub const WORD_END: u8 = 0x0F;
     pub const REPEAT_GREEDY: u8 = 0x10;
     pub const REPEAT_NON_GREEDY: u8 = 0x11;
+    pub const LINE_START: u8 = 0x12;
+    pub const LINE_END: u8 = 0x13;
 }
 
 /// Parses a slice of bytes that contains Pike VM instructions, returning
@@ -398,6 +412,8 @@ impl<'a> InstrParser<'a> {
             }
             [OPCODE_PREFIX, Instr::START, ..] => (Instr::Start, 2),
             [OPCODE_PREFIX, Instr::END, ..] => (Instr::End, 2),
+            [OPCODE_PREFIX, Instr::LINE_START, ..] => (Instr::LineStart, 2),
+            [OPCODE_PREFIX, Instr::LINE_END, ..] => (Instr::LineEnd, 2),
             [OPCODE_PREFIX, Instr::WORD_BOUNDARY, ..] => {
                 (Instr::WordBoundary, 2)
             }

diff --git a/lib/src/re/thompson/pikevm.rs b/lib/src/re/thompson/pikevm.rs
@@ -394,29 +394,129 @@ pub(crate) fn epsilon_closure<C: CodeLoc>(
                 state.threads.push((apply_offset(ip, offset), rep_count));
             }
             Instr::Start => {
-                if start.backwards() {
-                    if curr_byte.is_none() {
-                        state.threads.push((
-                            apply_offset(ip, instr_size.into()),
-                            rep_count,
-                        ));
-                    }
-                } else if prev_byte.is_none() {
+                let is_match = match (start.backwards(), prev_byte, curr_byte)
+                {
+                    // Going forward, no previous byte, we are at the start of
+                    // the input, this is a match.
+                    (false, None, _) => true,
+                    // Going backward, no current byte, we are at the start
+                    // of the input, this is a match.
+                    (true, _, None) => true,
+                    _ => false,
+                };
+                if is_match {
                     state.threads.push((
                         apply_offset(ip, instr_size.into()),
                         rep_count,
                     ));
                 }
             }
             Instr::End => {
-                if start.backwards() {
-                    if prev_byte.is_none() {
-                        state.threads.push((
-                            apply_offset(ip, instr_size.into()),
-                            rep_count,
-                        ));
+                let is_match = match (start.backwards(), prev_byte, curr_byte)
+                {
+                    // Going forward, no current byte, we are at the end of the
+                    // data, this is a match.
+                    (false, _, None) => true,
+                    // Going backward, no previous byte, we are at the end of
+                    // the data, this is a match.
+                    (true, None, _) => true,
+                    _ => false,
+                };
+                if is_match {
+                    state.threads.push((
+                        apply_offset(ip, instr_size.into()),
+                        rep_count,
+                    ));
+                }
+            }
+            Instr::LineStart => {
+                let is_match = match (start.backwards(), prev_byte, curr_byte)
+                {
+                    // Going forward, no previous byte, we are at the start of
+                    // the input, this is a match.
+                    (false, None, _) => true,
+                    // Going forward, no current byte, previous byte was \n,
+                    // this is a match.
+                    (false, Some(b'\n'), None) => true,
+                    // Going forward, no current byte, previous byte was \r,
+                    // this is a match.
+                    (false, Some(b'\r'), None) => true,
+                    // Going forward, previous byte was \n, this is match if
+                    // current byte is not \r.
+                    (false, Some(b'\n'), Some(curr_byte)) => {
+                        *curr_byte != b'\r'
+                    }
+                    // Going forward, previous byte was \r, this is match if
+                    // current byte is not \n.
+                    (false, Some(b'\r'), Some(curr_byte)) => {
+                        *curr_byte != b'\n'
                     }
-                } else if curr_byte.is_none() {
+                    // Going backward, no current byte, we are at the start
+                    // of the input, this is a match.
+                    (true, _, None) => true,
+                    // Going backward, no previous byte and current byte is \n,
+                    // this is a match.
+                    (true, None, Some(b'\n')) => true,
+                    // Going backward, no previous byte and current byte is \r,
+                    // this is a match.
+                    (true, None, Some(b'\r')) => true,
+                    // Going backward, current byte is \n, this is a match if
+                    // previous byte was not \r.
+                    (true, Some(prev_byte), Some(b'\n')) => {
+                        *prev_byte != b'\r'
+                    }
+                    // Going backward, current byte is \r, this is a match if
+                    // previous byte was not \n.
+                    (true, Some(prev_byte), Some(b'\r')) => {
+                        *prev_byte != b'\n'
+                    }
+                    _ => false,
+                };
+                if is_match {
+                    state.threads.push((
+                        apply_offset(ip, instr_size.into()),
+                        rep_count,
+                    ));
+                }
+            }
+            Instr::LineEnd => {
+                let is_match = match (start.backwards(), prev_byte, curr_byte)
+                {
+                    // Going forward, no current byte, we are at the end of the
+                    // data, this is a match.
+                    (false, _, None) => true,
+                    // Going forward, no previous byte and current byte is \n,
+                    // this is a match.
+                    (false, None, Some(b'\n')) => true,
+                    // Going forward, no previous byte and current byte is \t,
+                    // this is a match.
+                    (false, None, Some(b'\t')) => true,
+                    // Going forward, current byte is \n, this is a match if
+                    // the previous byte is not \r.
+                    (false, Some(prev_byte), Some(b'\n')) => {
+                        *prev_byte != b'\r'
+                    }
+                    // Going forward, current byte is \r, this is a match if
+                    // the previous byte is not \n.
+                    (false, Some(prev_byte), Some(b'\r')) => {
+                        *prev_byte != b'\n'
+                    }
+                    // Going backward, no previous byte, we are at the end of
+                    // the data, this is a match.
+                    (true, None, _) => true,
+                    // Going backward, previous byte is \n, this is a match if
+                    // the current byte is not \r.
+                    (true, Some(b'\n'), Some(curr_byte)) => {
+                        *curr_byte != b'\r'
+                    }
+                    // Going backward, previous byte is \r, this is a match if
+                    // the current byte is not \n.
+                    (true, Some(b'\r'), Some(curr_byte)) => {
+                        *curr_byte != b'\n'
+                    }
+                    _ => false,
+                };
+                if is_match {
                     state.threads.push((
                         apply_offset(ip, instr_size.into()),
                         rep_count,

diff --git a/lib/src/re/thompson/tests.rs b/lib/src/re/thompson/tests.rs
@@ -1217,6 +1217,90 @@ fn re_code_24() {
     );
 }
 
+#[test]
+fn re_code_25() {
+    assert_re_code!(
+        r#"^abc\bxyz$"#,
+        // Forward code
+        r#"
+00000: START
+00002: LIT 0x61
+00003: LIT 0x62
+00004: LIT 0x63
+00005: WORD_BOUNDARY
+00007: LIT 0x78
+00008: LIT 0x79
+00009: LIT 0x7a
+0000a: END
+0000c: MATCH
+"#,
+        // Backward code
+        r#"
+00000: END
+00002: LIT 0x7a
+00003: LIT 0x79
+00004: LIT 0x78
+00005: WORD_BOUNDARY
+00007: LIT 0x63
+00008: LIT 0x62
+00009: LIT 0x61
+0000a: START
+0000c: MATCH
+"#,
+        // Atoms
+        vec![RegexpAtom {
+            atom: Atom::inexact(vec![0x61, 0x62, 0x63, 0x78]),
+            code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x0c }
+        },],
+        // Epsilon closure starting at forward code 0.
+        vec![0x02],
+        // Epsilon closure starting at backward code 0.
+        vec![0x02]
+    );
+}
+
+#[test]
+fn re_code_26() {
+    assert_re_code!(
+        r#"(?m)^abc\Bxyz$"#,
+        // Forward code
+        r#"
+00000: LINE_START
+00002: LIT 0x61
+00003: LIT 0x62
+00004: LIT 0x63
+00005: WORD_BOUNDARY_NEG
+00007: LIT 0x78
+00008: LIT 0x79
+00009: LIT 0x7a
+0000a: LINE_END
+0000c: MATCH
+"#,
+        // Backward code
+        r#"
+00000: LINE_END
+00002: LIT 0x7a
+00003: LIT 0x79
+00004: LIT 0x78
+00005: WORD_BOUNDARY_NEG
+00007: LIT 0x63
+00008: LIT 0x62
+00009: LIT 0x61
+0000a: LINE_START
+0000c: MATCH
+"#,
+        // Atoms
+        vec![RegexpAtom {
+            atom: Atom::inexact(vec![0x61, 0x62, 0x63, 0x78]),
+            code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x0c }
+        },],
+        // Epsilon closure starting at forward code 0.
+        vec![0x02],
+        // Epsilon closure starting at backward code 0.
+        vec![0x02]
+    );
+}
+
 #[rustfmt::skip]
 #[test]
 fn re_atoms() {