Skip to content

Commit

Permalink
fixes and improvements for regexpVet checker (#412)
Browse files Browse the repository at this point in the history
*   Improved flag flow checking.
    Now it can handle flag clearing.

*   Added checks for dangling/redundant ^.

*   Fix false positive with altAnchors.

Signed-off-by: Iskander Sharipov <[email protected]>
  • Loading branch information
quasilyte authored Apr 14, 2020
1 parent eff3772 commit d9b6dda
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 21 deletions.
121 changes: 102 additions & 19 deletions src/linter/regexpvet.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@ import (
type regexpVet struct {
parser *syntax.Parser

issues []string
exprFlags []string
issues []string
flagStates []regexpFlagState
goodAnchors []syntax.Position
}

type regexpFlagState [utf8.RuneSelf]bool // 128 bytes per group context

func (c *regexpVet) CheckRegexp(pat regexpPattern) ([]string, error) {
c.issues = c.issues[:0]
c.exprFlags = c.exprFlags[:0]
c.flagStates = c.flagStates[:0]
c.goodAnchors = c.goodAnchors[:0]

re, err := c.parser.ParsePCRE(pat.value)
if err != nil {
Expand All @@ -30,11 +34,46 @@ func (c *regexpVet) CheckRegexp(pat regexpPattern) ([]string, error) {
}
return nil, err
}

c.flagStates = append(c.flagStates, regexpFlagState{})
c.updateFlagState(c.currentFlagState(), re.Expr, re.Modifiers)

c.markGoodCarets(re.Expr)
c.walk(re.Expr)

return c.issues, nil
}

func (c *regexpVet) markGoodCarets(e syntax.Expr) {
canSkip := func(e syntax.Expr) bool {
switch e.Op {
case syntax.OpFlagOnlyGroup:
return true
case syntax.OpGroup:
x := e.Args[0]
return x.Op == syntax.OpConcat && len(x.Args) == 0
}
return false
}

if e.Op == syntax.OpConcat && len(e.Args) > 1 {
i := 0
for i < len(e.Args) && canSkip(e.Args[i]) {
i++
}
if i < len(e.Args) {
c.markGoodCarets(e.Args[i])
}
return
}
if e.Op == syntax.OpCaret {
c.addGoodAnchor(e.Pos)
}
for _, a := range e.Args {
c.markGoodCarets(a)
}
}

func (c *regexpVet) walk(e syntax.Expr) {
switch e.Op {
case syntax.OpAlt:
Expand All @@ -54,18 +93,27 @@ func (c *regexpVet) walk(e syntax.Expr) {
c.walk(e.Args[0])

case syntax.OpFlagOnlyGroup:
c.checkFlags(e, e.Args[0].Value)
c.exprFlags = append(c.exprFlags, e.Args[0].Value)
c.updateFlagState(c.currentFlagState(), e, e.Args[0].Value)
case syntax.OpGroupWithFlags:
nflags := len(c.exprFlags)
c.checkFlags(e, e.Args[1].Value)
c.exprFlags = append(c.exprFlags, e.Args[1].Value)
// Creates a new context using the current context copy.
// New flags are evaluated inside a new context.
// After nested expressions are processed, previous context is restored.
nflags := len(c.flagStates)
c.flagStates = append(c.flagStates, *c.currentFlagState())
c.updateFlagState(c.currentFlagState(), e, e.Args[1].Value)
c.walk(e.Args[0])
c.exprFlags = c.exprFlags[:nflags]
c.flagStates = c.flagStates[:nflags]
case syntax.OpGroup, syntax.OpCapture, syntax.OpNamedCapture:
nflags := len(c.exprFlags)
// Like with OpGroupWithFlags, but doesn't evaluate any new flags.
nflags := len(c.flagStates)
c.flagStates = append(c.flagStates, *c.currentFlagState())
c.walk(e.Args[0])
c.exprFlags = c.exprFlags[:nflags]
c.flagStates = c.flagStates[:nflags]

case syntax.OpCaret:
if !c.isGoodAnchor(e) {
c.warn("dangling or redundant ^, maybe \\^ is intended?")
}

default:
for _, a := range e.Args {
Expand All @@ -74,11 +122,29 @@ func (c *regexpVet) walk(e syntax.Expr) {
}
}

func (c *regexpVet) checkFlags(e syntax.Expr, flags string) {
for _, fset := range c.exprFlags {
if i := strings.IndexAny(flags, fset); i != -1 {
c.warn("redundant flag %c in %s", flags[i], e.Value)
func (c *regexpVet) currentFlagState() *regexpFlagState {
return &c.flagStates[len(c.flagStates)-1]
}

func (c *regexpVet) updateFlagState(state *regexpFlagState, e syntax.Expr, flagString string) {
clearing := false
for i := 0; i < len(flagString); i++ {
ch := flagString[i]
if ch == '-' {
clearing = true
continue
}

if clearing {
if !state[ch] {
c.warn("clearing unset flag %c in %s", ch, e.Value)
}
} else {
if state[ch] {
c.warn("redundant flag %c in %s", ch, e.Value)
}
}
state[ch] = !clearing
}
}

Expand Down Expand Up @@ -114,10 +180,10 @@ func (c *regexpVet) checkAltAnchor(alt syntax.Expr) {

// Case 1: an alternation of literals where 1st expr begins with ^ anchor.
first := alt.Args[0]
if first.Op == syntax.OpConcat && len(first.Args) > 0 && first.Args[0].Op == syntax.OpCaret {
if first.Op == syntax.OpConcat && len(first.Args) == 2 && first.Args[0].Op == syntax.OpCaret && c.isCharOrLit(first.Args[1]) {
matched := true
for _, a := range alt.Args[1:] {
if a.Op != syntax.OpLiteral && a.Op != syntax.OpChar {
if !c.isCharOrLit(a) {
matched = false
break
}
Expand All @@ -129,10 +195,10 @@ func (c *regexpVet) checkAltAnchor(alt syntax.Expr) {

// Case 2: an alternation of literals where last expr ends with $ anchor.
last := alt.Args[len(alt.Args)-1]
if last.Op == syntax.OpConcat && len(last.Args) > 0 && last.LastArg().Op == syntax.OpDollar {
if last.Op == syntax.OpConcat && len(last.Args) == 2 && last.Args[1].Op == syntax.OpDollar && c.isCharOrLit(last.Args[0]) {
matched := true
for _, a := range alt.Args[:len(alt.Args)-1] {
if a.Op != syntax.OpLiteral && a.Op != syntax.OpChar {
if !c.isCharOrLit(a) {
matched = false
break
}
Expand Down Expand Up @@ -285,6 +351,10 @@ func (c *regexpVet) charClassBoundRune(e syntax.Expr) rune {
}
}

func (c *regexpVet) isCharOrLit(e syntax.Expr) bool {
return e.Op == syntax.OpChar || e.Op == syntax.OpLiteral
}

func (c *regexpVet) octalToRune(e syntax.Expr) rune {
v, _ := strconv.ParseInt(e.Value[len(`\`):], 8, 32)
return rune(v)
Expand All @@ -307,6 +377,19 @@ func (c *regexpVet) stringToRune(s string) rune {
return ch
}

func (c *regexpVet) addGoodAnchor(pos syntax.Position) {
c.goodAnchors = append(c.goodAnchors, pos)
}

func (c *regexpVet) isGoodAnchor(e syntax.Expr) bool {
for _, pos := range c.goodAnchors {
if e.Pos == pos {
return true
}
}
return false
}

func (c *regexpVet) warn(format string, args ...interface{}) {
c.issues = append(c.issues, fmt.Sprintf(format, args...))
}
Expand Down
2 changes: 1 addition & 1 deletion src/linttest/regexpsimplify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ function _4($s) {
preg_match('/^\s*http\s*{/', $s);
preg_match('/(?i)windows nt/', $s);
preg_match('/(?i)windows phone/', $s);
preg_match('~^.* ENGINE=.*$/\)~', $s);
preg_match('~^.* ENGINE=.*/\)~', $s);
preg_match('/[a-zA-Z0-9]+@[a-zA-Z-0-9.]+\.[a-zA-Z0-9]+/', $s);
preg_match('/he|ll|o+/', $s);
preg_match('/^(\S*) (\S*) (\d*) (\S*) IP(\d) (\S*)/', $s);
Expand Down
86 changes: 85 additions & 1 deletion src/linttest/regexpvet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,38 @@ function redundantFlags($s) {
preg_match('~(?ims:(?i:foo))(?im:bar)~', $s);
preg_match('~(?i)(?ims:flags1)(?m:flags2)~', $s);
preg_match('~((?m)(?m:a|b(?s:foo))(?i)x)~', $s);
preg_match('~(?i)foo~i');
}
function redundantFlagClear($s) {
preg_match('/(?-i)x/', $s);
preg_match('/(?i:foo)(?-i)bar/', $s);
preg_match('/(?i:(?m:fo(?-i)o))(?-mi)bar/', $s);
preg_match('/(?i-ii)/', $s);
preg_match('/(?:(?i)foo)(?-i)/', $s);
preg_match('/((?i)(?-i))(?-i)/', $s);
preg_match('/(?:(?i)(?-i))(?-i)/', $s);
preg_match('/(?m-s)(?:tags)/(\S+)$/', $s);
}
function suspiciousAltAnchor($s) {
preg_match('~^foo|bar|baz~', $s);
preg_match('~(?:^a|bc)c~', $s);
preg_match('~foo|bar|baz$~', $s);
preg_match('~(?:a|bc$)c~', $s);
preg_match('~c(?:a|bc$)~', $s);
}
function danglingCaret() {
preg_match('~a^~', $s);
preg_match('~a^b~', $s);
preg_match('~^^foo~', $s);
preg_match('~foo?|bar^~', $s);
preg_match('~(?i:a)^foo~', $s);
preg_match('~(?i)^(?:foo|bar|^baz)~', $s);
preg_match('~(?i)^(?m)foobar^baz~', $s);
preg_match('~(?i:foo|((?:f|b|(foo|^bar)^)))~', $s);
preg_match('~(?i)(?m)\n^foo|bar|baz~', $s);
}
`)
test.Expect = []string{
Expand Down Expand Up @@ -135,11 +160,32 @@ function suspiciousAltAnchor($s) {
`redundant flag i in (?i:foo)`,
`redundant flag i in (?ims:flags1)`,
`redundant flag m in (?m:a|b(?s:foo))`,
`redundant flag i in (?i)`,

`clearing unset flag i in (?-i)`,
`clearing unset flag i in (?-i)`,
`clearing unset flag m in (?-mi)`,
`clearing unset flag i in (?-mi)`,
`clearing unset flag i in (?i-ii)`,
`clearing unset flag i in (?-i)`,
`clearing unset flag i in (?-i)`,
`clearing unset flag i in (?-i)`,
`clearing unset flag s in (?m-s)`,

`^ applied only to 'foo' in ^foo|bar|baz`,
`^ applied only to 'a' in ^a|bc`,
`$ applied only to 'baz' in foo|bar|baz$`,
`$ applied only to 'bc' in a|bc$`,

`dangling or redundant ^, maybe \^ is intended?`,
`dangling or redundant ^, maybe \^ is intended?`,
`dangling or redundant ^, maybe \^ is intended?`,
`dangling or redundant ^, maybe \^ is intended?`,
`dangling or redundant ^, maybe \^ is intended?`,
`dangling or redundant ^, maybe \^ is intended?`,
`dangling or redundant ^, maybe \^ is intended?`,
`dangling or redundant ^, maybe \^ is intended?`,
`dangling or redundant ^, maybe \^ is intended?`,
}
runFilterMatch(test, `regexpVet`)
}
Expand Down Expand Up @@ -191,3 +237,41 @@ function f($s) {
}
runFilterMatch(test, `regexpVet`)
}

func TestREVet_4(t *testing.T) {
test := linttest.NewSuite(t)
test.LoadStubs = []string{`stubs/phpstorm-stubs/pcre/pcre.php`}
test.AddFile(`<?php
function goodFlags($s) {
preg_match('~(?:(?i)foo)(?i)bar~', $s);
preg_match('~(?m)(?i)(?-m)(?-i)(?m)(?i)~', $s);
preg_match('~(?ms:(?i:foo))(?im:bar)~', $s);
preg_match('~(?i)(?ms:flags1)(?m:flags2)~', $s);
preg_match('~((?m)(?i:a|b(?s:foo))(?i)x)~', $s);
preg_match('~(?i)yy(?-i)x~', $s);
preg_match('~(?i-i)~', $s);
preg_match('~(?i:foo)(?i)bar~', $s);
preg_match('~(?i:(?m:fo(?-i)o))(?mi)x(?-mi)bar~', $s);
preg_match('~(?i-i)(?i)~', $s);
preg_match('~(?:(?i)foo)(?i)x(?-i)~', $s);
preg_match('~(?-i)foo~i', $s);
}
function goodAnchors($s) {
preg_match('~^~', $s);
preg_match('~^foo~', $s);
preg_match('~^foo?|bar~', $s);
preg_match('~^foo|^bar~', $s);
preg_match('~(^a|^b)~', $s);
preg_match('~(?i)^foo~', $s);
preg_match('~(?i)((?m)a|^foo)b~', $s);
preg_match('~(?i)(?m)\bfoo|bar|^baz~', $s);
preg_match('~(?i)^(?m)foo|bar|baz~', $s);
preg_match('~(?i:foo|((?:f|^b|(foo|^bar))))~', $s);
preg_match('~(?i)^(?m)foo|bar|^baz~', $s);
preg_match('~(?i)(?:)(^| )\S+~', $s);
}
`)
test.RunAndMatch()
}

0 comments on commit d9b6dda

Please sign in to comment.