Skip to content

Commit

Permalink
encoding/jsonschema: check regexp syntax up front
Browse files Browse the repository at this point in the history
Issue #3555 shows that invalid regular expressions are ignored
in some circumstances. To avoid differences between evalv2 and evalv3
(and to enable better behaviour when `strictFeatures` is disabled),
check all regular expressions up front rather than relying on the
evaluator to detect them later.

Also detect invalid character classes specifically because that's
something which is technically a missing feature rather than
an invalid regexp.

Signed-off-by: Roger Peppe <[email protected]>
Change-Id: I616c991bccd82ddcb9c2070404b352f686f58aeb
Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/1203589
TryBot-Result: CUEcueckoo <[email protected]>
Reviewed-by: Daniel Martí <[email protected]>
  • Loading branch information
rogpeppe committed Nov 5, 2024
1 parent 85f5ca8 commit 1476fc7
Show file tree
Hide file tree
Showing 8 changed files with 224 additions and 95 deletions.
11 changes: 10 additions & 1 deletion encoding/jsonschema/constraints_object.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,18 @@ func constraintPatternProperties(key string, n cue.Value, s *state) {
obj := s.object(n)
existing := excludeFields(s.obj.Elts)
s.processMap(n, func(key string, n cue.Value) {
// [!~(properties) & pattern]: schema
if !s.checkRegexp(n, key) {
return
}

// Record the pattern for potential use by
// additionalProperties because patternProperties are
// considered before additionalProperties.
s.patterns = append(s.patterns,
&ast.UnaryExpr{Op: token.NMAT, X: ast.NewString(key)})

// We'll make a pattern constraint of the form:
// [pattern & !~(properties)]: schema
f := internal.EmbedStruct(ast.NewStruct(&ast.Field{
Label: ast.NewList(ast.NewBinExpr(
token.AND,
Expand Down
46 changes: 33 additions & 13 deletions encoding/jsonschema/decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,9 @@ func (d *decoder) schema(ref []ast.Label, v cue.Value) (a []ast.Decl) {
a = append(a, &ast.EmbedDecl{Expr: expr})
}

state.doc(a[0])

if len(a) > 0 {
state.doc(a[0])
}
for i := inner - 1; i >= 0; i-- {
a = []ast.Decl{&ast.Field{
Label: ref[i],
Expand Down Expand Up @@ -267,24 +268,43 @@ func (d *decoder) regexpValue(n cue.Value) (ast.Expr, bool) {
if !ok {
return nil, false
}
if !d.checkRegexp(n, s) {
return nil, false
}
return d.string(n), true
}

func (d *decoder) checkRegexp(n cue.Value, s string) bool {
_, err := syntax.Parse(s, syntax.Perl)
if err == nil {
return d.string(n), true
return true
}
var regErr *syntax.Error
if errors.As(err, &regErr) && regErr.Code == syntax.ErrInvalidPerlOp {
// It's Perl syntax that we'll never support because the CUE evaluation
// engine uses Go's regexp implementation and because the missing
// features are usually not there for good reason (e.g. exponential
// runtime). In other words, this is a missing feature but not an invalid
// regular expression as such.
if d.cfg.StrictFeatures {
d.errf(n, "unsupported Perl regexp syntax in %q: %v", s, err)
if errors.As(err, &regErr) {
switch regErr.Code {
case syntax.ErrInvalidPerlOp:
// It's Perl syntax that we'll never support because the CUE evaluation
// engine uses Go's regexp implementation and because the missing
// features are usually not there for good reason (e.g. exponential
// runtime). In other words, this is a missing feature but not an invalid
// regular expression as such.
if d.cfg.StrictFeatures {
d.errf(n, "unsupported Perl regexp syntax in %q: %v", s, err)
}
return false
case syntax.ErrInvalidCharRange:
// There are many more character class ranges than Go supports currently
// (see https://go.dev/issue/14509) so treat an unknown character class
// range as a feature error rather than a bad regexp.
// TODO translate names to Go-supported class names when possible.
if d.cfg.StrictFeatures {
d.errf(n, "unsupported regexp character class in %q: %v", s, err)
}
return false
}
return nil, false
}
d.errf(n, "invalid regexp %q: %v", s, err)
return nil, false
return false
}

// const draftCutoff = 5
Expand Down
12 changes: 6 additions & 6 deletions encoding/jsonschema/external_teststats.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ v3:
Optional tests

v2:
schema extract (pass / total): 230 / 274 = 83.9%
tests (pass / total): 1498 / 2372 = 63.2%
tests on extracted schemas (pass / total): 1498 / 2258 = 66.3%
schema extract (pass / total): 220 / 274 = 80.3%
tests (pass / total): 1483 / 2372 = 62.5%
tests on extracted schemas (pass / total): 1483 / 2223 = 66.7%

v3:
schema extract (pass / total): 230 / 274 = 83.9%
tests (pass / total): 1498 / 2372 = 63.2%
tests on extracted schemas (pass / total): 1498 / 2258 = 66.3%
schema extract (pass / total): 220 / 274 = 80.3%
tests (pass / total): 1483 / 2372 = 62.5%
tests on extracted schemas (pass / total): 1483 / 2223 = 66.7%
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,8 @@
"pattern": "\\p{Letter}cole"
},
"skip": {
"v2": "extract error: invalid regexp \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
"v3": "extract error: invalid regexp \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
"v2": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
"v3": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
},
"tests": [
{
Expand Down Expand Up @@ -496,8 +496,8 @@
"pattern": "^\\p{digit}+$"
},
"skip": {
"v2": "extract error: invalid regexp \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
"v3": "extract error: invalid regexp \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
"v2": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
"v3": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
},
"tests": [
{
Expand Down Expand Up @@ -539,6 +539,10 @@
},
"additionalProperties": false
},
"skip": {
"v2": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
"v3": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
},
"tests": [
{
"description": "ascii character in json string",
Expand All @@ -547,23 +551,31 @@
},
"valid": true,
"skip": {
"v2": "\"l'ecole\": invalid regexp: error parsing regexp: invalid character class range: `\\p{Letter}`\n",
"v3": "invalid regexp: error parsing regexp: invalid character class range: `\\p{Letter}`:\n generated.cue:3:25\n"
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
"description": "literal unicode character in json string",
"data": {
"l'école": "pas de vraie vie"
},
"valid": true
"valid": true,
"skip": {
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
"description": "unicode character in hex format in string",
"data": {
"l'école": "pas de vraie vie"
},
"valid": true
"valid": true,
"skip": {
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
"description": "unicode matching is case-sensitive",
Expand All @@ -572,8 +584,8 @@
},
"valid": false,
"skip": {
"v2": "unexpected success",
"v3": "unexpected success"
"v2": "could not compile schema",
"v3": "could not compile schema"
}
}
]
Expand Down Expand Up @@ -725,6 +737,10 @@
},
"additionalProperties": false
},
"skip": {
"v2": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
"v3": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
},
"tests": [
{
"description": "ascii digits",
Expand All @@ -733,8 +749,8 @@
},
"valid": true,
"skip": {
"v2": "\"42\": invalid regexp: error parsing regexp: invalid character class range: `\\p{digit}`\n",
"v3": "invalid regexp: error parsing regexp: invalid character class range: `\\p{digit}`:\n generated.cue:3:23\n"
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
Expand All @@ -744,16 +760,20 @@
},
"valid": false,
"skip": {
"v2": "unexpected success",
"v3": "unexpected success"
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
"description": "non-ascii digits (BENGALI DIGIT FOUR, BENGALI DIGIT TWO)",
"data": {
"৪২": "khajit has wares if you have coin"
},
"valid": true
"valid": true,
"skip": {
"v2": "could not compile schema",
"v3": "could not compile schema"
}
}
]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,8 @@
"pattern": "\\p{Letter}cole"
},
"skip": {
"v2": "extract error: invalid regexp \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
"v3": "extract error: invalid regexp \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
"v2": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
"v3": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
},
"tests": [
{
Expand Down Expand Up @@ -514,8 +514,8 @@
"pattern": "^\\p{digit}+$"
},
"skip": {
"v2": "extract error: invalid regexp \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
"v3": "extract error: invalid regexp \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
"v2": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
"v3": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
},
"tests": [
{
Expand Down Expand Up @@ -557,6 +557,10 @@
},
"additionalProperties": false
},
"skip": {
"v2": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
"v3": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
},
"tests": [
{
"description": "ascii character in json string",
Expand All @@ -565,23 +569,31 @@
},
"valid": true,
"skip": {
"v2": "\"l'ecole\": invalid regexp: error parsing regexp: invalid character class range: `\\p{Letter}`\n",
"v3": "invalid regexp: error parsing regexp: invalid character class range: `\\p{Letter}`:\n generated.cue:3:25\n"
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
"description": "literal unicode character in json string",
"data": {
"l'école": "pas de vraie vie"
},
"valid": true
"valid": true,
"skip": {
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
"description": "unicode character in hex format in string",
"data": {
"l'école": "pas de vraie vie"
},
"valid": true
"valid": true,
"skip": {
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
"description": "unicode matching is case-sensitive",
Expand All @@ -590,8 +602,8 @@
},
"valid": false,
"skip": {
"v2": "unexpected success",
"v3": "unexpected success"
"v2": "could not compile schema",
"v3": "could not compile schema"
}
}
]
Expand Down Expand Up @@ -743,6 +755,10 @@
},
"additionalProperties": false
},
"skip": {
"v2": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
"v3": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
},
"tests": [
{
"description": "ascii digits",
Expand All @@ -751,8 +767,8 @@
},
"valid": true,
"skip": {
"v2": "\"42\": invalid regexp: error parsing regexp: invalid character class range: `\\p{digit}`\n",
"v3": "invalid regexp: error parsing regexp: invalid character class range: `\\p{digit}`:\n generated.cue:3:23\n"
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
Expand All @@ -762,16 +778,20 @@
},
"valid": false,
"skip": {
"v2": "unexpected success",
"v3": "unexpected success"
"v2": "could not compile schema",
"v3": "could not compile schema"
}
},
{
"description": "non-ascii digits (BENGALI DIGIT FOUR, BENGALI DIGIT TWO)",
"data": {
"৪২": "khajit has wares if you have coin"
},
"valid": true
"valid": true,
"skip": {
"v2": "could not compile schema",
"v3": "could not compile schema"
}
}
]
}
Expand Down
Loading

0 comments on commit 1476fc7

Please sign in to comment.