encoding/jsonschema: check regexp syntax up front

Issue #3555 shows that invalid regular expressions are ignored in some circumstances. To avoid differences between evalv2 and evalv3 (and to enable better behaviour when `strictFeatures` is disabled), check all regular expressions up front rather than relying on the evaluator to detect them later. Also detect invalid character classes specifically because that's something which is technically a missing feature rather than an invalid regexp. Signed-off-by: Roger Peppe <[email protected]> Change-Id: I616c991bccd82ddcb9c2070404b352f686f58aeb Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/1203589 TryBot-Result: CUEcueckoo <[email protected]> Reviewed-by: Daniel Martí <[email protected]>
cue-lang · Nov 5, 2024 · 1476fc7 · 1476fc7
1 parent 85f5ca8
commit 1476fc7
Show file tree

Hide file tree

Showing 8 changed files with 224 additions and 95 deletions.
diff --git a/encoding/jsonschema/constraints_object.go b/encoding/jsonschema/constraints_object.go
@@ -83,9 +83,18 @@ func constraintPatternProperties(key string, n cue.Value, s *state) {
 	obj := s.object(n)
 	existing := excludeFields(s.obj.Elts)
 	s.processMap(n, func(key string, n cue.Value) {
-		// [!~(properties) & pattern]: schema
+		if !s.checkRegexp(n, key) {
+			return
+		}
+
+		// Record the pattern for potential use by
+		// additionalProperties because patternProperties are
+		// considered before additionalProperties.
 		s.patterns = append(s.patterns,
 			&ast.UnaryExpr{Op: token.NMAT, X: ast.NewString(key)})
+
+		// We'll make a pattern constraint of the form:
+		// 	[pattern & !~(properties)]: schema
 		f := internal.EmbedStruct(ast.NewStruct(&ast.Field{
 			Label: ast.NewList(ast.NewBinExpr(
 				token.AND,

diff --git a/encoding/jsonschema/decode.go b/encoding/jsonschema/decode.go
@@ -171,8 +171,9 @@ func (d *decoder) schema(ref []ast.Label, v cue.Value) (a []ast.Decl) {
 		a = append(a, &ast.EmbedDecl{Expr: expr})
 	}
 
-	state.doc(a[0])
-
+	if len(a) > 0 {
+		state.doc(a[0])
+	}
 	for i := inner - 1; i >= 0; i-- {
 		a = []ast.Decl{&ast.Field{
 			Label: ref[i],
@@ -267,24 +268,43 @@ func (d *decoder) regexpValue(n cue.Value) (ast.Expr, bool) {
 	if !ok {
 		return nil, false
 	}
+	if !d.checkRegexp(n, s) {
+		return nil, false
+	}
+	return d.string(n), true
+}
+
+func (d *decoder) checkRegexp(n cue.Value, s string) bool {
 	_, err := syntax.Parse(s, syntax.Perl)
 	if err == nil {
-		return d.string(n), true
+		return true
 	}
 	var regErr *syntax.Error
-	if errors.As(err, &regErr) && regErr.Code == syntax.ErrInvalidPerlOp {
-		// It's Perl syntax that we'll never support because the CUE evaluation
-		// engine uses Go's regexp implementation and because the missing
-		// features are usually not there for good reason (e.g. exponential
-		// runtime). In other words, this is a missing feature but not an invalid
-		// regular expression as such.
-		if d.cfg.StrictFeatures {
-			d.errf(n, "unsupported Perl regexp syntax in %q: %v", s, err)
+	if errors.As(err, &regErr) {
+		switch regErr.Code {
+		case syntax.ErrInvalidPerlOp:
+			// It's Perl syntax that we'll never support because the CUE evaluation
+			// engine uses Go's regexp implementation and because the missing
+			// features are usually not there for good reason (e.g. exponential
+			// runtime). In other words, this is a missing feature but not an invalid
+			// regular expression as such.
+			if d.cfg.StrictFeatures {
+				d.errf(n, "unsupported Perl regexp syntax in %q: %v", s, err)
+			}
+			return false
+		case syntax.ErrInvalidCharRange:
+			// There are many more character class ranges than Go supports currently
+			// (see https://go.dev/issue/14509) so treat an unknown character class
+			// range as a feature error rather than a bad regexp.
+			// TODO translate names to Go-supported class names when possible.
+			if d.cfg.StrictFeatures {
+				d.errf(n, "unsupported regexp character class in %q: %v", s, err)
+			}
+			return false
 		}
-		return nil, false
 	}
 	d.errf(n, "invalid regexp %q: %v", s, err)
-	return nil, false
+	return false
 }
 
 // const draftCutoff = 5

diff --git a/encoding/jsonschema/external_teststats.txt b/encoding/jsonschema/external_teststats.txt
@@ -12,11 +12,11 @@ v3:
 Optional tests
 
 v2:
-	schema extract (pass / total): 230 / 274 = 83.9%
-	tests (pass / total): 1498 / 2372 = 63.2%
-	tests on extracted schemas (pass / total): 1498 / 2258 = 66.3%
+	schema extract (pass / total): 220 / 274 = 80.3%
+	tests (pass / total): 1483 / 2372 = 62.5%
+	tests on extracted schemas (pass / total): 1483 / 2223 = 66.7%
 
 v3:
-	schema extract (pass / total): 230 / 274 = 83.9%
-	tests (pass / total): 1498 / 2372 = 63.2%
-	tests on extracted schemas (pass / total): 1498 / 2258 = 66.3%
+	schema extract (pass / total): 220 / 274 = 80.3%
+	tests (pass / total): 1483 / 2372 = 62.5%
+	tests on extracted schemas (pass / total): 1483 / 2223 = 66.7%
diff --git a/encoding/jsonschema/testdata/external/tests/draft2019-09/optional/ecmascript-regex.json b/encoding/jsonschema/testdata/external/tests/draft2019-09/optional/ecmascript-regex.json
@@ -370,8 +370,8 @@
 			"pattern": "\\p{Letter}cole"
 		},
 		"skip": {
-			"v2": "extract error: invalid regexp \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
-			"v3": "extract error: invalid regexp \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
+			"v2": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
+			"v3": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
 		},
 		"tests": [
 			{
@@ -496,8 +496,8 @@
 			"pattern": "^\\p{digit}+$"
 		},
 		"skip": {
-			"v2": "extract error: invalid regexp \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
-			"v3": "extract error: invalid regexp \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
+			"v2": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
+			"v3": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
 		},
 		"tests": [
 			{
@@ -539,6 +539,10 @@
 			},
 			"additionalProperties": false
 		},
+		"skip": {
+			"v2": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
+			"v3": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
+		},
 		"tests": [
 			{
 				"description": "ascii character in json string",
@@ -547,23 +551,31 @@
 				},
 				"valid": true,
 				"skip": {
-					"v2": "\"l'ecole\": invalid regexp: error parsing regexp: invalid character class range: `\\p{Letter}`\n",
-					"v3": "invalid regexp: error parsing regexp: invalid character class range: `\\p{Letter}`:\n    generated.cue:3:25\n"
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
 				}
 			},
 			{
 				"description": "literal unicode character in json string",
 				"data": {
 					"l'école": "pas de vraie vie"
 				},
-				"valid": true
+				"valid": true,
+				"skip": {
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
+				}
 			},
 			{
 				"description": "unicode character in hex format in string",
 				"data": {
 					"l'école": "pas de vraie vie"
 				},
-				"valid": true
+				"valid": true,
+				"skip": {
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
+				}
 			},
 			{
 				"description": "unicode matching is case-sensitive",
@@ -572,8 +584,8 @@
 				},
 				"valid": false,
 				"skip": {
-					"v2": "unexpected success",
-					"v3": "unexpected success"
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
 				}
 			}
 		]
@@ -725,6 +737,10 @@
 			},
 			"additionalProperties": false
 		},
+		"skip": {
+			"v2": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
+			"v3": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
+		},
 		"tests": [
 			{
 				"description": "ascii digits",
@@ -733,8 +749,8 @@
 				},
 				"valid": true,
 				"skip": {
-					"v2": "\"42\": invalid regexp: error parsing regexp: invalid character class range: `\\p{digit}`\n",
-					"v3": "invalid regexp: error parsing regexp: invalid character class range: `\\p{digit}`:\n    generated.cue:3:23\n"
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
 				}
 			},
 			{
@@ -744,16 +760,20 @@
 				},
 				"valid": false,
 				"skip": {
-					"v2": "unexpected success",
-					"v3": "unexpected success"
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
 				}
 			},
 			{
 				"description": "non-ascii digits (BENGALI DIGIT FOUR, BENGALI DIGIT TWO)",
 				"data": {
 					"৪২": "khajit has wares if you have coin"
 				},
-				"valid": true
+				"valid": true,
+				"skip": {
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
+				}
 			}
 		]
 	}

diff --git a/encoding/jsonschema/testdata/external/tests/draft2020-12/optional/ecmascript-regex.json b/encoding/jsonschema/testdata/external/tests/draft2020-12/optional/ecmascript-regex.json
@@ -370,8 +370,8 @@
 			"pattern": "\\p{Letter}cole"
 		},
 		"skip": {
-			"v2": "extract error: invalid regexp \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
-			"v3": "extract error: invalid regexp \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
+			"v2": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
+			"v3": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
 		},
 		"tests": [
 			{
@@ -514,8 +514,8 @@
 			"pattern": "^\\p{digit}+$"
 		},
 		"skip": {
-			"v2": "extract error: invalid regexp \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
-			"v3": "extract error: invalid regexp \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
+			"v2": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
+			"v3": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
 		},
 		"tests": [
 			{
@@ -557,6 +557,10 @@
 			},
 			"additionalProperties": false
 		},
+		"skip": {
+			"v2": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`",
+			"v3": "extract error: unsupported regexp character class in \"\\\\p{Letter}cole\": error parsing regexp: invalid character class range: `\\p{Letter}`"
+		},
 		"tests": [
 			{
 				"description": "ascii character in json string",
@@ -565,23 +569,31 @@
 				},
 				"valid": true,
 				"skip": {
-					"v2": "\"l'ecole\": invalid regexp: error parsing regexp: invalid character class range: `\\p{Letter}`\n",
-					"v3": "invalid regexp: error parsing regexp: invalid character class range: `\\p{Letter}`:\n    generated.cue:3:25\n"
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
 				}
 			},
 			{
 				"description": "literal unicode character in json string",
 				"data": {
 					"l'école": "pas de vraie vie"
 				},
-				"valid": true
+				"valid": true,
+				"skip": {
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
+				}
 			},
 			{
 				"description": "unicode character in hex format in string",
 				"data": {
 					"l'école": "pas de vraie vie"
 				},
-				"valid": true
+				"valid": true,
+				"skip": {
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
+				}
 			},
 			{
 				"description": "unicode matching is case-sensitive",
@@ -590,8 +602,8 @@
 				},
 				"valid": false,
 				"skip": {
-					"v2": "unexpected success",
-					"v3": "unexpected success"
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
 				}
 			}
 		]
@@ -743,6 +755,10 @@
 			},
 			"additionalProperties": false
 		},
+		"skip": {
+			"v2": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`",
+			"v3": "extract error: unsupported regexp character class in \"^\\\\p{digit}+$\": error parsing regexp: invalid character class range: `\\p{digit}`"
+		},
 		"tests": [
 			{
 				"description": "ascii digits",
@@ -751,8 +767,8 @@
 				},
 				"valid": true,
 				"skip": {
-					"v2": "\"42\": invalid regexp: error parsing regexp: invalid character class range: `\\p{digit}`\n",
-					"v3": "invalid regexp: error parsing regexp: invalid character class range: `\\p{digit}`:\n    generated.cue:3:23\n"
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
 				}
 			},
 			{
@@ -762,16 +778,20 @@
 				},
 				"valid": false,
 				"skip": {
-					"v2": "unexpected success",
-					"v3": "unexpected success"
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
 				}
 			},
 			{
 				"description": "non-ascii digits (BENGALI DIGIT FOUR, BENGALI DIGIT TWO)",
 				"data": {
 					"৪২": "khajit has wares if you have coin"
 				},
-				"valid": true
+				"valid": true,
+				"skip": {
+					"v2": "could not compile schema",
+					"v3": "could not compile schema"
+				}
 			}
 		]
 	}