diff --git a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift index dd331ca3e..115a28af1 100644 --- a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift +++ b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift @@ -36,6 +36,11 @@ extension AST { // be unset, only flipped between) case textSegmentGraphemeMode // y{g} case textSegmentWordMode // y{w} + + // Swift semantic matching level + case graphemeClusterSemantics // X + case unicodeScalarSemantics // u + case byteSemantics // b } public var kind: Kind public var location: SourceLocation @@ -53,6 +58,15 @@ extension AST { return false } } + + public var isSemanticMatchingLevel: Bool { + switch kind { + case .graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics: + return true + default: + return false + } + } } /// A sequence of matching options written in source. diff --git a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift index 6740d6b90..3f80cb7a3 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift @@ -65,6 +65,7 @@ enum ParseError: Error, Hashable { case identifierCannotStartWithNumber(IdentifierKind) case cannotRemoveTextSegmentOptions + case cannotRemoveSemanticsOptions case expectedCalloutArgument } @@ -145,6 +146,8 @@ extension ParseError: CustomStringConvertible { return "\(i.diagDescription) must not start with number" case .cannotRemoveTextSegmentOptions: return "text segment mode cannot be unset, only changed" + case .cannotRemoveSemanticsOptions: + return "semantic level cannot be unset, only changed" case .expectedCalloutArgument: return "expected argument to callout" } diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 39c7ad346..369b7fd5a 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -574,6 +574,11 @@ extension Source { try src.expect("}") return opt + // Swift semantic level options + case "X": return advanceAndReturn(.graphemeClusterSemantics) + case "u": return advanceAndReturn(.unicodeScalarSemantics) + case "b": return advanceAndReturn(.byteSemantics) + default: return nil } @@ -618,6 +623,10 @@ extension Source { if opt.isTextSegmentMode { throw ParseError.cannotRemoveTextSegmentOptions } + // Matching semantics options can only be added, not removed. + if opt.isSemanticMatchingLevel { + throw ParseError.cannotRemoveSemanticsOptions + } removing.append(opt) } return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index ce9d6242f..b51c2018d 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -338,12 +338,19 @@ extension AST.Atom { switch kind { case let .escaped(b): return b.characterClass - case .any: return .any - case .property: // TODO: Would our model type for character classes include // this? Or does grapheme-semantic mode complicate that? return nil + + case .any: + // `.any` is handled in the matching engine by Compiler.emitAny() and in + // the legacy compiler by the `.any` instruction, which can provide lower + // level instructions than the CharacterClass-generated consumer closure + // + // FIXME: We shouldn't be returning `nil` here, but instead fixing the call + // site to check for any before trying to construct a character class. + return nil default: return nil diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 02ff8334e..f13b24e3a 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -18,18 +18,13 @@ struct RegexProgram { class Compiler { let ast: AST - let matchLevel: CharacterClass.MatchLevel - let options: REOptions + private var options = MatchingOptions() private var builder = RegexProgram.Program.Builder() init( - ast: AST, - matchLevel: CharacterClass.MatchLevel = .graphemeCluster, - options: REOptions = [] + ast: AST ) { self.ast = ast - self.matchLevel = matchLevel - self.options = options } __consuming func emit() throws -> RegexProgram { @@ -42,11 +37,9 @@ class Compiler { func emit(_ node: AST) throws { switch node { - // Any: . - // consume 1 - case .atom(let a) where a.kind == .any && matchLevel == .graphemeCluster: - builder.buildAdvance(1) - + case .atom(let a) where a.kind == .any: + try emitAny() + // Single characters we just match case .atom(let a) where a.singleCharacter != nil : builder.buildMatch(a.singleCharacter!) @@ -97,6 +90,9 @@ class Compiler { throw unsupported(node.renderAsCanonical()) case .group(let g): + options.beginScope() + defer { options.endScope() } + if let lookaround = g.lookaroundKind { try emitLookaround(lookaround, g.child) return @@ -113,6 +109,10 @@ class Compiler { try emit(g.child) builder.buildEndCapture(cap) + case .changeMatchingOptions(let optionSequence, _): + options.apply(optionSequence) + try emit(g.child) + default: // FIXME: Other kinds... try emit(g.child) @@ -124,8 +124,8 @@ class Compiler { // For now, we model sets and atoms as consumers. // This lets us rapidly expand support, and we can better // design the actual instruction set with real examples - case _ where try node.generateConsumer(matchLevel) != nil: - try builder.buildConsume(by: node.generateConsumer(matchLevel)!) + case _ where try node.generateConsumer(options) != nil: + try builder.buildConsume(by: node.generateConsumer(options)!) case .quote(let q): // We stick quoted content into read-only constant strings @@ -158,6 +158,31 @@ class Compiler { throw unsupported(node.renderAsCanonical()) } } + + func emitAny() throws { + switch (options.semanticLevel, options.dotMatchesNewline) { + case (.graphemeCluster, true): + builder.buildAdvance(1) + case (.graphemeCluster, false): + builder.buildConsume { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) + } + + case (.unicodeScalar, true): + // TODO: builder.buildAdvanceUnicodeScalar(1) + builder.buildConsume { input, bounds in + input.unicodeScalars.index(after: bounds.lowerBound) + } + case (.unicodeScalar, false): + builder.buildConsume { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.unicodeScalars.index(after: bounds.lowerBound) + } + } + } func emitAssertion(_ kind: AST.Atom.AssertionKind) throws { // FIXME: Depends on API model we have... We may want to @@ -458,7 +483,18 @@ class Compiler { func emitQuantification(_ quant: AST.Quantification) throws { let child = quant.child - let kind = quant.kind.value + + // If in reluctant-by-default mode, eager and reluctant need to be switched. + let kind: AST.Quantification.Kind + if options.isReluctantByDefault + && quant.kind.value != .possessive + { + kind = quant.kind.value == .eager + ? .reluctant + : .eager + } else { + kind = quant.kind.value + } switch quant.amount.value.bounds { case (_, atMost: 0): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a54bd4c33..1b0dd3bec 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -28,12 +28,8 @@ func unsupported( file: StaticString = #file, line: UInt = #line ) -> Unsupported { - // TODO: how do we not have a public init for this? - let fStr = file.withUTF8Buffer { - String(decoding: $0, as: UTF8.self) - } return Unsupported( - message: s, file: fStr, line: Int(line)) + message: s, file: String(describing: file), line: Int(line)) } extension AST { @@ -42,8 +38,7 @@ extension AST { /// A consumer is a Swift closure that matches against /// the front of an input range func generateConsumer( - // TODO: Better option modeling - _ opts: CharacterClass.MatchLevel + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction? { switch self { case .atom(let a): @@ -77,10 +72,10 @@ extension AST.Atom { } func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction? { // TODO: Wean ourselves off of this type... - if let cc = self.characterClass?.withMatchLevel(opts) { + if let cc = self.characterClass?.withMatchLevel(opts.matchLevel) { return { input, bounds in // FIXME: should we worry about out of bounds? cc.matches(in: input, at: bounds.lowerBound) @@ -109,10 +104,16 @@ extension AST.Atom { // TODO: alias? casing? $0.name == name || $0.nameAlias == name } + + case .any: + fatalError(".atom(.any) is handled in emitAny") + case .startOfLine, .endOfLine: + // handled in emitAssertion + return nil + case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .callout, .backtrackingDirective: + .backreference, .subpattern, .callout, .backtrackingDirective: // FIXME: implement return nil } @@ -121,7 +122,7 @@ extension AST.Atom { extension AST.CustomCharacterClass.Member { func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { switch self { case .custom(let ccc): @@ -212,7 +213,7 @@ extension AST.CustomCharacterClass.Member { extension AST.CustomCharacterClass { func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { // NOTE: Easy way to implement, obviously not performant let consumers = try members.map { @@ -265,7 +266,7 @@ private func consumeScalar( extension AST.Atom.CharacterProperty { func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { // Handle inversion for us, albeit not efficiently func invert( @@ -335,7 +336,7 @@ extension AST.Atom.CharacterProperty { extension Unicode.BinaryProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { switch self { @@ -499,7 +500,7 @@ extension Unicode.BinaryProperty { extension Unicode.POSIXProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: MatchingOptions ) -> Program.ConsumeFunction { // FIXME: semantic levels, modes, etc switch self { @@ -545,7 +546,7 @@ extension Unicode.POSIXProperty { extension Unicode.ExtendedGeneralCategory { // FIXME: Semantic level func generateConsumer( - _ opts: CharacterClass.MatchLevel + _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { switch self { case .letter: diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift new file mode 100644 index 000000000..7509d3e2b --- /dev/null +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -0,0 +1,249 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import _MatchingEngine + +/// A type that represents the current state of regex matching options, with +/// stack-based scoping. +struct MatchingOptions { + fileprivate var stack: [Representation] + + fileprivate func _invariantCheck() { + assert(!stack.isEmpty, "Unbalanced call to endScope") + + // Must contain exactly one of each mutually exclusive group + assert(stack.last!.intersection(.textSegmentOptions).rawValue.nonzeroBitCount == 1) + assert(stack.last!.intersection(.semanticMatchingLevels).rawValue.nonzeroBitCount == 1) + } +} + +// Compiler API +extension MatchingOptions { + /// Creates an instance with the default options. + init() { + self.stack = [.default] + _invariantCheck() + } + + /// Starts a new scope with the current options. + mutating func beginScope() { + stack.append(stack.last!) + _invariantCheck() + } + + /// Ends the current scope. + mutating func endScope() { + _ = stack.removeLast() + _invariantCheck() + } + + /// Updates the options in the current scope with the changes described by + /// `sequence`. + mutating func apply(_ sequence: AST.MatchingOptionSequence) { + stack[stack.count - 1].apply(sequence) + _invariantCheck() + } + + var isReluctantByDefault: Bool { + stack.last!.contains(.reluctantByDefault) + } + + var dotMatchesNewline: Bool { + stack.last!.contains(.singleLine) + } + + enum SemanticLevel { + case graphemeCluster + case unicodeScalar + // TODO: include? + // case byte + } + + var semanticLevel: SemanticLevel { + stack.last!.contains(.graphemeClusterSemantics) + ? .graphemeCluster + : .unicodeScalar + } +} + +// Deprecated CharacterClass.MatchLevel API +extension MatchingOptions { + @available(*, deprecated) + var matchLevel: CharacterClass.MatchLevel { + switch semanticLevel { + case .graphemeCluster: + return .graphemeCluster + case .unicodeScalar: + return .unicodeScalar + } + } +} + +extension MatchingOptions { + /// An option that changes the behavior of a regular expression. + fileprivate enum Option: Int { + // PCRE options + case caseInsensitive + case allowDuplicateGroupNames + case multiline + case noAutoCapture + case singleLine + case reluctantByDefault + + // ICU options + case unicodeWordBoundaries + + // NSRegularExpression compatibility options + // Not available via regex literal flags + case transparentBounds + case withoutAnchoringBounds + + // Oniguruma options + case asciiOnlyDigit + case asciiOnlyPOSIXProps + case asciiOnlySpace + case asciiOnlyWord + + // Oniguruma text segment options (these are mutually exclusive and cannot + // be unset, only flipped between) + case textSegmentGraphemeMode + case textSegmentWordMode + + // Swift semantic matching level + case graphemeClusterSemantics + case unicodeScalarSemantics + case byteSemantics + + init?(_ astKind: AST.MatchingOption.Kind) { + switch astKind { + case .caseInsensitive: + self = .caseInsensitive + case .allowDuplicateGroupNames: + self = .allowDuplicateGroupNames + case .multiline: + self = .multiline + case .noAutoCapture: + self = .noAutoCapture + case .singleLine: + self = .singleLine + case .reluctantByDefault: + self = .reluctantByDefault + case .unicodeWordBoundaries: + self = .unicodeWordBoundaries + case .asciiOnlyDigit: + self = .asciiOnlyDigit + case .asciiOnlyPOSIXProps: + self = .asciiOnlyPOSIXProps + case .asciiOnlySpace: + self = .asciiOnlySpace + case .asciiOnlyWord: + self = .asciiOnlyWord + case .textSegmentGraphemeMode: + self = .textSegmentGraphemeMode + case .textSegmentWordMode: + self = .textSegmentWordMode + case .graphemeClusterSemantics: + self = .graphemeClusterSemantics + case .unicodeScalarSemantics: + self = .unicodeScalarSemantics + case .byteSemantics: + self = .byteSemantics + + // Whitespace options are only relevant during parsing, not compilation. + case .extended, .extraExtended: + return nil + @unknown default: + // Ignore unknown + return nil + } + } + + fileprivate var representation: Representation { + return .init(self) + } + } +} + +extension MatchingOptions { + /// A set of matching options. + fileprivate struct Representation: OptionSet, RawRepresentable { + var rawValue: UInt32 + + /// Returns `true` if the option denoted by `kind` is a member of this set. + func contains(_ kind: Option) -> Bool { + contains(.init(kind)) + } + + /// Applies the changes described by `sequence` to this set of options. + mutating func apply(_ sequence: AST.MatchingOptionSequence) { + // Replace entirely if the sequence includes a caret, e.g. `(?^is)`. + if sequence.caretLoc != nil { + self = .default + } + + for opt in sequence.adding { + guard let opt = Option(opt.kind)?.representation else { + continue + } + + // If opt is in one of the mutually exclusive groups, clear out the + // group before inserting. + if Self.semanticMatchingLevels.contains(opt) { + remove(.semanticMatchingLevels) + } + if Self.textSegmentOptions.contains(opt) { + remove(.textSegmentOptions) + } + + insert(opt) + } + + for opt in sequence.removing { + guard let opt = Option(opt.kind)?.representation else { + continue + } + + remove(opt) + } + } + } +} + +extension MatchingOptions.Representation { + fileprivate init(_ kind: MatchingOptions.Option) { + self.rawValue = 1 << kind.rawValue + } + + // Text segmentation options + static var textSegmentGraphemeMode: Self { .init(.textSegmentGraphemeMode) } + static var textSegmentWordMode: Self { .init(.textSegmentWordMode) } + + /// Options that comprise the mutually exclusive test segmentation group. + static var textSegmentOptions: Self { + [.textSegmentGraphemeMode, .textSegmentWordMode] + } + + // Semantic matching level options + static var graphemeClusterSemantics: Self { .init(.graphemeClusterSemantics) } + static var unicodeScalarSemantics: Self { .init(.unicodeScalarSemantics) } + static var byteSemantics: Self { .init(.byteSemantics) } + + /// Options that comprise the mutually exclusive semantic matching level + /// group. + static var semanticMatchingLevels: Self { + [.graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics] + } + + /// The default set of options. + static var `default`: Self { + [.graphemeClusterSemantics, .textSegmentGraphemeMode] + } +} diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6d8a45544..63e48fa61 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -18,14 +18,16 @@ import XCTest extension RegexTests { private func testCompilationEquivalence( - _ equivs: [String] + _ equivs: [String], + file: StaticString = #file, + line: UInt = #line ) throws { assert(!equivs.isEmpty) let progs = try equivs.map { try _compileRegex($0).engine.program } let ref = progs.first! - for prog in progs.dropFirst() { + for (prog, equiv) in zip(progs, equivs).dropFirst() { guard ref.instructions.elementsEqual( prog.instructions) else { XCTFail(""" @@ -33,7 +35,10 @@ extension RegexTests { \(ref) Current: \(prog) - """) + Compiled from: + \(equiv) + """, + file: file, line: line) continue } } @@ -70,7 +75,14 @@ extension RegexTests { "(*positive_lookahead: assert)"], ["(?! assert)", "(*nla: assert)", - "(*negative_lookahead: assert)"] + "(*negative_lookahead: assert)"], + + ["a+?", + "(?U)a+", + "(?U:a+)"], + ["a+", + "(?U)(?-U)a+", + "(?U)(?^s)a+"], ] for row in equivalents { diff --git a/Tests/RegexTests/LegacyTests.swift b/Tests/RegexTests/LegacyTests.swift index 785d4293e..316d1dac7 100644 --- a/Tests/RegexTests/LegacyTests.swift +++ b/Tests/RegexTests/LegacyTests.swift @@ -154,14 +154,14 @@ private func performTest( extension RegexTests { func testLegacyCompile() { - func performTest(_ input: String, _ expecting: RECode) { + func performTest(_ input: String, _ expecting: RECode, line: UInt = #line) { let recode = try! compile(input) guard recode == expecting else { XCTFail(""" Expected: \(expecting) Found: \(recode) - """) + """, line: line) return } } @@ -292,7 +292,7 @@ extension RegexTests { recode( label(0), split(disfavoring: 1), .beginGroup, - label(2), split(disfavoring: 3), .characterClass(.any), goto(label: 2), + label(2), split(disfavoring: 3), .any, goto(label: 2), label(3), .endGroup, goto(label: 0), @@ -302,7 +302,7 @@ extension RegexTests { "a.*?b+?c??", recode("a", label(0), split(disfavoring: 1), goto(label: 2), - label(1), .characterClass(.any), goto(label: 0), + label(1), .any, goto(label: 0), label(2), label(3), "b", split(disfavoring: 3), split(disfavoring: 4), goto(label: 5), @@ -419,34 +419,6 @@ extension RegexTests { // expecting: .init(captures: "aaaa", capturesEqual: ==)) } - func testLegacyMatchLevel() throws { - let tests: Array<(String, chars: [String], unicodes: [String])> = [ - ("..", ["e\u{301}e\u{301}"], ["e\u{301}"]), - ] - - for (regex, characterInputs, scalarInputs) in tests { - let ast = try parse(regex, .traditional) - let program = try Compiler(ast: ast).emit() - let executor = Executor(program: program) - - let scalarProgram = try Compiler( - ast: ast, matchLevel: .unicodeScalar - ).emit() - let scalarExecutor = Executor( - program: scalarProgram, enablesTracing: false) - - for input in characterInputs { - XCTAssertNotNil(executor.execute(input: input)) - XCTAssertNil(scalarExecutor.execute(input: input)) - } - - for input in scalarInputs { - XCTAssertNotNil(scalarExecutor.execute(input: input)) - XCTAssertNil(executor.execute(input: input)) - } - } - } - func testLegacyPartialMatches() { let tests: Array<(String, pass: [(String, matched: String)], fail: [String])> = [ ("a+", diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 4f639ee44..e908a11da 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -18,21 +18,23 @@ func diagnose( _ input: String, expecting expected: ParseError, _ syntax: SyntaxOptions = .traditional, - _ f: (inout Source) throws -> () + _ f: (inout Source) throws -> (), + file: StaticString = #file, + line: UInt = #line ) { var src = Source(input, syntax) do { try f(&src) XCTFail(""" Passed, but expected error: \(expected) - """) + """, file: file, line: line) } catch let e as Source.LocatedError { guard e.error == expected else { XCTFail(""" Expected: \(expected) Actual: \(e.error) - """) + """, file: file, line: line) return } } catch let e { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 1717b6a7d..e29907d8c 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -464,6 +464,20 @@ extension RegexTests { // TODO: Nested reluctant reentrant example, xfailed ) + // Reluctant by default - '*/+/.' and '*?/+?/.?' are swapped + firstMatchTest("(?U)a*", input: "aaa", match: "") + firstMatchTest("(?U)a*a", input: "aaa", match: "a") + firstMatchTest("(?U)a*?", input: "aaa", match: "aaa") + firstMatchTest("(?U)a*?a", input: "aaa", match: "aaa") + + firstMatchTest("(?U)a+", input: "aaa", match: "a") + firstMatchTest("(?U)a+?", input: "aaa", match: "aaa") + + firstMatchTest("(?U)a?", input: "a", match: "") + firstMatchTest("(?U)a?a", input: "aaa", match: "a") + firstMatchTest("(?U)a??", input: "a", match: "a") + firstMatchTest("(?U)a??a", input: "aaa", match: "aa") + // TODO: After captures, easier to test these } @@ -1098,6 +1112,32 @@ extension RegexTests { ) } + func testSingleLineMode() { + firstMatchTest(#".+"#, input: "a\nb", match: "a") + firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") + } + + func testMatchingOptionsScope() { + // `.` only matches newlines when the 's' option (single-line mode) + // is turned on. Standalone option-setting groups (e.g. `(?s)`) are + // scoped only to the current group. + + firstMatchTest(#"(?s)a.b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"((?s)a.)b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"(?-s)((?s)a.)b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"(?-s)(?s:a.)b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"((?s)a).b"#, input: "a\nb", match: nil) + firstMatchTest(#"((?s))a.b"#, input: "a\nb", match: nil) + firstMatchTest(#"(?:(?s))a.b"#, input: "a\nb", match: nil) + firstMatchTest(#"((?s)a(?s)).b"#, input: "a\nb", match: nil) + firstMatchTest(#"(?s)a(?-s).b"#, input: "a\nb", match: nil) + firstMatchTest(#"(?s)a(?-s:.b)"#, input: "a\nb", match: nil) + firstMatchTest(#"(?:(?s)a).b"#, input: "a\nb", match: nil) + firstMatchTest(#"(((?s)a)).b"#, input: "a\nb", match: nil) + firstMatchTest(#"(?s)(((?-s)a)).b"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"(?s)((?-s)((?i)a)).b"#, input: "a\nb", match: "a\nb") + } + // MARK: Character Semantics var eComposed: String { "é" } @@ -1256,8 +1296,7 @@ extension RegexTests { // a single Unicode scalar value, leaving any other grapheme scalar // components to be matched. - firstMatchTest(#"(?u:.)"#, input: eDecomposed, match: "e", - xfail: true) + firstMatchTest(#"(?u:.)"#, input: eDecomposed, match: "e") matchTest( #".\u{301}"#, @@ -1278,12 +1317,31 @@ extension RegexTests { firstMatchTest(#"e\O"#, input: eComposed, match: nil, xfail: true) - // FIXME: Unicode scalar semantic flag (?U) doesn't change behavior of `.` matchTest( - #"(?U).\u{301}"#, - (eComposed, true), - (eDecomposed, true), - xfail: true) + #"(?u).\u{301}"#, + (eComposed, false), + (eDecomposed, true)) + firstMatchTest(#"(?u).$"#, input: eComposed, match: eComposed) + + // Option permutations for 'u' and 's' + matchTest( + #"...."#, + ("e\u{301}ab", false), + ("e\u{301}abc", true), + ("e\u{301}\nab", false)) + matchTest( + #"(?s)...."#, + ("e\u{301}ab", false), + ("e\u{301}abc", true), + ("e\u{301}\nab", true)) + matchTest( + #"(?u)...."#, + ("e\u{301}ab", true), + ("e\u{301}\na", false)) + matchTest( + #"(?us)...."#, + ("e\u{301}ab", true), + ("e\u{301}\na", true)) } // TODO: Add test for implied grapheme cluster requirement at group boundaries diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 1aa9af18a..89d695aab 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -689,19 +689,20 @@ extension RegexTests { .singleLine, .reluctantByDefault, .extraExtended, .extended, .unicodeWordBoundaries, .asciiOnlyDigit, .asciiOnlyPOSIXProps, .asciiOnlySpace, .asciiOnlyWord, .textSegmentGraphemeMode, - .textSegmentWordMode + .textSegmentWordMode, .graphemeClusterSemantics, .unicodeScalarSemantics, + .byteSemantics ] - parseTest("(?iJmnsUxxxwDPSWy{g}y{w}-iJmnsUxxxwDPSW)", changeMatchingOptions( + parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW)", changeMatchingOptions( matchingOptions( adding: allOptions, - removing: allOptions.dropLast(2) + removing: allOptions.dropLast(5) ), isIsolated: true, empty()) ) - parseTest("(?iJmnsUxxxwDPSWy{g}y{w}-iJmnsUxxxwDPSW:)", changeMatchingOptions( + parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW:)", changeMatchingOptions( matchingOptions( adding: allOptions, - removing: allOptions.dropLast(2) + removing: allOptions.dropLast(5) ), isIsolated: false, empty()) ) @@ -1507,6 +1508,12 @@ extension RegexTests { diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions) + // MARK: Semantic Level options + + diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions) + diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) + diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + // MARK: Group specifiers diagnosticTest(#"(*"#, .unknownGroupKind("*"))