Skip to content

Commit

Permalink
Add options support to the compiler (#112)
Browse files Browse the repository at this point in the history
MatchingOptions provides an interface for the compiler
to manage group-scoped matching options, to apply matching
option sequences from the AST, and to query when building
out matching behavior.

Includes support and tests for the `s` and `u` option flags.
  • Loading branch information
natecook1000 authored Jan 28, 2022
1 parent 074ee9f commit a6132a5
Show file tree
Hide file tree
Showing 12 changed files with 455 additions and 85 deletions.
14 changes: 14 additions & 0 deletions Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ extension AST {
// be unset, only flipped between)
case textSegmentGraphemeMode // y{g}
case textSegmentWordMode // y{w}

// Swift semantic matching level
case graphemeClusterSemantics // X
case unicodeScalarSemantics // u
case byteSemantics // b
}
public var kind: Kind
public var location: SourceLocation
Expand All @@ -53,6 +58,15 @@ extension AST {
return false
}
}

public var isSemanticMatchingLevel: Bool {
switch kind {
case .graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics:
return true
default:
return false
}
}
}

/// A sequence of matching options written in source.
Expand Down
3 changes: 3 additions & 0 deletions Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ enum ParseError: Error, Hashable {
case identifierCannotStartWithNumber(IdentifierKind)

case cannotRemoveTextSegmentOptions
case cannotRemoveSemanticsOptions
case expectedCalloutArgument
}

Expand Down Expand Up @@ -145,6 +146,8 @@ extension ParseError: CustomStringConvertible {
return "\(i.diagDescription) must not start with number"
case .cannotRemoveTextSegmentOptions:
return "text segment mode cannot be unset, only changed"
case .cannotRemoveSemanticsOptions:
return "semantic level cannot be unset, only changed"
case .expectedCalloutArgument:
return "expected argument to callout"
}
Expand Down
9 changes: 9 additions & 0 deletions Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,11 @@ extension Source {
try src.expect("}")
return opt

// Swift semantic level options
case "X": return advanceAndReturn(.graphemeClusterSemantics)
case "u": return advanceAndReturn(.unicodeScalarSemantics)
case "b": return advanceAndReturn(.byteSemantics)

default:
return nil
}
Expand Down Expand Up @@ -618,6 +623,10 @@ extension Source {
if opt.isTextSegmentMode {
throw ParseError.cannotRemoveTextSegmentOptions
}
// Matching semantics options can only be added, not removed.
if opt.isSemanticMatchingLevel {
throw ParseError.cannotRemoveSemanticsOptions
}
removing.append(opt)
}
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
Expand Down
11 changes: 9 additions & 2 deletions Sources/_StringProcessing/CharacterClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -338,12 +338,19 @@ extension AST.Atom {
switch kind {
case let .escaped(b): return b.characterClass

case .any: return .any

case .property:
// TODO: Would our model type for character classes include
// this? Or does grapheme-semantic mode complicate that?
return nil

case .any:
// `.any` is handled in the matching engine by Compiler.emitAny() and in
// the legacy compiler by the `.any` instruction, which can provide lower
// level instructions than the CharacterClass-generated consumer closure
//
// FIXME: We shouldn't be returning `nil` here, but instead fixing the call
// site to check for any before trying to construct a character class.
return nil

default: return nil

Expand Down
66 changes: 51 additions & 15 deletions Sources/_StringProcessing/Compiler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,13 @@ struct RegexProgram {

class Compiler {
let ast: AST
let matchLevel: CharacterClass.MatchLevel
let options: REOptions
private var options = MatchingOptions()
private var builder = RegexProgram.Program.Builder()

init(
ast: AST,
matchLevel: CharacterClass.MatchLevel = .graphemeCluster,
options: REOptions = []
ast: AST
) {
self.ast = ast
self.matchLevel = matchLevel
self.options = options
}

__consuming func emit() throws -> RegexProgram {
Expand All @@ -42,11 +37,9 @@ class Compiler {
func emit(_ node: AST.Node) throws {

switch node {
// Any: .
// consume 1
case .atom(let a) where a.kind == .any && matchLevel == .graphemeCluster:
builder.buildAdvance(1)

case .atom(let a) where a.kind == .any:
try emitAny()

// Single characters we just match
case .atom(let a) where a.singleCharacter != nil :
builder.buildMatch(a.singleCharacter!)
Expand Down Expand Up @@ -97,6 +90,9 @@ class Compiler {
throw unsupported(node.renderAsCanonical())

case .group(let g):
options.beginScope()
defer { options.endScope() }

if let lookaround = g.lookaroundKind {
try emitLookaround(lookaround, g.child)
return
Expand All @@ -113,6 +109,10 @@ class Compiler {
try emit(g.child)
builder.buildEndCapture(cap)

case .changeMatchingOptions(let optionSequence, _):
options.apply(optionSequence)
try emit(g.child)

default:
// FIXME: Other kinds...
try emit(g.child)
Expand All @@ -124,8 +124,8 @@ class Compiler {
// For now, we model sets and atoms as consumers.
// This lets us rapidly expand support, and we can better
// design the actual instruction set with real examples
case _ where try node.generateConsumer(matchLevel) != nil:
try builder.buildConsume(by: node.generateConsumer(matchLevel)!)
case _ where try node.generateConsumer(options) != nil:
try builder.buildConsume(by: node.generateConsumer(options)!)

case .quote(let q):
// We stick quoted content into read-only constant strings
Expand Down Expand Up @@ -158,6 +158,31 @@ class Compiler {
throw unsupported(node.renderAsCanonical())
}
}

func emitAny() throws {
switch (options.semanticLevel, options.dotMatchesNewline) {
case (.graphemeCluster, true):
builder.buildAdvance(1)
case (.graphemeCluster, false):
builder.buildConsume { input, bounds in
input[bounds.lowerBound].isNewline
? nil
: input.index(after: bounds.lowerBound)
}

case (.unicodeScalar, true):
// TODO: builder.buildAdvanceUnicodeScalar(1)
builder.buildConsume { input, bounds in
input.unicodeScalars.index(after: bounds.lowerBound)
}
case (.unicodeScalar, false):
builder.buildConsume { input, bounds in
input[bounds.lowerBound].isNewline
? nil
: input.unicodeScalars.index(after: bounds.lowerBound)
}
}
}

func emitAssertion(_ kind: AST.Atom.AssertionKind) throws {
// FIXME: Depends on API model we have... We may want to
Expand Down Expand Up @@ -458,7 +483,18 @@ class Compiler {

func emitQuantification(_ quant: AST.Quantification) throws {
let child = quant.child
let kind = quant.kind.value

// If in reluctant-by-default mode, eager and reluctant need to be switched.
let kind: AST.Quantification.Kind
if options.isReluctantByDefault
&& quant.kind.value != .possessive
{
kind = quant.kind.value == .eager
? .reluctant
: .eager
} else {
kind = quant.kind.value
}

switch quant.amount.value.bounds {
case (_, atMost: 0):
Expand Down
35 changes: 18 additions & 17 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,8 @@ func unsupported(
file: StaticString = #file,
line: UInt = #line
) -> Unsupported {
// TODO: how do we not have a public init for this?
let fStr = file.withUTF8Buffer {
String(decoding: $0, as: UTF8.self)
}
return Unsupported(
message: s, file: fStr, line: Int(line))
message: s, file: String(describing: file), line: Int(line))
}

extension AST.Node {
Expand All @@ -42,8 +38,7 @@ extension AST.Node {
/// A consumer is a Swift closure that matches against
/// the front of an input range
func generateConsumer(
// TODO: Better option modeling
_ opts: CharacterClass.MatchLevel
_ opts: MatchingOptions
) throws -> Program<String>.ConsumeFunction? {
switch self {
case .atom(let a):
Expand Down Expand Up @@ -77,10 +72,10 @@ extension AST.Atom {
}

func generateConsumer(
_ opts: CharacterClass.MatchLevel
_ opts: MatchingOptions
) throws -> Program<String>.ConsumeFunction? {
// TODO: Wean ourselves off of this type...
if let cc = self.characterClass?.withMatchLevel(opts) {
if let cc = self.characterClass?.withMatchLevel(opts.matchLevel) {
return { input, bounds in
// FIXME: should we worry about out of bounds?
cc.matches(in: input, at: bounds.lowerBound)
Expand Down Expand Up @@ -109,10 +104,16 @@ extension AST.Atom {
// TODO: alias? casing?
$0.name == name || $0.nameAlias == name
}

case .any:
fatalError(".atom(.any) is handled in emitAny")

case .startOfLine, .endOfLine:
// handled in emitAssertion
return nil

case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl,
.any, .startOfLine, .endOfLine,
.backreference, .subpattern, .callout, .backtrackingDirective:
.backreference, .subpattern, .callout, .backtrackingDirective:
// FIXME: implement
return nil
}
Expand All @@ -121,7 +122,7 @@ extension AST.Atom {

extension AST.CustomCharacterClass.Member {
func generateConsumer(
_ opts: CharacterClass.MatchLevel
_ opts: MatchingOptions
) throws -> Program<String>.ConsumeFunction {
switch self {
case .custom(let ccc):
Expand Down Expand Up @@ -212,7 +213,7 @@ extension AST.CustomCharacterClass.Member {

extension AST.CustomCharacterClass {
func generateConsumer(
_ opts: CharacterClass.MatchLevel
_ opts: MatchingOptions
) throws -> Program<String>.ConsumeFunction {
// NOTE: Easy way to implement, obviously not performant
let consumers = try members.map {
Expand Down Expand Up @@ -265,7 +266,7 @@ private func consumeScalar(

extension AST.Atom.CharacterProperty {
func generateConsumer(
_ opts: CharacterClass.MatchLevel
_ opts: MatchingOptions
) throws -> Program<String>.ConsumeFunction {
// Handle inversion for us, albeit not efficiently
func invert(
Expand Down Expand Up @@ -335,7 +336,7 @@ extension AST.Atom.CharacterProperty {
extension Unicode.BinaryProperty {
// FIXME: Semantic level, vet for precise defs
func generateConsumer(
_ opts: CharacterClass.MatchLevel
_ opts: MatchingOptions
) throws -> Program<String>.ConsumeFunction {
switch self {

Expand Down Expand Up @@ -499,7 +500,7 @@ extension Unicode.BinaryProperty {
extension Unicode.POSIXProperty {
// FIXME: Semantic level, vet for precise defs
func generateConsumer(
_ opts: CharacterClass.MatchLevel
_ opts: MatchingOptions
) -> Program<String>.ConsumeFunction {
// FIXME: semantic levels, modes, etc
switch self {
Expand Down Expand Up @@ -545,7 +546,7 @@ extension Unicode.POSIXProperty {
extension Unicode.ExtendedGeneralCategory {
// FIXME: Semantic level
func generateConsumer(
_ opts: CharacterClass.MatchLevel
_ opts: MatchingOptions
) throws -> Program<String>.ConsumeFunction {
switch self {
case .letter:
Expand Down
Loading

0 comments on commit a6132a5

Please sign in to comment.