Skip to content

Commit

Permalink
Merge pull request #379 from hamishknight/sema
Browse files Browse the repository at this point in the history
  • Loading branch information
hamishknight authored May 9, 2022
2 parents 39c0ed5 + c95e862 commit 7f068dc
Show file tree
Hide file tree
Showing 18 changed files with 928 additions and 285 deletions.
2 changes: 1 addition & 1 deletion Sources/PatternConverter/PatternConverter.swift
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ struct PatternConverter: ParsableCommand {
print("Converting '\(delim)\(regex)\(delim)'")

let ast = try _RegexParser.parse(
regex,
regex, .semantic,
experimentalSyntax ? .experimental : .traditional)

// Show rendered source ranges
Expand Down
19 changes: 19 additions & 0 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,23 @@ extension AST.Atom.EscapedBuiltin {
return nil
}
}

public var isQuantifiable: Bool {
switch self {
case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab,
.singleDataUnit, .decimalDigit, .notDecimalDigit, .horizontalWhitespace,
.notHorizontalWhitespace, .notNewline, .newlineSequence, .whitespace,
.notWhitespace, .verticalTab, .notVerticalTab, .wordCharacter,
.notWordCharacter, .backspace, .graphemeCluster, .trueAnychar:
return true

case .wordBoundary, .notWordBoundary, .startOfSubject,
.endOfSubjectBeforeNewline, .endOfSubject,
.firstMatchingPositionInSubject, .resetStartOfMatch, .textSegment,
.notTextSegment:
return false
}
}
}

extension AST.Atom {
Expand Down Expand Up @@ -749,6 +766,8 @@ extension AST.Atom {
case .changeMatchingOptions:
return false
// TODO: Are callouts quantifiable?
case .escaped(let esc):
return esc.isQuantifiable
default:
return true
}
Expand Down
15 changes: 10 additions & 5 deletions Sources/_RegexParser/Regex/Parse/CaptureList.swift
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,18 @@ extension CaptureList {
public var name: String?
public var type: Any.Type?
public var optionalDepth: Int
public var location: SourceLocation

public init(
name: String? = nil,
type: Any.Type? = nil,
optionalDepth: Int
optionalDepth: Int,
_ location: SourceLocation
) {
self.name = name
self.type = type
self.optionalDepth = optionalDepth
self.location = location
}
}
}
Expand All @@ -61,13 +64,14 @@ extension AST.Node {
case let .group(g):
switch g.kind.value {
case .capture:
list.append(.init(optionalDepth: nesting))
list.append(.init(optionalDepth: nesting, g.location))

case .namedCapture(let name):
list.append(.init(name: name.value, optionalDepth: nesting))
list.append(.init(name: name.value, optionalDepth: nesting, g.location))

case .balancedCapture(let b):
list.append(.init(name: b.name?.value, optionalDepth: nesting))
list.append(.init(name: b.name?.value, optionalDepth: nesting,
g.location))

default: break
}
Expand Down Expand Up @@ -124,7 +128,8 @@ extension CaptureList.Capture: Equatable {
public static func == (lhs: Self, rhs: Self) -> Bool {
lhs.name == rhs.name &&
lhs.optionalDepth == rhs.optionalDepth &&
lhs.type == rhs.type
lhs.type == rhs.type &&
lhs.location == rhs.location
}
}
extension CaptureList: Equatable {}
Expand Down
2 changes: 1 addition & 1 deletion Sources/_RegexParser/Regex/Parse/CompilerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public func swiftCompilerParseRegexLiteral(
_ input: String, captureBufferOut: UnsafeMutableRawBufferPointer
) throws -> (regexToEmit: String, version: Int) {
do {
let ast = try parseWithDelimiters(input)
let ast = try parseWithDelimiters(input, .semantic)
// Serialize the capture structure for later type inference.
assert(captureBufferOut.count >= input.utf8.count)
ast.captureStructure.encode(to: captureBufferOut)
Expand Down
35 changes: 31 additions & 4 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ enum ParseError: Error, Hashable {
// TODO: I wonder if it makes sense to store the string.
// This can make equality weird.

// MARK: Syntactic Errors

case numberOverflow(String)
case expectedNumDigits(String, Int)
case expectedNumber(String, kind: RadixKind)
Expand Down Expand Up @@ -43,7 +45,6 @@ enum ParseError: Error, Hashable {

case cannotReferToWholePattern

case notQuantifiable
case quantifierRequiresOperand(String)

case backtrackingDirectiveMustHaveName(String)
Expand All @@ -55,7 +56,6 @@ enum ParseError: Error, Hashable {
case cannotRemoveMatchingOptionsAfterCaret

case expectedCustomCharacterClassMembers
case invalidCharacterClassRangeOperand

case emptyProperty
case unknownProperty(key: String?, value: String)
Expand All @@ -73,6 +73,17 @@ enum ParseError: Error, Hashable {
case cannotRemoveExtendedSyntaxInMultilineMode

case expectedCalloutArgument

// MARK: Semantic Errors

case unsupported(String)
case deprecatedUnicode(String)
case invalidReference(Int)
case duplicateNamedCapture(String)
case invalidCharacterClassRangeOperand
case invalidQuantifierRange(Int, Int)
case invalidCharacterRange(from: Character, to: Character)
case notQuantifiable
}

extension IdentifierKind {
Expand All @@ -88,6 +99,7 @@ extension IdentifierKind {
extension ParseError: CustomStringConvertible {
var description: String {
switch self {
// MARK: Syntactic Errors
case let .numberOverflow(s):
return "number overflow: \(s)"
case let .expectedNumDigits(s, i):
Expand All @@ -114,8 +126,6 @@ extension ParseError: CustomStringConvertible {
return "invalid escape sequence '\\\(c)'"
case .cannotReferToWholePattern:
return "cannot refer to whole pattern here"
case .notQuantifiable:
return "expression is not quantifiable"
case .quantifierRequiresOperand(let q):
return "quantifier '\(q)' must appear after expression"
case .backtrackingDirectiveMustHaveName(let b):
Expand Down Expand Up @@ -167,6 +177,23 @@ extension ParseError: CustomStringConvertible {
return "extended syntax may not be disabled in multi-line mode"
case .expectedCalloutArgument:
return "expected argument to callout"

// MARK: Semantic Errors

case let .unsupported(kind):
return "\(kind) is not currently supported"
case let .deprecatedUnicode(kind):
return "\(kind) is a deprecated Unicode property, and is not supported"
case let .invalidReference(i):
return "no capture numbered \(i)"
case let .duplicateNamedCapture(str):
return "group named '\(str)' already exists"
case let .invalidQuantifierRange(lhs, rhs):
return "range lower bound '\(lhs)' must be less than or equal to upper bound '\(rhs)'"
case let .invalidCharacterRange(from: lhs, to: rhs):
return "character '\(lhs)' must compare less than or equal to '\(rhs)'"
case .notQuantifiable:
return "expression is not quantifiable"
}
}
}
Expand Down
35 changes: 23 additions & 12 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,6 @@ extension Parser {
if let (amt, kind, trivia) =
try source.lexQuantifier(context: context) {
let location = loc(_start)
guard operand.isQuantifiable else {
throw Source.LocatedError(ParseError.notQuantifiable, location)
}
result.append(.quantification(
.init(amt, kind, operand, location, trivia: trivia)))
} else {
Expand Down Expand Up @@ -543,11 +540,6 @@ extension Parser {
// Range between atoms.
if let (dashLoc, rhs) =
try source.lexCustomCharClassRangeEnd(context: context) {
guard atom.isValidCharacterClassRangeBound &&
rhs.isValidCharacterClassRangeBound else {
throw ParseError.invalidCharacterClassRangeOperand
}
// TODO: Validate lower <= upper?
members.append(.range(.init(atom, dashLoc, rhs)))
continue
}
Expand All @@ -558,13 +550,31 @@ extension Parser {
}
}

public enum ASTStage {
/// The regex is parsed, and a syntactically valid AST is returned. Otherwise
/// an error is thrown. This is useful for e.g syntax coloring.
case syntactic

/// The regex is parsed, and a syntactically and semantically valid AST is
/// returned. Otherwise an error is thrown. A semantically valid AST has been
/// checked for e.g unsupported constructs and invalid backreferences.
case semantic
}

public func parse<S: StringProtocol>(
_ regex: S, _ syntax: SyntaxOptions
_ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions
) throws -> AST where S.SubSequence == Substring
{
let source = Source(String(regex))
var parser = Parser(source, syntax: syntax)
return try parser.parse()
let ast = try parser.parse()
switch stage {
case .syntactic:
break
case .semantic:
try validate(ast)
}
return ast
}

/// Retrieve the default set of syntax options that a delimiter and literal
Expand All @@ -591,11 +601,12 @@ fileprivate func defaultSyntaxOptions(
/// Parses a given regex string with delimiters, inferring the syntax options
/// from the delimiters used.
public func parseWithDelimiters<S: StringProtocol>(
_ regex: S
_ regex: S, _ stage: ASTStage
) throws -> AST where S.SubSequence == Substring {
let (contents, delim) = droppingRegexDelimiters(String(regex))
do {
return try parse(contents, defaultSyntaxOptions(delim, contents: contents))
let syntax = defaultSyntaxOptions(delim, contents: contents)
return try parse(contents, stage, syntax)
} catch let error as LocatedErrorProtocol {
// Convert the range in 'contents' to the range in 'regex'.
let delimCount = delim.opening.count
Expand Down
Loading

0 comments on commit 7f068dc

Please sign in to comment.