Skip to content

Commit

Permalink
Parse raw identifiers.
Browse files Browse the repository at this point in the history
This is the parser implementation for SE-0451.
  • Loading branch information
allevato committed Jan 6, 2025
1 parent eae92fe commit 6c7fc5a
Show file tree
Hide file tree
Showing 7 changed files with 331 additions and 14 deletions.
55 changes: 49 additions & 6 deletions Sources/SwiftParser/Lexer/Cursor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2025,14 +2025,57 @@ extension Lexer.Cursor {
// Check whether we have an identifier followed by another backtick, in which
// case this is an escaped identifier.
let identifierStart = self
if self.advance(if: { $0.isValidIdentifierStartCodePoint }) {
// Keep continuing the identifier.
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })

// If we have the terminating "`", it's an escaped identifier.
if self.advance(matching: "`") {
return Lexer.Result(.identifier)
// Scan until we see either a closing backtick or the end of the line. Do
// additional validation for raw identifiers along the way; if we see
// characters that aren't allowed (prohibited whitespace or unprintable
// ASCII characters) or if the identifier is an operator, provide a more
// precise diagnostic and location, but otherwise keep trying to tokenize
// it as a raw identifier as long as we see the closing backtick because
// it more likely represents what the user was trying to do.
var hasNonOperatorCharacter = false
var hasNonWhitespaceCharacter = false
var isEmpty = true
var error: LexingDiagnostic? = nil
while true {
let ch = self.peek()
if ch == nil || ch == "`" || ch == "\n" || ch == "\r" {
break
}
let position = self
guard let scalar = self.advanceValidatingUTF8Character() else {
error = LexingDiagnostic(.invalidUtf8, position: position)
continue
}
if error == nil {
if scalar.isForbiddenRawIdentifierWhitespace {
error = LexingDiagnostic(.invalidWhitespaceInRawIdentifier, position: position)
} else if scalar == "\\" {
error = LexingDiagnostic(.invalidBackslashInRawIdentifier, position: position)
} else if scalar.isASCII && !scalar.isPrintableASCII {
error = LexingDiagnostic(.unprintableAsciiCharacter, position: position)
}
}
if !scalar.isPermittedRawIdentifierWhitespace {
hasNonWhitespaceCharacter = true
}
if (isEmpty && !scalar.isOperatorStartCodePoint) || !scalar.isOperatorContinuationCodePoint {
hasNonOperatorCharacter = true
}
isEmpty = false
}

// If we have the terminating "`", it's an escaped/raw identifier, unless
// it contained only operator characters or had other invalid elements.
if self.advance(matching: "`") {
if isEmpty {
error = LexingDiagnostic(.rawIdentifierCannotBeEmpty, position: quote)
} else if error == nil && !hasNonWhitespaceCharacter {
error = LexingDiagnostic(.rawIdentifierCannotBeAllWhitespace, position: quote)
} else if error == nil && !hasNonOperatorCharacter {
error = LexingDiagnostic(.rawIdentifierCannotBeOperator, position: quote)
}
return Lexer.Result(.identifier, error: error)
}

// Special case; allow '`$`'.
Expand Down
25 changes: 25 additions & 0 deletions Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,31 @@ extension Unicode.Scalar {
return true
}

var isForbiddenRawIdentifierWhitespace: Bool {
let c = self.value
// This is the set of code points satisfying the `White_Space` property,
// excluding the set satisfying the `Pattern_White_Space` property, and
// excluding any other ASCII non-printables and Unicode separators. In
// other words, the only whitespace code points allowed in a raw
// identifier are U+0020, and U+200E/200F (LTR/RTL marks).
return (c >= 0x0009 && c <= 0x000D) as Bool
|| (c == 0x0085) as Bool
|| (c == 0x00A0) as Bool
|| (c == 0x1680) as Bool
|| (c >= 0x2000 && c <= 0x200A) as Bool
|| (c >= 0x2028 && c <= 0x2029) as Bool
|| (c == 0x202F) as Bool
|| (c == 0x205F) as Bool
|| (c == 0x3000) as Bool
}

var isPermittedRawIdentifierWhitespace: Bool {
let c = self.value
return (c == 0x0020) as Bool
|| (c == 0x200E) as Bool
|| (c == 0x200F) as Bool
}

/// isOperatorStartCodePoint - Return true if the specified code point is a
/// valid start of an operator.
var isOperatorStartCodePoint: Bool {
Expand Down
11 changes: 11 additions & 0 deletions Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,16 @@ public enum StaticTokenError: String, DiagnosticMessage {
case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
case expectedHexDigitInHexLiteral = "expected hexadecimal digit (0-9, A-F) in integer literal"
case invalidBackslashInRawIdentifier = "a raw identifier cannot contain a backslash"
case invalidCharacter = "invalid character in source file"
case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
case invalidUtf8 = "invalid UTF-8 found in source file"
case invalidWhitespaceInRawIdentifier = "invalid whitespace found in raw identifier"
case rawIdentifierCannotBeAllWhitespace = "a raw identifier cannot contain only whitespace characters"
case rawIdentifierCannotBeEmpty = "a raw identifier cannot be empty"
case rawIdentifierCannotBeOperator = "a raw identifier cannot contain only operator characters"
case tokenDiagnosticOffsetOverflow =
"the lexer discovered an error in this token but was not able to represent its offset due to overflow; please split the token"
case sourceConflictMarker = "source control conflict marker in source file"
Expand Down Expand Up @@ -211,6 +216,7 @@ extension SwiftSyntax.TokenDiagnostic {
// inside `ParseDiagnosticsGenerator` but fall back to an error message
// here in case the error is not diagnosed.
return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
case .invalidBackslashInRawIdentifier: return StaticTokenError.invalidBackslashInRawIdentifier
case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
case .invalidCharacter: return StaticTokenError.invalidCharacter
case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
Expand All @@ -223,9 +229,14 @@ extension SwiftSyntax.TokenDiagnostic {
case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticTokenError.invalidNumberOfHexDigitsInUnicodeEscape
case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
case .invalidUtf8: return StaticTokenError.invalidUtf8
case .invalidWhitespaceInRawIdentifier: return StaticTokenError.invalidWhitespaceInRawIdentifier
case .multilineRegexClosingNotOnNewline: return StaticTokenError.multilineRegexClosingNotOnNewline
case .nonBreakingSpace: return StaticTokenWarning.nonBreakingSpace
case .nulCharacter: return StaticTokenWarning.nulCharacter
case .rawIdentifierCannotBeAllWhitespace: return StaticTokenError.rawIdentifierCannotBeAllWhitespace
case .rawIdentifierCannotBeEmpty: return StaticTokenError.rawIdentifierCannotBeEmpty
case .rawIdentifierCannotBeOperator:
return StaticTokenError.rawIdentifierCannotBeOperator
case .sourceConflictMarker: return StaticTokenError.sourceConflictMarker
case .spaceAtEndOfRegexLiteral: return StaticTokenError.spaceAtEndOfRegexLiteral
case .spaceAtStartOfRegexLiteral: return StaticTokenError.spaceAtStartOfRegexLiteral
Expand Down
10 changes: 10 additions & 0 deletions Sources/SwiftSyntax/TokenDiagnostic.swift
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
case extraneousTrailingWhitespaceError
case extraneousTrailingWhitespaceWarning
case insufficientIndentationInMultilineStringLiteral
case invalidBackslashInRawIdentifier
case invalidBinaryDigitInIntegerLiteral
case invalidCharacter
case invalidDecimalDigitInIntegerLiteral
Expand All @@ -46,9 +47,13 @@ public struct TokenDiagnostic: Hashable, Sendable {
case invalidNumberOfHexDigitsInUnicodeEscape
case invalidOctalDigitInIntegerLiteral
case invalidUtf8
case invalidWhitespaceInRawIdentifier
case multilineRegexClosingNotOnNewline
case nonBreakingSpace
case nulCharacter
case rawIdentifierCannotBeAllWhitespace
case rawIdentifierCannotBeEmpty
case rawIdentifierCannotBeOperator
case sourceConflictMarker
case spaceAtEndOfRegexLiteral
case spaceAtStartOfRegexLiteral
Expand All @@ -74,6 +79,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
case .extraneousTrailingWhitespaceError: return .error
case .extraneousTrailingWhitespaceWarning: return .warning
case .insufficientIndentationInMultilineStringLiteral: return .error
case .invalidBackslashInRawIdentifier: return .error
case .invalidBinaryDigitInIntegerLiteral: return .error
case .invalidCharacter: return .error
case .invalidDecimalDigitInIntegerLiteral: return .error
Expand All @@ -85,9 +91,13 @@ public struct TokenDiagnostic: Hashable, Sendable {
case .invalidNumberOfHexDigitsInUnicodeEscape: return .error
case .invalidOctalDigitInIntegerLiteral: return .error
case .invalidUtf8: return .error
case .invalidWhitespaceInRawIdentifier: return .error
case .multilineRegexClosingNotOnNewline: return .error
case .nonBreakingSpace: return .warning
case .nulCharacter: return .warning
case .rawIdentifierCannotBeAllWhitespace: return .error
case .rawIdentifierCannotBeEmpty: return .error
case .rawIdentifierCannotBeOperator: return .error
case .sourceConflictMarker: return .error
case .spaceAtEndOfRegexLiteral: return .error
case .spaceAtStartOfRegexLiteral: return .error
Expand Down
24 changes: 22 additions & 2 deletions Tests/SwiftParserTest/LexerTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,14 @@ class LexerTests: ParserTestCase {

func testEscapedIdentifiers() {
assertLexemes(
"`Hello` `World` `$`",
"`Hello` `World` `$` `with a space` `/not-an*operator+` `123`",
lexemes: [
LexemeSpec(.identifier, text: "`Hello`", trailing: " "),
LexemeSpec(.identifier, text: "`World`", trailing: " "),
LexemeSpec(.identifier, text: "`$`"),
LexemeSpec(.identifier, text: "`$`", trailing: " "),
LexemeSpec(.identifier, text: "`with a space`", trailing: " "),
LexemeSpec(.identifier, text: "`/not-an*operator+`", trailing: " "),
LexemeSpec(.identifier, text: "`123`"),
]
)
}
Expand Down Expand Up @@ -1173,6 +1176,23 @@ class LexerTests: ParserTestCase {
}
}

func testInvalidUtf8_4() {
let sourceBytes: [UInt8] = [0x60, 0x41, 0xfd, 0x60] // 0x41 == "A", 0x60 == "`"

lex(sourceBytes) { lexemes in
guard lexemes.count == 2 else {
return XCTFail("Expected 2 lexemes, got \(lexemes.count)")
}
assertRawBytesLexeme(
lexemes[0],
kind: .identifier,
leadingTrivia: [],
text: [0x60, 0x41, 0xfd, 0x60],
error: TokenDiagnostic(.invalidUtf8, byteOffset: 2)
)
}
}

func testUTF16Surrogates1() {
// U+D800 <= (UTF16 surrogates code point) <= U+DFFF
let sourceBytes: [UInt8] = [0xED, 0xA0, 0x80] // The bytes represent the code point U+D800
Expand Down
8 changes: 2 additions & 6 deletions Tests/SwiftParserTest/translated/DollarIdentifierTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -214,15 +214,11 @@ final class DollarIdentifierTests: ParserTestCase {
assertParse(
"""
func escapedDollarAnd() {
1️⃣`$0` = 1
`$0` = 1
`$$` = 2
`$abc` = 3
}
""",
diagnostics: [
// FIXME: Bad diagnostic
DiagnosticSpec(message: "unexpected code in function")
]
"""
)
}

Expand Down
Loading

0 comments on commit 6c7fc5a

Please sign in to comment.