Skip to content

Commit

Permalink
Parse raw identifiers.
Browse files Browse the repository at this point in the history
  • Loading branch information
allevato committed Oct 23, 2024
1 parent d145cb2 commit 65275bd
Show file tree
Hide file tree
Showing 7 changed files with 285 additions and 24 deletions.
61 changes: 45 additions & 16 deletions Sources/SwiftParser/Lexer/Cursor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2025,24 +2025,53 @@ extension Lexer.Cursor {
// Check whether we have an identifier followed by another backtick, in which
// case this is an escaped identifier.
let identifierStart = self
if self.advance(if: { $0.isValidIdentifierStartCodePoint }) {
// Keep continuing the identifier.
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })

// If we have the terminating "`", it's an escaped identifier.
if self.advance(matching: "`") {
return Lexer.Result(.identifier)
// Track some information while advancing so that we can more efficiently
// detect invalid identifiers later. Even if we end in a situation that is
// invalid (for example, it contains a backslash), we want to continue
// scanning until we reach a terminating backtick if possible because it
// provides better error recover and more likely resembles what the user
// was trying to write.
var sawNonWhitespace = false
var sawNonOperator = false
var sawBackslash = false
var isFirstScalar = true
self.advance(while: {
guard $0.isValidWhenLexingRawIdentifier else {
return false
}
}

// Special case; allow '`$`'.
if quote.starts(with: "`$`".utf8) {
self = quote
let firstBacktickConsumed = self.advance(matching: "`")
let dollarConsumed = self.advance(matching: "$")
let secondBacktickConsumed = self.advance(matching: "`")
precondition(firstBacktickConsumed && dollarConsumed && secondBacktickConsumed)
return Lexer.Result(.identifier)
if isFirstScalar {
if !$0.isOperatorStartCodePoint {
sawNonOperator = true
}
isFirstScalar = false
} else if !$0.isOperatorContinuationCodePoint {
sawNonOperator = true
}
if !$0.properties.isWhitespace {
sawNonWhitespace = true
}
if $0 == "\\" {
sawBackslash = true
}
return true
})

// If we have the terminating "`", it's an escaped identifier, unless it
// contained only operator characters.
let text = identifierStart.text(upTo: self)
if self.advance(matching: "`") {
var error: LexingDiagnostic? = nil
if text.isEmpty {
error = LexingDiagnostic(.rawIdentifierCannotBeEmpty, position: quote)
} else if !sawNonWhitespace {
error = LexingDiagnostic(.rawIdentifierCannotBeEntirelyWhitespace, position: quote)
} else if !sawNonOperator {
error = LexingDiagnostic(.rawIdentifierCannotBeOperator, position: quote)
} else if sawBackslash {
error = LexingDiagnostic(.rawIdentifierCannotContainBacklash, position: quote)
}
return Lexer.Result(.identifier, error: error)
}

// The backtick is punctuation.
Expand Down
16 changes: 16 additions & 0 deletions Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,22 @@ extension Unicode.Scalar {
return true
}

/// True if this code point is allowed when lexing a raw identifier.
///
/// This does not mean that the characters is necessarily _valid_ inside a
/// raw identifier. We scan more than we eventually accept so that we can
/// provide better diagnostics and recovery in certain failing cases, like
/// when a raw identifier contains a backslash or is entirely an operator.
var isValidWhenLexingRawIdentifier: Bool {
if self.value < 0x80 {
guard isPrintableASCII else {
return false
}
return UInt8(self.value) != "`"
}
return true
}

/// isOperatorStartCodePoint - Return true if the specified code point is a
/// valid start of an operator.
var isOperatorStartCodePoint: Bool {
Expand Down
11 changes: 11 additions & 0 deletions Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ public enum StaticTokenError: String, DiagnosticMessage {
case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
case invalidUtf8 = "invalid UTF-8 found in source file"
case rawIdentifierCannotBeEmpty = "a raw identifier cannot be empty"
case rawIdentifierCannotBeEntirelyWhitespace = "a raw identifier cannot be entirely whitespace"
case rawIdentifierCannotBeOperator = "a raw identifier cannot contain only operator characters"
case rawIdentifierCannotContainBacklash = "a raw identifier cannot contain backslashes"
case tokenDiagnosticOffsetOverflow =
"the lexer discovered an error in this token but was not able to represent its offset due to overflow; please split the token"
case sourceConflictMarker = "source control conflict marker in source file"
Expand Down Expand Up @@ -226,6 +230,13 @@ extension SwiftSyntax.TokenDiagnostic {
case .multilineRegexClosingNotOnNewline: return StaticTokenError.multilineRegexClosingNotOnNewline
case .nonBreakingSpace: return StaticTokenWarning.nonBreakingSpace
case .nulCharacter: return StaticTokenWarning.nulCharacter
case .rawIdentifierCannotBeEmpty: return StaticTokenError.rawIdentifierCannotBeEmpty
case .rawIdentifierCannotBeEntirelyWhitespace:
return StaticTokenError.rawIdentifierCannotBeEntirelyWhitespace
case .rawIdentifierCannotBeOperator:
return StaticTokenError.rawIdentifierCannotBeOperator
case .rawIdentifierCannotContainBacklash:
return StaticTokenError.rawIdentifierCannotContainBacklash
case .sourceConflictMarker: return StaticTokenError.sourceConflictMarker
case .spaceAtEndOfRegexLiteral: return StaticTokenError.spaceAtEndOfRegexLiteral
case .spaceAtStartOfRegexLiteral: return StaticTokenError.spaceAtStartOfRegexLiteral
Expand Down
8 changes: 8 additions & 0 deletions Sources/SwiftSyntax/TokenDiagnostic.swift
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ public struct TokenDiagnostic: Hashable, Sendable {
case multilineRegexClosingNotOnNewline
case nonBreakingSpace
case nulCharacter
case rawIdentifierCannotBeEmpty
case rawIdentifierCannotBeEntirelyWhitespace
case rawIdentifierCannotBeOperator
case rawIdentifierCannotContainBacklash
case sourceConflictMarker
case spaceAtEndOfRegexLiteral
case spaceAtStartOfRegexLiteral
Expand Down Expand Up @@ -88,6 +92,10 @@ public struct TokenDiagnostic: Hashable, Sendable {
case .multilineRegexClosingNotOnNewline: return .error
case .nonBreakingSpace: return .warning
case .nulCharacter: return .warning
case .rawIdentifierCannotBeEmpty: return .error
case .rawIdentifierCannotBeEntirelyWhitespace: return .error
case .rawIdentifierCannotBeOperator: return .error
case .rawIdentifierCannotContainBacklash: return .error
case .sourceConflictMarker: return .error
case .spaceAtEndOfRegexLiteral: return .error
case .spaceAtStartOfRegexLiteral: return .error
Expand Down
7 changes: 5 additions & 2 deletions Tests/SwiftParserTest/LexerTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,14 @@ class LexerTests: ParserTestCase {

func testEscapedIdentifiers() {
assertLexemes(
"`Hello` `World` `$`",
"`Hello` `World` `$` `with a space` `/not-an*operator+` `123`",
lexemes: [
LexemeSpec(.identifier, text: "`Hello`", trailing: " "),
LexemeSpec(.identifier, text: "`World`", trailing: " "),
LexemeSpec(.identifier, text: "`$`"),
LexemeSpec(.identifier, text: "`$`", trailing: " "),
LexemeSpec(.identifier, text: "`with a space`", trailing: " "),
LexemeSpec(.identifier, text: "`/not-an*operator+`", trailing: " "),
LexemeSpec(.identifier, text: "`123`"),
]
)
}
Expand Down
8 changes: 2 additions & 6 deletions Tests/SwiftParserTest/translated/DollarIdentifierTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -214,15 +214,11 @@ final class DollarIdentifierTests: ParserTestCase {
assertParse(
"""
func escapedDollarAnd() {
1️⃣`$0` = 1
`$0` = 1
`$$` = 2
`$abc` = 3
}
""",
diagnostics: [
// FIXME: Bad diagnostic
DiagnosticSpec(message: "unexpected code in function")
]
"""
)
}

Expand Down
198 changes: 198 additions & 0 deletions Tests/SwiftParserTest/translated/EscapedIdentifiersTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,202 @@ final class EscapedIdentifiersTests: ParserTestCase {
)
}

func testEscapedIdentifiers11() {
assertParse(
"""
func `method with space and .:/`() {}
`method with space and .:/`()
"""
)
}

func testEscapedIdentifiers12() {
assertParse(
"""
class `Class with space and .:/` {}
var `var with space and .:/` = `Class with space and .:/`.self
"""
)
}

func testEscapedIdentifiers13() {
assertParse(
"""
enum `enum with space and .:/` {
case `space cases`
case `case with payload`(`some label`: `Class with space and .:/`)
}
"""
)
}

func testEscapedIdentifiers14() {
assertParse(
"""
typealias `Typealias with space and .:/` = Int
func `+ start with operator`() {}
"""
)
}

func testEscapedIdentifiers15() {
assertParse(
"""
struct `Escaped Type` {}
func `escaped function`(`escaped label` `escaped arg`: `Escaped Type`) {}
`escaped function`(`escaped label`: `Escaped Type`())
let `escaped reference` = `escaped function`(`escaped label`:)
`escaped reference`(`Escaped Type`())
"""
)
}

func testEscapedIdentifiers16() {
assertParse(
"""
let `@atSign` = 0
let `#octothorpe` = 0
"""
)
}

func testEscapedIdentifiers17() {
assertParse(
"""
@propertyWrapper
struct `@PoorlyNamedWrapper`<`The Value`> {
var wrappedValue: `The Value`
}
struct WithWrappedProperty {
@`@PoorlyNamedWrapper` var x: Int
}
"""
)
}

func testEscapedIdentifiers18() {
assertParse(
"""
let 1️⃣`+` = 0
let 2️⃣`^*^` = 0
let 3️⃣`.` = 0
let 4️⃣`?` = 0
func 5️⃣`+`(lhs: Int, rhs: Int) -> Int
""",
diagnostics: [
DiagnosticSpec(
locationMarker: "1️⃣",
message: "a raw identifier cannot contain only operator characters"
),
DiagnosticSpec(
locationMarker: "2️⃣",
message: "a raw identifier cannot contain only operator characters"
),
DiagnosticSpec(
locationMarker: "3️⃣",
message: "a raw identifier cannot contain only operator characters"
),
DiagnosticSpec(
locationMarker: "4️⃣",
message: "a raw identifier cannot contain only operator characters"
),
DiagnosticSpec(
locationMarker: "5️⃣",
message: "a raw identifier cannot contain only operator characters"
),
]
)
}

func testEscapedIdentifiers19() {
assertParse(
"""
1️⃣`multiline is
not allowed` = 5
""",
diagnostics: [
DiagnosticSpec(
locationMarker: "1️⃣",
message: "extraneous code at top level"
)
]
)
}

func testEscapedIdentifiers20() {
assertParse(
"""
1️⃣`null\u{0000}is not allowed` = 5
`unprintable ascii\u{007f}is not allowed` = 10
""",
diagnostics: [
DiagnosticSpec(
locationMarker: "1️⃣",
message: "extraneous code at top level"
)
]
)
}

func testEscapedIdentifiers21() {
assertParse(
"""
1️⃣`` = 5
""",
diagnostics: [
DiagnosticSpec(
locationMarker: "1️⃣",
message: "a raw identifier cannot be empty"
)
]
)
}

func testEscapedIdentifiers22() {
assertParse(
"""
1️⃣` ` = 5
2️⃣` ` = 5
3️⃣`\u{2000}` = 5
""",
diagnostics: [
DiagnosticSpec(
locationMarker: "1️⃣",
message: "a raw identifier cannot be entirely whitespace"
),
DiagnosticSpec(
locationMarker: "2️⃣",
message: "a raw identifier cannot be entirely whitespace"
),
DiagnosticSpec(
locationMarker: "3️⃣",
message: "a raw identifier cannot be entirely whitespace"
),
]
)
}

func testEscapedIdentifiers23() {
assertParse(
#"""
1️⃣`hello\there` = 5
2️⃣`\` = 5
3️⃣`back\\slash` = 5
"""#,
diagnostics: [
DiagnosticSpec(
locationMarker: "1️⃣",
message: "a raw identifier cannot contain backslashes"
),
DiagnosticSpec(
locationMarker: "2️⃣",
message: "a raw identifier cannot contain backslashes"
),
DiagnosticSpec(
locationMarker: "3️⃣",
message: "a raw identifier cannot contain backslashes"
),
]
)
}
}

0 comments on commit 65275bd

Please sign in to comment.