Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse raw identifiers. #2857

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 49 additions & 6 deletions Sources/SwiftParser/Lexer/Cursor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2025,14 +2025,57 @@ extension Lexer.Cursor {
// Check whether we have an identifier followed by another backtick, in which
// case this is an escaped identifier.
let identifierStart = self
if self.advance(if: { $0.isValidIdentifierStartCodePoint }) {
// Keep continuing the identifier.
self.advance(while: { $0.isValidIdentifierContinuationCodePoint })

// If we have the terminating "`", it's an escaped identifier.
if self.advance(matching: "`") {
return Lexer.Result(.identifier)
// Scan until we see either a closing backtick or the end of the line. Do
// additional validation for raw identifiers along the way; if we see
// characters that aren't allowed (prohibited whitespace or unprintable
// ASCII characters) or if the identifier is an operator, provide a more
// precise diagnostic and location, but otherwise keep trying to tokenize
// it as a raw identifier as long as we see the closing backtick because
// it more likely represents what the user was trying to do.
var hasNonOperatorCharacter = false
var hasNonWhitespaceCharacter = false
var isEmpty = true
var error: LexingDiagnostic? = nil
while true {
let ch = self.peek()
if ch == nil || ch == "`" || ch == "\n" || ch == "\r" {
break
}
let position = self
guard let scalar = self.advanceValidatingUTF8Character() else {
error = LexingDiagnostic(.invalidUtf8, position: position)
continue
}
if error == nil {
if scalar.isForbiddenRawIdentifierWhitespace {
error = LexingDiagnostic(.invalidWhitespaceInRawIdentifier, position: position)
} else if scalar == "\\" {
error = LexingDiagnostic(.invalidBackslashInRawIdentifier, position: position)
} else if scalar.isASCII && !scalar.isPrintableASCII {
error = LexingDiagnostic(.unprintableAsciiCharacter, position: position)
}
}
if !scalar.isPermittedRawIdentifierWhitespace {
hasNonWhitespaceCharacter = true
}
if (isEmpty && !scalar.isOperatorStartCodePoint) || !scalar.isOperatorContinuationCodePoint {
hasNonOperatorCharacter = true
}
isEmpty = false
}

// If we have the terminating "`", it's an escaped/raw identifier, unless
// it contained only operator characters or had other invalid elements.
if self.advance(matching: "`") {
if isEmpty {
error = LexingDiagnostic(.rawIdentifierCannotBeEmpty, position: quote)
} else if error == nil && !hasNonWhitespaceCharacter {
error = LexingDiagnostic(.rawIdentifierCannotBeAllWhitespace, position: quote)
} else if error == nil && !hasNonOperatorCharacter {
error = LexingDiagnostic(.rawIdentifierCannotBeOperator, position: quote)
}
return Lexer.Result(.identifier, error: error)
}

// Special case; allow '`$`'.
Expand Down
25 changes: 25 additions & 0 deletions Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,31 @@ extension Unicode.Scalar {
return true
}

var isForbiddenRawIdentifierWhitespace: Bool {
let c = self.value
// This is the set of code points satisfying the `White_Space` property,
// excluding the set satisfying the `Pattern_White_Space` property, and
// excluding any other ASCII non-printables and Unicode separators. In
// other words, the only whitespace code points allowed in a raw
// identifier are U+0020, and U+200E/200F (LTR/RTL marks).
return (c >= 0x0009 && c <= 0x000D) as Bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be better to express this as isWhitespace && !isPermittedRawIdentifierWhitespace?

Copy link
Member Author

@allevato allevato Jan 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parser explicitly avoids the use of Unicode.Scalar.Properties because they're not necessarily stable between Unicode versions and may change depending on the Unicode tables that are built into libswiftCore, so we don't want the behavior of the parser to change depending on things like what version of the operating system the compiler is running on.

It's probably exceedingly rare that new whitespace will be added (or worse, that a non-whitespace code point would become whitespace or vice versa), but hardcoding the code points means the behavior is deterministic and we can choose to make breaks via new language modes, if needed in the future.

|| (c == 0x0085) as Bool
|| (c == 0x00A0) as Bool
|| (c == 0x1680) as Bool
|| (c >= 0x2000 && c <= 0x200A) as Bool
|| (c >= 0x2028 && c <= 0x2029) as Bool
|| (c == 0x202F) as Bool
|| (c == 0x205F) as Bool
|| (c == 0x3000) as Bool
}

var isPermittedRawIdentifierWhitespace: Bool {
let c = self.value
return (c == 0x0020) as Bool
|| (c == 0x200E) as Bool
|| (c == 0x200F) as Bool
}

/// isOperatorStartCodePoint - Return true if the specified code point is a
/// valid start of an operator.
var isOperatorStartCodePoint: Bool {
Expand Down
11 changes: 11 additions & 0 deletions Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,16 @@ public enum StaticTokenError: String, DiagnosticMessage {
case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
case expectedHexDigitInHexLiteral = "expected hexadecimal digit (0-9, A-F) in integer literal"
case invalidBackslashInRawIdentifier = "a raw identifier cannot contain a backslash"
case invalidCharacter = "invalid character in source file"
case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
case invalidUtf8 = "invalid UTF-8 found in source file"
case invalidWhitespaceInRawIdentifier = "invalid whitespace found in raw identifier"
case rawIdentifierCannotBeAllWhitespace = "a raw identifier cannot contain only whitespace characters"
case rawIdentifierCannotBeEmpty = "a raw identifier cannot be empty"
case rawIdentifierCannotBeOperator = "a raw identifier cannot contain only operator characters"
case tokenDiagnosticOffsetOverflow =
"the lexer discovered an error in this token but was not able to represent its offset due to overflow; please split the token"
case sourceConflictMarker = "source control conflict marker in source file"
Expand Down Expand Up @@ -211,6 +216,7 @@ extension SwiftSyntax.TokenDiagnostic {
// inside `ParseDiagnosticsGenerator` but fall back to an error message
// here in case the error is not diagnosed.
return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
case .invalidBackslashInRawIdentifier: return StaticTokenError.invalidBackslashInRawIdentifier
case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
case .invalidCharacter: return StaticTokenError.invalidCharacter
case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
Expand All @@ -223,9 +229,14 @@ extension SwiftSyntax.TokenDiagnostic {
case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticTokenError.invalidNumberOfHexDigitsInUnicodeEscape
case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
case .invalidUtf8: return StaticTokenError.invalidUtf8
case .invalidWhitespaceInRawIdentifier: return StaticTokenError.invalidWhitespaceInRawIdentifier
case .multilineRegexClosingNotOnNewline: return StaticTokenError.multilineRegexClosingNotOnNewline
case .nonBreakingSpace: return StaticTokenWarning.nonBreakingSpace
case .nulCharacter: return StaticTokenWarning.nulCharacter
case .rawIdentifierCannotBeAllWhitespace: return StaticTokenError.rawIdentifierCannotBeAllWhitespace
case .rawIdentifierCannotBeEmpty: return StaticTokenError.rawIdentifierCannotBeEmpty
case .rawIdentifierCannotBeOperator:
return StaticTokenError.rawIdentifierCannotBeOperator
case .sourceConflictMarker: return StaticTokenError.sourceConflictMarker
case .spaceAtEndOfRegexLiteral: return StaticTokenError.spaceAtEndOfRegexLiteral
case .spaceAtStartOfRegexLiteral: return StaticTokenError.spaceAtStartOfRegexLiteral
Expand Down
10 changes: 10 additions & 0 deletions Sources/SwiftSyntax/TokenDiagnostic.swift
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
case extraneousTrailingWhitespaceError
case extraneousTrailingWhitespaceWarning
case insufficientIndentationInMultilineStringLiteral
case invalidBackslashInRawIdentifier
case invalidBinaryDigitInIntegerLiteral
case invalidCharacter
case invalidDecimalDigitInIntegerLiteral
Expand All @@ -46,9 +47,13 @@ public struct TokenDiagnostic: Hashable, Sendable {
case invalidNumberOfHexDigitsInUnicodeEscape
case invalidOctalDigitInIntegerLiteral
case invalidUtf8
case invalidWhitespaceInRawIdentifier
case multilineRegexClosingNotOnNewline
case nonBreakingSpace
case nulCharacter
case rawIdentifierCannotBeAllWhitespace
case rawIdentifierCannotBeEmpty
case rawIdentifierCannotBeOperator
case sourceConflictMarker
case spaceAtEndOfRegexLiteral
case spaceAtStartOfRegexLiteral
Expand All @@ -74,6 +79,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
case .extraneousTrailingWhitespaceError: return .error
case .extraneousTrailingWhitespaceWarning: return .warning
case .insufficientIndentationInMultilineStringLiteral: return .error
case .invalidBackslashInRawIdentifier: return .error
case .invalidBinaryDigitInIntegerLiteral: return .error
case .invalidCharacter: return .error
case .invalidDecimalDigitInIntegerLiteral: return .error
Expand All @@ -85,9 +91,13 @@ public struct TokenDiagnostic: Hashable, Sendable {
case .invalidNumberOfHexDigitsInUnicodeEscape: return .error
case .invalidOctalDigitInIntegerLiteral: return .error
case .invalidUtf8: return .error
case .invalidWhitespaceInRawIdentifier: return .error
case .multilineRegexClosingNotOnNewline: return .error
case .nonBreakingSpace: return .warning
case .nulCharacter: return .warning
case .rawIdentifierCannotBeAllWhitespace: return .error
case .rawIdentifierCannotBeEmpty: return .error
case .rawIdentifierCannotBeOperator: return .error
case .sourceConflictMarker: return .error
case .spaceAtEndOfRegexLiteral: return .error
case .spaceAtStartOfRegexLiteral: return .error
Expand Down
24 changes: 22 additions & 2 deletions Tests/SwiftParserTest/LexerTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,14 @@ class LexerTests: ParserTestCase {

func testEscapedIdentifiers() {
assertLexemes(
"`Hello` `World` `$`",
"`Hello` `World` `$` `with a space` `/not-an*operator+` `123`",
lexemes: [
LexemeSpec(.identifier, text: "`Hello`", trailing: " "),
LexemeSpec(.identifier, text: "`World`", trailing: " "),
LexemeSpec(.identifier, text: "`$`"),
LexemeSpec(.identifier, text: "`$`", trailing: " "),
LexemeSpec(.identifier, text: "`with a space`", trailing: " "),
LexemeSpec(.identifier, text: "`/not-an*operator+`", trailing: " "),
LexemeSpec(.identifier, text: "`123`"),
]
)
}
Expand Down Expand Up @@ -1173,6 +1176,23 @@ class LexerTests: ParserTestCase {
}
}

func testInvalidUtf8_4() {
let sourceBytes: [UInt8] = [0x60, 0x41, 0xfd, 0x60] // 0x41 == "A", 0x60 == "`"

lex(sourceBytes) { lexemes in
guard lexemes.count == 2 else {
return XCTFail("Expected 2 lexemes, got \(lexemes.count)")
}
assertRawBytesLexeme(
lexemes[0],
kind: .identifier,
leadingTrivia: [],
text: [0x60, 0x41, 0xfd, 0x60],
error: TokenDiagnostic(.invalidUtf8, byteOffset: 2)
)
}
}

func testUTF16Surrogates1() {
// U+D800 <= (UTF16 surrogates code point) <= U+DFFF
let sourceBytes: [UInt8] = [0xED, 0xA0, 0x80] // The bytes represent the code point U+D800
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,15 +214,11 @@ final class DollarIdentifierTests: ParserTestCase {
assertParse(
"""
func escapedDollarAnd() {
1️⃣`$0` = 1
`$0` = 1
`$$` = 2
`$abc` = 3
}
""",
diagnostics: [
// FIXME: Bad diagnostic
DiagnosticSpec(message: "unexpected code in function")
]
"""
)
}

Expand Down
Loading