diff --git a/Sources/SwiftParser/Lexer/Cursor.swift b/Sources/SwiftParser/Lexer/Cursor.swift index 28ed57f8947..4a9e595c581 100644 --- a/Sources/SwiftParser/Lexer/Cursor.swift +++ b/Sources/SwiftParser/Lexer/Cursor.swift @@ -2025,14 +2025,57 @@ extension Lexer.Cursor { // Check whether we have an identifier followed by another backtick, in which // case this is an escaped identifier. let identifierStart = self - if self.advance(if: { $0.isValidIdentifierStartCodePoint }) { - // Keep continuing the identifier. - self.advance(while: { $0.isValidIdentifierContinuationCodePoint }) - // If we have the terminating "`", it's an escaped identifier. - if self.advance(matching: "`") { - return Lexer.Result(.identifier) + // Scan until we see either a closing backtick or the end of the line. Do + // additional validation for raw identifiers along the way; if we see + // characters that aren't allowed (prohibited whitespace or unprintable + // ASCII characters) or if the identifier is an operator, provide a more + // precise diagnostic and location, but otherwise keep trying to tokenize + // it as a raw identifier as long as we see the closing backtick because + // it more likely represents what the user was trying to do. + var hasNonOperatorCharacter = false + var hasNonWhitespaceCharacter = false + var isEmpty = true + var error: LexingDiagnostic? = nil + while true { + let ch = self.peek() + if ch == nil || ch == "`" || ch == "\n" || ch == "\r" { + break + } + let position = self + guard let scalar = self.advanceValidatingUTF8Character() else { + error = LexingDiagnostic(.invalidUtf8, position: position) + continue + } + if error == nil { + if scalar.isForbiddenRawIdentifierWhitespace { + error = LexingDiagnostic(.invalidWhitespaceInRawIdentifier, position: position) + } else if scalar == "\\" { + error = LexingDiagnostic(.invalidBackslashInRawIdentifier, position: position) + } else if scalar.isASCII && !scalar.isPrintableASCII { + error = LexingDiagnostic(.unprintableAsciiCharacter, position: position) + } + } + if !scalar.isPermittedRawIdentifierWhitespace { + hasNonWhitespaceCharacter = true + } + if (isEmpty && !scalar.isOperatorStartCodePoint) || !scalar.isOperatorContinuationCodePoint { + hasNonOperatorCharacter = true + } + isEmpty = false + } + + // If we have the terminating "`", it's an escaped/raw identifier, unless + // it contained only operator characters or had other invalid elements. + if self.advance(matching: "`") { + if isEmpty { + error = LexingDiagnostic(.rawIdentifierCannotBeEmpty, position: quote) + } else if error == nil && !hasNonWhitespaceCharacter { + error = LexingDiagnostic(.rawIdentifierCannotBeAllWhitespace, position: quote) + } else if error == nil && !hasNonOperatorCharacter { + error = LexingDiagnostic(.rawIdentifierCannotBeOperator, position: quote) } + return Lexer.Result(.identifier, error: error) } // Special case; allow '`$`'. diff --git a/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift b/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift index d963e6320be..acb2eb7d8f2 100644 --- a/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift +++ b/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift @@ -93,6 +93,31 @@ extension Unicode.Scalar { return true } + var isForbiddenRawIdentifierWhitespace: Bool { + let c = self.value + // This is the set of code points satisfying the `White_Space` property, + // excluding the set satisfying the `Pattern_White_Space` property, and + // excluding any other ASCII non-printables and Unicode separators. In + // other words, the only whitespace code points allowed in a raw + // identifier are U+0020, and U+200E/200F (LTR/RTL marks). + return (c >= 0x0009 && c <= 0x000D) as Bool + || (c == 0x0085) as Bool + || (c == 0x00A0) as Bool + || (c == 0x1680) as Bool + || (c >= 0x2000 && c <= 0x200A) as Bool + || (c >= 0x2028 && c <= 0x2029) as Bool + || (c == 0x202F) as Bool + || (c == 0x205F) as Bool + || (c == 0x3000) as Bool + } + + var isPermittedRawIdentifierWhitespace: Bool { + let c = self.value + return (c == 0x0020) as Bool + || (c == 0x200E) as Bool + || (c == 0x200F) as Bool + } + /// isOperatorStartCodePoint - Return true if the specified code point is a /// valid start of an operator. var isOperatorStartCodePoint: Bool { diff --git a/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift b/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift index b01d7246f1c..73cd7a3c9e7 100644 --- a/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift +++ b/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift @@ -71,11 +71,16 @@ public enum StaticTokenError: String, DiagnosticMessage { case expectedDigitInFloatLiteral = "expected a digit in floating point exponent" case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"# case expectedHexDigitInHexLiteral = "expected hexadecimal digit (0-9, A-F) in integer literal" + case invalidBackslashInRawIdentifier = "a raw identifier cannot contain a backslash" case invalidCharacter = "invalid character in source file" case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal" case invalidIdentifierStartCharacter = "an identifier cannot begin with this character" case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"# case invalidUtf8 = "invalid UTF-8 found in source file" + case invalidWhitespaceInRawIdentifier = "invalid whitespace found in raw identifier" + case rawIdentifierCannotBeAllWhitespace = "a raw identifier cannot contain only whitespace characters" + case rawIdentifierCannotBeEmpty = "a raw identifier cannot be empty" + case rawIdentifierCannotBeOperator = "a raw identifier cannot contain only operator characters" case tokenDiagnosticOffsetOverflow = "the lexer discovered an error in this token but was not able to represent its offset due to overflow; please split the token" case sourceConflictMarker = "source control conflict marker in source file" @@ -211,6 +216,7 @@ extension SwiftSyntax.TokenDiagnostic { // inside `ParseDiagnosticsGenerator` but fall back to an error message // here in case the error is not diagnosed. return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1) + case .invalidBackslashInRawIdentifier: return StaticTokenError.invalidBackslashInRawIdentifier case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset)) case .invalidCharacter: return StaticTokenError.invalidCharacter case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset)) @@ -223,9 +229,14 @@ extension SwiftSyntax.TokenDiagnostic { case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticTokenError.invalidNumberOfHexDigitsInUnicodeEscape case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset)) case .invalidUtf8: return StaticTokenError.invalidUtf8 + case .invalidWhitespaceInRawIdentifier: return StaticTokenError.invalidWhitespaceInRawIdentifier case .multilineRegexClosingNotOnNewline: return StaticTokenError.multilineRegexClosingNotOnNewline case .nonBreakingSpace: return StaticTokenWarning.nonBreakingSpace case .nulCharacter: return StaticTokenWarning.nulCharacter + case .rawIdentifierCannotBeAllWhitespace: return StaticTokenError.rawIdentifierCannotBeAllWhitespace + case .rawIdentifierCannotBeEmpty: return StaticTokenError.rawIdentifierCannotBeEmpty + case .rawIdentifierCannotBeOperator: + return StaticTokenError.rawIdentifierCannotBeOperator case .sourceConflictMarker: return StaticTokenError.sourceConflictMarker case .spaceAtEndOfRegexLiteral: return StaticTokenError.spaceAtEndOfRegexLiteral case .spaceAtStartOfRegexLiteral: return StaticTokenError.spaceAtStartOfRegexLiteral diff --git a/Sources/SwiftSyntax/TokenDiagnostic.swift b/Sources/SwiftSyntax/TokenDiagnostic.swift index a65f2f2f8df..edeb2832053 100644 --- a/Sources/SwiftSyntax/TokenDiagnostic.swift +++ b/Sources/SwiftSyntax/TokenDiagnostic.swift @@ -35,6 +35,7 @@ public struct TokenDiagnostic: Hashable, Sendable { case extraneousTrailingWhitespaceError case extraneousTrailingWhitespaceWarning case insufficientIndentationInMultilineStringLiteral + case invalidBackslashInRawIdentifier case invalidBinaryDigitInIntegerLiteral case invalidCharacter case invalidDecimalDigitInIntegerLiteral @@ -46,9 +47,13 @@ public struct TokenDiagnostic: Hashable, Sendable { case invalidNumberOfHexDigitsInUnicodeEscape case invalidOctalDigitInIntegerLiteral case invalidUtf8 + case invalidWhitespaceInRawIdentifier case multilineRegexClosingNotOnNewline case nonBreakingSpace case nulCharacter + case rawIdentifierCannotBeAllWhitespace + case rawIdentifierCannotBeEmpty + case rawIdentifierCannotBeOperator case sourceConflictMarker case spaceAtEndOfRegexLiteral case spaceAtStartOfRegexLiteral @@ -74,6 +79,7 @@ public struct TokenDiagnostic: Hashable, Sendable { case .extraneousTrailingWhitespaceError: return .error case .extraneousTrailingWhitespaceWarning: return .warning case .insufficientIndentationInMultilineStringLiteral: return .error + case .invalidBackslashInRawIdentifier: return .error case .invalidBinaryDigitInIntegerLiteral: return .error case .invalidCharacter: return .error case .invalidDecimalDigitInIntegerLiteral: return .error @@ -85,9 +91,13 @@ public struct TokenDiagnostic: Hashable, Sendable { case .invalidNumberOfHexDigitsInUnicodeEscape: return .error case .invalidOctalDigitInIntegerLiteral: return .error case .invalidUtf8: return .error + case .invalidWhitespaceInRawIdentifier: return .error case .multilineRegexClosingNotOnNewline: return .error case .nonBreakingSpace: return .warning case .nulCharacter: return .warning + case .rawIdentifierCannotBeAllWhitespace: return .error + case .rawIdentifierCannotBeEmpty: return .error + case .rawIdentifierCannotBeOperator: return .error case .sourceConflictMarker: return .error case .spaceAtEndOfRegexLiteral: return .error case .spaceAtStartOfRegexLiteral: return .error diff --git a/Tests/SwiftParserTest/LexerTests.swift b/Tests/SwiftParserTest/LexerTests.swift index dd9e61bcf12..a175bc0e588 100644 --- a/Tests/SwiftParserTest/LexerTests.swift +++ b/Tests/SwiftParserTest/LexerTests.swift @@ -76,11 +76,14 @@ class LexerTests: ParserTestCase { func testEscapedIdentifiers() { assertLexemes( - "`Hello` `World` `$`", + "`Hello` `World` `$` `with a space` `/not-an*operator+` `123`", lexemes: [ LexemeSpec(.identifier, text: "`Hello`", trailing: " "), LexemeSpec(.identifier, text: "`World`", trailing: " "), - LexemeSpec(.identifier, text: "`$`"), + LexemeSpec(.identifier, text: "`$`", trailing: " "), + LexemeSpec(.identifier, text: "`with a space`", trailing: " "), + LexemeSpec(.identifier, text: "`/not-an*operator+`", trailing: " "), + LexemeSpec(.identifier, text: "`123`"), ] ) } @@ -1173,6 +1176,23 @@ class LexerTests: ParserTestCase { } } + func testInvalidUtf8_4() { + let sourceBytes: [UInt8] = [0x60, 0x41, 0xfd, 0x60] // 0x41 == "A", 0x60 == "`" + + lex(sourceBytes) { lexemes in + guard lexemes.count == 2 else { + return XCTFail("Expected 2 lexemes, got \(lexemes.count)") + } + assertRawBytesLexeme( + lexemes[0], + kind: .identifier, + leadingTrivia: [], + text: [0x60, 0x41, 0xfd, 0x60], + error: TokenDiagnostic(.invalidUtf8, byteOffset: 2) + ) + } + } + func testUTF16Surrogates1() { // U+D800 <= (UTF16 surrogates code point) <= U+DFFF let sourceBytes: [UInt8] = [0xED, 0xA0, 0x80] // The bytes represent the code point U+D800 diff --git a/Tests/SwiftParserTest/translated/DollarIdentifierTests.swift b/Tests/SwiftParserTest/translated/DollarIdentifierTests.swift index de333fb6f96..1ca12ee226c 100644 --- a/Tests/SwiftParserTest/translated/DollarIdentifierTests.swift +++ b/Tests/SwiftParserTest/translated/DollarIdentifierTests.swift @@ -214,15 +214,11 @@ final class DollarIdentifierTests: ParserTestCase { assertParse( """ func escapedDollarAnd() { - 1️⃣`$0` = 1 + `$0` = 1 `$$` = 2 `$abc` = 3 } - """, - diagnostics: [ - // FIXME: Bad diagnostic - DiagnosticSpec(message: "unexpected code in function") - ] + """ ) } diff --git a/Tests/SwiftParserTest/translated/EscapedIdentifiersTests.swift b/Tests/SwiftParserTest/translated/EscapedIdentifiersTests.swift index d64cdec77b7..939beb8b972 100644 --- a/Tests/SwiftParserTest/translated/EscapedIdentifiersTests.swift +++ b/Tests/SwiftParserTest/translated/EscapedIdentifiersTests.swift @@ -99,4 +99,216 @@ final class EscapedIdentifiersTests: ParserTestCase { ) } + func testEscapedIdentifiers11() { + assertParse( + """ + func `method with space and .:/`() {} + `method with space and .:/`() + + class `Class with space and .:/` {} + var `var with space and .:/` = `Class with space and .:/`.self + + enum `Enum with space and .:/` { + case `space cases` + case `case with payload`(`some label`: `Class with space and .:/`) + } + let `enum value`: `Enum with space and .:/` = + .`case with payload`(`some label`: `var with space and .:/`) + + struct `Escaped Type` {} + func `escaped function`(`escaped label` `escaped arg`: `Escaped Type`) {} + `escaped function`(`escaped label`: `Escaped Type`()) + let `escaped reference` = `escaped function`(`escaped label`:) + `escaped reference`(`Escaped Type`()) + """ + ) + } + + func testEscapedIdentifiers12() { + assertParse( + """ + func `+ start with operator`() {} + func `end with operator +`() {} + func ` + `() {} + """ + ) + } + + func testEscapedIdentifiers13() { + assertParse( + """ + func `// not a comment`() {} + func `/* also not a comment */`() {} + func `func dontDoThis() {}`() {} + """ + ) + } + + func testEscapedIdentifiers14() { + assertParse( + """ + let `@atSign` = 0 + let `#octothorpe` = 0 + """ + ) + } + + func testEscapedIdentifiers15() { + assertParse( + """ + @propertyWrapper + struct `@PoorlyNamedWrapper`<`The Value`> { + var wrappedValue: `The Value` + } + struct WithWrappedProperty { + @`@PoorlyNamedWrapper` var x: Int + } + """ + ) + } + + func testEscapedIdentifiers16() { + assertParse( + """ + let 1️⃣`+` = 0 + let 2️⃣`^*^` = 0 + let 3️⃣`.` = 0 + let 4️⃣`?` = 0 + func 5️⃣`+`(lhs: Int, rhs: Int) -> Int + """, + diagnostics: [ + DiagnosticSpec( + locationMarker: "1️⃣", + message: "a raw identifier cannot contain only operator characters" + ), + DiagnosticSpec( + locationMarker: "2️⃣", + message: "a raw identifier cannot contain only operator characters" + ), + DiagnosticSpec( + locationMarker: "3️⃣", + message: "a raw identifier cannot contain only operator characters" + ), + DiagnosticSpec( + locationMarker: "4️⃣", + message: "a raw identifier cannot contain only operator characters" + ), + DiagnosticSpec( + locationMarker: "5️⃣", + message: "a raw identifier cannot contain only operator characters" + ), + ] + ) + } + + func testEscapedIdentifiers17() { + assertParse( + """ + 1️⃣`multiline is + not allowed` = 5 + """, + diagnostics: [ + DiagnosticSpec( + locationMarker: "1️⃣", + message: "extraneous code at top level" + ) + ] + ) + } + + func testEscapedIdentifiers18() { + assertParse( + """ + `null1️⃣\u{0000}is not allowed` = 5 + `unprintable ascii2️⃣\u{007f}is not allowed` = 10 + """, + diagnostics: [ + DiagnosticSpec( + locationMarker: "1️⃣", + message: "unprintable ASCII character found in source file" + ), + DiagnosticSpec( + locationMarker: "2️⃣", + message: "unprintable ASCII character found in source file" + ), + ] + ) + } + + func testEscapedIdentifiers19() { + assertParse( + """ + 1️⃣`` = 5 + 2️⃣` ` = 5 + 3️⃣` ` = 5 + 4️⃣`\u{200E} \u{200F}` = 5 + """, + diagnostics: [ + DiagnosticSpec( + locationMarker: "1️⃣", + message: "a raw identifier cannot be empty" + ), + DiagnosticSpec( + locationMarker: "2️⃣", + message: "a raw identifier cannot contain only whitespace characters" + ), + DiagnosticSpec( + locationMarker: "3️⃣", + message: "a raw identifier cannot contain only whitespace characters" + ), + DiagnosticSpec( + locationMarker: "4️⃣", + message: "a raw identifier cannot contain only whitespace characters" + ), + ] + ) + } + + func testEscapedIdentifiers20() { + assertParse( + """ + `this space is ok but this1️⃣\u{00a0}one is not` = 5 + `neither is this2️⃣\u{2029}one` = 5 + `whitespace diagnostic3️⃣\u{0009}has precedence over nonprintable ASCII` = 5 + """, + diagnostics: [ + DiagnosticSpec( + locationMarker: "1️⃣", + message: "invalid whitespace found in raw identifier" + ), + DiagnosticSpec( + locationMarker: "2️⃣", + message: "invalid whitespace found in raw identifier" + ), + DiagnosticSpec( + locationMarker: "3️⃣", + message: "invalid whitespace found in raw identifier" + ), + ] + ) + } + + func testEscapedIdentifiers21() { + assertParse( + """ + `1️⃣\\starting` = 5 + `mid2️⃣\\dle` = 5 + `end3️⃣\\` = 5 + """, + diagnostics: [ + DiagnosticSpec( + locationMarker: "1️⃣", + message: "a raw identifier cannot contain a backslash" + ), + DiagnosticSpec( + locationMarker: "2️⃣", + message: "a raw identifier cannot contain a backslash" + ), + DiagnosticSpec( + locationMarker: "3️⃣", + message: "a raw identifier cannot contain a backslash" + ), + ] + ) + } }