swiftlang · allevato · Sep 7, 2024 · grynspan · Jan 6, 2025 · allevato
@@ -2025,14 +2025,57 @@ extension Lexer.Cursor {
     // Check whether we have an identifier followed by another backtick, in which
     // case this is an escaped identifier.
     let identifierStart = self
-    if self.advance(if: { $0.isValidIdentifierStartCodePoint }) {
-      // Keep continuing the identifier.
-      self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
 
-      // If we have the terminating "`", it's an escaped identifier.
-      if self.advance(matching: "`") {
-        return Lexer.Result(.identifier)
+    // Scan until we see either a closing backtick or the end of the line. Do
+    // additional validation for raw identifiers along the way; if we see
+    // characters that aren't allowed (prohibited whitespace or unprintable
+    // ASCII characters) or if the identifier is an operator, provide a more
+    // precise diagnostic and location, but otherwise keep trying to tokenize
+    // it as a raw identifier as long as we see the closing backtick because
+    // it more likely represents what the user was trying to do.
+    var hasNonOperatorCharacter = false
+    var hasNonWhitespaceCharacter = false
+    var isEmpty = true
+    var error: LexingDiagnostic? = nil
+    while true {
+      let ch = self.peek()
+      if ch == nil || ch == "`" || ch == "\n" || ch == "\r" {
+        break
+      }
+      let position = self
+      guard let scalar = self.advanceValidatingUTF8Character() else {
+        error = LexingDiagnostic(.invalidUtf8, position: position)
+        continue
+      }
+      if error == nil {
+        if scalar.isForbiddenRawIdentifierWhitespace {
+          error = LexingDiagnostic(.invalidWhitespaceInRawIdentifier, position: position)
+        } else if scalar == "\\" {
+          error = LexingDiagnostic(.invalidBackslashInRawIdentifier, position: position)
+        } else if scalar.isASCII && !scalar.isPrintableASCII {
+          error = LexingDiagnostic(.unprintableAsciiCharacter, position: position)
+        }
+      }
+      if !scalar.isPermittedRawIdentifierWhitespace {
+        hasNonWhitespaceCharacter = true
+      }
+      if (isEmpty && !scalar.isOperatorStartCodePoint) || !scalar.isOperatorContinuationCodePoint {
+        hasNonOperatorCharacter = true
+      }
+      isEmpty = false
+    }
+
+    // If we have the terminating "`", it's an escaped/raw identifier, unless
+    // it contained only operator characters or had other invalid elements.
+    if self.advance(matching: "`") {
+      if isEmpty {
+        error = LexingDiagnostic(.rawIdentifierCannotBeEmpty, position: quote)
+      } else if error == nil && !hasNonWhitespaceCharacter {
+        error = LexingDiagnostic(.rawIdentifierCannotBeAllWhitespace, position: quote)
+      } else if error == nil && !hasNonOperatorCharacter {
+        error = LexingDiagnostic(.rawIdentifierCannotBeOperator, position: quote)
       }
+      return Lexer.Result(.identifier, error: error)
     }
 
     // Special case; allow '`$`'.

@@ -93,6 +93,31 @@ extension Unicode.Scalar {
     return true
   }
 
+  var isForbiddenRawIdentifierWhitespace: Bool {
+    let c = self.value
+    // This is the set of code points satisfying the `White_Space` property,
+    // excluding the set satisfying the `Pattern_White_Space` property, and
+    // excluding any other ASCII non-printables and Unicode separators. In
+    // other words, the only whitespace code points allowed in a raw
+    // identifier are U+0020, and U+200E/200F (LTR/RTL marks).
+    return (c >= 0x0009 && c <= 0x000D) as Bool
+      || (c == 0x0085) as Bool
+      || (c == 0x00A0) as Bool
+      || (c == 0x1680) as Bool
+      || (c >= 0x2000 && c <= 0x200A) as Bool
+      || (c >= 0x2028 && c <= 0x2029) as Bool
+      || (c == 0x202F) as Bool
+      || (c == 0x205F) as Bool
+      || (c == 0x3000) as Bool
+  }
+
+  var isPermittedRawIdentifierWhitespace: Bool {
+    let c = self.value
+    return (c == 0x0020) as Bool
+      || (c == 0x200E) as Bool
+      || (c == 0x200F) as Bool
+  }
+
   /// isOperatorStartCodePoint - Return true if the specified code point is a
   /// valid start of an operator.
   var isOperatorStartCodePoint: Bool {

@@ -71,11 +71,16 @@ public enum StaticTokenError: String, DiagnosticMessage {
   case expectedDigitInFloatLiteral = "expected a digit in floating point exponent"
   case expectedHexCodeInUnicodeEscape = #"expected hexadecimal code in \u{...} escape sequence"#
   case expectedHexDigitInHexLiteral = "expected hexadecimal digit (0-9, A-F) in integer literal"
+  case invalidBackslashInRawIdentifier = "a raw identifier cannot contain a backslash"
   case invalidCharacter = "invalid character in source file"
   case invalidEscapeSequenceInStringLiteral = "invalid escape sequence in literal"
   case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
   case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
   case invalidUtf8 = "invalid UTF-8 found in source file"
+  case invalidWhitespaceInRawIdentifier = "invalid whitespace found in raw identifier"
+  case rawIdentifierCannotBeAllWhitespace = "a raw identifier cannot contain only whitespace characters"
+  case rawIdentifierCannotBeEmpty = "a raw identifier cannot be empty"
+  case rawIdentifierCannotBeOperator = "a raw identifier cannot contain only operator characters"
   case tokenDiagnosticOffsetOverflow =
     "the lexer discovered an error in this token but was not able to represent its offset due to overflow; please split the token"
   case sourceConflictMarker = "source control conflict marker in source file"
@@ -211,6 +216,7 @@ extension SwiftSyntax.TokenDiagnostic {
       // inside `ParseDiagnosticsGenerator` but fall back to an error message
       // here in case the error is not diagnosed.
       return InvalidIndentationInMultiLineStringLiteralError(kind: .insufficientIndentation, lines: 1)
+    case .invalidBackslashInRawIdentifier: return StaticTokenError.invalidBackslashInRawIdentifier
     case .invalidBinaryDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .binary(scalarAtErrorOffset))
     case .invalidCharacter: return StaticTokenError.invalidCharacter
     case .invalidDecimalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .decimal(scalarAtErrorOffset))
@@ -223,9 +229,14 @@ extension SwiftSyntax.TokenDiagnostic {
     case .invalidNumberOfHexDigitsInUnicodeEscape: return StaticTokenError.invalidNumberOfHexDigitsInUnicodeEscape
     case .invalidOctalDigitInIntegerLiteral: return InvalidDigitInIntegerLiteral(kind: .octal(scalarAtErrorOffset))
     case .invalidUtf8: return StaticTokenError.invalidUtf8
+    case .invalidWhitespaceInRawIdentifier: return StaticTokenError.invalidWhitespaceInRawIdentifier
     case .multilineRegexClosingNotOnNewline: return StaticTokenError.multilineRegexClosingNotOnNewline
     case .nonBreakingSpace: return StaticTokenWarning.nonBreakingSpace
     case .nulCharacter: return StaticTokenWarning.nulCharacter
+    case .rawIdentifierCannotBeAllWhitespace: return StaticTokenError.rawIdentifierCannotBeAllWhitespace
+    case .rawIdentifierCannotBeEmpty: return StaticTokenError.rawIdentifierCannotBeEmpty
+    case .rawIdentifierCannotBeOperator:
+      return StaticTokenError.rawIdentifierCannotBeOperator
     case .sourceConflictMarker: return StaticTokenError.sourceConflictMarker
     case .spaceAtEndOfRegexLiteral: return StaticTokenError.spaceAtEndOfRegexLiteral
     case .spaceAtStartOfRegexLiteral: return StaticTokenError.spaceAtStartOfRegexLiteral

@@ -35,6 +35,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
     case extraneousTrailingWhitespaceError
     case extraneousTrailingWhitespaceWarning
     case insufficientIndentationInMultilineStringLiteral
+    case invalidBackslashInRawIdentifier
     case invalidBinaryDigitInIntegerLiteral
     case invalidCharacter
     case invalidDecimalDigitInIntegerLiteral
@@ -46,9 +47,13 @@ public struct TokenDiagnostic: Hashable, Sendable {
     case invalidNumberOfHexDigitsInUnicodeEscape
     case invalidOctalDigitInIntegerLiteral
     case invalidUtf8
+    case invalidWhitespaceInRawIdentifier
     case multilineRegexClosingNotOnNewline
     case nonBreakingSpace
     case nulCharacter
+    case rawIdentifierCannotBeAllWhitespace
+    case rawIdentifierCannotBeEmpty
+    case rawIdentifierCannotBeOperator
     case sourceConflictMarker
     case spaceAtEndOfRegexLiteral
     case spaceAtStartOfRegexLiteral
@@ -74,6 +79,7 @@ public struct TokenDiagnostic: Hashable, Sendable {
       case .extraneousTrailingWhitespaceError: return .error
       case .extraneousTrailingWhitespaceWarning: return .warning
       case .insufficientIndentationInMultilineStringLiteral: return .error
+      case .invalidBackslashInRawIdentifier: return .error
       case .invalidBinaryDigitInIntegerLiteral: return .error
       case .invalidCharacter: return .error
       case .invalidDecimalDigitInIntegerLiteral: return .error
@@ -85,9 +91,13 @@ public struct TokenDiagnostic: Hashable, Sendable {
       case .invalidNumberOfHexDigitsInUnicodeEscape: return .error
       case .invalidOctalDigitInIntegerLiteral: return .error
       case .invalidUtf8: return .error
+      case .invalidWhitespaceInRawIdentifier: return .error
       case .multilineRegexClosingNotOnNewline: return .error
       case .nonBreakingSpace: return .warning
       case .nulCharacter: return .warning
+      case .rawIdentifierCannotBeAllWhitespace: return .error
+      case .rawIdentifierCannotBeEmpty: return .error
+      case .rawIdentifierCannotBeOperator: return .error
       case .sourceConflictMarker: return .error
       case .spaceAtEndOfRegexLiteral: return .error
       case .spaceAtStartOfRegexLiteral: return .error

@@ -76,11 +76,14 @@ class LexerTests: ParserTestCase {
 
   func testEscapedIdentifiers() {
     assertLexemes(
-      "`Hello` `World` `$`",
+      "`Hello` `World` `$` `with a space` `/not-an*operator+` `123`",
       lexemes: [
         LexemeSpec(.identifier, text: "`Hello`", trailing: " "),
         LexemeSpec(.identifier, text: "`World`", trailing: " "),
-        LexemeSpec(.identifier, text: "`$`"),
+        LexemeSpec(.identifier, text: "`$`", trailing: " "),
+        LexemeSpec(.identifier, text: "`with a space`", trailing: " "),
+        LexemeSpec(.identifier, text: "`/not-an*operator+`", trailing: " "),
+        LexemeSpec(.identifier, text: "`123`"),
       ]
     )
   }
@@ -1173,6 +1176,23 @@ class LexerTests: ParserTestCase {
     }
   }
 
+  func testInvalidUtf8_4() {
+    let sourceBytes: [UInt8] = [0x60, 0x41, 0xfd, 0x60]  // 0x41 == "A", 0x60 == "`"
+
+    lex(sourceBytes) { lexemes in
+      guard lexemes.count == 2 else {
+        return XCTFail("Expected 2 lexemes, got \(lexemes.count)")
+      }
+      assertRawBytesLexeme(
+        lexemes[0],
+        kind: .identifier,
+        leadingTrivia: [],
+        text: [0x60, 0x41, 0xfd, 0x60],
+        error: TokenDiagnostic(.invalidUtf8, byteOffset: 2)
+      )
+    }
+  }
+
   func testUTF16Surrogates1() {
     // U+D800 <= (UTF16 surrogates code point) <= U+DFFF
     let sourceBytes: [UInt8] = [0xED, 0xA0, 0x80]  // The bytes represent the code point U+D800

@@ -214,15 +214,11 @@ final class DollarIdentifierTests: ParserTestCase {
     assertParse(
       """
       func escapedDollarAnd() {
-        1️⃣`$0` = 1
+        `$0` = 1
         `$$` = 2
         `$abc` = 3
       }
-      """,
-      diagnostics: [
-        // FIXME: Bad diagnostic
-        DiagnosticSpec(message: "unexpected code in function")
-      ]
+      """
     )
   }