Parse raw identifiers.

swiftlang · Oct 23, 2024 · 65275bd · 65275bd
1 parent d145cb2
commit 65275bd
Show file tree

Hide file tree

Showing 7 changed files with 285 additions and 24 deletions.
diff --git a/Sources/SwiftParser/Lexer/Cursor.swift b/Sources/SwiftParser/Lexer/Cursor.swift
@@ -2025,24 +2025,53 @@ extension Lexer.Cursor {
     // Check whether we have an identifier followed by another backtick, in which
     // case this is an escaped identifier.
     let identifierStart = self
-    if self.advance(if: { $0.isValidIdentifierStartCodePoint }) {
-      // Keep continuing the identifier.
-      self.advance(while: { $0.isValidIdentifierContinuationCodePoint })
 
-      // If we have the terminating "`", it's an escaped identifier.
-      if self.advance(matching: "`") {
-        return Lexer.Result(.identifier)
+    // Track some information while advancing so that we can more efficiently
+    // detect invalid identifiers later. Even if we end in a situation that is
+    // invalid (for example, it contains a backslash), we want to continue
+    // scanning until we reach a terminating backtick if possible because it
+    // provides better error recover and more likely resembles what the user
+    // was trying to write.
+    var sawNonWhitespace = false
+    var sawNonOperator = false
+    var sawBackslash = false
+    var isFirstScalar = true
+    self.advance(while: {
+      guard $0.isValidWhenLexingRawIdentifier else {
+        return false
       }
-    }
-
-    // Special case; allow '`$`'.
-    if quote.starts(with: "`$`".utf8) {
-      self = quote
-      let firstBacktickConsumed = self.advance(matching: "`")
-      let dollarConsumed = self.advance(matching: "$")
-      let secondBacktickConsumed = self.advance(matching: "`")
-      precondition(firstBacktickConsumed && dollarConsumed && secondBacktickConsumed)
-      return Lexer.Result(.identifier)
+      if isFirstScalar {
+        if !$0.isOperatorStartCodePoint {
+          sawNonOperator = true
+        }
+        isFirstScalar = false
+      } else if !$0.isOperatorContinuationCodePoint {
+        sawNonOperator = true
+      }
+      if !$0.properties.isWhitespace {
+        sawNonWhitespace = true
+      }
+      if $0 == "\\" {
+        sawBackslash = true
+      }
+      return true
+    })
+
+    // If we have the terminating "`", it's an escaped identifier, unless it
+    // contained only operator characters.
+    let text = identifierStart.text(upTo: self)
+    if self.advance(matching: "`") {
+      var error: LexingDiagnostic? = nil
+      if text.isEmpty {
+        error = LexingDiagnostic(.rawIdentifierCannotBeEmpty, position: quote)
+      } else if !sawNonWhitespace {
+        error = LexingDiagnostic(.rawIdentifierCannotBeEntirelyWhitespace, position: quote)
+      } else if !sawNonOperator {
+        error = LexingDiagnostic(.rawIdentifierCannotBeOperator, position: quote)
+      } else if sawBackslash {
+        error = LexingDiagnostic(.rawIdentifierCannotContainBacklash, position: quote)
+      }
+      return Lexer.Result(.identifier, error: error)
     }
 
     // The backtick is punctuation.

diff --git a/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift b/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift
@@ -93,6 +93,22 @@ extension Unicode.Scalar {
     return true
   }
 
+  /// True if this code point is allowed when lexing a raw identifier.
+  ///
+  /// This does not mean that the characters is necessarily _valid_ inside a
+  /// raw identifier. We scan more than we eventually accept so that we can
+  /// provide better diagnostics and recovery in certain failing cases, like
+  /// when a raw identifier contains a backslash or is entirely an operator.
+  var isValidWhenLexingRawIdentifier: Bool {
+    if self.value < 0x80 {
+      guard isPrintableASCII else {
+        return false
+      }
+      return UInt8(self.value) != "`"
+    }
+    return true
+  }
+
   /// isOperatorStartCodePoint - Return true if the specified code point is a
   /// valid start of an operator.
   var isOperatorStartCodePoint: Bool {

diff --git a/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift b/Sources/SwiftParserDiagnostics/LexerDiagnosticMessages.swift
@@ -76,6 +76,10 @@ public enum StaticTokenError: String, DiagnosticMessage {
   case invalidIdentifierStartCharacter = "an identifier cannot begin with this character"
   case invalidNumberOfHexDigitsInUnicodeEscape = #"\u{...} escape sequence expects between 1 and 8 hex digits"#
   case invalidUtf8 = "invalid UTF-8 found in source file"
+  case rawIdentifierCannotBeEmpty = "a raw identifier cannot be empty"
+  case rawIdentifierCannotBeEntirelyWhitespace = "a raw identifier cannot be entirely whitespace"
+  case rawIdentifierCannotBeOperator = "a raw identifier cannot contain only operator characters"
+  case rawIdentifierCannotContainBacklash = "a raw identifier cannot contain backslashes"
   case tokenDiagnosticOffsetOverflow =
     "the lexer discovered an error in this token but was not able to represent its offset due to overflow; please split the token"
   case sourceConflictMarker = "source control conflict marker in source file"
@@ -226,6 +230,13 @@ extension SwiftSyntax.TokenDiagnostic {
     case .multilineRegexClosingNotOnNewline: return StaticTokenError.multilineRegexClosingNotOnNewline
     case .nonBreakingSpace: return StaticTokenWarning.nonBreakingSpace
     case .nulCharacter: return StaticTokenWarning.nulCharacter
+    case .rawIdentifierCannotBeEmpty: return StaticTokenError.rawIdentifierCannotBeEmpty
+    case .rawIdentifierCannotBeEntirelyWhitespace:
+      return StaticTokenError.rawIdentifierCannotBeEntirelyWhitespace
+    case .rawIdentifierCannotBeOperator:
+      return StaticTokenError.rawIdentifierCannotBeOperator
+    case .rawIdentifierCannotContainBacklash:
+      return StaticTokenError.rawIdentifierCannotContainBacklash
     case .sourceConflictMarker: return StaticTokenError.sourceConflictMarker
     case .spaceAtEndOfRegexLiteral: return StaticTokenError.spaceAtEndOfRegexLiteral
     case .spaceAtStartOfRegexLiteral: return StaticTokenError.spaceAtStartOfRegexLiteral

diff --git a/Sources/SwiftSyntax/TokenDiagnostic.swift b/Sources/SwiftSyntax/TokenDiagnostic.swift
@@ -49,6 +49,10 @@ public struct TokenDiagnostic: Hashable, Sendable {
     case multilineRegexClosingNotOnNewline
     case nonBreakingSpace
     case nulCharacter
+    case rawIdentifierCannotBeEmpty
+    case rawIdentifierCannotBeEntirelyWhitespace
+    case rawIdentifierCannotBeOperator
+    case rawIdentifierCannotContainBacklash
     case sourceConflictMarker
     case spaceAtEndOfRegexLiteral
     case spaceAtStartOfRegexLiteral
@@ -88,6 +92,10 @@ public struct TokenDiagnostic: Hashable, Sendable {
       case .multilineRegexClosingNotOnNewline: return .error
       case .nonBreakingSpace: return .warning
       case .nulCharacter: return .warning
+      case .rawIdentifierCannotBeEmpty: return .error
+      case .rawIdentifierCannotBeEntirelyWhitespace: return .error
+      case .rawIdentifierCannotBeOperator: return .error
+      case .rawIdentifierCannotContainBacklash: return .error
       case .sourceConflictMarker: return .error
       case .spaceAtEndOfRegexLiteral: return .error
       case .spaceAtStartOfRegexLiteral: return .error

diff --git a/Tests/SwiftParserTest/LexerTests.swift b/Tests/SwiftParserTest/LexerTests.swift
@@ -76,11 +76,14 @@ class LexerTests: ParserTestCase {
 
   func testEscapedIdentifiers() {
     assertLexemes(
-      "`Hello` `World` `$`",
+      "`Hello` `World` `$` `with a space` `/not-an*operator+` `123`",
       lexemes: [
         LexemeSpec(.identifier, text: "`Hello`", trailing: " "),
         LexemeSpec(.identifier, text: "`World`", trailing: " "),
-        LexemeSpec(.identifier, text: "`$`"),
+        LexemeSpec(.identifier, text: "`$`", trailing: " "),
+        LexemeSpec(.identifier, text: "`with a space`", trailing: " "),
+        LexemeSpec(.identifier, text: "`/not-an*operator+`", trailing: " "),
+        LexemeSpec(.identifier, text: "`123`"),
       ]
     )
   }

diff --git a/Tests/SwiftParserTest/translated/DollarIdentifierTests.swift b/Tests/SwiftParserTest/translated/DollarIdentifierTests.swift
@@ -214,15 +214,11 @@ final class DollarIdentifierTests: ParserTestCase {
     assertParse(
       """
       func escapedDollarAnd() {
-        1️⃣`$0` = 1
+        `$0` = 1
         `$$` = 2
         `$abc` = 3
       }
-      """,
-      diagnostics: [
-        // FIXME: Bad diagnostic
-        DiagnosticSpec(message: "unexpected code in function")
-      ]
+      """
     )
   }
 

diff --git a/Tests/SwiftParserTest/translated/EscapedIdentifiersTests.swift b/Tests/SwiftParserTest/translated/EscapedIdentifiersTests.swift
@@ -99,4 +99,202 @@ final class EscapedIdentifiersTests: ParserTestCase {
     )
   }
 
+  func testEscapedIdentifiers11() {
+    assertParse(
+      """
+      func `method with space and .:/`() {}
+      `method with space and .:/`()
+      """
+    )
+  }
+
+  func testEscapedIdentifiers12() {
+    assertParse(
+      """
+      class `Class with space and .:/` {}
+      var `var with space and .:/` = `Class with space and .:/`.self
+      """
+    )
+  }
+
+  func testEscapedIdentifiers13() {
+    assertParse(
+      """
+      enum `enum with space and .:/` {
+        case `space cases`
+        case `case with payload`(`some label`: `Class with space and .:/`)
+      }
+      """
+    )
+  }
+
+  func testEscapedIdentifiers14() {
+    assertParse(
+      """
+      typealias `Typealias with space and .:/` = Int
+      func `+ start with operator`() {}
+      """
+    )
+  }
+
+  func testEscapedIdentifiers15() {
+    assertParse(
+      """
+      struct `Escaped Type` {}
+      func `escaped function`(`escaped label` `escaped arg`: `Escaped Type`) {}
+      `escaped function`(`escaped label`: `Escaped Type`())
+      let `escaped reference` = `escaped function`(`escaped label`:)
+      `escaped reference`(`Escaped Type`())
+      """
+    )
+  }
+
+  func testEscapedIdentifiers16() {
+    assertParse(
+      """
+      let `@atSign` = 0
+      let `#octothorpe` = 0
+      """
+    )
+  }
+
+  func testEscapedIdentifiers17() {
+    assertParse(
+      """
+      @propertyWrapper
+      struct `@PoorlyNamedWrapper`<`The Value`> {
+        var wrappedValue: `The Value`
+      }
+      struct WithWrappedProperty {
+        @`@PoorlyNamedWrapper` var x: Int
+      }
+      """
+    )
+  }
+
+  func testEscapedIdentifiers18() {
+    assertParse(
+      """
+      let 1️⃣`+` = 0
+      let 2️⃣`^*^` = 0
+      let 3️⃣`.` = 0
+      let 4️⃣`?` = 0
+      func 5️⃣`+`(lhs: Int, rhs: Int) -> Int
+      """,
+      diagnostics: [
+        DiagnosticSpec(
+          locationMarker: "1️⃣",
+          message: "a raw identifier cannot contain only operator characters"
+        ),
+        DiagnosticSpec(
+          locationMarker: "2️⃣",
+          message: "a raw identifier cannot contain only operator characters"
+        ),
+        DiagnosticSpec(
+          locationMarker: "3️⃣",
+          message: "a raw identifier cannot contain only operator characters"
+        ),
+        DiagnosticSpec(
+          locationMarker: "4️⃣",
+          message: "a raw identifier cannot contain only operator characters"
+        ),
+        DiagnosticSpec(
+          locationMarker: "5️⃣",
+          message: "a raw identifier cannot contain only operator characters"
+        ),
+      ]
+    )
+  }
+
+  func testEscapedIdentifiers19() {
+    assertParse(
+      """
+      1️⃣`multiline is
+      not allowed` = 5
+      """,
+      diagnostics: [
+        DiagnosticSpec(
+          locationMarker: "1️⃣",
+          message: "extraneous code at top level"
+        )
+      ]
+    )
+  }
+
+  func testEscapedIdentifiers20() {
+    assertParse(
+      """
+      1️⃣`null\u{0000}is not allowed` = 5
+      `unprintable ascii\u{007f}is not allowed` = 10
+      """,
+      diagnostics: [
+        DiagnosticSpec(
+          locationMarker: "1️⃣",
+          message: "extraneous code at top level"
+        )
+      ]
+    )
+  }
+
+  func testEscapedIdentifiers21() {
+    assertParse(
+      """
+      1️⃣`` = 5
+      """,
+      diagnostics: [
+        DiagnosticSpec(
+          locationMarker: "1️⃣",
+          message: "a raw identifier cannot be empty"
+        )
+      ]
+    )
+  }
+
+  func testEscapedIdentifiers22() {
+    assertParse(
+      """
+      1️⃣` ` = 5
+      2️⃣`  ` = 5
+      3️⃣`\u{2000}` = 5
+      """,
+      diagnostics: [
+        DiagnosticSpec(
+          locationMarker: "1️⃣",
+          message: "a raw identifier cannot be entirely whitespace"
+        ),
+        DiagnosticSpec(
+          locationMarker: "2️⃣",
+          message: "a raw identifier cannot be entirely whitespace"
+        ),
+        DiagnosticSpec(
+          locationMarker: "3️⃣",
+          message: "a raw identifier cannot be entirely whitespace"
+        ),
+      ]
+    )
+  }
+
+  func testEscapedIdentifiers23() {
+    assertParse(
+      #"""
+      1️⃣`hello\there` = 5
+      2️⃣`\` = 5
+      3️⃣`back\\slash` = 5
+      """#,
+      diagnostics: [
+        DiagnosticSpec(
+          locationMarker: "1️⃣",
+          message: "a raw identifier cannot contain backslashes"
+        ),
+        DiagnosticSpec(
+          locationMarker: "2️⃣",
+          message: "a raw identifier cannot contain backslashes"
+        ),
+        DiagnosticSpec(
+          locationMarker: "3️⃣",
+          message: "a raw identifier cannot contain backslashes"
+        ),
+      ]
+    )
+  }
 }