diff --git a/Benchmarks/Benchmarks/MyBenchmark/MyBenchmark.swift b/Benchmarks/Benchmarks/MyBenchmark/MyBenchmark.swift index 8e32dc7..c334ea2 100644 --- a/Benchmarks/Benchmarks/MyBenchmark/MyBenchmark.swift +++ b/Benchmarks/Benchmarks/MyBenchmark/MyBenchmark.swift @@ -13,7 +13,7 @@ extension TestSink: TokenSink { private func runBench(_ name: String, configuration conf: Benchmark.Configuration) { // swift-format-ignore: NeverUseForceTry, NeverForceUnwrap - let html = try! String(contentsOf: Bundle.module.url(forResource: name, withExtension: "html")!) + let html = try! String(contentsOf: Bundle.module.url(forResource: name, withExtension: "html")!).unicodeScalars let input = Deque(consume html) Benchmark(name, configuration: conf) { benchmark in for _ in benchmark.scaledIterations { diff --git a/Sources/Tokenizer/CharRefTokenizer.swift b/Sources/Tokenizer/CharRefTokenizer.swift index fc2b77c..6c762fb 100644 --- a/Sources/Tokenizer/CharRefTokenizer.swift +++ b/Sources/Tokenizer/CharRefTokenizer.swift @@ -32,7 +32,7 @@ struct CharRefTokenizer { self.isInAttr = isInAttr } - mutating func tokenize(tokenizer: inout Tokenizer, input: inout Deque) -> [Unicode.Scalar]? { + mutating func tokenize(tokenizer: inout Tokenizer, input: inout Deque) -> [Unicode.Scalar]? { repeat { switch self.step(tokenizer: &tokenizer, input: &input) { case .done(let scalars): return scalars @@ -42,10 +42,10 @@ struct CharRefTokenizer { } while true } - private mutating func step(tokenizer: inout Tokenizer, input: inout Deque) -> CharRefProcessResult { + private mutating func step(tokenizer: inout Tokenizer, input: inout Deque) -> CharRefProcessResult { switch self.state { case .initial: - switch tokenizer.peek(input)?.firstScalar { + switch tokenizer.peek(input) { case ("0"..."9")?, ("A"..."Z")?, ("a"..."z")?: self.state = .named return .progress @@ -58,14 +58,14 @@ struct CharRefTokenizer { case .named: guard let c = tokenizer.peek(input) else { guard let (endIndex, chars) = lastMatch else { - input.prepend(contentsOf: self.nameBuffer) + input.prepend(contentsOf: self.nameBuffer.unicodeScalars) return .doneNone } self.state = .namedEnd(endIndex: endIndex, replaceChars: chars) return .progress } tokenizer.discardChar(&input) - self.nameBuffer.append(c) + self.nameBuffer.append(Character(c)) switch processedNamedChars[self.nameBuffer] { case ("\0", _)?: break case let chars?: lastMatch = (self.nameBuffer.endIndex, chars) @@ -79,7 +79,7 @@ struct CharRefTokenizer { return .progress case .namedEnd(let endIndex, let replaceChars): // swift-format-ignore: NeverForceUnwrap - let lastChar = self.nameBuffer[..: ~Copyable { public var sink: Sink package var state: State - private var reconsumeChar: Optional + private var reconsumeChar: Optional private var tempBuffer: String private var currentComment: String private var currentTagName: String @@ -112,7 +129,7 @@ public struct Tokenizer: ~Copyable { self.charRefTokenizer = nil } - public mutating func tokenize(_ input: inout Deque) { + public mutating func tokenize(_ input: inout Deque) { loop: repeat { switch self.step(&input) { case .continue: break @@ -122,7 +139,7 @@ public struct Tokenizer: ~Copyable { } // swift-format-ignore - private mutating func step(_ input: inout Deque) -> ProcessResult { + private mutating func step(_ input: inout Deque) -> ProcessResult { if var charRefTokenizer { if let scalars = charRefTokenizer.tokenize(tokenizer: &self, input: &input) { self.processCharRef(scalars) @@ -135,41 +152,41 @@ public struct Tokenizer: ~Copyable { switch self.getChar(from: &input) { case "&": #goConsumeCharRef(inAttr: false) case "<": #go(to: .tagOpen) - case "\0": #go(error: .unexpectedNull, emit: .char("\0")) + case "\0": #go(error: .unexpectedNull, emit: "\0") case nil: #go(emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .rcdata: repeat { switch self.getChar(from: &input) { case "&": #goConsumeCharRef(inAttr: false) case "<": #go(to: .rcdataLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}")) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .rawtext: repeat { switch self.getChar(from: &input) { case "<": #go(to: .rawtextLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}")) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .scriptData: repeat { switch self.getChar(from: &input) { case "<": #go(to: .scriptDataLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}")) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .plaintext: repeat { switch self.getChar(from: &input) { - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}")) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .tagOpen: repeat { @@ -177,11 +194,11 @@ public struct Tokenizer: ~Copyable { case "!": #go(to: .markupDeclarationOpen) case "/": #go(to: .endTagOpen) case "?": #go(error: .unexpectedQuestionMark, createComment: "?", to: .bogusComment) - case nil: #go(error: .eofBeforeTagName, emit: .char("<"), .eof) + case nil: #go(error: .eofBeforeTagName, emit: "<", .eof) case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(createStartTag: cl, to: .tagName) - case nil: #go(error: .invalidFirstChar, emit: .char("<"), reconsume: c, in: .data) + case nil: #go(error: .invalidFirstChar, emit: "<", reconsume: c, in: .data) } } } while true @@ -189,7 +206,7 @@ public struct Tokenizer: ~Copyable { switch self.getChar(from: &input) { case ">": #go(error: .missingEndTagName, to: .data) case "\0": #go(error: .invalidFirstChar, .unexpectedNull, createComment: "\u{FFFD}", to: .bogusComment) - case nil: #go(error: .eofBeforeTagName, emit: .char("<"), .char("/"), .eof) + case nil: #go(error: .eofBeforeTagName, emit: "<", "/", .eof) case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(createEndTag: cl, to: .tagName) @@ -210,8 +227,8 @@ public struct Tokenizer: ~Copyable { case .rcdataLessThanSign: repeat { switch self.getChar(from: &input) { case "/": #go(clearTemp: .rcdataEndTagOpen) - case nil: #go(emit: .char("<"), .eof) - case let c?: #go(emit: .char("<"), reconsume: c, in: .rcdata) + case nil: #go(emit: "<", .eof) + case let c?: #go(emit: "<", reconsume: c, in: .rcdata) } } while true case .rcdataEndTagOpen: repeat { @@ -219,9 +236,9 @@ public struct Tokenizer: ~Copyable { case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(createEndTag: cl, appendTemp: c, to: .rcdataEndTagName) - case nil: #go(emit: .char("<"), .char("/"), reconsume: c, in: .rcdata) + case nil: #go(emit: "<", "/", reconsume: c, in: .rcdata) } - case nil: #go(emit: .char("<"), .char("/"), .eof) + case nil: #go(emit: "<", "/", .eof) } } while true case .rcdataEndTagName: repeat { @@ -238,29 +255,29 @@ public struct Tokenizer: ~Copyable { case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(appendTagName: cl, appendTemp: c) - case nil: #go(emit: .char("<"), .char("/"), emitTempAndReconsume: c, in: .rcdata) + case nil: #go(emit: "<", "/", emitTempAndReconsume: c, in: .rcdata) } - case nil: #go(emit: .char("<"), .char("/"), emitTempAndEmit: .eof) + case nil: #go(emit: "<", "/", emitTempAndEmit: .eof) } } while true case .rawtextLessThanSign: repeat { switch self.getChar(from: &input) { case "/": #go(clearTemp: .rawtextEndTagOpen) - case "<": #go(emit: .char("<"), to: .rawtextLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("<"), .char("\u{FFFD}"), to: .rawtext) - case nil: #go(emit: .char("<"), .eof) - case let c?: #go(emit: .char("<"), .char(c), to: .rawtext) + case "<": #go(emit: "<", to: .rawtextLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "<", "\u{FFFD}", to: .rawtext) + case nil: #go(emit: "<", .eof) + case let c?: #go(emit: "<", c, to: .rawtext) } } while true case .rawtextEndTagOpen: repeat { switch self.getChar(from: &input) { - case "<": #go(emit: .char("<"), .char("/"), to: .rawtextLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("<"), .char("/"), .char("\u{FFFD}"), to: .rawtext) - case nil: #go(emit: .char("<"), .char("/"), .eof) + case "<": #go(emit: "<", "/", to: .rawtextLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "<", "/", "\u{FFFD}", to: .rawtext) + case nil: #go(emit: "<", "/", .eof) case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(createEndTag: cl, appendTemp: c, to: .rawtextEndTagName) - case nil: #go(emit: .char("<"), .char("/"), .char(c), to: .rawtext) + case nil: #go(emit: "<", "/", c, to: .rawtext) } } } while true @@ -278,30 +295,30 @@ public struct Tokenizer: ~Copyable { case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(appendTagName: cl, appendTemp: c) - case nil: #go(emit: .char("<"), .char("/"), emitTempAndReconsume: c, in: .rawtext) + case nil: #go(emit: "<", "/", emitTempAndReconsume: c, in: .rawtext) } - case nil: #go(emit: .char("<"), .char("/"), emitTempAndEmit: .eof) + case nil: #go(emit: "<", "/", emitTempAndEmit: .eof) } } while true case .scriptDataLessThanSign: repeat { switch self.getChar(from: &input) { case "/": #go(clearTemp: .scriptDataEndTagOpen) - case "!": #go(emit: .char("<"), .char("!"), to: .scriptDataEscapeStart) - case "<": #go(emit: .char("<"), to: .scriptDataLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("<"), .char("\u{FFFD}"), to: .scriptData) - case nil: #go(emit: .char("<"), .eof) - case let c?: #go(emit: .char("<"), .char(c), to: .scriptData) + case "!": #go(emit: "<", "!", to: .scriptDataEscapeStart) + case "<": #go(emit: "<", to: .scriptDataLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "<", "\u{FFFD}", to: .scriptData) + case nil: #go(emit: "<", .eof) + case let c?: #go(emit: "<", c, to: .scriptData) } } while true case .scriptDataEndTagOpen: repeat { switch self.getChar(from: &input) { - case "<": #go(emit: .char("<"), .char("/"), to: .scriptDataLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("<"), .char(","), .char("\u{FFFD}"), to: .scriptData) - case nil: #go(emit: .char("<"), .char("/"), .eof) + case "<": #go(emit: "<", "/", to: .scriptDataLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "<", ",", "\u{FFFD}", to: .scriptData) + case nil: #go(emit: "<", "/", .eof) case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(createEndTag: cl, appendTemp: c, to: .scriptDataEndTagName) - case nil: #go(emit: .char("<"), .char("/"), .char(c), to: .scriptData) + case nil: #go(emit: "<", "/", c, to: .scriptData) } } } while true @@ -319,81 +336,81 @@ public struct Tokenizer: ~Copyable { case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(appendTagName: cl, appendTemp: c) - case nil: #go(emit: .char("<"), .char("/"), emitTempAndReconsume: c, in: .scriptData) + case nil: #go(emit: "<", "/", emitTempAndReconsume: c, in: .scriptData) } - case nil: #go(emit: .char("<"), .char("/"), emitTempAndEmit: .eof) + case nil: #go(emit: "<", "/", emitTempAndEmit: .eof) } } while true case .scriptDataEscapeStart: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("-"), to: .scriptDataEscapeStartDash) + case "-": #go(emit: "-", to: .scriptDataEscapeStartDash) case "<": #go(to: .scriptDataLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}")) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .scriptDataEscapeStartDash: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("-"), to: .scriptDataEscapedDashDash) + case "-": #go(emit: "-", to: .scriptDataEscapedDashDash) case "<": #go(to: .scriptDataLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}")) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .scriptDataEscaped: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("-"), to: .scriptDataEscapedDash) + case "-": #go(emit: "-", to: .scriptDataEscapedDash) case "<": #go(to: .scriptDataEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}")) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(error: .eofInScriptComment, emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .scriptDataEscapedDash: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("-"), to: .scriptDataEscapedDashDash) + case "-": #go(emit: "-", to: .scriptDataEscapedDashDash) case "<": #go(to: .scriptDataEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}"), to: .scriptDataEscaped) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}", to: .scriptDataEscaped) case nil: #go(error: .eofInScriptComment, emit: .eof) - case let c?: #go(emit: .char(c), to: .scriptDataEscaped) + case let c?: #go(emit: c, to: .scriptDataEscaped) } } while true case .scriptDataEscapedDashDash: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("-")) + case "-": #go(emit: "-") case "<": #go(to: .scriptDataEscapedLessThanSign) - case ">": #go(emit: .char(">"), to: .scriptData) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}"), to: .scriptDataEscaped) + case ">": #go(emit: ">", to: .scriptData) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}", to: .scriptDataEscaped) case nil: #go(error: .eofInScriptComment, emit: .eof) - case let c?: #go(emit: .char(c), to: .scriptDataEscaped) + case let c?: #go(emit: c, to: .scriptDataEscaped) } } while true case .scriptDataEscapedLessThanSign: repeat { switch self.getChar(from: &input) { case "/": #go(clearTemp: .scriptDataEscapedEndTagOpen) - case "-": #go(emit: .char("<"), .char("-"), to: .scriptDataEscapedDash) - case "<": #go(emit: .char("<"), to: .scriptDataEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("<"), .char("\u{FFFD}"), to: .scriptDataEscaped) - case nil: #go(error: .eofInScriptComment, emit: .char("<"), .eof) + case "-": #go(emit: "<", "-", to: .scriptDataEscapedDash) + case "<": #go(emit: "<", to: .scriptDataEscapedLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "<", "\u{FFFD}", to: .scriptDataEscaped) + case nil: #go(error: .eofInScriptComment, emit: "<", .eof) case let c?: switch lowerASCIIOrNil(c) { - case let cl?: #go(createTemp: cl, emit: .char("<"), .char(c), to: .scriptDataDoubleEscapeStart) - case nil: #go(emit: .char("<"), .char(c), to: .scriptDataEscaped) + case let cl?: #go(createTemp: cl, emit: "<", c, to: .scriptDataDoubleEscapeStart) + case nil: #go(emit: "<", c, to: .scriptDataEscaped) } } } while true case .scriptDataEscapedEndTagOpen: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("<"), .char("/"), .char("-"), to: .scriptDataEscapedDash) - case "<": #go(emit: .char("<"), .char("/"), to: .scriptDataEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("<"), .char("/"), .char("\u{FFFD}"), to: .scriptDataEscaped) - case nil: #go(error: .eofInScriptComment, emit: .char("<"), .char("/"), .eof) + case "-": #go(emit: "<", "/", "-", to: .scriptDataEscapedDash) + case "<": #go(emit: "<", "/", to: .scriptDataEscapedLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "<", "/", "\u{FFFD}", to: .scriptDataEscaped) + case nil: #go(error: .eofInScriptComment, emit: "<", "/", .eof) case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(createEndTag: cl, appendTemp: c, to: .scriptDataEscapedEndTagName) - case nil: #go(emit: .char("<"), .char("/"), .char(c), to: .scriptDataEscaped) + case nil: #go(emit: "<", "/", c, to: .scriptDataEscaped) } } } while true @@ -411,9 +428,9 @@ public struct Tokenizer: ~Copyable { case let c?: switch lowerASCIIOrNil(c) { case let cl?: #go(appendTagName: cl, appendTemp: c) - case nil: #go(emit: .char("<"), .char("/"), emitTempAndReconsume: c, in: .scriptDataEscaped) + case nil: #go(emit: "<", "/", emitTempAndReconsume: c, in: .scriptDataEscaped) } - case nil: #go(emit: .char("<"), .char("/"), emitTempAndEmit: .eof) + case nil: #go(emit: "<", "/", emitTempAndEmit: .eof) } } while true case .scriptDataDoubleEscapeStart: repeat { @@ -421,56 +438,56 @@ public struct Tokenizer: ~Copyable { switch c { case "\t", "\n", "\u{0C}", " ", "/", ">": if self.tempBuffer == "script" { - #go(emit: .char(c), to: .scriptDataDoubleEscaped) + #go(emit: c, to: .scriptDataDoubleEscaped) } else { - #go(emit: .char(c), to: .scriptDataEscaped) + #go(emit: c, to: .scriptDataEscaped) } - case "-": #go(emit: .char("-"), to: .scriptDataEscapedDash) + case "-": #go(emit: "-", to: .scriptDataEscapedDash) case "<": #go(to: .scriptDataEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}"), to: .scriptDataEscaped) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}", to: .scriptDataEscaped) case let c: switch lowerASCIIOrNil(c) { - case let cl?: #go(appendTemp: cl, emit: .char(c)) - case nil: #go(emit: .char(c), to: .scriptDataEscaped) + case let cl?: #go(appendTemp: cl, emit: c) + case nil: #go(emit: c, to: .scriptDataEscaped) } } } while true case .scriptDataDoubleEscaped: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("-"), to: .scriptDataDoubleEscapedDash) - case "<": #go(emit: .char("<"), to: .scriptDataDoubleEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}")) + case "-": #go(emit: "-", to: .scriptDataDoubleEscapedDash) + case "<": #go(emit: "<", to: .scriptDataDoubleEscapedLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(error: .eofInScriptComment, emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .scriptDataDoubleEscapedDash: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("-"), to: .scriptDataDoubleEscapedDashDash) - case "<": #go(emit: .char("<"), to: .scriptDataDoubleEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}"), to: .scriptDataDoubleEscaped) + case "-": #go(emit: "-", to: .scriptDataDoubleEscapedDashDash) + case "<": #go(emit: "<", to: .scriptDataDoubleEscapedLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}", to: .scriptDataDoubleEscaped) case nil: #go(error: .eofInScriptComment, emit: .eof) - case let c?: #go(emit: .char(c), to: .scriptDataDoubleEscaped) + case let c?: #go(emit: c, to: .scriptDataDoubleEscaped) } } while true case .scriptDataDoubleEscapedDashDash: repeat { switch self.getChar(from: &input) { - case "-": #go(emit: .char("-")) - case "<": #go(emit: .char("<"), to: .scriptDataDoubleEscapedLessThanSign) - case ">": #go(emit: .char(">"), to: .scriptData) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}"), to: .scriptDataDoubleEscaped) + case "-": #go(emit: "-") + case "<": #go(emit: "<", to: .scriptDataDoubleEscapedLessThanSign) + case ">": #go(emit: ">", to: .scriptData) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}", to: .scriptDataDoubleEscaped) case nil: #go(error: .eofInScriptComment, emit: .eof) - case let c?: #go(emit: .char(c), to: .scriptDataDoubleEscaped) + case let c?: #go(emit: c, to: .scriptDataDoubleEscaped) } } while true case .scriptDataDoubleEscapedLessThanSign: repeat { switch self.getChar(from: &input) { - case "/": #go(emit: .char("/"), clearTemp: .scriptDataDoubleEscapeEnd) - case "-": #go(emit: .char("-"), to: .scriptDataDoubleEscapedDash) - case "<": #go(emit: .char("<"), to: .scriptDataDoubleEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}"), to: .scriptDataDoubleEscaped) + case "/": #go(emit: "/", clearTemp: .scriptDataDoubleEscapeEnd) + case "-": #go(emit: "-", to: .scriptDataDoubleEscapedDash) + case "<": #go(emit: "<", to: .scriptDataDoubleEscapedLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}", to: .scriptDataDoubleEscaped) case nil: #go(error: .eofInScriptComment, emit: .eof) - case let c?: #go(emit: .char(c), to: .scriptDataDoubleEscaped) + case let c?: #go(emit: c, to: .scriptDataDoubleEscaped) } } while true case .scriptDataDoubleEscapeEnd: repeat { @@ -478,17 +495,17 @@ public struct Tokenizer: ~Copyable { switch c { case "\t", "\n", "\u{0C}", " ", "/", ">": if self.tempBuffer == "script" { - #go(emit: .char(c), to: .scriptDataEscaped) + #go(emit: c, to: .scriptDataEscaped) } else { - #go(emit: .char(c), to: .scriptDataDoubleEscaped) + #go(emit: c, to: .scriptDataDoubleEscaped) } - case "-": #go(emit: .char("-"), to: .scriptDataDoubleEscapedDash) - case "<": #go(emit: .char("<"), to: .scriptDataDoubleEscapedLessThanSign) - case "\0": #go(error: .unexpectedNull, emit: .char("\u{FFFD}"), to: .scriptDataDoubleEscaped) + case "-": #go(emit: "-", to: .scriptDataDoubleEscapedDash) + case "<": #go(emit: "<", to: .scriptDataDoubleEscapedLessThanSign) + case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}", to: .scriptDataDoubleEscaped) case let c: switch lowerASCIIOrNil(c) { - case let cl?: #go(appendTemp: cl, emit: .char(c)) - case nil: #go(emit: .char(c), to: .scriptDataDoubleEscaped) + case let cl?: #go(appendTemp: cl, emit: c) + case nil: #go(emit: c, to: .scriptDataDoubleEscaped) } } } while true @@ -892,22 +909,22 @@ public struct Tokenizer: ~Copyable { switch self.getChar(from: &input) { case "]": #go(to: .cdataSectionBracket) case nil: #go(error: .eofInCDATA, emit: .eof) - case let c?: #go(emit: .char(c)) + case let c?: #go(emit: c) } } while true case .cdataSectionBracket: repeat { switch self.getChar(from: &input) { case "]": #go(to: .cdataSectionEnd) - case nil: #go(error: .eofInCDATA, emit: .char("]"), .eof) - case let c?: #go(emit: .char("]"), .char(c), to: .cdataSection) + case nil: #go(error: .eofInCDATA, emit: "]", .eof) + case let c?: #go(emit: "]", c, to: .cdataSection) } } while true case .cdataSectionEnd: repeat { switch self.getChar(from: &input) { - case "]": #go(emit: .char("]")) + case "]": #go(emit: "]") case ">": #go(to: .data) - case nil: #go(error: .eofInCDATA, emit: .char("]"), .eof) - case let c?: #go(emit: .char("]"), .char(c), to: .cdataSection) + case nil: #go(error: .eofInCDATA, emit: "]", .eof) + case let c?: #go(emit: "]", c, to: .cdataSection) } } while true } @@ -916,28 +933,33 @@ public struct Tokenizer: ~Copyable { @inline(__always) mutating func processCharRef(_ scalars: consuming [Unicode.Scalar]) { switch self.state { - case .data, .rcdata: for scalar in scalars { #go(emit: .char(Character(scalar))) } + case .data, .rcdata: for scalar in scalars { #go(emit: scalar) } case .attributeValueDoubleQuoted, .attributeValueSingleQuoted, .attributeValueUnquoted: - for scalar in scalars { #go(appendAttrValue: Character(scalar)) } + for scalar in scalars { #go(appendAttrValue: scalar) } case _: preconditionFailure("unreachable") } } @inline(__always) - mutating func processCharRef(_ c: consuming Character) { + mutating func processCharRef(_ c: consuming Unicode.Scalar) { switch self.state { - case .data, .rcdata: #go(emit: .char(c)) + case .data, .rcdata: #go(emit: c) case .attributeValueDoubleQuoted, .attributeValueSingleQuoted, .attributeValueUnquoted: #go(appendAttrValue: c) case _: preconditionFailure("unreachable") } } @inline(__always) - private mutating func getChar(from input: inout Deque) -> Character? { + private mutating func getChar(from input: inout Deque) -> Unicode.Scalar? { guard let reconsumeChar else { guard let c = input.popFirst() else { return nil } - guard c != "\r\n", c != "\r" else { return "\n" } - switch c.firstScalar.value { + guard c != "\r" else { + if self.peek(input) == "\n" { + self.discardChar(&input) + } + return "\n" + } + switch c.value { // Swift's String cannot have surrogates // case 0xD800...0xDBFF, 0xDC00...0xDFFF: // self.emitError(.surrogateInInput) @@ -958,12 +980,12 @@ public struct Tokenizer: ~Copyable { } @inline(__always) - func peek(_ input: borrowing Deque) -> Character? { + func peek(_ input: borrowing Deque) -> Unicode.Scalar? { self.reconsumeChar ?? input.first } @inline(__always) - mutating func discardChar(_ input: inout Deque) { + mutating func discardChar(_ input: inout Deque) { switch self.reconsumeChar { case .some: self.reconsumeChar = nil case .none: input.removeFirst() @@ -972,12 +994,12 @@ public struct Tokenizer: ~Copyable { @inline(__always) private mutating func startsExact( - _ input: inout Deque, + _ input: inout Deque, with pattern: consuming some StringProtocol ) -> Bool? { var iter = input.makeIterator() let count = pattern.count - for pc in pattern { + for pc in pattern.unicodeScalars { guard let c = iter.next() else { return nil } guard consume c == consume pc else { return false } } @@ -987,14 +1009,14 @@ public struct Tokenizer: ~Copyable { @inline(__always) private mutating func starts( - _ input: inout Deque, + _ input: inout Deque, with pattern: consuming some StringProtocol ) -> Bool? { var iter = input.makeIterator() let count = pattern.count - for pc in pattern { + for pc in pattern.unicodeScalars { guard let c = iter.next() else { return nil } - guard c.lowercased() == pc.lowercased() else { return false } + guard lowerASCII(consume c) == lowerASCII(consume pc) else { return false } } input.removeFirst(count) return true @@ -1006,17 +1028,16 @@ public struct Tokenizer: ~Copyable { } @inline(__always) - private mutating func go(reconsume c: consuming Character, in state: consuming State) { + private mutating func go(reconsume c: consuming Unicode.Scalar, in state: consuming State) { self.reconsumeChar = c self.state = state } @inline(__always) - private mutating func emit(_ c: consuming Character) { + private mutating func emit(_ c: consuming Unicode.Scalar) { self.sink.process(.char(c)) } - @_disfavoredOverload @inline(__always) private mutating func emit(_ token: consuming Token) { self.sink.process(token) @@ -1033,29 +1054,40 @@ public struct Tokenizer: ~Copyable { } @inline(__always) - private mutating func createTempBuffer(with c: consuming Character) { + private mutating func createTempBuffer(with c: consuming Unicode.Scalar) { self.tempBuffer = String(c) } + @inline(__always) + private mutating func appendTempBuffer(_ c: consuming Unicode.Scalar) { + self.tempBuffer.append(Character(c)) + } + @inline(__always) private mutating func emitTempBuffer() { - for c in self.tempBuffer { + for c in self.tempBuffer.unicodeScalars { self.sink.process(.char(c)) } self.tempBuffer.removeAll() } + @_disfavoredOverload @inline(__always) - private mutating func createComment(with c: consuming Character) { + private mutating func createComment(with c: consuming Unicode.Scalar) { self.currentComment = String(c) } - @_disfavoredOverload @inline(__always) private mutating func createComment(with s: consuming String) { self.currentComment = s } + @_disfavoredOverload + @inline(__always) + private mutating func appendComment(_ c: consuming Unicode.Scalar) { + self.currentComment.append(Character(c)) + } + @inline(__always) private mutating func appendComment(_ c: consuming Character) { self.currentComment.append(c) @@ -1073,25 +1105,65 @@ public struct Tokenizer: ~Copyable { } @inline(__always) - private mutating func createStartTag(with c: consuming Character) { + private mutating func createStartTag(with c: consuming Unicode.Scalar) { self.currentTagName = String(c) self.currentTagKind = .start self.currentAttrs.removeAll() } @inline(__always) - private mutating func createEndTag(with c: consuming Character) { + private mutating func createEndTag(with c: consuming Unicode.Scalar) { self.currentTagName = String(c) self.currentTagKind = .end self.currentAttrs.removeAll() } + @_disfavoredOverload + @inline(__always) + private mutating func appendTagName(_ c: consuming Unicode.Scalar) { + self.currentTagName.append(Character(c)) + } + @inline(__always) - private mutating func createAttr(with c: consuming Character) { + private mutating func appendTagName(_ c: consuming Character) { + self.currentTagName.append(c) + } + + @_disfavoredOverload + @inline(__always) + private mutating func createAttr(with c: consuming Unicode.Scalar) { self.pushAttr() self.currentAttrName = String(c) } + @inline(__always) + private mutating func createAttr(with s: consuming String) { + self.pushAttr() + self.currentAttrName = s + } + + @_disfavoredOverload + @inline(__always) + private mutating func appendAttrName(_ c: consuming Unicode.Scalar) { + self.currentAttrName.append(Character(c)) + } + + @inline(__always) + private mutating func appendAttrName(_ c: consuming Character) { + self.currentAttrName.append(c) + } + + @_disfavoredOverload + @inline(__always) + private mutating func appendAttrValue(_ c: consuming Unicode.Scalar) { + self.currentAttrValue.append(Character(c)) + } + + @inline(__always) + private mutating func appendAttrValue(_ c: consuming Character) { + self.currentAttrValue.append(c) + } + @inline(__always) private mutating func pushAttr() { guard !self.currentAttrName.isEmpty else { return } @@ -1127,11 +1199,26 @@ public struct Tokenizer: ~Copyable { self.currentDOCTYPE = .init() } + @_disfavoredOverload @inline(__always) - private mutating func createDOCTYPE(with c: consuming Character) { + private mutating func createDOCTYPE(with c: consuming Unicode.Scalar) { self.currentDOCTYPE = .init(name: String(c)) } + @inline(__always) + private mutating func createDOCTYPE(with s: consuming String) { + self.currentDOCTYPE = .init(name: s) + } + + @_disfavoredOverload + @inline(__always) + private mutating func appendDOCTYPEName(_ c: consuming Unicode.Scalar) { + switch self.currentDOCTYPE.name { + case .some: self.currentDOCTYPE.name?.append(Character(c)) + case .none: self.currentDOCTYPE.name = String(c) + } + } + @inline(__always) private mutating func appendDOCTYPEName(_ c: consuming Character) { switch self.currentDOCTYPE.name { @@ -1140,6 +1227,15 @@ public struct Tokenizer: ~Copyable { } } + @_disfavoredOverload + @inline(__always) + private mutating func appendPublicID(_ c: consuming Unicode.Scalar) { + switch self.currentDOCTYPE.publicID { + case .some: self.currentDOCTYPE.publicID?.append(Character(c)) + case .none: self.currentDOCTYPE.publicID = String(c) + } + } + @inline(__always) private mutating func appendPublicID(_ c: consuming Character) { switch self.currentDOCTYPE.publicID { @@ -1153,6 +1249,15 @@ public struct Tokenizer: ~Copyable { self.currentDOCTYPE.publicID = "" } + @_disfavoredOverload + @inline(__always) + private mutating func appendSystemID(_ c: consuming Unicode.Scalar) { + switch self.currentDOCTYPE.systemID { + case .some: self.currentDOCTYPE.systemID?.append(Character(c)) + case .none: self.currentDOCTYPE.systemID = String(c) + } + } + @inline(__always) private mutating func appendSystemID(_ c: consuming Character) { switch self.currentDOCTYPE.systemID { diff --git a/Sources/Tokenizer/Utils.swift b/Sources/Tokenizer/Utils.swift index ad80860..1cbbed8 100644 --- a/Sources/Tokenizer/Utils.swift +++ b/Sources/Tokenizer/Utils.swift @@ -1,20 +1,18 @@ @inline(__always) @inlinable -func lowerASCIIOrNil(_ c: consuming Character) -> Character? { - let firstScalar = c.firstScalar - return switch firstScalar { - case "A"..."Z": .init(.init(UInt8(firstScalar.value) &+ 0x20)) - case "a"..."z": c +func lowerASCIIOrNil(_ c: consuming Unicode.Scalar) -> Unicode.Scalar? { + switch c { + case let c where "A"..."Z" ~= c: .init(.init(UInt8(c.value) &+ 0x20)) + case let c where "a"..."z" ~= c: c case _: nil } } @inline(__always) @inlinable -func lowerASCII(_ c: consuming Character) -> Character { - let firstScalar = c.firstScalar - return switch firstScalar { - case "A"..."Z": .init(.init(UInt8(firstScalar.value) &+ 0x20)) - case _: c +func lowerASCII(_ c: consuming Unicode.Scalar) -> Unicode.Scalar { + switch c { + case let c where "A"..."Z" ~= c: .init(.init(UInt8(c.value) &+ 0x20)) + case let c: c } } diff --git a/Sources/TokenizerMacros/Macros.swift b/Sources/TokenizerMacros/Macros.swift index 554b8ea..7afb46b 100644 --- a/Sources/TokenizerMacros/Macros.swift +++ b/Sources/TokenizerMacros/Macros.swift @@ -69,16 +69,16 @@ extension GoMacro: CodeItemMacro { items += ["self.createEndTag(with: \(arg.expression))"] argList = .init(argList.dropFirst()) case "appendTagName": - items += ["self.currentTagName.append(\(arg.expression))"] + items += ["self.appendTagName(\(arg.expression))"] argList = .init(argList.dropFirst()) case "createAttr": items += ["self.createAttr(with: \(arg.expression))"] argList = .init(argList.dropFirst()) case "appendAttrName": - items += ["self.currentAttrName.append(\(arg.expression))"] + items += ["self.appendAttrName(\(arg.expression))"] argList = .init(argList.dropFirst()) case "appendAttrValue": - items += ["self.currentAttrValue.append(\(arg.expression))"] + items += ["self.appendAttrValue(\(arg.expression))"] argList = .init(argList.dropFirst()) case "emitTag": precondition(argList.count == 1) @@ -128,7 +128,7 @@ extension GoMacro: CodeItemMacro { items += ["self.createTempBuffer(with: \(arg.expression))"] argList = .init(argList.dropFirst()) case "appendTemp": - items += ["self.tempBuffer.append(\(arg.expression))"] + items += ["self.appendTempBuffer(\(arg.expression))"] argList = .init(argList.dropFirst()) case "clearTemp": items += ["self.tempBuffer.removeAll()", "self.go(to: \(arg.expression))", "return .continue"] diff --git a/Tests/TokenizerTests/BasicHTMLTests.swift b/Tests/TokenizerTests/BasicHTMLTests.swift index eae9bed..0bcfeae 100644 --- a/Tests/TokenizerTests/BasicHTMLTests.swift +++ b/Tests/TokenizerTests/BasicHTMLTests.swift @@ -27,7 +27,7 @@ extension TestSink: TokenSink { """# var tokenizer = Tokenizer(sink: TestSink()) - var input = Deque(html) + var input = Deque(html.unicodeScalars) tokenizer.tokenize(&input) let tokens: [Token] = [ diff --git a/Tests/TokenizerTests/HTML5LibTests.swift b/Tests/TokenizerTests/HTML5LibTests.swift index ed70899..bdc7249 100644 --- a/Tests/TokenizerTests/HTML5LibTests.swift +++ b/Tests/TokenizerTests/HTML5LibTests.swift @@ -3,30 +3,16 @@ private import Foundation import Testing private import Tokenizer -private struct TestSink { +private struct TestSink: ~Copyable { var tokens = [Token]() var errors = [ParseError]() - var pendingChars = "" - - consuming func finalize() -> ([Token], [ParseError]) { - self.processChars() - return (self.tokens, self.errors) - } - - private mutating func processChars() { - self.tokens.append(contentsOf: self.pendingChars.map(Token.char)) - self.pendingChars.removeAll() - } } extension TestSink: TokenSink { mutating func process(_ token: consuming Token) { switch token { case .error(let error): self.errors.append(error) - case .char(let c): self.pendingChars.append(c) - case let token: - self.processChars() - self.tokens.append(token) + case let token: self.tokens.append(token) } } } @@ -49,13 +35,12 @@ private let testCases = try! [ .flatMap { try parseTestCases(from: Data(contentsOf: $0)) } @Test("html5lib-tests", arguments: testCases) -func html5libTests(_ testCase: TestCase) throws { +func html5libTests(_ testCase: TestCase) { var tokenizer = Tokenizer(sink: TestSink()) tokenizer.state = testCase.initialState - var input = Deque(testCase.input) + var input = Deque(testCase.input.unicodeScalars) tokenizer.tokenize(&input) - let (tokens, errors) = tokenizer.sink.finalize() - #expect(tokens == testCase.tokens) - #expect(errors.count == testCase.errors.count) // TODO: Make it stricter + #expect(tokenizer.sink.tokens == testCase.tokens) + #expect(tokenizer.sink.errors.count == testCase.errors.count) // TODO: Make it stricter } diff --git a/Tests/TokenizerTests/HTML5LibTestsParser.swift b/Tests/TokenizerTests/HTML5LibTestsParser.swift index 73316ce..ca77cb7 100644 --- a/Tests/TokenizerTests/HTML5LibTestsParser.swift +++ b/Tests/TokenizerTests/HTML5LibTestsParser.swift @@ -84,7 +84,7 @@ struct ExpectedToken { return [.comment(data)] case .str("Character"): guard case .str(let data) = fields[1] else { throw TestParseError.invalidTokenFormat(fields) } - return data.map(Token.char) + return data.unicodeScalars.map(Token.char) case let type: throw TestParseError.invalidTokenType(type) } }