Skip to content

Commit

Permalink
perf!: improve performance of reading input
Browse files Browse the repository at this point in the history
  • Loading branch information
kkebo committed May 4, 2024
1 parent 0c882eb commit fcc370b
Show file tree
Hide file tree
Showing 10 changed files with 264 additions and 160 deletions.
5 changes: 2 additions & 3 deletions Benchmarks/Benchmarks/MyBenchmark/MyBenchmark.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
private import Benchmark
private import DequeModule
private import Foundation
private import Tokenizer

Expand All @@ -14,11 +13,11 @@ extension TestSink: TokenSink {
private func runBench(_ name: String, configuration conf: Benchmark.Configuration) {
// swift-format-ignore: NeverUseForceTry, NeverForceUnwrap
let html = try! String(contentsOf: Bundle.module.url(forResource: name, withExtension: "html")!).unicodeScalars
let input = Deque(consume html)
let input = ArraySlice(consume html)
Benchmark(name, configuration: conf) { benchmark in
for _ in benchmark.scaledIterations {
var tokenizer = Tokenizer(sink: TestSink())
var input = input
var input = BufferQueue(input)
tokenizer.tokenize(&input)
}
}
Expand Down
44 changes: 44 additions & 0 deletions Sources/Tokenizer/BufferQueue.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
public import DequeModule

public struct BufferQueue: ~Copyable, Sendable {
@usableFromInline
var buffers: Deque<ArraySlice<Unicode.Scalar>>

@inlinable
public init(_ buf: ArraySlice<Unicode.Scalar>) {
self.buffers = [buf]
}

mutating func prepend(_ buf: ArraySlice<Unicode.Scalar>) {
guard !buf.isEmpty else { return }
self.buffers.prepend(buf)
}

func peek() -> Unicode.Scalar? {
self.buffers.first.flatMap { $0.first }
}

mutating func popFirst() -> Unicode.Scalar? {
guard !self.buffers.isEmpty else { return nil }
defer { if self.buffers[0].isEmpty { self.buffers.removeFirst() } }
return self.buffers[0].popFirst()
}

mutating func pop(except s: consuming SmallCharSet) -> PopResult? {
guard !self.buffers.isEmpty else { return nil }
defer { if self.buffers[0].isEmpty { self.buffers.removeFirst() } }
let others = self.buffers[0].prefix { $0.value >= 64 || !s.contains($0) }
self.buffers[0].removeFirst(others.count)
return if others.isEmpty {
self.buffers[0].popFirst().map(PopResult.known)
} else {
.others(others)
}
}

mutating func removeFirst() {
guard !self.buffers.isEmpty else { return }
self.buffers[0].removeFirst()
if self.buffers[0].isEmpty { self.buffers.removeFirst() }
}
}
22 changes: 10 additions & 12 deletions Sources/Tokenizer/CharRefTokenizer.swift
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import DequeModule
private import HTMLEntities

private enum CharRefState {
Expand All @@ -15,7 +14,7 @@ private enum CharRefState {
}

private enum CharRefProcessResult: ~Copyable {
case done([Unicode.Scalar])
case done(ArraySlice<Unicode.Scalar>)
case doneNone
case progress
}
Expand All @@ -32,7 +31,7 @@ struct CharRefTokenizer {
self.isInAttr = isInAttr
}

mutating func tokenize(tokenizer: inout Tokenizer<some ~Copyable & TokenSink>, input: inout Deque<Unicode.Scalar>) -> [Unicode.Scalar]? {
mutating func tokenize(tokenizer: inout Tokenizer<some ~Copyable & TokenSink>, input: inout BufferQueue) -> ArraySlice<Unicode.Scalar>? {
repeat {
switch self.step(tokenizer: &tokenizer, input: &input) {
case .done(let scalars): return scalars
Expand All @@ -42,7 +41,7 @@ struct CharRefTokenizer {
} while true
}

private mutating func step(tokenizer: inout Tokenizer<some ~Copyable & TokenSink>, input: inout Deque<Unicode.Scalar>) -> CharRefProcessResult {
private mutating func step(tokenizer: inout Tokenizer<some ~Copyable & TokenSink>, input: inout BufferQueue) -> CharRefProcessResult {
switch self.state {
case .initial:
switch tokenizer.peek(input) {
Expand All @@ -58,7 +57,7 @@ struct CharRefTokenizer {
case .named:
guard let c = tokenizer.peek(input) else {
guard let (endIndex, chars) = lastMatch else {
input.prepend(contentsOf: self.nameBuffer.unicodeScalars)
input.prepend(ArraySlice(self.nameBuffer.unicodeScalars))
return .doneNone
}
self.state = .namedEnd(endIndex: endIndex, replaceChars: chars)
Expand Down Expand Up @@ -89,18 +88,18 @@ struct CharRefTokenizer {
switch (isInAttr, lastChar, nextChar) {
case (_, ";", _): break
case (true, _, "="?), (true, _, ("0"..."9")?), (true, _, ("A"..."Z")?), (true, _, ("a"..."z")?):
input.prepend(contentsOf: self.nameBuffer.unicodeScalars)
input.prepend(ArraySlice(self.nameBuffer.unicodeScalars))
return .doneNone
case _: tokenizer.emitError(.missingSemicolon)
}
input.prepend(contentsOf: self.nameBuffer[endIndex...].unicodeScalars)
input.prepend(ArraySlice(self.nameBuffer[endIndex...].unicodeScalars))
return switch replaceChars {
case (let c1, "\0"): .done([c1])
case (let c1, let c2): .done([c1, c2])
}
case .ambiguousAmpersand:
guard let c = tokenizer.peek(input) else {
input.prepend(contentsOf: self.nameBuffer.unicodeScalars)
input.prepend(ArraySlice(self.nameBuffer.unicodeScalars))
return .doneNone
}
switch c {
Expand All @@ -111,7 +110,7 @@ struct CharRefTokenizer {
case ";": tokenizer.emitError(.unknownNamedCharRef)
case _: break
}
input.prepend(contentsOf: self.nameBuffer.unicodeScalars)
input.prepend(ArraySlice(self.nameBuffer.unicodeScalars))
return .doneNone
case .numeric:
switch tokenizer.peek(input) {
Expand All @@ -134,8 +133,7 @@ struct CharRefTokenizer {
return .progress
case _:
tokenizer.emitError(.absenceDigits)
input.prepend(uppercase ? "X" : "x")
input.prepend("#")
input.prepend(ArraySlice(uppercase ? "#X".unicodeScalars : "#x".unicodeScalars))
return .doneNone
}
case .decimalStart:
Expand All @@ -145,7 +143,7 @@ struct CharRefTokenizer {
return .progress
case _:
tokenizer.emitError(.absenceDigits)
input.prepend("#")
input.prepend(ArraySlice("#".unicodeScalars))
return .doneNone
}
case .hexadecimal:
Expand Down
4 changes: 4 additions & 0 deletions Sources/Tokenizer/PopResult.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
enum PopResult: ~Copyable {
case known(Unicode.Scalar)
case others(ArraySlice<Unicode.Scalar>)
}
14 changes: 14 additions & 0 deletions Sources/Tokenizer/SmallCharSet.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
struct SmallCharSet {
var bits: UInt64

@inlinable
func contains(_ c: Unicode.Scalar) -> Bool {
self.bits & 1 << c.value != 0
}
}

extension SmallCharSet: ExpressibleByArrayLiteral {
init(arrayLiteral elements: Unicode.Scalar...) {
self.bits = elements.lazy.map { 1 << $0.value }.reduce(0, |)
}
}
1 change: 1 addition & 0 deletions Sources/Tokenizer/Token.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
public enum Token: Equatable, Sendable {
case char(Unicode.Scalar)
case chars(ArraySlice<Unicode.Scalar>)
case tag(Tag)
case comment(String)
case doctype(DOCTYPE)
Expand Down
Loading

0 comments on commit fcc370b

Please sign in to comment.