Skip to content

Commit

Permalink
Merge pull request #63 from kkebo/unicode-scalar
Browse files Browse the repository at this point in the history
  • Loading branch information
kkebo authored May 2, 2024
2 parents 1b292dd + 03431b5 commit fc60b98
Show file tree
Hide file tree
Showing 9 changed files with 311 additions and 222 deletions.
2 changes: 1 addition & 1 deletion Benchmarks/Benchmarks/MyBenchmark/MyBenchmark.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ extension TestSink: TokenSink {

private func runBench(_ name: String, configuration conf: Benchmark.Configuration) {
// swift-format-ignore: NeverUseForceTry, NeverForceUnwrap
let html = try! String(contentsOf: Bundle.module.url(forResource: name, withExtension: "html")!)
let html = try! String(contentsOf: Bundle.module.url(forResource: name, withExtension: "html")!).unicodeScalars
let input = Deque(consume html)
Benchmark(name, configuration: conf) { benchmark in
for _ in benchmark.scaledIterations {
Expand Down
35 changes: 18 additions & 17 deletions Sources/Tokenizer/CharRefTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ struct CharRefTokenizer {
self.isInAttr = isInAttr
}

mutating func tokenize(tokenizer: inout Tokenizer<some ~Copyable & TokenSink>, input: inout Deque<Character>) -> [Unicode.Scalar]? {
mutating func tokenize(tokenizer: inout Tokenizer<some ~Copyable & TokenSink>, input: inout Deque<Unicode.Scalar>) -> [Unicode.Scalar]? {
repeat {
switch self.step(tokenizer: &tokenizer, input: &input) {
case .done(let scalars): return scalars
Expand All @@ -42,10 +42,10 @@ struct CharRefTokenizer {
} while true
}

private mutating func step(tokenizer: inout Tokenizer<some ~Copyable & TokenSink>, input: inout Deque<Character>) -> CharRefProcessResult {
private mutating func step(tokenizer: inout Tokenizer<some ~Copyable & TokenSink>, input: inout Deque<Unicode.Scalar>) -> CharRefProcessResult {
switch self.state {
case .initial:
switch tokenizer.peek(input)?.firstScalar {
switch tokenizer.peek(input) {
case ("0"..."9")?, ("A"..."Z")?, ("a"..."z")?:
self.state = .named
return .progress
Expand All @@ -58,14 +58,14 @@ struct CharRefTokenizer {
case .named:
guard let c = tokenizer.peek(input) else {
guard let (endIndex, chars) = lastMatch else {
input.prepend(contentsOf: self.nameBuffer)
input.prepend(contentsOf: self.nameBuffer.unicodeScalars)
return .doneNone
}
self.state = .namedEnd(endIndex: endIndex, replaceChars: chars)
return .progress
}
tokenizer.discardChar(&input)
self.nameBuffer.append(c)
self.nameBuffer.append(Character(c))
switch processedNamedChars[self.nameBuffer] {
case ("\0", _)?: break
case let chars?: lastMatch = (self.nameBuffer.endIndex, chars)
Expand All @@ -79,7 +79,7 @@ struct CharRefTokenizer {
return .progress
case .namedEnd(let endIndex, let replaceChars):
// swift-format-ignore: NeverForceUnwrap
let lastChar = self.nameBuffer[..<endIndex].last!.firstScalar
let lastChar = self.nameBuffer[..<endIndex].last!
let nextChar: Unicode.Scalar? =
if self.nameBuffer.endIndex != endIndex {
self.nameBuffer[endIndex].firstScalar
Expand All @@ -89,29 +89,29 @@ struct CharRefTokenizer {
switch (isInAttr, lastChar, nextChar) {
case (_, ";", _): break
case (true, _, "="?), (true, _, ("0"..."9")?), (true, _, ("A"..."Z")?), (true, _, ("a"..."z")?):
input.prepend(contentsOf: self.nameBuffer)
input.prepend(contentsOf: self.nameBuffer.unicodeScalars)
return .doneNone
case _: tokenizer.emitError(.missingSemicolon)
}
input.prepend(contentsOf: self.nameBuffer[endIndex...])
input.prepend(contentsOf: self.nameBuffer[endIndex...].unicodeScalars)
return switch replaceChars {
case (let c1, "\0"): .done([c1])
case (let c1, let c2): .done([c1, c2])
}
case .ambiguousAmpersand:
guard let c = tokenizer.peek(input) else {
input.prepend(contentsOf: self.nameBuffer)
input.prepend(contentsOf: self.nameBuffer.unicodeScalars)
return .doneNone
}
switch c.firstScalar {
switch c {
case "0"..."9", "A"..."Z", "a"..."z":
tokenizer.discardChar(&input)
self.nameBuffer.append(c)
self.nameBuffer.append(Character(c))
return .progress
case ";": tokenizer.emitError(.unknownNamedCharRef)
case _: break
}
input.prepend(contentsOf: self.nameBuffer)
input.prepend(contentsOf: self.nameBuffer.unicodeScalars)
return .doneNone
case .numeric:
switch tokenizer.peek(input) {
Expand All @@ -128,17 +128,18 @@ struct CharRefTokenizer {
return .progress
}
case .hexadecimalStart(let uppercase):
switch tokenizer.peek(input)?.firstScalar {
switch tokenizer.peek(input) {
case ("0"..."9")?, ("A"..."F")?, ("a"..."f")?:
self.state = .hexadecimal
return .progress
case _:
tokenizer.emitError(.absenceDigits)
input.prepend(contentsOf: uppercase ? "#X" : "#x")
input.prepend(uppercase ? "X" : "x")
input.prepend("#")
return .doneNone
}
case .decimalStart:
switch tokenizer.peek(input)?.firstScalar {
switch tokenizer.peek(input) {
case ("0"..."9")?:
self.state = .decimal
return .progress
Expand All @@ -148,7 +149,7 @@ struct CharRefTokenizer {
return .doneNone
}
case .hexadecimal:
if let firstScalar = tokenizer.peek(input)?.firstScalar {
if let firstScalar = tokenizer.peek(input) {
switch firstScalar {
case "0"..."9":
tokenizer.discardChar(&input)
Expand Down Expand Up @@ -185,7 +186,7 @@ struct CharRefTokenizer {
self.state = .numericEnd
return .progress
case .decimal:
if let firstScalar = tokenizer.peek(input)?.firstScalar {
if let firstScalar = tokenizer.peek(input) {
switch firstScalar {
case "0"..."9":
tokenizer.discardChar(&input)
Expand Down
2 changes: 1 addition & 1 deletion Sources/Tokenizer/Token.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
public enum Token: Equatable, Sendable {
case char(Character)
case char(Unicode.Scalar)
case tag(Tag)
case comment(String)
case doctype(DOCTYPE)
Expand Down
Loading

0 comments on commit fc60b98

Please sign in to comment.