diff --git a/Sources/HTMLEntities/namedChars.swift b/Sources/HTMLEntities/namedChars.swift
index 5f0d314..00878fe 100644
--- a/Sources/HTMLEntities/namedChars.swift
+++ b/Sources/HTMLEntities/namedChars.swift
@@ -2231,3 +2231,17 @@ public let namedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = [
"yen": ("\u{A5}", "\0"),
"yuml": ("\u{FF}", "\0"),
]
+
+// FIXME: This process should be done at compile-time, not runtime.
+public let processedNamedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = {
+ var namedChars = namedChars
+ for key in namedChars.keys {
+ for i in 1.., input: inout Deque) -> [Unicode.Scalar]? {
repeat {
switch self.step(tokenizer: &tokenizer, input: &input) {
case .done(let scalars): return scalars
- case .doneNone: return nil
+ case .doneNone: return ["&"]
case .progress: break
}
} while true
@@ -45,28 +53,66 @@ struct CharRefTokenizer {
tokenizer.discardChar(&input)
self.state = .numeric
return .progress
- case _: return .done(["&"])
+ case _: return .doneNone
}
case .named:
- // TODO: If there is a match
- guard false else {
- // TODO: Flush code points consumed as a character reference
- tokenizer.processCharRef("&")
- self.state = .ambiguousAmpersand
+ guard let c = tokenizer.peek(input) else {
+ guard let (endIndex, chars) = lastMatch else {
+ input.prepend(contentsOf: self.nameBuffer)
+ return .doneNone
+ }
+ self.state = .namedEnd(endIndex: endIndex, replaceChars: chars)
return .progress
}
+ tokenizer.discardChar(&input)
+ self.nameBuffer.append(c)
+ switch processedNamedChars[self.nameBuffer] {
+ case ("\0", _)?: break
+ case let chars?: lastMatch = (self.nameBuffer.endIndex, chars)
+ case nil:
+ if let (endIndex, chars) = lastMatch {
+ self.state = .namedEnd(endIndex: endIndex, replaceChars: chars)
+ } else {
+ self.state = .ambiguousAmpersand
+ }
+ }
+ return .progress
+ case .namedEnd(let endIndex, let replaceChars):
+ // swift-format-ignore: NeverForceUnwrap
+ let lastChar = self.nameBuffer[..: ~Copyable {
public var sink: Sink
@@ -94,7 +94,7 @@ public struct Tokenizer: ~Copyable {
switch self.state {
case .data: repeat {
switch self.getChar(from: &input) {
- case "&": #goConsumeCharRef
+ case "&": #goConsumeCharRef(inAttr: false)
case "<": #go(to: .tagOpen)
case "\0": #go(error: .unexpectedNull, emit: "\0")
case nil: #go(emit: .eof)
@@ -103,7 +103,7 @@ public struct Tokenizer: ~Copyable {
} while true
case .rcdata: repeat {
switch self.getChar(from: &input) {
- case "&": #goConsumeCharRef
+ case "&": #goConsumeCharRef(inAttr: false)
case "<": #go(to: .rcdataLessThanSign)
case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}")
case nil: #go(emit: .eof)
@@ -508,7 +508,7 @@ public struct Tokenizer: ~Copyable {
case .attributeValueDoubleQuoted: repeat {
switch self.getChar(from: &input) {
case "\"": #go(to: .afterAttributeValueQuoted)
- case "&": #goConsumeCharRef
+ case "&": #goConsumeCharRef(inAttr: true)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
case let c?: #go(appendAttrValue: c)
@@ -517,7 +517,7 @@ public struct Tokenizer: ~Copyable {
case .attributeValueSingleQuoted: repeat {
switch self.getChar(from: &input) {
case "'": #go(to: .afterAttributeValueQuoted)
- case "&": #goConsumeCharRef
+ case "&": #goConsumeCharRef(inAttr: true)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
case let c?: #go(appendAttrValue: c)
@@ -526,7 +526,7 @@ public struct Tokenizer: ~Copyable {
case .attributeValueUnquoted: repeat {
switch self.getChar(from: &input) {
case "\t", "\n", "\u{0C}", " ": #go(to: .beforeAttributeName)
- case "&": #goConsumeCharRef
+ case "&": #goConsumeCharRef(inAttr: true)
case ">": #go(emitTag: .data)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
@@ -1138,8 +1138,8 @@ public struct Tokenizer: ~Copyable {
}
@inline(__always)
- private mutating func consumeCharRef() {
- self.charRefTokenizer = .init()
+ private mutating func consumeCharRef(inAttr isInAttr: Bool) {
+ self.charRefTokenizer = .init(inAttr: isInAttr)
}
}
diff --git a/Sources/TokenizerMacros/Macros.swift b/Sources/TokenizerMacros/Macros.swift
index a3d1639..554b8ea 100644
--- a/Sources/TokenizerMacros/Macros.swift
+++ b/Sources/TokenizerMacros/Macros.swift
@@ -162,7 +162,7 @@ extension GoMacro: CodeItemMacro {
case "goEmitNewForceQuirksDOCTYPEAndEOF":
return ["self.createDOCTYPE()", "self.forceQuirks()", "self.emitDOCTYPE()", "self.emitEOF()", "return .suspend"]
case "goConsumeCharRef":
- return ["self.consumeCharRef()", "return .continue"]
+ return ["self.consumeCharRef(\(node.arguments))", "return .continue"]
case let name:
preconditionFailure("not supported: \(name)")
}
diff --git a/Tests/TokenizerTests/HTML5LibTests.swift b/Tests/TokenizerTests/HTML5LibTests.swift
index 3ebde1f..15ce1c3 100644
--- a/Tests/TokenizerTests/HTML5LibTests.swift
+++ b/Tests/TokenizerTests/HTML5LibTests.swift
@@ -6,13 +6,27 @@ private import Tokenizer
private struct TestSink {
var tokens = [Token]()
var errors = [ParseError]()
+ var pendingChars = ""
+
+ consuming func finalize() -> ([Token], [ParseError]) {
+ self.processChars()
+ return (self.tokens, self.errors)
+ }
+
+ private mutating func processChars() {
+ self.tokens.append(contentsOf: self.pendingChars.map(Token.char))
+ self.pendingChars.removeAll()
+ }
}
extension TestSink: TokenSink {
mutating func process(_ token: consuming Token) {
switch token {
case .error(let error): self.errors.append(error)
- case let token: self.tokens.append(token)
+ case .char(let c): self.pendingChars.append(c)
+ case let token:
+ self.processChars()
+ self.tokens.append(token)
}
}
}
@@ -25,7 +39,7 @@ private let testCases = try! [
Bundle.module.url(forResource: "test4", withExtension: "test")!,
Bundle.module.url(forResource: "unicodeChars", withExtension: "test")!,
Bundle.module.url(forResource: "entities", withExtension: "test")!,
- // Bundle.module.url(forResource: "namedEntities", withExtension: "test")!,
+ Bundle.module.url(forResource: "namedEntities", withExtension: "test")!,
Bundle.module.url(forResource: "numericEntities", withExtension: "test")!,
Bundle.module.url(forResource: "pendingSpecChanges", withExtension: "test")!,
// Bundle.module.url(forResource: "contentModelFlags", withExtension: "test")!,
@@ -36,31 +50,12 @@ private let testCases = try! [
@Test("html5lib-tests", arguments: testCases)
func html5libTests(_ testCase: TestCase) throws {
- // TODO: Do not ignore any test cases
- switch testCase.title {
- // test1.test
- case "Entity with trailing semicolon (1)": return
- case "Entity with trailing semicolon (2)": return
- case "Entity without trailing semicolon (1)": return
- case "Entity without trailing semicolon (2)": return
- case "Entity in attribute without semicolon": return
- // test2.test
- case "Entity + newline": return
- // entities.test
- case "Undefined named entity in a double-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return
- case "Undefined named entity in a single-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return
- case "Undefined named entity in an unquoted attribute value ending in semicolon and whose name starts with a known entity name.": return
- case "Semicolonless named entity 'not' followed by 'i;' in body": return
- case _: break
- }
-
var tokenizer = Tokenizer(sink: TestSink())
tokenizer.state = testCase.initialState
var input = Deque(testCase.input)
tokenizer.tokenize(&input)
- let tokens = tokenizer.sink.tokens
- let errors = tokenizer.sink.errors
+ let (tokens, errors) = tokenizer.sink.finalize()
#expect(tokens == testCase.tokens)
#expect(errors.count == testCase.errors.count) // TODO: Make it stricter
}