diff --git a/Sources/HTMLEntities/namedChars.swift b/Sources/HTMLEntities/namedChars.swift index 5f0d314..00878fe 100644 --- a/Sources/HTMLEntities/namedChars.swift +++ b/Sources/HTMLEntities/namedChars.swift @@ -2231,3 +2231,17 @@ public let namedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = [ "yen": ("\u{A5}", "\0"), "yuml": ("\u{FF}", "\0"), ] + +// FIXME: This process should be done at compile-time, not runtime. +public let processedNamedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = { + var namedChars = namedChars + for key in namedChars.keys { + for i in 1.., input: inout Deque) -> [Unicode.Scalar]? { repeat { switch self.step(tokenizer: &tokenizer, input: &input) { case .done(let scalars): return scalars - case .doneNone: return nil + case .doneNone: return ["&"] case .progress: break } } while true @@ -45,28 +53,66 @@ struct CharRefTokenizer { tokenizer.discardChar(&input) self.state = .numeric return .progress - case _: return .done(["&"]) + case _: return .doneNone } case .named: - // TODO: If there is a match - guard false else { - // TODO: Flush code points consumed as a character reference - tokenizer.processCharRef("&") - self.state = .ambiguousAmpersand + guard let c = tokenizer.peek(input) else { + guard let (endIndex, chars) = lastMatch else { + input.prepend(contentsOf: self.nameBuffer) + return .doneNone + } + self.state = .namedEnd(endIndex: endIndex, replaceChars: chars) return .progress } + tokenizer.discardChar(&input) + self.nameBuffer.append(c) + switch processedNamedChars[self.nameBuffer] { + case ("\0", _)?: break + case let chars?: lastMatch = (self.nameBuffer.endIndex, chars) + case nil: + if let (endIndex, chars) = lastMatch { + self.state = .namedEnd(endIndex: endIndex, replaceChars: chars) + } else { + self.state = .ambiguousAmpersand + } + } + return .progress + case .namedEnd(let endIndex, let replaceChars): + // swift-format-ignore: NeverForceUnwrap + let lastChar = self.nameBuffer[..: ~Copyable { public var sink: Sink @@ -94,7 +94,7 @@ public struct Tokenizer: ~Copyable { switch self.state { case .data: repeat { switch self.getChar(from: &input) { - case "&": #goConsumeCharRef + case "&": #goConsumeCharRef(inAttr: false) case "<": #go(to: .tagOpen) case "\0": #go(error: .unexpectedNull, emit: "\0") case nil: #go(emit: .eof) @@ -103,7 +103,7 @@ public struct Tokenizer: ~Copyable { } while true case .rcdata: repeat { switch self.getChar(from: &input) { - case "&": #goConsumeCharRef + case "&": #goConsumeCharRef(inAttr: false) case "<": #go(to: .rcdataLessThanSign) case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}") case nil: #go(emit: .eof) @@ -508,7 +508,7 @@ public struct Tokenizer: ~Copyable { case .attributeValueDoubleQuoted: repeat { switch self.getChar(from: &input) { case "\"": #go(to: .afterAttributeValueQuoted) - case "&": #goConsumeCharRef + case "&": #goConsumeCharRef(inAttr: true) case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}") case nil: #go(error: .eofInTag, emit: .eof) case let c?: #go(appendAttrValue: c) @@ -517,7 +517,7 @@ public struct Tokenizer: ~Copyable { case .attributeValueSingleQuoted: repeat { switch self.getChar(from: &input) { case "'": #go(to: .afterAttributeValueQuoted) - case "&": #goConsumeCharRef + case "&": #goConsumeCharRef(inAttr: true) case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}") case nil: #go(error: .eofInTag, emit: .eof) case let c?: #go(appendAttrValue: c) @@ -526,7 +526,7 @@ public struct Tokenizer: ~Copyable { case .attributeValueUnquoted: repeat { switch self.getChar(from: &input) { case "\t", "\n", "\u{0C}", " ": #go(to: .beforeAttributeName) - case "&": #goConsumeCharRef + case "&": #goConsumeCharRef(inAttr: true) case ">": #go(emitTag: .data) case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}") case nil: #go(error: .eofInTag, emit: .eof) @@ -1138,8 +1138,8 @@ public struct Tokenizer: ~Copyable { } @inline(__always) - private mutating func consumeCharRef() { - self.charRefTokenizer = .init() + private mutating func consumeCharRef(inAttr isInAttr: Bool) { + self.charRefTokenizer = .init(inAttr: isInAttr) } } diff --git a/Sources/TokenizerMacros/Macros.swift b/Sources/TokenizerMacros/Macros.swift index a3d1639..554b8ea 100644 --- a/Sources/TokenizerMacros/Macros.swift +++ b/Sources/TokenizerMacros/Macros.swift @@ -162,7 +162,7 @@ extension GoMacro: CodeItemMacro { case "goEmitNewForceQuirksDOCTYPEAndEOF": return ["self.createDOCTYPE()", "self.forceQuirks()", "self.emitDOCTYPE()", "self.emitEOF()", "return .suspend"] case "goConsumeCharRef": - return ["self.consumeCharRef()", "return .continue"] + return ["self.consumeCharRef(\(node.arguments))", "return .continue"] case let name: preconditionFailure("not supported: \(name)") } diff --git a/Tests/TokenizerTests/HTML5LibTests.swift b/Tests/TokenizerTests/HTML5LibTests.swift index 3ebde1f..15ce1c3 100644 --- a/Tests/TokenizerTests/HTML5LibTests.swift +++ b/Tests/TokenizerTests/HTML5LibTests.swift @@ -6,13 +6,27 @@ private import Tokenizer private struct TestSink { var tokens = [Token]() var errors = [ParseError]() + var pendingChars = "" + + consuming func finalize() -> ([Token], [ParseError]) { + self.processChars() + return (self.tokens, self.errors) + } + + private mutating func processChars() { + self.tokens.append(contentsOf: self.pendingChars.map(Token.char)) + self.pendingChars.removeAll() + } } extension TestSink: TokenSink { mutating func process(_ token: consuming Token) { switch token { case .error(let error): self.errors.append(error) - case let token: self.tokens.append(token) + case .char(let c): self.pendingChars.append(c) + case let token: + self.processChars() + self.tokens.append(token) } } } @@ -25,7 +39,7 @@ private let testCases = try! [ Bundle.module.url(forResource: "test4", withExtension: "test")!, Bundle.module.url(forResource: "unicodeChars", withExtension: "test")!, Bundle.module.url(forResource: "entities", withExtension: "test")!, - // Bundle.module.url(forResource: "namedEntities", withExtension: "test")!, + Bundle.module.url(forResource: "namedEntities", withExtension: "test")!, Bundle.module.url(forResource: "numericEntities", withExtension: "test")!, Bundle.module.url(forResource: "pendingSpecChanges", withExtension: "test")!, // Bundle.module.url(forResource: "contentModelFlags", withExtension: "test")!, @@ -36,31 +50,12 @@ private let testCases = try! [ @Test("html5lib-tests", arguments: testCases) func html5libTests(_ testCase: TestCase) throws { - // TODO: Do not ignore any test cases - switch testCase.title { - // test1.test - case "Entity with trailing semicolon (1)": return - case "Entity with trailing semicolon (2)": return - case "Entity without trailing semicolon (1)": return - case "Entity without trailing semicolon (2)": return - case "Entity in attribute without semicolon": return - // test2.test - case "Entity + newline": return - // entities.test - case "Undefined named entity in a double-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return - case "Undefined named entity in a single-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return - case "Undefined named entity in an unquoted attribute value ending in semicolon and whose name starts with a known entity name.": return - case "Semicolonless named entity 'not' followed by 'i;' in body": return - case _: break - } - var tokenizer = Tokenizer(sink: TestSink()) tokenizer.state = testCase.initialState var input = Deque(testCase.input) tokenizer.tokenize(&input) - let tokens = tokenizer.sink.tokens - let errors = tokenizer.sink.errors + let (tokens, errors) = tokenizer.sink.finalize() #expect(tokens == testCase.tokens) #expect(errors.count == testCase.errors.count) // TODO: Make it stricter }