Skip to content

Commit

Permalink
Merge pull request #39 from kkebo/named-char-ref
Browse files Browse the repository at this point in the history
  • Loading branch information
kkebo authored Apr 6, 2024
2 parents e55b299 + 1a0f255 commit f1e72b1
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 46 deletions.
14 changes: 14 additions & 0 deletions Sources/HTMLEntities/namedChars.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2231,3 +2231,17 @@ public let namedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = [
"yen": ("\u{A5}", "\0"),
"yuml": ("\u{FF}", "\0"),
]

// FIXME: This process should be done at compile-time, not runtime.
public let processedNamedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = {
var namedChars = namedChars
for key in namedChars.keys {
for i in 1..<key.count {
let k = String(key.prefix(i))
if !namedChars.keys.contains(k) {
namedChars[k] = ("\0", "\0")
}
}
}
return namedChars
}()
78 changes: 63 additions & 15 deletions Sources/Tokenizer/CharRefTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ private import HTMLEntities
private enum CharRefState {
case initial
case named
case namedEnd(endIndex: String.Index, replaceChars: (Unicode.Scalar, Unicode.Scalar))
case ambiguousAmpersand
case numeric
case hexadecimalStart(uppercase: Bool)
Expand All @@ -23,12 +24,19 @@ struct CharRefTokenizer {
private var state: CharRefState = .initial
private var num: Int = 0
private var numTooBig: Bool = false
private var nameBuffer: String = ""
private var lastMatch: (endIndex: String.Index, replaceChars: (Unicode.Scalar, Unicode.Scalar))?
private let isInAttr: Bool

init(inAttr isInAttr: Bool) {
self.isInAttr = isInAttr
}

mutating func tokenize(tokenizer: inout Tokenizer<some TokenSink>, input: inout Deque<Character>) -> [Unicode.Scalar]? {
repeat {
switch self.step(tokenizer: &tokenizer, input: &input) {
case .done(let scalars): return scalars
case .doneNone: return nil
case .doneNone: return ["&"]
case .progress: break
}
} while true
Expand All @@ -45,28 +53,66 @@ struct CharRefTokenizer {
tokenizer.discardChar(&input)
self.state = .numeric
return .progress
case _: return .done(["&"])
case _: return .doneNone
}
case .named:
// TODO: If there is a match
guard false else {
// TODO: Flush code points consumed as a character reference
tokenizer.processCharRef("&")
self.state = .ambiguousAmpersand
guard let c = tokenizer.peek(input) else {
guard let (endIndex, chars) = lastMatch else {
input.prepend(contentsOf: self.nameBuffer)
return .doneNone
}
self.state = .namedEnd(endIndex: endIndex, replaceChars: chars)
return .progress
}
tokenizer.discardChar(&input)
self.nameBuffer.append(c)
switch processedNamedChars[self.nameBuffer] {
case ("\0", _)?: break
case let chars?: lastMatch = (self.nameBuffer.endIndex, chars)
case nil:
if let (endIndex, chars) = lastMatch {
self.state = .namedEnd(endIndex: endIndex, replaceChars: chars)
} else {
self.state = .ambiguousAmpersand
}
}
return .progress
case .namedEnd(let endIndex, let replaceChars):
// swift-format-ignore: NeverForceUnwrap
let lastChar = self.nameBuffer[..<endIndex].last!.firstScalar
let nextChar: Unicode.Scalar? =
if self.nameBuffer.endIndex != endIndex {
self.nameBuffer[endIndex].firstScalar
} else {
nil
}
switch (isInAttr, lastChar, nextChar) {
case (_, ";", _): break
case (true, _, "="?), (true, _, ("0"..."9")?), (true, _, ("A"..."Z")?), (true, _, ("a"..."z")?):
input.prepend(contentsOf: self.nameBuffer)
return .doneNone
case _: tokenizer.emitError(.missingSemicolon)
}
input.prepend(contentsOf: self.nameBuffer[endIndex...])
return switch replaceChars {
case (let c1, "\0"): .done([c1])
case (let c1, let c2): .done([c1, c2])
}
case .ambiguousAmpersand:
guard let c = tokenizer.peek(input) else { return .doneNone }
guard let c = tokenizer.peek(input) else {
input.prepend(contentsOf: self.nameBuffer)
return .doneNone
}
switch c.firstScalar {
case "0"..."9", "A"..."Z", "a"..."z":
tokenizer.discardChar(&input)
tokenizer.processCharRef(c)
self.nameBuffer.append(c)
return .progress
case ";":
tokenizer.emitError(.unknownNamedCharRef)
return .doneNone
case _: return .doneNone
case ";": tokenizer.emitError(.unknownNamedCharRef)
case _: break
}
input.prepend(contentsOf: self.nameBuffer)
return .doneNone
case .numeric:
switch tokenizer.peek(input) {
case "X":
Expand All @@ -88,7 +134,8 @@ struct CharRefTokenizer {
return .progress
case _:
tokenizer.emitError(.absenceDigits)
return .done(["&", "#", uppercase ? "X" : "x"])
input.prepend(contentsOf: uppercase ? "#X" : "#x")
return .doneNone
}
case .decimalStart:
switch tokenizer.peek(input)?.firstScalar {
Expand All @@ -97,7 +144,8 @@ struct CharRefTokenizer {
return .progress
case _:
tokenizer.emitError(.absenceDigits)
return .done(["&", "#"])
input.prepend("#")
return .doneNone
}
case .hexadecimal:
if let firstScalar = tokenizer.peek(input)?.firstScalar {
Expand Down
16 changes: 8 additions & 8 deletions Sources/Tokenizer/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public import Collections
@freestanding(codeItem) private macro goEmitDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
@freestanding(codeItem) private macro goEmitForceQuirksDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
@freestanding(codeItem) private macro goEmitNewForceQuirksDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
@freestanding(codeItem) private macro goConsumeCharRef() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
@freestanding(codeItem) private macro goConsumeCharRef(inAttr: Bool) = #externalMacro(module: "TokenizerMacros", type: "GoMacro")

public struct Tokenizer<Sink: TokenSink>: ~Copyable {
public var sink: Sink
Expand Down Expand Up @@ -94,7 +94,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
switch self.state {
case .data: repeat {
switch self.getChar(from: &input) {
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: false)
case "<": #go(to: .tagOpen)
case "\0": #go(error: .unexpectedNull, emit: "\0")
case nil: #go(emit: .eof)
Expand All @@ -103,7 +103,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
} while true
case .rcdata: repeat {
switch self.getChar(from: &input) {
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: false)
case "<": #go(to: .rcdataLessThanSign)
case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}")
case nil: #go(emit: .eof)
Expand Down Expand Up @@ -508,7 +508,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
case .attributeValueDoubleQuoted: repeat {
switch self.getChar(from: &input) {
case "\"": #go(to: .afterAttributeValueQuoted)
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: true)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
case let c?: #go(appendAttrValue: c)
Expand All @@ -517,7 +517,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
case .attributeValueSingleQuoted: repeat {
switch self.getChar(from: &input) {
case "'": #go(to: .afterAttributeValueQuoted)
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: true)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
case let c?: #go(appendAttrValue: c)
Expand All @@ -526,7 +526,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
case .attributeValueUnquoted: repeat {
switch self.getChar(from: &input) {
case "\t", "\n", "\u{0C}", " ": #go(to: .beforeAttributeName)
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: true)
case ">": #go(emitTag: .data)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
Expand Down Expand Up @@ -1138,8 +1138,8 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
}

@inline(__always)
private mutating func consumeCharRef() {
self.charRefTokenizer = .init()
private mutating func consumeCharRef(inAttr isInAttr: Bool) {
self.charRefTokenizer = .init(inAttr: isInAttr)
}
}

Expand Down
2 changes: 1 addition & 1 deletion Sources/TokenizerMacros/Macros.swift
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ extension GoMacro: CodeItemMacro {
case "goEmitNewForceQuirksDOCTYPEAndEOF":
return ["self.createDOCTYPE()", "self.forceQuirks()", "self.emitDOCTYPE()", "self.emitEOF()", "return .suspend"]
case "goConsumeCharRef":
return ["self.consumeCharRef()", "return .continue"]
return ["self.consumeCharRef(\(node.arguments))", "return .continue"]
case let name:
preconditionFailure("not supported: \(name)")
}
Expand Down
39 changes: 17 additions & 22 deletions Tests/TokenizerTests/HTML5LibTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,27 @@ private import Tokenizer
private struct TestSink {
var tokens = [Token]()
var errors = [ParseError]()
var pendingChars = ""

consuming func finalize() -> ([Token], [ParseError]) {
self.processChars()
return (self.tokens, self.errors)
}

private mutating func processChars() {
self.tokens.append(contentsOf: self.pendingChars.map(Token.char))
self.pendingChars.removeAll()
}
}

extension TestSink: TokenSink {
mutating func process(_ token: consuming Token) {
switch token {
case .error(let error): self.errors.append(error)
case let token: self.tokens.append(token)
case .char(let c): self.pendingChars.append(c)
case let token:
self.processChars()
self.tokens.append(token)
}
}
}
Expand All @@ -25,7 +39,7 @@ private let testCases = try! [
Bundle.module.url(forResource: "test4", withExtension: "test")!,
Bundle.module.url(forResource: "unicodeChars", withExtension: "test")!,
Bundle.module.url(forResource: "entities", withExtension: "test")!,
// Bundle.module.url(forResource: "namedEntities", withExtension: "test")!,
Bundle.module.url(forResource: "namedEntities", withExtension: "test")!,
Bundle.module.url(forResource: "numericEntities", withExtension: "test")!,
Bundle.module.url(forResource: "pendingSpecChanges", withExtension: "test")!,
// Bundle.module.url(forResource: "contentModelFlags", withExtension: "test")!,
Expand All @@ -36,31 +50,12 @@ private let testCases = try! [

@Test("html5lib-tests", arguments: testCases)
func html5libTests(_ testCase: TestCase) throws {
// TODO: Do not ignore any test cases
switch testCase.title {
// test1.test
case "Entity with trailing semicolon (1)": return
case "Entity with trailing semicolon (2)": return
case "Entity without trailing semicolon (1)": return
case "Entity without trailing semicolon (2)": return
case "Entity in attribute without semicolon": return
// test2.test
case "Entity + newline": return
// entities.test
case "Undefined named entity in a double-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return
case "Undefined named entity in a single-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return
case "Undefined named entity in an unquoted attribute value ending in semicolon and whose name starts with a known entity name.": return
case "Semicolonless named entity 'not' followed by 'i;' in body": return
case _: break
}

var tokenizer = Tokenizer(sink: TestSink())
tokenizer.state = testCase.initialState
var input = Deque(testCase.input)
tokenizer.tokenize(&input)

let tokens = tokenizer.sink.tokens
let errors = tokenizer.sink.errors
let (tokens, errors) = tokenizer.sink.finalize()
#expect(tokens == testCase.tokens)
#expect(errors.count == testCase.errors.count) // TODO: Make it stricter
}

0 comments on commit f1e72b1

Please sign in to comment.