servo · mrobinson · Aug 11, 2023 · Dec 10, 2021 · Jan 17, 2022 · Jan 21, 2022
diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs
@@ -47,8 +47,8 @@ enum State {
 
 pub struct CharRefTokenizer {
     state: State,
-    addnl_allowed: Option<char>,
     result: Option<CharRef>,
+    is_consumed_in_attribute: bool,
 
     num: u32,
     num_too_big: bool,
@@ -61,12 +61,10 @@ pub struct CharRefTokenizer {
 }
 
 impl CharRefTokenizer {
-    // NB: We assume that we have an additional allowed character iff we're
-    // tokenizing in an attribute value.
-    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
+    pub fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
         CharRefTokenizer {
+            is_consumed_in_attribute,
             state: Begin,
-            addnl_allowed,
             result: None,
             num: 0,
             num_too_big: false,
@@ -140,20 +138,18 @@ impl CharRefTokenizer {
         input: &mut BufferQueue,
     ) -> Status {
         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
-            '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
-            c if Some(c) == self.addnl_allowed => self.finish_none(),
+            'a'..='z' | 'A'..='Z' | '0'..='9' => {
+                self.state = Named;
+                self.name_buf_opt = Some(StrTendril::new());
+                Progress
+            },
 
             '#' => {
                 tokenizer.discard_char(input);
                 self.state = Octothorpe;
                 Progress
             },
-
-            _ => {
-                self.state = Named;
-                self.name_buf_opt = Some(StrTendril::new());
-                Progress
-            },
+            _ => self.finish_none(),
         }
     }
 
@@ -277,7 +273,10 @@ impl CharRefTokenizer {
         tokenizer: &mut Tokenizer<Sink>,
         input: &mut BufferQueue,
     ) -> Status {
-        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
+        // peek + discard skips over newline normalization, therefore making it easier to
+        // un-consume
+        let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
+        tokenizer.discard_char(input);
         self.name_buf_mut().push_char(c);
         match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
             // We have either a full match or a prefix of one.
@@ -356,26 +355,20 @@ impl CharRefTokenizer {
                     Some(self.name_buf()[name_len..].chars().next().unwrap())
                 };
 
-                // "If the character reference is being consumed as part of an
-                // attribute, and the last character matched is not a U+003B
-                // SEMICOLON character (;), and the next character is either a
-                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
-                // character, then, for historical reasons, all the characters
-                // that were matched after the U+0026 AMPERSAND character (&)
-                // must be unconsumed, and nothing is returned. However, if
-                // this next character is in fact a U+003D EQUALS SIGN
-                // character (=), then this is a parse error"
-
-                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
+                // If the character reference was consumed as part of an attribute, and the last
+                // character matched is not a U+003B SEMICOLON character (;), and the next input
+                // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
+                // then, for historical reasons, flush code points consumed as a character
+                // reference and switch to the return state.
+
+                let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after) {
                     (_, ';', _) => false,
-                    (Some(_), _, Some('=')) => {
-                        tokenizer.emit_error(Borrowed(
-                            "Equals sign after character reference in attribute",
-                        ));
-                        true
-                    },
-                    (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
+                    (true, _, Some('=')) => true,
+                    (true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
                     _ => {
+                        // 1. If the last character matched is not a U+003B SEMICOLON character
+                        //    (;), then this is a missing-semicolon-after-character-reference parse
+                        //    error.
                         tokenizer.emit_error(Borrowed(
                             "Character reference does not end with semicolon",
                         ));
@@ -388,6 +381,7 @@ impl CharRefTokenizer {
                     self.finish_none()
                 } else {
                     input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
+                    tokenizer.ignore_lf = false;
                     self.result = Some(CharRef {
                         chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
                         num_chars: if c2 == 0 { 1 } else { 2 },
@@ -403,7 +397,10 @@ impl CharRefTokenizer {
         tokenizer: &mut Tokenizer<Sink>,
         input: &mut BufferQueue,
     ) -> Status {
-        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
+        // peek + discard skips over newline normalization, therefore making it easier to
+        // un-consume
+        let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
+        tokenizer.discard_char(input);
         self.name_buf_mut().push_char(c);
         match c {
             _ if c.is_ascii_alphanumeric() => return Progress,