Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update html5lib-tests #460

Merged
merged 40 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e7c8609
wip on updating html5lib-tests
untitaker Dec 10, 2021
d7fbbeb
fix up parse error parsing
untitaker Jan 17, 2022
6fea210
add better debug output
untitaker Jan 21, 2022
42cb8c5
wip
untitaker Jan 21, 2022
fe69f2a
wip
untitaker Jan 21, 2022
92870e5
wip
untitaker Jan 21, 2022
be63b7a
wip
untitaker Jan 21, 2022
22b810a
adjust all switches to BogusComment (according to html5gum)
untitaker Jan 21, 2022
ce33a56
wip
untitaker Jan 21, 2022
388a97b
wip
untitaker Jan 21, 2022
e02f121
wip
untitaker Jan 21, 2022
ff62f59
wip
untitaker Jan 21, 2022
8091d12
wip
untitaker Jan 21, 2022
4d704fc
wip
untitaker Jan 21, 2022
71ed7a6
wip (test3 done)
untitaker Jan 21, 2022
eeb17bb
fix test1
untitaker Jan 23, 2022
8444198
wip on entities.test
untitaker Jan 23, 2022
4bf9b3e
get rid of addnl_allowed in charref tokenizer
untitaker Feb 2, 2022
575b077
remove bogusname???
untitaker Mar 19, 2022
c9f10e1
fix escapeFlag.test: End tag surrounded by bogus comment in RCDATA or…
untitaker Apr 1, 2022
38912be
update html5lib tests
untitaker Jun 21, 2023
448122d
Merge branch 'master' into html5lib-tests-update
untitaker Jul 16, 2023
9365e9c
Revert "remove bogusname???"
untitaker Apr 3, 2022
eb28165
wip restore bogusname
untitaker Apr 3, 2022
ee5e375
more bugfixes
untitaker Jul 16, 2023
5b9ba09
Revert "wip restore bogusname"
untitaker Jul 16, 2023
07b4121
fix a bug when peeking characters in BeforeAttributeValue
untitaker Jul 16, 2023
a9c3726
make eat() pre-process input characters
untitaker Jul 17, 2023
4e23332
update charref states
untitaker Jul 17, 2023
10b47e4
add regression tests, skip broken test
untitaker Jul 17, 2023
e909dd4
fix hang
untitaker Jul 17, 2023
7f92546
fix bug where ignore_lf was not reset during unconsuming
untitaker Jul 17, 2023
b78f4c8
fix webkit02.dat-26 test
untitaker Jul 18, 2023
1d177fd
fix wbekit02.dat-22
untitaker Jul 18, 2023
dd5cf99
fix ack self-closing
untitaker Jul 18, 2023
6c28250
fix tests26.dat-19
untitaker Jul 18, 2023
dcc293c
fix foreign-fragment.dat-65
untitaker Jul 18, 2023
fb3d51d
fix search-element.dat-0
untitaker Jul 18, 2023
968f662
fix search-element.dat-1
untitaker Jul 18, 2023
2a5a626
fix bug in charref tokenizer wrt newline normalization
untitaker Jul 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 30 additions & 33 deletions html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ enum State {

pub struct CharRefTokenizer {
state: State,
addnl_allowed: Option<char>,
result: Option<CharRef>,
is_consumed_in_attribute: bool,

num: u32,
num_too_big: bool,
Expand All @@ -61,12 +61,10 @@ pub struct CharRefTokenizer {
}

impl CharRefTokenizer {
// NB: We assume that we have an additional allowed character iff we're
// tokenizing in an attribute value.
pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
pub fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
CharRefTokenizer {
is_consumed_in_attribute,
state: Begin,
addnl_allowed,
result: None,
num: 0,
num_too_big: false,
Expand Down Expand Up @@ -140,20 +138,18 @@ impl CharRefTokenizer {
input: &mut BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
'\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
c if Some(c) == self.addnl_allowed => self.finish_none(),
'a'..='z' | 'A'..='Z' | '0'..='9' => {
self.state = Named;
self.name_buf_opt = Some(StrTendril::new());
Progress
},

'#' => {
tokenizer.discard_char(input);
self.state = Octothorpe;
Progress
},

_ => {
self.state = Named;
self.name_buf_opt = Some(StrTendril::new());
Progress
},
_ => self.finish_none(),
}
}

Expand Down Expand Up @@ -277,7 +273,10 @@ impl CharRefTokenizer {
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
// peek + discard skips over newline normalization, therefore making it easier to
// un-consume
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
tokenizer.discard_char(input);
self.name_buf_mut().push_char(c);
match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
// We have either a full match or a prefix of one.
Expand Down Expand Up @@ -356,26 +355,20 @@ impl CharRefTokenizer {
Some(self.name_buf()[name_len..].chars().next().unwrap())
};

// "If the character reference is being consumed as part of an
// attribute, and the last character matched is not a U+003B
// SEMICOLON character (;), and the next character is either a
// U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
// character, then, for historical reasons, all the characters
// that were matched after the U+0026 AMPERSAND character (&)
// must be unconsumed, and nothing is returned. However, if
// this next character is in fact a U+003D EQUALS SIGN
// character (=), then this is a parse error"

let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
// If the character reference was consumed as part of an attribute, and the last
// character matched is not a U+003B SEMICOLON character (;), and the next input
// character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
// then, for historical reasons, flush code points consumed as a character
// reference and switch to the return state.

let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after) {
(_, ';', _) => false,
(Some(_), _, Some('=')) => {
tokenizer.emit_error(Borrowed(
"Equals sign after character reference in attribute",
));
true
},
(Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
(true, _, Some('=')) => true,
(true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
_ => {
// 1. If the last character matched is not a U+003B SEMICOLON character
// (;), then this is a missing-semicolon-after-character-reference parse
// error.
tokenizer.emit_error(Borrowed(
"Character reference does not end with semicolon",
));
Expand All @@ -388,6 +381,7 @@ impl CharRefTokenizer {
self.finish_none()
} else {
input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
tokenizer.ignore_lf = false;
self.result = Some(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
num_chars: if c2 == 0 { 1 } else { 2 },
Expand All @@ -403,7 +397,10 @@ impl CharRefTokenizer {
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
// peek + discard skips over newline normalization, therefore making it easier to
// un-consume
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
tokenizer.discard_char(input);
self.name_buf_mut().push_char(c);
match c {
_ if c.is_ascii_alphanumeric() => return Progress,
Expand Down
Loading