Skip to content

Commit

Permalink
setup tokens so they are usable. Also some reformatting through rustfmt
Browse files Browse the repository at this point in the history
  • Loading branch information
jaytaph committed Aug 16, 2023
1 parent c4204fd commit 5d1abc0
Show file tree
Hide file tree
Showing 11 changed files with 3,198 additions and 3,217 deletions.
811 changes: 406 additions & 405 deletions src/html5_parser/consume_char_refs.rs

Large diffs are not rendered by default.

138 changes: 68 additions & 70 deletions src/html5_parser/emitter.rs
Original file line number Diff line number Diff line change
@@ -1,70 +1,68 @@
use std::fmt::{Display, Formatter};
use crate::html5_parser::tokenizer::Token;

pub(crate) trait Emitter: Display {
fn emit(&mut self, t: Token);
}

// Emitter that will send the output to a string
struct StrEmitter {
output: String
}

impl StrEmitter {
pub fn new() -> Self {
StrEmitter {
output: String::new(),
}
}
}

impl Display for StrEmitter {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.output)
}
}

impl Emitter for StrEmitter {
fn emit(&mut self, _t: Token) {
// self.output.add(&*t.to_string());
}
}

// Default emitter that will emit tokens to the std output
pub struct IoEmitter {
}

impl IoEmitter {
pub fn new() -> Self {
IoEmitter{}
}
}

impl Display for IoEmitter {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "")
}
}

// Implement the emit() function
impl Emitter for IoEmitter {
fn emit(&mut self, t: Token) {
println!("{}", t.to_string());
}
}

#[cfg(test)]
mod test {

// #[test]
// fn test_emit() {
// let e = StrEmitter::new();
// e.emit(Token::String(String::from("hello world")));
// assert_eq!(e.output, "hello world");
//
// let e = StrEmitter::new();
// e.emit(Token::StartTag(StartTag::new("tag", true, None, "")));
// assert_eq!(e.output, "<tag/>");
// }
}

use crate::html5_parser::token::Token;
use std::fmt::{Display, Formatter};

pub(crate) trait Emitter: Display {
fn emit(&mut self, t: Token);
}

// Emitter that will send the output to a string
struct StrEmitter {
output: String,
}

impl StrEmitter {
pub fn new() -> Self {
StrEmitter {
output: String::new(),
}
}
}

impl Display for StrEmitter {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.output)
}
}

impl Emitter for StrEmitter {
fn emit(&mut self, _t: Token) {
// self.output.add(&*t.to_string());
}
}

// Default emitter that will emit tokens to the std output
pub struct IoEmitter {}

impl IoEmitter {
pub fn new() -> Self {
IoEmitter {}
}
}

impl Display for IoEmitter {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "")
}
}

// Implement the emit() function
impl Emitter for IoEmitter {
fn emit(&mut self, t: Token) {
println!("{}", t.to_string());
}
}

#[cfg(test)]
mod test {

// #[test]
// fn test_emit() {
// let e = StrEmitter::new();
// e.emit(Token::String(String::from("hello world")));
// assert_eq!(e.output, "hello world");
//
// let e = StrEmitter::new();
// e.emit(Token::StartTag(StartTag::new("tag", true, None, "")));
// assert_eq!(e.output, "<tag/>");
// }
}
65 changes: 31 additions & 34 deletions src/html5_parser/input_stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,31 @@ use std::io::Read;
// Encoding defines the way the buffer stream is read, as what defines a "character".
#[derive(PartialEq)]
pub enum Encoding {
UTF8, // Stream is of UTF8 characters
ASCII, // Stream is of 8bit ASCII
// Iso88591 // Stream is of iso_8859_1
// More
UTF8, // Stream is of UTF8 characters
ASCII, // Stream is of 8bit ASCII
// Iso88591 // Stream is of iso_8859_1
// More
}

// The confidence decides how confident we are that the input stream is of this encoding
#[derive(PartialEq)]
pub enum Confidence {
Tentative, // This encoding might be the one we need
Certain, // We are certain to use this encoding
// Irrelevant // There is no content encoding for this stream
Tentative, // This encoding might be the one we need
Certain, // We are certain to use this encoding
// Irrelevant // There is no content encoding for this stream
}

// HTML(5) input stream structure
pub struct InputStream {
encoding: Encoding, // Current encoding
pub(crate) confidence: Confidence, // How confident are we that this is the correct encoding?
current: usize, // Current offset of the reader
length: usize, // Length (in bytes) of the buffer
buffer: Vec<char>, // Reference to the actual buffer stream in characters
u8_buffer: Vec<u8> // Reference to the actual buffer stream in u8 bytes
// If all things are ok, both buffer and u8_buffer should refer to the same memory location
encoding: Encoding, // Current encoding
pub(crate) confidence: Confidence, // How confident are we that this is the correct encoding?
current: usize, // Current offset of the reader
length: usize, // Length (in bytes) of the buffer
buffer: Vec<char>, // Reference to the actual buffer stream in characters
u8_buffer: Vec<u8>, // Reference to the actual buffer stream in u8 bytes
// If all things are ok, both buffer and u8_buffer should refer to the same memory location
}


impl InputStream {
// Create a new default empty input stream
pub fn new() -> Self {
Expand All @@ -55,20 +54,17 @@ impl InputStream {
}

// Returns true when the stream pointer is at the end of the stream
pub fn eof(&self) -> bool
{
pub fn eof(&self) -> bool {
self.current >= self.length
}

// Reset the stream reader back to the start
pub fn reset(&mut self)
{
pub fn reset(&mut self) {
self.current = 0
}

// Seek explicit offset in the stream (based on chars)
pub fn seek(&mut self, mut off: usize)
{
pub fn seek(&mut self, mut off: usize) {
if off > self.length {
off = self.length
}
Expand All @@ -81,17 +77,15 @@ impl InputStream {
}

// Set the given confidence of the input stream encoding
pub fn set_confidence(&mut self, c: Confidence)
{
pub fn set_confidence(&mut self, c: Confidence) {
self.confidence = c;
}

// Changes the encoding and if necessary, decodes the u8 buffer into the correct encoding
pub fn set_encoding(&mut self, e: Encoding)
{
pub fn set_encoding(&mut self, e: Encoding) {
// Don't convert if the encoding is the same as it already is
if self.encoding == e {
return
return;
}

self.force_set_encoding(e)
Expand All @@ -111,7 +105,11 @@ impl InputStream {
}
Encoding::ASCII => {
// Convert the string into characters so we can use easy indexing. Any non-ascii chars (> 0x7F) are converted to '?'
self.buffer = self.u8_buffer.iter().map(|&byte| if byte.is_ascii() { byte as char } else { '?' }).collect();
self.buffer = self
.u8_buffer
.iter()
.map(|&byte| if byte.is_ascii() { byte as char } else { '?' })
.collect();
self.length = self.buffer.len();
}
}
Expand Down Expand Up @@ -141,16 +139,15 @@ impl InputStream {
}

// Reads a character and increases the current pointer
pub(crate) fn read_char(&mut self) -> Option<char>
{
pub(crate) fn read_char(&mut self) -> Option<char> {
if self.eof() {
return None;
}

let c = self.buffer[self.current];
self.current+=1;
self.current += 1;

return Some(c)
return Some(c);
}

pub(crate) fn unread(&mut self) {
Expand All @@ -167,12 +164,12 @@ impl InputStream {

// Trying to look after the stream
if c + idx > self.length as i32 {
return None
return None;
}

// Trying to look before the stream
if c + idx < 0 {
return None
return None;
}

Some(self.buffer[(c + idx) as usize])
Expand Down Expand Up @@ -271,4 +268,4 @@ mod test {
is.seek(4);
assert_eq!(is.look_ahead(-1).unwrap(), 'c');
}
}
}
81 changes: 41 additions & 40 deletions src/html5_parser/mod.rs
Original file line number Diff line number Diff line change
@@ -1,40 +1,41 @@
pub mod input_stream;

mod node;
mod tokenizer;
mod token_replacements;
mod token_states;
mod consume_char_refs;
mod token_named_characters;
mod emitter;

use tokenizer::Tokenizer;
use input_stream::InputStream;
use node::Node;

pub struct Html5Parser<'a> {
tokenizer: Tokenizer<'a>,
}

impl<'a> Html5Parser<'a> {
// Creates a new parser object with the given input stream
pub fn new(stream: &'a mut InputStream) -> Self {
Html5Parser {
tokenizer: Tokenizer::new(stream),
}
}

// Parses the input stream into a Node tree
pub fn parse(&mut self) -> Node {
// Tokenize stuff

for _ in 1..=20 {
let t = self.tokenizer.next_token();
println!("{}", t);
}

let mut n = Node::new("root");
n.add_child(Node::new("child"));
return n;
}
}
pub mod input_stream;

mod consume_char_refs;
mod emitter;
mod node;
mod token;
mod token_named_characters;
mod token_replacements;
mod token_states;
mod tokenizer;

use input_stream::InputStream;
use node::Node;
use tokenizer::Tokenizer;

pub struct Html5Parser<'a> {
tokenizer: Tokenizer<'a>,
}

impl<'a> Html5Parser<'a> {
// Creates a new parser object with the given input stream
pub fn new(stream: &'a mut InputStream) -> Self {
Html5Parser {
tokenizer: Tokenizer::new(stream),
}
}

// Parses the input stream into a Node tree
pub fn parse(&mut self) -> Node {
// Tokenize stuff

for _ in 1..=20 {
let t = self.tokenizer.next_token();
println!("{}", t.to_string());
}

let mut n = Node::new("root");
n.add_child(Node::new("child"));
return n;
}
}
Loading

0 comments on commit 5d1abc0

Please sign in to comment.