diff --git a/src/parsers/integer.rs b/src/parsers/integer.rs index 50d61bd..b02356d 100644 --- a/src/parsers/integer.rs +++ b/src/parsers/integer.rs @@ -31,9 +31,13 @@ enum StateExpecting { /// /// Will panic if we reach the end of the input without completing the integer /// (without reaching the end of the integer `e`). -pub fn parse(reader: &mut ByteReader, writer: &mut W) -> Result<(), Error> { +pub fn parse( + reader: &mut ByteReader, + writer: &mut W, +) -> Result, Error> { let mut state = StateExpecting::Start; let mut first_digit_is_zero = false; + let mut value = vec![]; loop { let byte = next_byte(reader, writer)?; @@ -48,10 +52,12 @@ pub fn parse(reader: &mut ByteReader, writer: &mut W) -> StateExpecting::DigitOrSign => { if char == '-' { writer.write_byte(byte)?; + value.push(byte); StateExpecting::DigitAfterSign } else if char.is_ascii_digit() { writer.write_byte(byte)?; + value.push(byte); if char == '0' { first_digit_is_zero = true; @@ -76,6 +82,7 @@ pub fn parse(reader: &mut ByteReader, writer: &mut W) -> StateExpecting::DigitAfterSign => { if char.is_ascii_digit() { writer.write_byte(byte)?; + value.push(byte); if char == '0' { first_digit_is_zero = true; @@ -100,6 +107,7 @@ pub fn parse(reader: &mut ByteReader, writer: &mut W) -> StateExpecting::DigitOrEnd => { if char.is_ascii_digit() { writer.write_byte(byte)?; + value.push(byte); if char == '0' && first_digit_is_zero { return Err(Error::LeadingZerosInIntegersNotAllowed( @@ -118,7 +126,7 @@ pub fn parse(reader: &mut ByteReader, writer: &mut W) -> StateExpecting::DigitOrEnd } else if byte == BENCODE_END_INTEGER { - return Ok(()); + return Ok(value); } else { return Err(Error::UnexpectedByteParsingInteger( ReadContext { @@ -185,12 +193,12 @@ mod tests { let mut output = String::new(); match parse_bencode(input_buffer, &mut output) { - Ok(()) => Ok(output), + Ok(_value) => Ok(output), Err(err) => Err(err), } } - fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<(), Error> { + fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result, Error> { let mut reader = ByteReader::new(input_buffer); let mut writer = StringWriter::new(output); diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs index 44bebb8..5cd0d60 100644 --- a/src/parsers/mod.rs +++ b/src/parsers/mod.rs @@ -8,6 +8,7 @@ pub mod integer; pub mod stack; pub mod string; +use core::str; use std::{ fmt::Write as FmtWrite, io::{self, Read, Write as IoWrite}, @@ -36,6 +37,16 @@ pub enum BencodeType { Dict, } +#[derive(Debug, PartialEq)] +pub enum BencodeToken { + Integer(Vec), + String(Vec), + BeginList, + BeginDict, + EndListOrDict, + LineBreak, +} + pub struct BencodeParser { byte_reader: ByteReader, num_processed_tokens: u64, @@ -104,35 +115,40 @@ impl BencodeParser { /// - It can't read from the input or write to the output. /// - The input is invalid Bencode. fn parse(&mut self, writer: &mut W) -> Result<(), error::Error> { - while let Some(peeked_byte) = Self::peek_byte(&mut self.byte_reader, writer)? { - match peeked_byte { + let capture_output = Vec::new(); + let mut null_writer = ByteWriter::new(capture_output); + + while let Some(peeked_byte) = Self::peek_byte(&mut self.byte_reader, &null_writer)? { + let token: BencodeToken = match peeked_byte { BENCODE_BEGIN_INTEGER => { - self.begin_bencoded_value(BencodeType::Integer, writer)?; - integer::parse(&mut self.byte_reader, writer)?; + let value = integer::parse(&mut self.byte_reader, &mut null_writer)?; + BencodeToken::Integer(value) } b'0'..=b'9' => { - self.begin_bencoded_value(BencodeType::String, writer)?; - string::parse(&mut self.byte_reader, writer)?; + let value = string::parse(&mut self.byte_reader, &mut null_writer)?; + BencodeToken::String(value) } BENCODE_BEGIN_LIST => { - let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?; - self.begin_bencoded_value(BencodeType::List, writer)?; - writer.write_byte(Self::JSON_ARRAY_BEGIN)?; - self.stack.push(State::ExpectingFirstListItemOrEnd); + let _byte = + Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?; + BencodeToken::BeginList } BENCODE_BEGIN_DICT => { - let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?; - self.begin_bencoded_value(BencodeType::Dict, writer)?; - writer.write_byte(Self::JSON_OBJ_BEGIN)?; - self.stack.push(State::ExpectingFirstDictFieldOrEnd); + let _byte = + Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?; + BencodeToken::BeginDict } BENCODE_END_LIST_OR_DICT => { - let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?; - self.end_list_or_dict(writer)?; + let _byte = + Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?; + BencodeToken::EndListOrDict } b'\n' => { + // todo: we should not return any token and continue to the next token. // Ignore line breaks at the beginning, the end, or between values - let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?; + let _byte = + Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?; + BencodeToken::LineBreak } _ => { return Err(error::Error::UnrecognizedFirstBencodeValueByte( @@ -148,6 +164,60 @@ impl BencodeParser { }, )); } + }; + + /* TODO: + + - Extract tokenizer (without implementing the Iterator trait). + - Remove writer from tokenizer. + - Implement trait Iterator for tokenizer. + - Rename this parser to generator. + + */ + + match token { + BencodeToken::Integer(integer_bytes) => { + self.begin_bencoded_value(BencodeType::Integer, writer)?; + // todo: add `write_bytes` to writer. + for bytes in integer_bytes { + writer.write_byte(bytes)?; + } + } + BencodeToken::String(string_bytes) => { + self.begin_bencoded_value(BencodeType::String, writer)?; + + let html_tag_style_string = match str::from_utf8(&string_bytes) { + Ok(string) => { + // String only contains valid UTF-8 chars -> print it as it's + &format!("{}", string.to_owned()) + } + Err(_) => { + // String contains non valid UTF-8 chars -> print it as hex bytes + &format!("{}", hex::encode(string_bytes)) + } + }; + + writer.write_str( + &serde_json::to_string(&html_tag_style_string) + .expect("Failed to serialize to JSON. This should not happen because non UTF-8 bencoded string are serialized as hex bytes"), + )?; + } + BencodeToken::BeginList => { + self.begin_bencoded_value(BencodeType::List, writer)?; + writer.write_byte(Self::JSON_ARRAY_BEGIN)?; + self.stack.push(State::ExpectingFirstListItemOrEnd); + } + BencodeToken::BeginDict => { + self.begin_bencoded_value(BencodeType::Dict, writer)?; + writer.write_byte(Self::JSON_OBJ_BEGIN)?; + self.stack.push(State::ExpectingFirstDictFieldOrEnd); + } + BencodeToken::EndListOrDict => { + self.end_list_or_dict(writer)?; + } + BencodeToken::LineBreak => { + // Ignore line breaks at the beginning, the end, or between values + } } self.num_processed_tokens += 1; diff --git a/src/parsers/string.rs b/src/parsers/string.rs index 93514bc..6d5966a 100644 --- a/src/parsers/string.rs +++ b/src/parsers/string.rs @@ -25,7 +25,10 @@ use super::error::{Error, ReadContext, WriteContext}; /// # Panics /// /// Will panic if we reach the end of the input without completing the string. -pub fn parse(reader: &mut ByteReader, writer: &mut W) -> Result<(), Error> { +pub fn parse( + reader: &mut ByteReader, + writer: &mut W, +) -> Result, Error> { let mut string_parser = StringParser::default(); string_parser.parse(reader, writer) } @@ -46,20 +49,20 @@ impl StringParser { &mut self, reader: &mut ByteReader, writer: &mut W, - ) -> Result<(), Error> { + ) -> Result, Error> { let mut length = Length::default(); length.parse(reader, writer)?; let mut value = Value::new(length.number); - value.parse(reader, writer)?; + let value_bytes = value.parse(reader, writer)?; self.parsed_value = value.utf8(); writer.write_str(&self.json())?; - Ok(()) + Ok(value_bytes) } /// It returns the final parsed value as string. @@ -202,12 +205,12 @@ impl Value { &mut self, reader: &mut ByteReader, writer: &W, - ) -> Result<(), Error> { + ) -> Result, Error> { for _i in 1..=self.length { self.add_byte(Self::next_byte(reader, writer)?); } - Ok(()) + Ok(self.bytes.clone()) } /// It reads the next byte from the input. @@ -282,12 +285,12 @@ mod tests { let mut output = String::new(); match parse_bencode(input_buffer, &mut output) { - Ok(()) => Ok(output), + Ok(_string_value_bytes) => Ok(output), Err(err) => Err(err), } } - fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<(), Error> { + fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result, Error> { let mut reader = ByteReader::new(input_buffer); let mut writer = StringWriter::new(output);