Skip to content

Commit

Permalink
refactor: extract bencode tokenizer
Browse files Browse the repository at this point in the history
Split parser logic into two types:

- Tokenizer: It returns bencoded tokens.
- Generator: It iterator over bencoded tokens to generate the JSON.
  • Loading branch information
josecelano committed Dec 3, 2024
1 parent a2eb63c commit 83eeefd
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 29 deletions.
16 changes: 12 additions & 4 deletions src/parsers/integer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,13 @@ enum StateExpecting {
///
/// Will panic if we reach the end of the input without completing the integer
/// (without reaching the end of the integer `e`).
pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) -> Result<(), Error> {
pub fn parse<R: Read, W: Writer>(
reader: &mut ByteReader<R>,
writer: &mut W,
) -> Result<Vec<u8>, Error> {
let mut state = StateExpecting::Start;
let mut first_digit_is_zero = false;
let mut value = vec![];

loop {
let byte = next_byte(reader, writer)?;
Expand All @@ -48,10 +52,12 @@ pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) ->
StateExpecting::DigitOrSign => {
if char == '-' {
writer.write_byte(byte)?;
value.push(byte);

StateExpecting::DigitAfterSign
} else if char.is_ascii_digit() {
writer.write_byte(byte)?;
value.push(byte);

if char == '0' {
first_digit_is_zero = true;
Expand All @@ -76,6 +82,7 @@ pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) ->
StateExpecting::DigitAfterSign => {
if char.is_ascii_digit() {
writer.write_byte(byte)?;
value.push(byte);

if char == '0' {
first_digit_is_zero = true;
Expand All @@ -100,6 +107,7 @@ pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) ->
StateExpecting::DigitOrEnd => {
if char.is_ascii_digit() {
writer.write_byte(byte)?;
value.push(byte);

if char == '0' && first_digit_is_zero {
return Err(Error::LeadingZerosInIntegersNotAllowed(
Expand All @@ -118,7 +126,7 @@ pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) ->

StateExpecting::DigitOrEnd
} else if byte == BENCODE_END_INTEGER {
return Ok(());
return Ok(value);
} else {
return Err(Error::UnexpectedByteParsingInteger(
ReadContext {
Expand Down Expand Up @@ -185,12 +193,12 @@ mod tests {
let mut output = String::new();

match parse_bencode(input_buffer, &mut output) {
Ok(()) => Ok(output),
Ok(_value) => Ok(output),
Err(err) => Err(err),
}
}

fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<(), Error> {
fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<Vec<u8>, Error> {
let mut reader = ByteReader::new(input_buffer);

let mut writer = StringWriter::new(output);
Expand Down
104 changes: 87 additions & 17 deletions src/parsers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pub mod integer;
pub mod stack;
pub mod string;

use core::str;
use std::{
fmt::Write as FmtWrite,
io::{self, Read, Write as IoWrite},
Expand Down Expand Up @@ -36,6 +37,16 @@ pub enum BencodeType {
Dict,
}

#[derive(Debug, PartialEq)]
pub enum BencodeToken {
Integer(Vec<u8>),
String(Vec<u8>),
BeginList,
BeginDict,
EndListOrDict,
LineBreak,
}

pub struct BencodeParser<R: Read> {
byte_reader: ByteReader<R>,
num_processed_tokens: u64,
Expand Down Expand Up @@ -104,35 +115,40 @@ impl<R: Read> BencodeParser<R> {
/// - It can't read from the input or write to the output.
/// - The input is invalid Bencode.
fn parse<W: Writer>(&mut self, writer: &mut W) -> Result<(), error::Error> {
while let Some(peeked_byte) = Self::peek_byte(&mut self.byte_reader, writer)? {
match peeked_byte {
let capture_output = Vec::new();
let mut null_writer = ByteWriter::new(capture_output);

while let Some(peeked_byte) = Self::peek_byte(&mut self.byte_reader, &null_writer)? {
let token: BencodeToken = match peeked_byte {
BENCODE_BEGIN_INTEGER => {
self.begin_bencoded_value(BencodeType::Integer, writer)?;
integer::parse(&mut self.byte_reader, writer)?;
let value = integer::parse(&mut self.byte_reader, &mut null_writer)?;
BencodeToken::Integer(value)
}
b'0'..=b'9' => {
self.begin_bencoded_value(BencodeType::String, writer)?;
string::parse(&mut self.byte_reader, writer)?;
let value = string::parse(&mut self.byte_reader, &mut null_writer)?;
BencodeToken::String(value)
}
BENCODE_BEGIN_LIST => {
let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
self.begin_bencoded_value(BencodeType::List, writer)?;
writer.write_byte(Self::JSON_ARRAY_BEGIN)?;
self.stack.push(State::ExpectingFirstListItemOrEnd);
let _byte =
Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?;
BencodeToken::BeginList
}
BENCODE_BEGIN_DICT => {
let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
self.begin_bencoded_value(BencodeType::Dict, writer)?;
writer.write_byte(Self::JSON_OBJ_BEGIN)?;
self.stack.push(State::ExpectingFirstDictFieldOrEnd);
let _byte =
Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?;
BencodeToken::BeginDict
}
BENCODE_END_LIST_OR_DICT => {
let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
self.end_list_or_dict(writer)?;
let _byte =
Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?;
BencodeToken::EndListOrDict
}
b'\n' => {
// todo: we should not return any token and continue to the next token.
// Ignore line breaks at the beginning, the end, or between values
let _byte = Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, writer)?;
let _byte =
Self::read_peeked_byte(peeked_byte, &mut self.byte_reader, &null_writer)?;
BencodeToken::LineBreak
}
_ => {
return Err(error::Error::UnrecognizedFirstBencodeValueByte(
Expand All @@ -148,6 +164,60 @@ impl<R: Read> BencodeParser<R> {
},
));
}
};

/* TODO:
- Extract tokenizer (without implementing the Iterator trait).
- Remove writer from tokenizer.
- Implement trait Iterator for tokenizer.
- Rename this parser to generator.
*/

match token {
BencodeToken::Integer(integer_bytes) => {
self.begin_bencoded_value(BencodeType::Integer, writer)?;
// todo: add `write_bytes` to writer.
for bytes in integer_bytes {
writer.write_byte(bytes)?;
}
}
BencodeToken::String(string_bytes) => {
self.begin_bencoded_value(BencodeType::String, writer)?;

let html_tag_style_string = match str::from_utf8(&string_bytes) {
Ok(string) => {
// String only contains valid UTF-8 chars -> print it as it's
&format!("<string>{}</string>", string.to_owned())
}
Err(_) => {
// String contains non valid UTF-8 chars -> print it as hex bytes
&format!("<hex>{}</hex>", hex::encode(string_bytes))
}
};

writer.write_str(
&serde_json::to_string(&html_tag_style_string)
.expect("Failed to serialize to JSON. This should not happen because non UTF-8 bencoded string are serialized as hex bytes"),
)?;
}
BencodeToken::BeginList => {
self.begin_bencoded_value(BencodeType::List, writer)?;
writer.write_byte(Self::JSON_ARRAY_BEGIN)?;
self.stack.push(State::ExpectingFirstListItemOrEnd);
}
BencodeToken::BeginDict => {
self.begin_bencoded_value(BencodeType::Dict, writer)?;
writer.write_byte(Self::JSON_OBJ_BEGIN)?;
self.stack.push(State::ExpectingFirstDictFieldOrEnd);
}
BencodeToken::EndListOrDict => {
self.end_list_or_dict(writer)?;
}
BencodeToken::LineBreak => {
// Ignore line breaks at the beginning, the end, or between values
}
}

self.num_processed_tokens += 1;
Expand Down
19 changes: 11 additions & 8 deletions src/parsers/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ use super::error::{Error, ReadContext, WriteContext};
/// # Panics
///
/// Will panic if we reach the end of the input without completing the string.
pub fn parse<R: Read, W: Writer>(reader: &mut ByteReader<R>, writer: &mut W) -> Result<(), Error> {
pub fn parse<R: Read, W: Writer>(
reader: &mut ByteReader<R>,
writer: &mut W,
) -> Result<Vec<u8>, Error> {
let mut string_parser = StringParser::default();
string_parser.parse(reader, writer)
}
Expand All @@ -46,20 +49,20 @@ impl StringParser {
&mut self,
reader: &mut ByteReader<R>,
writer: &mut W,
) -> Result<(), Error> {
) -> Result<Vec<u8>, Error> {
let mut length = Length::default();

length.parse(reader, writer)?;

let mut value = Value::new(length.number);

value.parse(reader, writer)?;
let value_bytes = value.parse(reader, writer)?;

self.parsed_value = value.utf8();

writer.write_str(&self.json())?;

Ok(())
Ok(value_bytes)
}

/// It returns the final parsed value as string.
Expand Down Expand Up @@ -202,12 +205,12 @@ impl Value {
&mut self,
reader: &mut ByteReader<R>,
writer: &W,
) -> Result<(), Error> {
) -> Result<Vec<u8>, Error> {
for _i in 1..=self.length {
self.add_byte(Self::next_byte(reader, writer)?);
}

Ok(())
Ok(self.bytes.clone())
}

/// It reads the next byte from the input.
Expand Down Expand Up @@ -282,12 +285,12 @@ mod tests {
let mut output = String::new();

match parse_bencode(input_buffer, &mut output) {
Ok(()) => Ok(output),
Ok(_string_value_bytes) => Ok(output),
Err(err) => Err(err),
}
}

fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<(), Error> {
fn parse_bencode(input_buffer: &[u8], output: &mut String) -> Result<Vec<u8>, Error> {
let mut reader = ByteReader::new(input_buffer);

let mut writer = StringWriter::new(output);
Expand Down

0 comments on commit 83eeefd

Please sign in to comment.