Skip to content

Commit

Permalink
Check that processors add the number of tokens they say they will
Browse files Browse the repository at this point in the history
  • Loading branch information
boyleconnor committed Aug 3, 2023
1 parent efea6c7 commit d6a4de9
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -916,9 +916,10 @@ where
add_special_tokens: bool,
) -> Result<Encoding> {
// 1. First we truncate if needed
let is_pair = pair_encoding.is_some();
let (encoding, pair_encoding) = {
if let Some(trunc) = &self.truncation {
let n_added_tokens = self.get_n_added_tokens(pair_encoding.is_some());
let n_added_tokens = self.get_n_added_tokens(is_pair);

if add_special_tokens && n_added_tokens > 0 {
let params = TruncationParams {
Expand All @@ -933,6 +934,7 @@ where
(encoding, pair_encoding)
}
};
let original_length = encoding.len();

// 2. Then We post process
let final_encoding = if let Some(processor) = &self.post_processor {
Expand All @@ -950,6 +952,7 @@ where
}
encodings.pop().unwrap()
};
assert_eq!(final_encoding.len() - self.get_n_added_tokens(is_pair), original_length, "Processor should add {} tokens but instead added {}!", self.get_n_added_tokens(is_pair), final_encoding.len() - original_length);

// 3. Then we pad if needed
let [final_encoding] = if let Some(params) = &self.padding {
Expand Down

0 comments on commit d6a4de9

Please sign in to comment.