From 25aee8b88c8de3c5a52e2f9cb6281d6df00ad516 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 6 May 2024 11:49:38 +0200 Subject: [PATCH] [BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder (#1513) * [BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder Causes issues with `ByteLevel` messing up some `AddedTokens` with some utf-8 range used in the bytelevel mapping. This commit tests the extend of the damage of ignoring the decoder for those tokens. * Format. * Installing cargo audit. * Minor fix. * Fixing "bug" in node/python. * Autoformat. * Clippy. * Only prefix space when there's no decoder. --- .github/workflows/python.yml | 6 +++ .github/workflows/rust.yml | 6 +++ tokenizers/src/tokenizer/added_vocabulary.rs | 8 ++++ tokenizers/src/tokenizer/mod.rs | 47 +++++++++++++------- 4 files changed, 51 insertions(+), 16 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 529d892d0..92956ffc1 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -63,6 +63,12 @@ jobs: toolchain: stable components: rustfmt, clippy + - name: Install audit + uses: actions-rs/cargo@v1 + with: + command: install + args: cargo-audit + - name: Install Python uses: actions/setup-python@v4 with: diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 1cc3ef7a1..57d2e1fcd 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -36,6 +36,12 @@ jobs: command: install args: cargo-readme + - name: Install audit + uses: actions-rs/cargo@v1 + with: + command: install + args: cargo-audit + - name: Build uses: actions-rs/cargo@v1 with: diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index b7521fde4..301d9bc81 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -216,6 +216,10 @@ impl AddedVocabulary { } /// Get the token matching the given id if it exists + #[deprecated( + since = "0.19.0", + note = "please use `added_vocabulary.simple_id_to_token(id).or_else(|| model.id_to_token(id)` instead" + )] pub fn id_to_token(&self, id: u32, model: &impl Model) -> Option { self.added_tokens_map_r .get(&id) @@ -223,6 +227,10 @@ impl AddedVocabulary { .or_else(|| model.id_to_token(id)) } + pub fn simple_id_to_token(&self, id: u32) -> Option { + self.added_tokens_map_r.get(&id).map(|t| t.content.clone()) + } + // pub fn set_encode_special_tokens(&mut self, value: bool) { self.encode_special_tokens = value; diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index cb396deab..ebc68dfb1 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -699,7 +699,9 @@ where /// Converts an id to the corresponding token. pub fn id_to_token(&self, id: u32) -> Option { - self.added_vocabulary.id_to_token(id, &self.model) + self.added_vocabulary + .simple_id_to_token(id) + .or_else(|| self.model.id_to_token(id)) } /// set the added bocab's splitting scheme @@ -845,22 +847,35 @@ where /// Decode the given ids, back to a String pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result { - let tokens = ids - .iter() - .filter_map(|id| { - self.added_vocabulary - .id_to_token(*id, &self.model) - .filter(|token| { - !skip_special_tokens || !self.added_vocabulary.is_special_token(token) - }) - }) - .collect::>(); - - if let Some(decoder) = &self.decoder { - decoder.decode(tokens) - } else { - Ok(tokens.join(" ")) + let mut result = String::with_capacity(ids.len()); + let mut chunks = Vec::with_capacity(ids.len()); + for id in ids { + if let Some(added_token) = self.added_vocabulary.simple_id_to_token(*id) { + if skip_special_tokens && self.added_vocabulary.is_special_token(&added_token) { + continue; + } + let text_chunk = if let Some(decoder) = &self.decoder { + decoder.decode(chunks.clone())? + } else { + chunks.join(" ") + }; + result.push_str(&text_chunk); + if !result.is_empty() && self.decoder.is_none() { + result.push(' '); + } + result.push_str(&added_token); + chunks.clear(); + } else if let Some(token) = self.model.id_to_token(*id) { + chunks.push(token); + } } + let text_chunk = if let Some(decoder) = &self.decoder { + decoder.decode(chunks.clone())? + } else { + chunks.join(" ") + }; + result.push_str(&text_chunk); + Ok(result) } }