From bb38f390a61883fc2f29d659af696f428d1cda6b Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 25 Jul 2023 11:57:23 +0100 Subject: [PATCH] Single warning for holes. (#1303) * Single warning for holes. * Dummy. --- tokenizers/src/models/mod.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs index ade7878fa..f7ef3df15 100644 --- a/tokenizers/src/models/mod.rs +++ b/tokenizers/src/models/mod.rs @@ -34,20 +34,26 @@ impl<'a> Serialize for OrderedVocabIter<'a> { S: Serializer, { // There could be holes so max + 1 is more correct than vocab_r.len() - if let Some(max) = self.vocab_r.iter().map(|(key, _)| key).max() { + let mut holes = vec![]; + let result = if let Some(max) = self.vocab_r.iter().map(|(key, _)| key).max() { let iter = (0..*max + 1).filter_map(|i| { if let Some(token) = self.vocab_r.get(&i){ Some((token, i)) }else{ - warn!("The OrderedVocab you are attempting to save contains a hole for index {}, your vocabulary could be corrupted !", i); - println!("The OrderedVocab you are attempting to save contains a hole for index {}, your vocabulary could be corrupted !", i); + holes.push(i); None } - }); + }); serializer.collect_map(iter) } else { serializer.collect_map(std::iter::empty::<(&str, u32)>()) + }; + + if !holes.is_empty(){ + warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes); + println!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes); } + result } }