From 7e00a22f8827cbb24b963c13fdf60caa3ba1b4ff Mon Sep 17 00:00:00 2001 From: Dario Cancelliere Date: Thu, 11 May 2023 02:54:21 +0200 Subject: [PATCH 1/4] Added GODEL support --- src/pipelines/conversation.rs | 21 ++++++++++++++++++++- src/t5/t5_model.rs | 15 +++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs index b8f46e5f..62f9e7ce 100644 --- a/src/pipelines/conversation.rs +++ b/src/pipelines/conversation.rs @@ -12,7 +12,8 @@ // limitations under the License. //! # Multi-turn dialogue -//! Conversation model based on Microsoft's [DialoGPT](https://github.com/microsoft/DialoGPT). +//! Conversation model based on Microsoft's [DialoGPT](https://github.com/microsoft/DialoGPT) or +//! [GODEL](https://github.com/microsoft/GODEL). //! This pipeline allows the generation of single or multi-turn conversations between a human and a model. //! The DialoGPT's page states that //! > The human evaluation results indicate that the response generated from DialoGPT is comparable to human response quality @@ -55,6 +56,7 @@ //! from the 3rd party utilization of the pretrained system. use crate::common::error::RustBertError; use crate::gpt2::GPT2Generator; +use crate::t5::T5Generator; use crate::pipelines::common::{ModelType, TokenizerOption}; use crate::pipelines::generation_utils::private_generation_utils::PrivateLanguageGenerator; use crate::pipelines::generation_utils::{GenerateConfig, LanguageGenerator}; @@ -695,12 +697,14 @@ impl Default for ConversationManager { pub enum ConversationOption { /// Conversation based on GPT2 model GPT2(GPT2Generator), + T5(T5Generator), } impl ConversationOption { pub fn new(config: ConversationConfig) -> Result { match config.model_type { ModelType::GPT2 => Ok(ConversationOption::GPT2(GPT2Generator::new(config.into())?)), + ModelType::T5 => Ok(ConversationOption::T5(T5Generator::new(config.into())?)), _ => Err(RustBertError::InvalidConfigurationError( "GPT2 is currently the only supported model for conversation generation" .to_string(), @@ -717,6 +721,10 @@ impl ConversationOption { config.into(), tokenizer, )?)), + ModelType::T5 => Ok(ConversationOption::T5(T5Generator::new_with_tokenizer( + config.into(), + tokenizer, + )?)), _ => Err(RustBertError::InvalidConfigurationError( "GPT2 is currently the only supported model for conversation generation" .to_string(), @@ -729,6 +737,9 @@ impl ConversationOption { Self::GPT2(model_ref) => { Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap()) } + Self::T5(model_ref) => { + Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap()) + } } } @@ -736,6 +747,7 @@ impl ConversationOption { pub fn get_tokenizer(&self) -> &TokenizerOption { match self { Self::GPT2(model_ref) => model_ref._get_tokenizer(), + Self::T5(model_ref) => model_ref._get_tokenizer(), } } @@ -743,6 +755,7 @@ impl ConversationOption { pub fn get_tokenizer_mut(&mut self) -> &TokenizerOption { match self { Self::GPT2(model_ref) => model_ref._get_tokenizer_mut(), + Self::T5(model_ref) => model_ref._get_tokenizer_mut(), } } @@ -750,6 +763,7 @@ impl ConversationOption { pub fn model_type(&self) -> ModelType { match *self { Self::GPT2(_) => ModelType::GPT2, + Self::T5(_) => ModelType::T5, } } @@ -765,6 +779,11 @@ impl ConversationOption { .into_iter() .map(|output| output.indices) .collect(), + Self::T5(ref model) => model + .generate_from_ids_and_past(input_ids, attention_mask, None) + .into_iter() + .map(|output| output.indices) + .collect(), } } } diff --git a/src/t5/t5_model.rs b/src/t5/t5_model.rs index 204c4560..59d11807 100644 --- a/src/t5/t5_model.rs +++ b/src/t5/t5_model.rs @@ -61,6 +61,11 @@ impl T5ModelResources { "sentence-t5-base/model", "https://huggingface.co/sentence-transformers/sentence-t5-base/resolve/main/rust_model.ot", ); + /// Shared under MIT license by the Microsoft team at . Modified with conversion to C-array format. + pub const GODEL_V1_1_BASE: (&'static str, &'static str) = ( + "godel-v1-1-base/model", + "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/rust_model.ot", + ); } impl T5ConfigResources { @@ -79,6 +84,11 @@ impl T5ConfigResources { "sentence-t5-base/config", "https://huggingface.co/sentence-transformers/sentence-t5-base/resolve/main/config.json", ); + /// Shared under MIT license by the Microsoft team at . Modified with conversion to C-array format. + pub const GODEL_V1_1_BASE: (&'static str, &'static str) = ( + "godel-v1-1-base/config", + "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/config.json", + ); } impl T5VocabResources { @@ -97,6 +107,11 @@ impl T5VocabResources { "sentence-t5-base/spiece", "https://huggingface.co/sentence-transformers/sentence-t5-base/resolve/main/spiece.model", ); + /// Shared under MIT license by the Microsoft team at . Modified with conversion to C-array format. + pub const GODEL_V1_1_BASE: (&'static str, &'static str) = ( + "godel-v1-1-base/spiece", + "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/spiece.model", + ); } const T5LANGUAGES: [Language; 3] = [Language::English, Language::French, Language::German]; From a3f484e1afa73ce0070196706fa61f828a395445 Mon Sep 17 00:00:00 2001 From: Dario Cancelliere Date: Thu, 11 May 2023 03:47:10 +0200 Subject: [PATCH 2/4] Added other missing resources --- src/pipelines/conversation.rs | 2 ++ src/t5/t5_model.rs | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs index 62f9e7ce..ce450160 100644 --- a/src/pipelines/conversation.rs +++ b/src/pipelines/conversation.rs @@ -925,6 +925,8 @@ impl ConversationModel { let mut output = HashMap::with_capacity(active_uuid.len()); + println!("generated: {:#?}, prompt_ids: {:#?}", &generated, &prompt_ids); + for ( ((conversation, (generated_sequence, conversation_promp_ids)), uuid), removed_padding, diff --git a/src/t5/t5_model.rs b/src/t5/t5_model.rs index 59d11807..f93e210f 100644 --- a/src/t5/t5_model.rs +++ b/src/t5/t5_model.rs @@ -66,6 +66,11 @@ impl T5ModelResources { "godel-v1-1-base/model", "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/rust_model.ot", ); + /// Shared under MIT license by the Microsoft team at . Modified with conversion to C-array format. + pub const GODEL_V1_1_LARGE: (&'static str, &'static str) = ( + "godel-v1-1-large/model", + "https://huggingface.co/microsoft/GODEL-v1_1-large-seq2seq/resolve/main/rust_model.ot", + ); } impl T5ConfigResources { @@ -89,6 +94,11 @@ impl T5ConfigResources { "godel-v1-1-base/config", "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/config.json", ); + /// Shared under MIT license by the Microsoft team at . Modified with conversion to C-array format. + pub const GODEL_V1_1_LARGE: (&'static str, &'static str) = ( + "godel-v1-1-large/config", + "https://huggingface.co/microsoft/GODEL-v1_1-large-seq2seq/resolve/main/config.json", + ); } impl T5VocabResources { @@ -107,10 +117,15 @@ impl T5VocabResources { "sentence-t5-base/spiece", "https://huggingface.co/sentence-transformers/sentence-t5-base/resolve/main/spiece.model", ); - /// Shared under MIT license by the Microsoft team at . Modified with conversion to C-array format. + /// Shared under Apache 2.0 license by the Google team at . pub const GODEL_V1_1_BASE: (&'static str, &'static str) = ( "godel-v1-1-base/spiece", - "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/spiece.model", + "https://huggingface.co/t5-base/resolve/main/spiece.model", + ); + /// Shared under Apache 2.0 license by the Google team at . + pub const GODEL_V1_1_LARGE: (&'static str, &'static str) = ( + "godel-v1-1-large/spiece", + "https://huggingface.co/t5-large/resolve/main/spiece.model", ); } From 5f9500c54a4e06cec3c999e20e91a4ab3f93514a Mon Sep 17 00:00:00 2001 From: guillaume-be Date: Thu, 11 May 2023 18:35:35 +0100 Subject: [PATCH 3/4] `tch 0.12.0` Update (#379) * Fix 0.12 breaking changes * Fix Clippy warnings * Updated changelog --- CHANGELOG.md | 2 +- Cargo.toml | 4 +- benches/tensor_operations_benchmark.rs | 4 +- src/albert/albert_model.rs | 2 +- src/albert/attention.rs | 4 +- src/bart/attention.rs | 2 +- src/bart/bart_model.rs | 10 +-- src/bert/bert_model.rs | 4 +- src/common/dropout.rs | 2 +- src/deberta/attention.rs | 20 +++--- src/deberta/embeddings.rs | 4 +- src/deberta_v2/attention.rs | 18 ++--- src/deberta_v2/encoder.rs | 4 +- src/electra/electra_model.rs | 2 +- src/fnet/attention.rs | 4 +- src/gpt2/attention.rs | 6 +- src/gpt_j/attention.rs | 20 +++--- src/gpt_neo/attention.rs | 6 +- src/longformer/attention.rs | 38 +++++----- src/longformer/embeddings.rs | 2 +- src/longformer/encoder.rs | 3 +- src/longformer/longformer_model.rs | 11 ++- src/longt5/attention.rs | 34 +++++---- src/longt5/encoder.rs | 6 +- src/m2m_100/embeddings.rs | 2 +- src/mbart/mbart_model.rs | 2 +- src/mobilebert/embeddings.rs | 2 +- src/mobilebert/mobilebert_model.rs | 2 +- src/pegasus/embeddings.rs | 2 +- src/pipelines/conversation.rs | 2 +- src/pipelines/generation_utils.rs | 69 +++++++++++-------- src/pipelines/keywords_extraction/scorer.rs | 11 +-- src/pipelines/masked_language.rs | 3 +- src/pipelines/sentence_embeddings/pipeline.rs | 10 +-- src/prophetnet/attention.rs | 28 ++++---- src/prophetnet/decoder.rs | 22 +++--- src/prophetnet/embeddings.rs | 3 +- src/prophetnet/encoder.rs | 2 +- src/reformer/attention.rs | 58 +++++++++------- src/reformer/attention_utils.rs | 4 +- src/reformer/embeddings.rs | 9 +-- src/reformer/reformer_model.rs | 8 +-- src/roberta/embeddings.rs | 2 +- src/t5/attention.rs | 6 +- src/t5/encoder.rs | 11 +-- src/xlnet/attention.rs | 53 ++++++++------ src/xlnet/xlnet_model.rs | 36 +++++----- tests/deberta.rs | 6 +- tests/deberta_v2.rs | 6 +- tests/fnet.rs | 5 +- tests/longformer.rs | 5 +- tests/mobilebert.rs | 5 +- tests/xlnet.rs | 8 +-- 53 files changed, 323 insertions(+), 271 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4ca5bd3..8b637c79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ All notable changes to this project will be documented in this file. The format ## Changed - Bumped the tokenizers dependency from 7.x to 8.x, exposing additional options for special token mapping and adding the NLLBTokenizer. - (BREAKING) Simplified the generation traits (removal of LMHeadModel and elimination of unnecessary specification for LanguageGenerator) -- Upgraded to `torch` 2.0 (via `tch` 0.11.0). +- Upgraded to `torch` 2.0 (via `tch` 0.12.0). ## Fixed - MIN/MAX computation for float-like (was set to infinity instead of min/max) diff --git a/Cargo.toml b/Cargo.toml index 0e3cf453..b0e025d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,7 +70,7 @@ features = ["doc-only"] [dependencies] rust_tokenizers = "8.1" -tch = "0.11.0" +tch = "0.12.0" serde_json = "1" serde = { version = "1", features = ["derive"] } ordered-float = "3" @@ -88,6 +88,6 @@ anyhow = "1" csv = "1" criterion = "0.4" tokio = { version = "1.24", features = ["sync", "rt-multi-thread", "macros"] } -torch-sys = "0.11.0" +torch-sys = "0.12.0" tempfile = "3" itertools = "0.10" diff --git a/benches/tensor_operations_benchmark.rs b/benches/tensor_operations_benchmark.rs index 2ef3d140..6c47acb3 100644 --- a/benches/tensor_operations_benchmark.rs +++ b/benches/tensor_operations_benchmark.rs @@ -21,8 +21,8 @@ fn bench_tensor_ops(c: &mut Criterion) { unsafe { torch_sys::dummy_cuda_dependency(); } - let input = Tensor::rand(&[32, 128, 512], (Kind::Float, Device::cuda_if_available())); - let weights = Tensor::rand(&[512, 512], (Kind::Float, Device::cuda_if_available())); + let input = Tensor::rand([32, 128, 512], (Kind::Float, Device::cuda_if_available())); + let weights = Tensor::rand([512, 512], (Kind::Float, Device::cuda_if_available())); let _ = &input.matmul(&weights); c.bench_function("Matrix multiply ", |b| { diff --git a/src/albert/albert_model.rs b/src/albert/albert_model.rs index 1d957c8a..d55bf0cb 100644 --- a/src/albert/albert_model.rs +++ b/src/albert/albert_model.rs @@ -257,7 +257,7 @@ impl AlbertModel { get_shape_and_device_from_ids_embeddings_pair(input_ids, input_embeds)?; let calc_mask = if mask.is_none() { - Some(Tensor::ones(&input_shape, (Kind::Int64, device))) + Some(Tensor::ones(input_shape, (Kind::Int64, device))) } else { None }; diff --git a/src/albert/attention.rs b/src/albert/attention.rs index 97fa5393..1a2574a9 100644 --- a/src/albert/attention.rs +++ b/src/albert/attention.rs @@ -130,8 +130,8 @@ impl AlbertSelfAttention { self.hidden_size, )); - let context: Tensor = - Tensor::einsum("bfnd,ndh->bfh", &[context, w], None) + self.dense.bs.as_ref().unwrap(); + let context: Tensor = Tensor::einsum("bfnd,ndh->bfh", &[context, w], None::) + + self.dense.bs.as_ref().unwrap(); let context = (input_ids + context.apply_t(&self.dropout, train)).apply(&self.layer_norm); if !self.output_attentions { diff --git a/src/bart/attention.rs b/src/bart/attention.rs index 4b19d963..394a8c44 100644 --- a/src/bart/attention.rs +++ b/src/bart/attention.rs @@ -176,7 +176,7 @@ impl BartAttention { .bmm(&value_states) .view([bs, self.num_heads, target_length, self.head_dim]) .transpose(1, 2) - .reshape(&[bs, target_length, embed_dim]) + .reshape([bs, target_length, embed_dim]) .apply(&self.out_proj); (attention_output, saved_attention_weights, new_layer_state) diff --git a/src/bart/bart_model.rs b/src/bart/bart_model.rs index 1f416c29..53ba80dc 100644 --- a/src/bart/bart_model.rs +++ b/src/bart/bart_model.rs @@ -270,7 +270,7 @@ pub(crate) fn _make_causal_mask( let target_length = input_ids_shape[1]; let mut mask = Tensor::full( - &[target_length, target_length], + [target_length, target_length], get_min(dtype).unwrap(), (dtype, device), ); @@ -283,14 +283,14 @@ pub(crate) fn _make_causal_mask( if past_key_values_length > 0 { mask = Tensor::cat( &[ - Tensor::zeros(&[target_length, past_key_values_length], (dtype, device)), + Tensor::zeros([target_length, past_key_values_length], (dtype, device)), mask, ], -1, ); } mask.unsqueeze(0).unsqueeze(0).expand( - &[ + [ batch_size, 1, target_length, @@ -306,7 +306,7 @@ pub(crate) fn _expand_mask(mask: &Tensor, target_length: Option, dtype: Kin let expanded_mask = mask .unsqueeze(1) .unsqueeze(1) - .expand(&[batch_size, 1, target_length, source_length], true) + .expand([batch_size, 1, target_length, source_length], true) .totype(dtype); let inverted_mask: Tensor = 1 - expanded_mask; inverted_mask.masked_fill(&inverted_mask.to_kind(Kind::Bool), get_min(dtype).unwrap()) @@ -863,7 +863,7 @@ impl BartForSequenceClassification { let reshape = eos_mask.sum_dim_intlist([1].as_slice(), true, input_ids.kind()); let sentence_representation = base_model_output .decoder_output - .permute(&[2, 0, 1]) + .permute([2, 0, 1]) .masked_select(&eos_mask) .view((-1, reshape.size()[0] * reshape.int64_value(&[0, 0]))) .transpose(0, 1) diff --git a/src/bert/bert_model.rs b/src/bert/bert_model.rs index 84960abe..2b567ccd 100644 --- a/src/bert/bert_model.rs +++ b/src/bert/bert_model.rs @@ -370,7 +370,7 @@ impl BertModel { 2 => { if self.is_decoder { let seq_ids = Tensor::arange(input_shape[1], (Kind::Int8, device)); - let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat(&[ + let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat([ input_shape[0], input_shape[1], 1, @@ -407,7 +407,7 @@ impl BertModel { let encoder_mask = match encoder_mask { Some(value) => value.copy(), None => Tensor::ones( - &[ + [ encoder_hidden_states_shape[0], encoder_hidden_states_shape[1], ], diff --git a/src/common/dropout.rs b/src/common/dropout.rs index 8c5adf16..e35ab604 100644 --- a/src/common/dropout.rs +++ b/src/common/dropout.rs @@ -43,7 +43,7 @@ impl XDropout { impl ModuleT for XDropout { fn forward_t(&self, input: &Tensor, train: bool) -> Tensor { if train { - let mask = (Tensor::ones(&[1], (input.kind(), input.device())) + let mask = (Tensor::ones([1], (input.kind(), input.device())) - input .empty_like() .bernoulli_float_(1_f64 - self.dropout_prob)) diff --git a/src/deberta/attention.rs b/src/deberta/attention.rs index 505d057f..948b01fd 100644 --- a/src/deberta/attention.rs +++ b/src/deberta/attention.rs @@ -37,7 +37,7 @@ pub trait DisentangledSelfAttention { pub fn build_relative_position(query_size: i64, key_size: i64, device: Device) -> Tensor { let q_ids = Tensor::arange(query_size, (Kind::Int64, device)); let k_ids = Tensor::arange(key_size, (Kind::Int64, device)); - let rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.view([1, -1]).repeat(&[query_size, 1]); + let rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.view([1, -1]).repeat([query_size, 1]); rel_pos_ids.slice(0, 0, query_size, 1).unsqueeze(0) } @@ -62,7 +62,7 @@ impl DebertaDisentangledSelfAttention { let mut new_shape = x.size(); let _ = new_shape.pop(); new_shape.extend_from_slice(&[self.num_attention_heads, -1]); - x.view(new_shape.as_slice()).permute(&[0, 2, 1, 3]) + x.view(new_shape.as_slice()).permute([0, 2, 1, 3]) } fn linear(&self, weights: &Tensor, bias: Option<&Tensor>, x: &Tensor) -> Tensor { @@ -81,7 +81,7 @@ impl DebertaDisentangledSelfAttention { ) -> Tensor { let query_layer_size = query_layer.size(); c2p_pos.expand( - &[ + [ query_layer_size[0], query_layer_size[1], query_layer_size[2], @@ -101,7 +101,7 @@ impl DebertaDisentangledSelfAttention { let mut key_layer_size = key_layer.size(); key_layer_size.reverse(); c2p_pos.expand( - &[ + [ query_layer_size[0], query_layer_size[1], key_layer_size[1], @@ -182,7 +182,7 @@ impl DebertaDisentangledSelfAttention { ) .unsqueeze(0); - let mut score = Tensor::zeros(&[1], (query_layer.kind(), key_layer.device())); + let mut score = Tensor::zeros([1], (query_layer.kind(), key_layer.device())); // content -> position if let Some(pos_proj) = &self.pos_proj { @@ -410,9 +410,9 @@ impl DisentangledSelfAttention for DebertaDisentangledSelfAttention { if let Some(head_logits_proj) = &self.head_logits_proj { attention_scores = attention_scores - .permute(&[0, 2, 3, 1]) + .permute([0, 2, 3, 1]) .apply(head_logits_proj) - .permute(&[0, 3, 1, 2]); + .permute([0, 3, 1, 2]); } let mut attention_probs = @@ -420,14 +420,14 @@ impl DisentangledSelfAttention for DebertaDisentangledSelfAttention { if let Some(head_weights_proj) = &self.head_weights_proj { attention_probs = attention_probs - .permute(&[0, 2, 3, 1]) + .permute([0, 2, 3, 1]) .apply(head_weights_proj) - .permute(&[0, 3, 1, 2]); + .permute([0, 3, 1, 2]); } let context_layer = attention_probs .matmul(&value_layer) - .permute(&[0, 2, 1, 3]) + .permute([0, 2, 1, 3]) .contiguous(); let mut new_context_layer_shape = context_layer.size(); diff --git a/src/deberta/embeddings.rs b/src/deberta/embeddings.rs index e618c568..0bef5e1e 100644 --- a/src/deberta/embeddings.rs +++ b/src/deberta/embeddings.rs @@ -127,7 +127,7 @@ where let calc_position_ids = if position_ids.is_none() { Some( Tensor::arange(seq_length, (Kind::Int64, input_embeddings.device())) - .expand(&[1, -1], true), + .expand([1, -1], true), ) } else { None @@ -135,7 +135,7 @@ where let calc_token_type_ids = if token_type_ids.is_none() { Some(Tensor::zeros( - &input_shape, + input_shape, (Kind::Int64, input_embeddings.device()), )) } else { diff --git a/src/deberta_v2/attention.rs b/src/deberta_v2/attention.rs index 1c243bc6..4cc431f9 100644 --- a/src/deberta_v2/attention.rs +++ b/src/deberta_v2/attention.rs @@ -51,7 +51,7 @@ pub fn build_relative_position( ) -> Tensor { let q_ids = Tensor::arange(query_size, (Kind::Int64, device)); let k_ids = Tensor::arange(key_size, (Kind::Int64, device)); - let mut rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.tile(&[q_ids.size()[0], 1]); + let mut rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.tile([q_ids.size()[0], 1]); if (bucket_size > 0) & (max_position > 0) { rel_pos_ids = make_log_bucket_position(&rel_pos_ids, bucket_size, max_position); } @@ -80,7 +80,7 @@ impl DebertaV2DisentangledSelfAttention { let _ = new_shape.pop(); new_shape.extend_from_slice(&[self.num_attention_heads, -1]); let x = x.view(new_shape.as_slice()); - x.permute(&[0, 2, 1, 3]) + x.permute([0, 2, 1, 3]) .contiguous() .view([-1, x.size()[1], *x.size().last().unwrap()]) } @@ -133,12 +133,12 @@ impl DebertaV2DisentangledSelfAttention { let pos_query_layer = self .transpose_for_scores(&relative_embeddings.apply(query_proj)) - .repeat(&[query_layer.size()[0] / self.num_attention_heads, 1, 1]); + .repeat([query_layer.size()[0] / self.num_attention_heads, 1, 1]); let pos_key_layer = self .transpose_for_scores(&relative_embeddings.apply(key_proj)) - .repeat(&[query_layer.size()[0] / self.num_attention_heads, 1, 1]); + .repeat([query_layer.size()[0] / self.num_attention_heads, 1, 1]); - let mut score = Tensor::zeros(&[1], (query_layer.kind(), query_layer.device())); + let mut score = Tensor::zeros([1], (query_layer.kind(), query_layer.device())); let c2p_pos = if self.pos_att_type.has_type(PositionAttentionType::c2p) | self.pos_att_type.has_type(PositionAttentionType::p2p) @@ -149,7 +149,7 @@ impl DebertaV2DisentangledSelfAttention { let c2p_att = c2p_att.gather( -1, &c2p_pos.squeeze_dim(0).expand( - &[ + [ query_layer.size()[0], query_layer.size()[1], *relative_pos.size().last().unwrap(), @@ -186,7 +186,7 @@ impl DebertaV2DisentangledSelfAttention { .gather( -1, &p2c_pos.squeeze_dim(0).expand( - &[query_layer.size()[0], key_layer_size[1], key_layer_size[1]], + [query_layer.size()[0], key_layer_size[1], key_layer_size[1]], true, ), true, @@ -203,7 +203,7 @@ impl DebertaV2DisentangledSelfAttention { let p2p_att = p2p_att.gather( -1, &c2p_pos.unwrap().expand( - &[ + [ query_layer.size()[0], query_layer.size()[1], query_layer.size()[2], @@ -402,7 +402,7 @@ impl DisentangledSelfAttention for DebertaV2DisentangledSelfAttention { reverse_context_layer_size[1], reverse_context_layer_size[0], ]) - .permute(&[0, 2, 1, 3]) + .permute([0, 2, 1, 3]) .contiguous(); let mut new_context_layer_shape = context_layer.size(); diff --git a/src/deberta_v2/encoder.rs b/src/deberta_v2/encoder.rs index fbf77995..e9d86d4c 100644 --- a/src/deberta_v2/encoder.rs +++ b/src/deberta_v2/encoder.rs @@ -78,10 +78,10 @@ impl ConvLayer { train: bool, ) -> Tensor { let out = hidden_states - .permute(&[0, 2, 1]) + .permute([0, 2, 1]) .contiguous() .apply(&self.conv) - .permute(&[0, 2, 1]) + .permute([0, 2, 1]) .contiguous(); let reverse_mask: Tensor = 1 - input_mask; let out = out.masked_fill( diff --git a/src/electra/electra_model.rs b/src/electra/electra_model.rs index 3f9b2f14..429923e0 100644 --- a/src/electra/electra_model.rs +++ b/src/electra/electra_model.rs @@ -266,7 +266,7 @@ impl ElectraModel { get_shape_and_device_from_ids_embeddings_pair(input_ids, input_embeds)?; let calc_mask = if mask.is_none() { - Some(Tensor::ones(&input_shape, (Kind::Int64, device))) + Some(Tensor::ones(input_shape, (Kind::Int64, device))) } else { None }; diff --git a/src/fnet/attention.rs b/src/fnet/attention.rs index af5037d3..fc4a6377 100644 --- a/src/fnet/attention.rs +++ b/src/fnet/attention.rs @@ -42,7 +42,9 @@ impl FNetFourierTransform { } pub fn forward(&self, hidden_states: &Tensor) -> Tensor { - let self_outputs = hidden_states.fft_fft2(None, &[1, 2], "backward").real(); + let self_outputs = hidden_states + .fft_fft2(None::, [1, 2], "backward") + .real(); (self_outputs + hidden_states).apply(&self.layer_norm) } } diff --git a/src/gpt2/attention.rs b/src/gpt2/attention.rs index de07f79d..9602341c 100644 --- a/src/gpt2/attention.rs +++ b/src/gpt2/attention.rs @@ -71,7 +71,7 @@ impl Attention { { let p = p.borrow(); - let bias = Tensor::ones(&[config.n_ctx, config.n_ctx], (Float, p.device())) + let bias = Tensor::ones([config.n_ctx, config.n_ctx], (Float, p.device())) .tril(0) .view((1, 1, config.n_ctx, config.n_ctx)); @@ -111,9 +111,9 @@ impl Attention { fn split_heads(&self, x: &Tensor, k: bool) -> Tensor { let x = x.view((x.size()[0], -1, self.n_head, self.dim_per_head)); if k { - x.permute(&[0, 2, 3, 1]) + x.permute([0, 2, 3, 1]) } else { - x.permute(&[0, 2, 1, 3]) + x.permute([0, 2, 1, 3]) } } diff --git a/src/gpt_j/attention.rs b/src/gpt_j/attention.rs index cbfb87da..4e7bcd91 100644 --- a/src/gpt_j/attention.rs +++ b/src/gpt_j/attention.rs @@ -68,7 +68,7 @@ impl GptJAttention { let p = p.borrow(); let max_positions = config.n_positions; - let bias = Tensor::ones(&[max_positions, max_positions], (Kind::Uint8, p.device())) + let bias = Tensor::ones([max_positions, max_positions], (Kind::Uint8, p.device())) .tril(0) .view([1, 1, max_positions, max_positions]) .requires_grad_(false); @@ -142,9 +142,9 @@ impl GptJAttention { if rotary { tensor } else if tensor.size().len() == 5 { - tensor.permute(&[0, 1, 3, 2, 4]) // (batch, blocks, head, block_length, head_features) + tensor.permute([0, 1, 3, 2, 4]) // (batch, blocks, head, block_length, head_features) } else if tensor.size().len() == 4 { - tensor.permute(&[0, 2, 1, 3]) // (batch, head, seq_length, head_features) + tensor.permute([0, 2, 1, 3]) // (batch, head, seq_length, head_features) } else { panic!( "Input tensor should either be a rotary head, or its rank be one of [4, 5] but is: {}", @@ -155,9 +155,9 @@ impl GptJAttention { fn merge_heads(tensor: &Tensor, num_heads: i64, attention_head_size: i64) -> Tensor { let tensor = if tensor.size().len() == 5 { - tensor.permute(&[0, 1, 3, 2, 4]).contiguous() + tensor.permute([0, 1, 3, 2, 4]).contiguous() } else if tensor.size().len() == 4 { - tensor.permute(&[0, 2, 1, 3]).contiguous() + tensor.permute([0, 2, 1, 3]).contiguous() } else { panic!( "Input tensor rank should be one of [4, 5], but is: {}", @@ -197,7 +197,7 @@ impl GptJAttention { let mask_value = get_min(attention_weights.kind()).unwrap(); let mask_value = Tensor::full( - &attention_weights.size(), + attention_weights.size(), mask_value, (attention_weights.kind(), attention_weights.device()), ); @@ -261,8 +261,8 @@ impl GptJAttention { query = apply_rotary_pos_emb(&query, &sincos, offset); } - key = key.permute(&[0, 2, 1, 3]); - query = query.permute(&[0, 2, 1, 3]); + key = key.permute([0, 2, 1, 3]); + query = query.permute([0, 2, 1, 3]); if let Some(layer_past) = layer_past { key = Tensor::cat(&[&layer_past.prev_key, &key], -2); @@ -297,7 +297,7 @@ fn fixed_pos_embedding(x: &Tensor, seq_len: i64) -> (Tensor, Tensor) { let sinusoid_inp = Tensor::einsum( "i , j -> i j", &[Tensor::arange(seq_len, (x.kind(), x.device())), inv_freq], - None, + None::, ); (sinusoid_inp.sin(), sinusoid_inp.cos()) } @@ -312,7 +312,7 @@ fn apply_rotary_pos_emb(x: &Tensor, (sin, cos): &(Tensor, Tensor), offset: i64) fn duplicate_interleave(m: &Tensor) -> Tensor { let dim0 = m.size()[0]; m.view([-1, 1]) // flatten the matrix - .repeat(&[1, 2]) // repeat all elements into the 2nd dimension + .repeat([1, 2]) // repeat all elements into the 2nd dimension .view([dim0, -1]) // reshape into a matrix, interleaving the copy } diff --git a/src/gpt_neo/attention.rs b/src/gpt_neo/attention.rs index cb28ee1c..eaa5e087 100644 --- a/src/gpt_neo/attention.rs +++ b/src/gpt_neo/attention.rs @@ -70,7 +70,7 @@ impl GptNeoSelfAttention { let p = p.borrow(); let max_positions = config.max_position_embeddings; - let mut bias = Tensor::ones(&[max_positions, max_positions], (Kind::Uint8, p.device())) + let mut bias = Tensor::ones([max_positions, max_positions], (Kind::Uint8, p.device())) .tril(0) .view([1, 1, max_positions, max_positions]) .requires_grad_(false); @@ -135,11 +135,11 @@ impl GptNeoSelfAttention { let _ = new_shape.pop(); new_shape.extend_from_slice(&[num_heads, attention_head_size]); let reshaped_tensor = input_tensor.view(new_shape.as_slice()); - reshaped_tensor.permute(&[0, 2, 1, 3]) + reshaped_tensor.permute([0, 2, 1, 3]) } fn merge_heads(input_tensor: &Tensor, num_heads: i64, attention_head_size: i64) -> Tensor { - let output_tensor = input_tensor.permute(&[0, 2, 1, 3]).contiguous(); + let output_tensor = input_tensor.permute([0, 2, 1, 3]).contiguous(); let mut new_shape = output_tensor.size(); new_shape.truncate(new_shape.len() - 2); new_shape.push(num_heads * attention_head_size); diff --git a/src/longformer/attention.rs b/src/longformer/attention.rs index adcd2f76..862ac73b 100644 --- a/src/longformer/attention.rs +++ b/src/longformer/attention.rs @@ -14,6 +14,7 @@ use crate::common::dropout::Dropout; use crate::common::kind::get_negative_infinity; use crate::longformer::LongformerConfig; use std::borrow::Borrow; +use std::convert::TryFrom; use tch::{nn, Kind, Tensor}; pub struct LongformerSelfAttention { @@ -119,7 +120,7 @@ impl LongformerSelfAttention { ); chunked_hidden_states - .constant_pad_nd(&[0, window_overlap + 1]) + .constant_pad_nd([0, window_overlap + 1]) .view([total_num_heads, num_chunks, -1]) .slice(2, 0, -window_overlap, 1) .view([ @@ -165,15 +166,15 @@ impl LongformerSelfAttention { ]; let beginning_mask = Tensor::ones( - &[affected_sequence_length, affected_sequence_length + 1], + [affected_sequence_length, affected_sequence_length + 1], (Kind::Int, input_tensor.device()), ) .tril(0) - .flip(&[0]) + .flip([0]) .unsqueeze(0) .unsqueeze(2); - let ending_mask = beginning_mask.flip(&[1, 3]); + let ending_mask = beginning_mask.flip([1, 3]); let beginning_mask = beginning_mask .expand(beginning_input_size.as_slice(), true) @@ -214,21 +215,21 @@ impl LongformerSelfAttention { let query = query .transpose(1, 2) - .reshape(&[batch_size * num_heads, sequence_length, head_dim]); + .reshape([batch_size * num_heads, sequence_length, head_dim]); let key = key .transpose(1, 2) - .reshape(&[batch_size * num_heads, sequence_length, head_dim]); + .reshape([batch_size * num_heads, sequence_length, head_dim]); let query = self.chunk(&query, window_overlap); let key = self.chunk(&key, window_overlap); let diagonal_chunked_attention_scores = self.pad_and_transpose_last_two_dims( - &Tensor::einsum("bcxd,bcyd->bcxy", &[query, key], None), + &Tensor::einsum("bcxd,bcyd->bcxy", &[query, key], None::), &[0, 0, 0, 1], ); let diagonal_attention_scores = Tensor::empty( - &[ + [ batch_size * num_heads, chunks_count + 1, window_overlap, @@ -320,7 +321,7 @@ impl LongformerSelfAttention { let (batch_size, sequence_length, num_heads, head_dim) = value.size4().unwrap(); let chunk_counts = sequence_length / window_overlap - 1; - let chunked_attention_probas = attention_probas.transpose(1, 2).reshape(&[ + let chunked_attention_probas = attention_probas.transpose(1, 2).reshape([ batch_size * num_heads, sequence_length / window_overlap, window_overlap, @@ -330,9 +331,9 @@ impl LongformerSelfAttention { let value = value .transpose(1, 2) - .reshape(&[batch_size * num_heads, sequence_length, head_dim]); + .reshape([batch_size * num_heads, sequence_length, head_dim]); - let padded_value = (value + 1).constant_pad_nd(&[0, 0, window_overlap, window_overlap]) - 1; + let padded_value = (value + 1).constant_pad_nd([0, 0, window_overlap, window_overlap]) - 1; let chunked_value_size = &[ batch_size * num_heads, chunk_counts + 1, @@ -353,7 +354,7 @@ impl LongformerSelfAttention { Tensor::einsum( "bcwd,bcdh->bcwh", &[chunked_attention_probas, chunked_value], - None, + None::, ) .view([batch_size, num_heads, sequence_length, head_dim]) .transpose(1, 2) @@ -365,7 +366,8 @@ impl LongformerSelfAttention { ) -> GlobalAttentionIndices { let num_global_attention_indices = is_index_global_attn.sum_dim_intlist([1].as_slice(), false, Kind::Int64); - let max_num_global_attention_indices = i64::from(num_global_attention_indices.max()); + let max_num_global_attention_indices = + i64::try_from(num_global_attention_indices.max()).unwrap(); let is_index_global_attn_nonzero = is_index_global_attn .nonzero_numpy() .into_iter() @@ -411,7 +413,7 @@ impl LongformerSelfAttention { let batch_size = key_vectors.size()[0]; let mut key_vectors_only_global = Tensor::zeros( - &[ + [ batch_size, max_num_global_attention_indices, self.num_heads, @@ -429,7 +431,7 @@ impl LongformerSelfAttention { let attention_probas_from_global_key = Tensor::einsum( "blhd,bshd->blhs", &[query_vectors, &key_vectors_only_global], - None, + None::, ); let _ = attention_probas_from_global_key @@ -463,7 +465,7 @@ impl LongformerSelfAttention { let attention_probas_only_global = attention_probas.narrow(-1, 0, max_num_global_attention_indices); let mut value_vectors_only_global = Tensor::zeros( - &[ + [ batch_size, max_num_global_attention_indices, self.num_heads, @@ -513,7 +515,7 @@ impl LongformerSelfAttention { let (sequence_length, batch_size) = (hidden_states_shape[0], hidden_states_shape[1]); let mut global_attention_hidden_states = Tensor::zeros( - &[max_num_global_attention_indices, batch_size, self.embed_dim], + [max_num_global_attention_indices, batch_size, self.embed_dim], (hidden_states.kind(), hidden_states.device()), ); @@ -718,7 +720,7 @@ impl LongformerSelfAttention { let mut attention_output = attention_output .transpose(0, 1) - .reshape(&[sequence_length, batch_size, embed_dim]); + .reshape([sequence_length, batch_size, embed_dim]); let global_attention_probas = if is_global_attention { let (global_attention_output, global_attention_probas) = self diff --git a/src/longformer/embeddings.rs b/src/longformer/embeddings.rs index 5be27501..9ea17b82 100644 --- a/src/longformer/embeddings.rs +++ b/src/longformer/embeddings.rs @@ -93,7 +93,7 @@ impl LongformerEmbeddings { (Kind::Int64, inputs_embeds.device()), ) .unsqueeze(0) - .expand(&[batch_size, sequence_length], true) + .expand([batch_size, sequence_length], true) } pub fn forward_t( diff --git a/src/longformer/encoder.rs b/src/longformer/encoder.rs index bc26fb48..b0a26201 100644 --- a/src/longformer/encoder.rs +++ b/src/longformer/encoder.rs @@ -15,6 +15,7 @@ use crate::common::dropout::Dropout; use crate::longformer::attention::LongformerSelfAttention; use crate::longformer::LongformerConfig; use std::borrow::{Borrow, BorrowMut}; +use std::convert::TryFrom; use tch::nn::Module; use tch::{nn, Tensor}; @@ -293,7 +294,7 @@ impl LongformerEncoder { ) -> LongformerEncoderOutput { let is_index_masked = attention_mask.lt(0); let is_index_global_attention = attention_mask.gt(0); - let is_global_attention = bool::from(is_index_global_attention.any()); + let is_global_attention = bool::try_from(is_index_global_attention.any()).unwrap(); let mut all_hidden_states: Option> = if self.output_hidden_states { Some(vec![]) diff --git a/src/longformer/longformer_model.rs b/src/longformer/longformer_model.rs index 2cd25f0e..f0a9581a 100644 --- a/src/longformer/longformer_model.rs +++ b/src/longformer/longformer_model.rs @@ -393,7 +393,7 @@ impl LongformerModel { .map(|value| self.pad_with_nonzero_value(value, &[0, padding_length], pad_token_id)); let inputs_embeds = input_embeds.map(|value| { let input_ids_padding = Tensor::full( - &[batch_size, padding_length], + [batch_size, padding_length], pad_token_id, (Kind::Int64, value.device()), ); @@ -407,8 +407,7 @@ impl LongformerModel { let attention_mask = attention_mask.map(|value| self.pad_with_boolean(value, &[0, padding_length], false)); - let token_type_ids = - token_type_ids.map(|value| value.constant_pad_nd(&[0, padding_length])); + let token_type_ids = token_type_ids.map(|value| value.constant_pad_nd([0, padding_length])); Ok(PaddedInput { input_ids, position_ids, @@ -584,7 +583,7 @@ impl LongformerModel { let mut causal_mask = sequence_ids .unsqueeze(0) .unsqueeze(0) - .repeat(&[batch_size, sequence_length, 1]) + .repeat([batch_size, sequence_length, 1]) .le_tensor(&sequence_ids.unsqueeze(-1).unsqueeze(0)) .totype(Kind::Int); if causal_mask.size()[1] < padded_attention_mask.size()[1] { @@ -593,7 +592,7 @@ impl LongformerModel { causal_mask = Tensor::cat( &[ Tensor::ones( - &[batch_size, sequence_length, prefix_sequence_length], + [batch_size, sequence_length, prefix_sequence_length], (Kind::Int, device), ), causal_mask, @@ -975,7 +974,7 @@ impl LongformerForSequenceClassification { let (batch_size, sequence_length) = (input_shape[0], input_shape[1]); let global_attention_mask = - Tensor::zeros(&[batch_size, sequence_length], (Kind::Int, device)); + Tensor::zeros([batch_size, sequence_length], (Kind::Int, device)); let _ = global_attention_mask.select(1, 0).fill_(1); Some(global_attention_mask) } else { diff --git a/src/longt5/attention.rs b/src/longt5/attention.rs index efb2e2d2..e3cc116f 100644 --- a/src/longt5/attention.rs +++ b/src/longt5/attention.rs @@ -143,16 +143,16 @@ fn make_global_fixed_block_ids( global_block_ids .max_dim(-1, false) .0 - .repeat(&[num_globals, 1]) + .repeat([num_globals, 1]) .transpose(0, 1) } else { Tensor::zeros( - &[batch_size, 0], + [batch_size, 0], (global_block_ids.kind(), global_block_ids.device()), ) }; let global_segment_ids = Tensor::ones( - &[batch_size, num_globals], + [batch_size, num_globals], (attention_mask.kind(), attention_mask.device()), ) .cumsum(-1, attention_mask.kind()) @@ -190,7 +190,7 @@ fn create_global_aggregates( hidden_states, &one_hot_block_ids.to_kind(hidden_states.kind()), ], - None, + None::, ) } @@ -214,7 +214,7 @@ fn compute_bias( ); rp_bucket .apply(relative_attention_bias) - .permute(&[2, 0, 1]) + .permute([2, 0, 1]) .unsqueeze(0) .unsqueeze(0) } @@ -322,11 +322,15 @@ impl LongT5LocalAttention { let key_states = concatenate_3_blocks(&key_states, 1, 2, None); let value_states = concatenate_3_blocks(&value_states, 1, 2, None); - let mut scores = Tensor::einsum("...qhd,...khd->...hqk", &[query_states, key_states], None); + let mut scores = Tensor::einsum( + "...qhd,...khd->...hqk", + &[query_states, key_states], + None::, + ); let calc_position_bias = if position_bias.is_none() { let mut position_bias = if !self.has_relative_attention_bias { Tensor::zeros( - &[1, 1, self.n_heads, self.block_length, 3 * self.block_length], + [1, 1, self.n_heads, self.block_length, 3 * self.block_length], (scores.kind(), scores.device()), ) } else { @@ -356,7 +360,7 @@ impl LongT5LocalAttention { let attention_output = unshape(&Tensor::einsum( "...hqk,...khd->...qhd", &[&attention_weights, &value_states], - None, + None::, )) .narrow(1, 0, seq_length) .apply(&self.output); @@ -492,7 +496,7 @@ impl LongT5TransientGlobalAttention { ); let side_bias = side_relative_position_bucket .apply(self.global_relative_attention_bias.as_ref().unwrap()) - .permute(&[0, 3, 1, 2]); + .permute([0, 3, 1, 2]); attention_side_bias + side_bias } @@ -551,7 +555,11 @@ impl LongT5TransientGlobalAttention { let key_states = Tensor::cat(&[key_states, side_key_states], 2); let value_states = Tensor::cat(&[value_states, side_value_states], 2); - let mut scores = Tensor::einsum("...qhd,...khd->...hqk", &[query_states, key_states], None); + let mut scores = Tensor::einsum( + "...qhd,...khd->...hqk", + &[query_states, key_states], + None::, + ); let local_attention_mask = mask.map(|mask| { let local_attention_mask = get_local_attention_mask(mask, self.block_length); local_attention_mask @@ -562,7 +570,7 @@ impl LongT5TransientGlobalAttention { let calc_position_bias = if position_bias.is_none() { let mut position_bias = if !self.has_relative_attention_bias { Tensor::zeros( - &[1, 1, self.n_heads, self.block_length, 3 * self.block_length], + [1, 1, self.n_heads, self.block_length, 3 * self.block_length], (scores.kind(), scores.device()), ) } else { @@ -579,7 +587,7 @@ impl LongT5TransientGlobalAttention { } let calc_mask = if mask.is_none() { Some(Tensor::ones( - &[batch_size, seq_length], + [batch_size, seq_length], (global_segment_ids.kind(), global_segment_ids.device()), )) } else { @@ -610,7 +618,7 @@ impl LongT5TransientGlobalAttention { let attention_output = unshape(&Tensor::einsum( "...hqk,...khd->...qhd", &[&attention_weights, &value_states], - None, + None::, )) .narrow(1, 0, seq_length) .apply(&self.output); diff --git a/src/longt5/encoder.rs b/src/longt5/encoder.rs index 463b6f92..637e7e0a 100644 --- a/src/longt5/encoder.rs +++ b/src/longt5/encoder.rs @@ -306,7 +306,7 @@ impl LongT5Stack { let calculated_attention_mask = if attention_mask.is_none() { Some(Tensor::ones( - &[batch_size, mask_seq_length], + [batch_size, mask_seq_length], (Kind::Int64, input_embeddings.device()), )) } else { @@ -324,7 +324,7 @@ impl LongT5Stack { sequence_length, (input_embeddings.kind(), input_embeddings.device()), ); - let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat(&[ + let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat([ batch_size, sequence_length, 1, @@ -358,7 +358,7 @@ impl LongT5Stack { let new_shape = &encoder_hidden_states.as_ref().unwrap().size()[..2]; let calculated_encoder_attention_mask = if encoder_attention_mask.is_none() { Some(Tensor::ones( - &[batch_size, new_shape[1]], + [batch_size, new_shape[1]], (Kind::Int64, input_embeddings.device()), )) } else { diff --git a/src/m2m_100/embeddings.rs b/src/m2m_100/embeddings.rs index be46de69..23bfb477 100644 --- a/src/m2m_100/embeddings.rs +++ b/src/m2m_100/embeddings.rs @@ -82,7 +82,7 @@ impl SinusoidalPositionalEmbedding { sinusoidal_embedding = Tensor::cat( &[ sinusoidal_embedding, - Tensor::zeros(&[num_embeddings, 1], (Kind::Float, device)), + Tensor::zeros([num_embeddings, 1], (Kind::Float, device)), ], 1, ); diff --git a/src/mbart/mbart_model.rs b/src/mbart/mbart_model.rs index 6193c1c6..f5d51e3c 100644 --- a/src/mbart/mbart_model.rs +++ b/src/mbart/mbart_model.rs @@ -688,7 +688,7 @@ impl MBartForSequenceClassification { let reshape = eos_mask.sum_dim_intlist([1].as_slice(), true, Int64); let sentence_representation = base_model_output .decoder_output - .permute(&[2, 0, 1]) + .permute([2, 0, 1]) .masked_select(&eos_mask) .view((-1, reshape.size()[0] * reshape.int64_value(&[0, 0]))) .transpose(0, 1) diff --git a/src/mobilebert/embeddings.rs b/src/mobilebert/embeddings.rs index 16826ac1..92de72e9 100644 --- a/src/mobilebert/embeddings.rs +++ b/src/mobilebert/embeddings.rs @@ -114,7 +114,7 @@ impl MobileBertEmbeddings { let updated_input_embeddings = if self.trigram_input { let padding_tensor = Tensor::zeros( - &[input_shape[0], 1, self.embedding_size], + [input_shape[0], 1, self.embedding_size], (input_embeddings.kind(), input_embeddings.device()), ); let input_embeddings = Tensor::cat( diff --git a/src/mobilebert/mobilebert_model.rs b/src/mobilebert/mobilebert_model.rs index b34ee5b8..d1cca736 100644 --- a/src/mobilebert/mobilebert_model.rs +++ b/src/mobilebert/mobilebert_model.rs @@ -387,7 +387,7 @@ impl MobileBertModel { }; let position_ids = Tensor::arange(config.max_position_embeddings, (Kind::Int64, p.device())) - .expand(&[1, -1], true); + .expand([1, -1], true); MobileBertModel { embeddings, encoder, diff --git a/src/pegasus/embeddings.rs b/src/pegasus/embeddings.rs index b2e53a3f..2a285361 100644 --- a/src/pegasus/embeddings.rs +++ b/src/pegasus/embeddings.rs @@ -73,7 +73,7 @@ impl SinusoidalPositionalEmbedding { let sinusoidal_embeddings = Tensor::stack(&sinusoidal_embedding, 0).to_kind(Kind::Float); let reordered_sinusoidal_embeddings = - Tensor::empty(&[num_embeddings, embedding_dim], (Kind::Float, device)); + Tensor::empty([num_embeddings, embedding_dim], (Kind::Float, device)); reordered_sinusoidal_embeddings .slice(1, 0, sentinel, 1) diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs index b8f46e5f..bd7e035a 100644 --- a/src/pipelines/conversation.rs +++ b/src/pipelines/conversation.rs @@ -1011,7 +1011,7 @@ impl ConversationModel { .unwrap(); let attention_mask = Tensor::ones( - &[inputs.len() as i64, max_len as i64], + [inputs.len() as i64, max_len as i64], (Kind::Int8, self.device), ); diff --git a/src/pipelines/generation_utils.rs b/src/pipelines/generation_utils.rs index 51c75ec1..936e4a90 100644 --- a/src/pipelines/generation_utils.rs +++ b/src/pipelines/generation_utils.rs @@ -229,6 +229,7 @@ pub enum Cache { pub(crate) mod private_generation_utils { use std::cmp::{max, min}; use std::collections::HashMap; + use std::convert::TryFrom; use std::mem; use rust_tokenizers::tokenizer::{truncate_sequences, TruncationStrategy}; @@ -653,7 +654,7 @@ pub(crate) mod private_generation_utils { bad_words_id_length_1: &[i64], ) -> Tensor { let mut static_bad_words_mask = - Tensor::zeros(&[scores.size()[1]], (Kind::Int8, scores.device())); + Tensor::zeros([scores.size()[1]], (Kind::Int8, scores.device())); let _ = static_bad_words_mask.index_fill_( 0, &Tensor::of_slice(bad_words_id_length_1).to_device(scores.device()), @@ -766,9 +767,9 @@ pub(crate) mod private_generation_utils { output_scores: bool, ) -> GeneratedOutputWithScores { let mut unfinished_sentences = - Tensor::ones(&[batch_size], (Kind::Int64, self.get_var_store().device())); + Tensor::ones([batch_size], (Kind::Int64, self.get_var_store().device())); let mut sentence_lengths: Tensor = - Tensor::ones(&[batch_size], (Kind::Int64, self.get_var_store().device())); + Tensor::ones([batch_size], (Kind::Int64, self.get_var_store().device())); let (bad_word_ids_length_1, bad_word_ids_length_greater_than_1) = self.split_bad_word_ids(gen_opt.bad_word_ids); let mut static_bad_words_mask: Option = None; @@ -902,7 +903,7 @@ pub(crate) mod private_generation_utils { prev_scores.push( next_token_logits .log_softmax(-1, next_token_logits.kind()) - .gather(1, &next_token.reshape(&[-1, 1]), true) + .gather(1, &next_token.reshape([-1, 1]), true) .squeeze() .masked_fill(&finished_mask, 0), ); @@ -931,7 +932,7 @@ pub(crate) mod private_generation_utils { ); unfinished_sentences = -unfinished_sentences * (sentence_with_eos - 1); } - if i64::from(unfinished_sentences.max()) == 0 { + if i64::try_from(unfinished_sentences.max()).unwrap() == 0 { break; } } @@ -940,7 +941,7 @@ pub(crate) mod private_generation_utils { &[ attention_mask.as_ref(), Tensor::ones( - &[*attention_mask.size().first().unwrap(), 1], + [*attention_mask.size().first().unwrap(), 1], (Kind::Int64, attention_mask.device()), ) .as_ref(), @@ -1022,20 +1023,20 @@ pub(crate) mod private_generation_utils { let vocab_size = self.get_vocab_size(); let beam_scores = Tensor::ones( - &[batch_size, gen_opt.num_beams], + [batch_size, gen_opt.num_beams], (Kind::Float, self.get_var_store().device()), ) * -1e9; let _ = beam_scores .slice(1, 0, *beam_scores.size().last().unwrap(), num_sub_beams) .fill_(0); - let mut beam_scores = beam_scores.view_(&[-1]); + let mut beam_scores = beam_scores.view_([-1]); let mut beam_tokens = Tensor::zeros( - &[batch_size * gen_opt.num_beams], + [batch_size * gen_opt.num_beams], (Kind::Int64, self.get_var_store().device()), ); let mut beam_indices = Tensor::zeros( - &[batch_size * gen_opt.num_beams], + [batch_size * gen_opt.num_beams], (Kind::Int64, self.get_var_store().device()), ); let mut saved_beam_scores: Option> = @@ -1052,7 +1053,7 @@ pub(crate) mod private_generation_utils { loop { if num_beam_groups > 1 { current_tokens = Tensor::zeros( - &[batch_size * gen_opt.num_beams], + [batch_size * gen_opt.num_beams], (input_ids.kind(), input_ids.device()), ); } @@ -1364,7 +1365,7 @@ pub(crate) mod private_generation_utils { &[ attention_mask.as_ref(), Tensor::ones( - &[*attention_mask.size().first().unwrap(), 1], + [*attention_mask.size().first().unwrap(), 1], (Kind::Int64, attention_mask.device()), ) .as_ref(), @@ -1391,7 +1392,7 @@ pub(crate) mod private_generation_utils { let beam_saved_token_scores = saved_beam_scores.as_mut().map(|saved_tokens| { mem::replace(&mut saved_tokens[effective_beam_id as usize], Tensor::new()) }); - let final_score = f64::from(beam_scores.get(effective_beam_id)); + let final_score = f64::try_from(beam_scores.get(effective_beam_id)).unwrap(); let final_tokens = input_ids.get(effective_beam_id); hypotheses[batch_index as usize].add( final_tokens, @@ -1411,7 +1412,7 @@ pub(crate) mod private_generation_utils { }; let mut sentence_lengths = - Tensor::zeros(&[output_batch_size], (Kind::Int64, input_ids.device())); + Tensor::zeros([output_batch_size], (Kind::Int64, input_ids.device())); let mut best_ids = vec![]; let mut scores_output = if output_scores { @@ -1457,14 +1458,21 @@ pub(crate) mod private_generation_utils { } let sentence_max_length = gen_opt .max_length - .map(|max_length| min(i64::from(sentence_lengths.max()) + 1, max_length)) - .unwrap_or(i64::from(sentence_lengths.max()) + 1); + .map(|max_length| { + min( + i64::try_from(sentence_lengths.max()).unwrap() + 1, + max_length, + ) + }) + .unwrap_or(i64::try_from(sentence_lengths.max()).unwrap() + 1); let mut decoded = input_ids.new_empty( - &[output_batch_size, sentence_max_length], + [output_batch_size, sentence_max_length], (Kind::Int64, input_ids.device()), ); - if i64::from(sentence_lengths.max()) != i64::from(sentence_lengths.min()) { + if i64::try_from(sentence_lengths.max()).unwrap() + != i64::try_from(sentence_lengths.min()).unwrap() + { let _ = decoded.fill_( gen_opt .pad_token_id @@ -1476,15 +1484,16 @@ pub(crate) mod private_generation_utils { 0, &Tensor::arange_start( 0, - i64::from(sentence_lengths.get(hypothesis_index as i64)), + i64::try_from(sentence_lengths.get(hypothesis_index as i64)).unwrap(), (Kind::Int64, input_ids.device()), ), best_id, ); - let sentence_length = i64::from(sentence_lengths.get(hypothesis_index as i64)); + let sentence_length = + i64::try_from(sentence_lengths.get(hypothesis_index as i64)).unwrap(); let sentence_length_max = gen_opt .max_length - .unwrap_or_else(|| i64::from(sentence_lengths.max())); + .unwrap_or_else(|| i64::try_from(sentence_lengths.max()).unwrap()); if sentence_length < sentence_length_max { let _ = decoded.get(hypothesis_index as i64).index_fill_( 0, @@ -1810,7 +1819,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator { } None => match self.get_bos_id() { Some(bos_id) => { - Tensor::ones(&[1, 1], (Int64, self.get_var_store().device())) * bos_id + Tensor::ones([1, 1], (Int64, self.get_var_store().device())) * bos_id } None => panic!( "A model with a BOS token must be used to start generation with an empty input" @@ -1916,13 +1925,13 @@ pub trait LanguageGenerator: PrivateLanguageGenerator { let mut input_ids_len = *input_id_size.last().unwrap(); if input_ids_len == 0 { input_ids = Tensor::ones( - &[*input_id_size.first().unwrap(), 1], + [*input_id_size.first().unwrap(), 1], (Int64, input_ids.device()), ) * self .get_bos_id() .expect("`bos_token_id` has to be defined when no `input_ids` are provided."); attention_mask = Some(Tensor::ones( - &[*input_id_size.first().unwrap(), 1], + [*input_id_size.first().unwrap(), 1], (Int64, input_ids.device()), )); input_ids_len += 1; @@ -1952,7 +1961,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator { let encoder_outputs = self.encode(&input_ids, Some(&attention_mask)).unwrap(); let expanded_batch_indices = Tensor::arange(batch_size, (Int64, input_ids.device())) .view((-1, 1)) - .repeat(&[1, num_beams * effective_batch_mult]) + .repeat([1, num_beams * effective_batch_mult]) .view(-1); Some(encoder_outputs.index_select(0, &expanded_batch_indices)) } else { @@ -1965,7 +1974,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator { input_ids .unsqueeze(1) .expand( - &[batch_size, effective_batch_mult * num_beams, cur_len], + [batch_size, effective_batch_mult * num_beams, cur_len], true, ) .contiguous() @@ -1973,7 +1982,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator { attention_mask .unsqueeze(1) .expand( - &[batch_size, effective_batch_mult * num_beams, cur_len], + [batch_size, effective_batch_mult * num_beams, cur_len], true, ) .contiguous() @@ -1988,7 +1997,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator { .expect("decoder start id must be specified for encoder decoders") }); let input_ids = Tensor::full( - &[effective_batch_size * num_beams, 1], + [effective_batch_size * num_beams, 1], decoder_start_token_id, (Int64, input_ids.device()), ); @@ -1996,7 +2005,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator { attention_mask .unsqueeze(1) .expand( - &[batch_size, effective_batch_mult * num_beams, input_ids_len], + [batch_size, effective_batch_mult * num_beams, input_ids_len], true, ) .contiguous() @@ -2228,7 +2237,7 @@ impl BeamHypotheses { 1, 0, Some(Tensor::zeros( - &[1], + [1], (scores_tensor.kind(), scores_tensor.device()), )), None, diff --git a/src/pipelines/keywords_extraction/scorer.rs b/src/pipelines/keywords_extraction/scorer.rs index 822dab08..fbdc0a77 100644 --- a/src/pipelines/keywords_extraction/scorer.rs +++ b/src/pipelines/keywords_extraction/scorer.rs @@ -22,6 +22,7 @@ /// SOFTWARE. use crate::pipelines::keywords_extraction::KeywordScorerType; use std::cmp::{max, min}; +use std::convert::TryFrom; use tch::{Kind, Tensor}; impl KeywordScorerType { @@ -96,7 +97,8 @@ fn maximal_margin_relevance_score( cosine_similarity(Some(&document_embedding), &word_embeddings).view([-1]); let word_similarities = cosine_similarity(None, &word_embeddings); - let mut keyword_indices = vec![i64::from(word_document_similarities.argmax(0, false))]; + let mut keyword_indices = + vec![i64::try_from(word_document_similarities.argmax(0, false)).unwrap()]; let mut candidate_indices = (0..word_document_similarities.size()[0]).collect::>(); let _ = candidate_indices.remove(keyword_indices[0] as usize); for _ in 0..min(num_keywords - 1, word_embeddings.size()[0] as usize) { @@ -112,7 +114,7 @@ fn maximal_margin_relevance_score( ) .max_dim(1, false); let mmr = candidate_similarities * (1.0 - diversity) - target_similarities * diversity; - let mmr_index = candidate_indices[i64::from(mmr.argmax(0, false)) as usize]; + let mmr_index = candidate_indices[i64::try_from(mmr.argmax(0, false)).unwrap() as usize]; keyword_indices.push(mmr_index); let candidate_mmr_index = candidate_indices .iter() @@ -149,12 +151,13 @@ fn max_sum_score( let (mut best_score, mut best_combination) = (None, None); for idx in 0..keyword_combinations.size()[0] { let combination = keyword_combinations.get(idx); - let combination_score = f64::from( + let combination_score = f64::try_from( word_similarities .index_select(0, &combination) .index_select(1, &combination) .sum(word_similarities.kind()), - ); + ) + .unwrap(); if let Some(current_best_score) = best_score { if combination_score < current_best_score { best_score = Some(combination_score); diff --git a/src/pipelines/masked_language.rs b/src/pipelines/masked_language.rs index eca8a373..32829ae1 100644 --- a/src/pipelines/masked_language.rs +++ b/src/pipelines/masked_language.rs @@ -61,6 +61,7 @@ use crate::{ use rust_tokenizers::tokenizer::TruncationStrategy; use rust_tokenizers::TokenizedInput; use std::borrow::Borrow; +use std::convert::TryFrom; use tch::nn::VarStore; use tch::{nn, no_grad, Device, Tensor}; @@ -580,7 +581,7 @@ impl MaskedLanguageModel { for input_id in 0..input.as_ref().len() as i64 { let mut sequence_tokens = vec![]; let sequence_mask = mask_token_mask.get(input_id); - if bool::from(sequence_mask.any()) { + if bool::try_from(sequence_mask.any())? { let mask_scores = output .get(input_id) .index_select(0, &sequence_mask.argwhere().squeeze_dim(1)); diff --git a/src/pipelines/sentence_embeddings/pipeline.rs b/src/pipelines/sentence_embeddings/pipeline.rs index 6005825c..b2104152 100644 --- a/src/pipelines/sentence_embeddings/pipeline.rs +++ b/src/pipelines/sentence_embeddings/pipeline.rs @@ -1,5 +1,5 @@ use std::borrow::Borrow; -use std::convert::TryInto; +use std::convert::{TryFrom, TryInto}; use rust_tokenizers::tokenizer::TruncationStrategy; use tch::{nn, Tensor}; @@ -365,7 +365,7 @@ impl SentenceEmbeddingsModel { }; let maybe_normalized = if self.normalize_embeddings { let norm = &maybe_linear - .norm_scalaropt_dim(2, &[1], true) + .norm_scalaropt_dim(2, [1], true) .clamp_min(1e-12) .expand_as(&maybe_linear); maybe_linear / norm @@ -385,7 +385,7 @@ impl SentenceEmbeddingsModel { S: AsRef + Sync, { let SentenceEmbeddingsModelOutput { embeddings, .. } = self.encode_as_tensor(inputs)?; - Ok(Vec::from(embeddings)) + Ok(Vec::try_from(embeddings)?) } fn nb_layers(&self) -> usize { @@ -433,7 +433,7 @@ impl SentenceEmbeddingsModel { all_attentions, } = self.encode_as_tensor(inputs)?; - let embeddings = Vec::from(embeddings); + let embeddings = Vec::try_from(embeddings)?; let all_attentions = all_attentions.ok_or_else(|| { RustBertError::InvalidConfigurationError("No attention outputted".into()) })?; @@ -448,7 +448,7 @@ impl SentenceEmbeddingsModel { .slice(0, i, i + 1, 1) .slice(1, head as i64, head as i64 + 1, 1) .squeeze(); - let attention_head = AttentionHead::from(attention_slice); + let attention_head = AttentionHead::try_from(attention_slice).unwrap(); attention_layer.push(attention_head); } attention_output.push(attention_layer); diff --git a/src/prophetnet/attention.rs b/src/prophetnet/attention.rs index 21d67bb7..d381129b 100644 --- a/src/prophetnet/attention.rs +++ b/src/prophetnet/attention.rs @@ -531,7 +531,7 @@ impl ProphetNetNgramAttention { let predict_attention_weights = Tensor::einsum( "nbtc,nbsc->nbts", &[predict_query_states, predict_key_states], - None, + None::, ); let predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings( @@ -555,7 +555,7 @@ impl ProphetNetNgramAttention { let predict_attention_output = Tensor::einsum( "nbts,nbsc->nbtc", &[&predict_attention_probas, &predict_value_states], - None, + None::, ) .transpose(1, 2) .contiguous() @@ -611,11 +611,11 @@ impl ProphetNetNgramAttention { ) .unsqueeze(0) .unsqueeze(0) - .repeat(&[batch_size, sequence_length, 1]); + .repeat([batch_size, sequence_length, 1]); let relative_positions = relative_positions - position_ids .unsqueeze(0) - .repeat(&[batch_size, sequence_length, 1]); + .repeat([batch_size, sequence_length, 1]); Some(compute_relative_buckets( self.num_buckets, self.relative_max_distance, @@ -637,11 +637,11 @@ impl ProphetNetNgramAttention { self.num_buckets, self.num_attention_heads, ]) - .permute(&[0, 3, 1, 2]) - .reshape(&[-1, self.num_buckets]); + .permute([0, 3, 1, 2]) + .reshape([-1, self.num_buckets]); let main_relative_position_buckets = main_relative_position_buckets - .repeat(&[1, self.num_attention_heads, 1]) + .repeat([1, self.num_attention_heads, 1]) .view([-1, *main_relative_position_buckets.size().last().unwrap()]); let mut new_shape = attention_weights @@ -672,11 +672,11 @@ impl ProphetNetNgramAttention { Tensor::arange(key_sequence_length, (Kind::Int64, hidden_states.device())) .unsqueeze(0) .unsqueeze(0) - .repeat(&[batch_size, sequence_length, 1]); + .repeat([batch_size, sequence_length, 1]); let relative_positions = relative_positions - position_ids .unsqueeze(0) - .repeat(&[batch_size, sequence_length, 1]); + .repeat([batch_size, sequence_length, 1]); Some(compute_relative_buckets( self.num_buckets, self.relative_max_distance, @@ -700,12 +700,12 @@ impl ProphetNetNgramAttention { self.num_buckets, self.num_attention_heads, ]) - .permute(&[0, 1, 4, 2, 3]) - .reshape(&[-1, self.num_buckets]); + .permute([0, 1, 4, 2, 3]) + .reshape([-1, self.num_buckets]); let predict_relative_position_buckets = predict_relative_position_buckets .unsqueeze(0) - .repeat(&[self.ngram, 1, self.num_attention_heads, 1]) + .repeat([self.ngram, 1, self.num_attention_heads, 1]) .view([ -1, *predict_relative_position_buckets.size().last().unwrap(), @@ -770,12 +770,12 @@ pub(crate) fn compute_all_stream_relative_buckets( let main_stream_relative_positions = position_ids .unsqueeze(1) - .repeat(&[1, *position_ids.size().last().unwrap(), 1]) + .repeat([1, *position_ids.size().last().unwrap(), 1]) - position_ids.unsqueeze(-1); let predicting_stream_relative_positions = Tensor::cat(&[&(position_ids - 1), position_ids], 1) .unsqueeze(1) - .repeat(&[1, *position_ids.size().last().unwrap(), 1]) + .repeat([1, *position_ids.size().last().unwrap(), 1]) - position_ids.unsqueeze(-1); let main_relative_position_buckets = compute_relative_buckets( diff --git a/src/prophetnet/decoder.rs b/src/prophetnet/decoder.rs index 5b647ebe..2a311ebc 100644 --- a/src/prophetnet/decoder.rs +++ b/src/prophetnet/decoder.rs @@ -25,7 +25,7 @@ use tch::nn::init::DEFAULT_KAIMING_UNIFORM; use tch::{nn, Device, Kind, Tensor}; fn ngram_attention_bias(sequence_length: i64, ngram: i64, device: Device, kind: Kind) -> Tensor { - let left_block = Tensor::ones(&[ngram, sequence_length, sequence_length], (kind, device)) + let left_block = Tensor::ones([ngram, sequence_length, sequence_length], (kind, device)) * get_min(kind).unwrap(); let right_block = left_block.copy(); for stream_idx in 0..ngram { @@ -302,7 +302,7 @@ impl ProphetNetDecoder { ngram_hidden_states.push( (&self.ngram_embeddings.get(ngram - 1) + &predicting_stream_pos_embed) .transpose(0, 1) - .repeat(&[1, batch_size, 1]), + .repeat([1, batch_size, 1]), ); } (ngram_hidden_states, None, None) @@ -328,7 +328,7 @@ impl ProphetNetDecoder { let extended_encoder_attention_mask = encoder_attention_mask.map(|encoder_attention_mask_value| { encoder_attention_mask_value.ones_like() - - encoder_attention_mask_value.unsqueeze(1).repeat(&[ + - encoder_attention_mask_value.unsqueeze(1).repeat([ self.num_attention_heads, 1, 1, @@ -471,7 +471,7 @@ impl ProphetNetDecoder { self.max_target_positions, (Kind::Int64, position_ids.device()), ) - .repeat(&[1, 1]); + .repeat([1, 1]); let (main_relative_buckets, predict_relative_buckets) = compute_all_stream_relative_buckets( self.num_buckets, @@ -482,7 +482,7 @@ impl ProphetNetDecoder { let main_relative_buckets = main_relative_buckets .slice(1, 0, sequence_length, 1) .slice(2, 0, sequence_length, 1) - .repeat(&[batch_size, 1, 1]); + .repeat([batch_size, 1, 1]); let predict_relative_buckets = Tensor::cat( &[ @@ -500,7 +500,7 @@ impl ProphetNetDecoder { ], 2, ) - .repeat(&[batch_size, 1, 1]); + .repeat([batch_size, 1, 1]); (main_relative_buckets, predict_relative_buckets) } @@ -514,7 +514,7 @@ impl ProphetNetDecoder { let (sequence_length, batch_size) = (input_size[0], input_size[1]); let causal_mask = Tensor::full( - &[sequence_length, sequence_length], + [sequence_length, sequence_length], get_min(hidden_states.kind()).unwrap(), (hidden_states.kind(), hidden_states.device()), ) @@ -522,7 +522,7 @@ impl ProphetNetDecoder { let extended_causal_mask = causal_mask .unsqueeze(0) - .expand(&[batch_size, sequence_length, sequence_length], true); + .expand([batch_size, sequence_length, sequence_length], true); let extended_attention_mask = if let Some(attention_mask_value) = attention_mask { let extended_attention_mask = @@ -533,7 +533,7 @@ impl ProphetNetDecoder { extended_causal_mask }; - extended_attention_mask.repeat(&[self.num_attention_heads, 1, 1]) + extended_attention_mask.repeat([self.num_attention_heads, 1, 1]) } fn prepare_predict_attention_mask( @@ -579,7 +579,7 @@ impl ProphetNetDecoder { - attention_mask_value.unsqueeze(0).unsqueeze(2)) * -10000.0; let extended_attention_mask = extended_attention_mask.expand( - &[self.ngram, batch_size, sequence_length, sequence_length], + [self.ngram, batch_size, sequence_length, sequence_length], true, ); let extended_attention_mask = Tensor::cat( @@ -594,7 +594,7 @@ impl ProphetNetDecoder { extended_predict_causal_mask }; - extended_attention_mask.repeat(&[1, self.num_attention_heads, 1, 1]) + extended_attention_mask.repeat([1, self.num_attention_heads, 1, 1]) } } diff --git a/src/prophetnet/embeddings.rs b/src/prophetnet/embeddings.rs index 42955431..6414b0fd 100644 --- a/src/prophetnet/embeddings.rs +++ b/src/prophetnet/embeddings.rs @@ -55,8 +55,7 @@ impl ProphetNetPositionalEmbeddings { if let Some(prev_num_input_ids_value) = prev_num_input_ids { let num_input_ids = input_shape[1] + prev_num_input_ids_value; - Tensor::ones(&[1, 1], (Kind::Int64, device)) - * (self.padding_idx + num_input_ids) + Tensor::ones([1, 1], (Kind::Int64, device)) * (self.padding_idx + num_input_ids) } else { let calc_attention_mask = if attention_mask.is_none() { Some(Tensor::ones(input_shape, (Kind::Int64, device))) diff --git a/src/prophetnet/encoder.rs b/src/prophetnet/encoder.rs index d6f0160a..4152e5c8 100644 --- a/src/prophetnet/encoder.rs +++ b/src/prophetnet/encoder.rs @@ -154,7 +154,7 @@ impl ProphetNetEncoder { let input_embeds = input_embeds.unwrap_or_else(|| calc_input_embeddings.as_ref().unwrap()); let extended_attention_mask = attention_mask.map(|mask| { - ((mask.ones_like() - mask.unsqueeze(1).repeat(&[self.num_attention_heads, 1, 1])) + ((mask.ones_like() - mask.unsqueeze(1).repeat([self.num_attention_heads, 1, 1])) * -10000.0) .to_kind(input_embeds.kind()) }); diff --git a/src/reformer/attention.rs b/src/reformer/attention.rs index b3ca9df4..e943d287 100644 --- a/src/reformer/attention.rs +++ b/src/reformer/attention.rs @@ -247,7 +247,7 @@ impl LSHSelfAttention { let per_head_query_key = self .query_key .ws - .reshape(&[ + .reshape([ self.num_attention_heads, self.attention_head_size, self.hidden_size, @@ -256,7 +256,7 @@ impl LSHSelfAttention { Tensor::einsum( "balh,ahr->balr", &[hidden_states, &per_head_query_key], - None, + None::, ) } @@ -264,13 +264,17 @@ impl LSHSelfAttention { let per_head_value = self .value .ws - .reshape(&[ + .reshape([ self.num_attention_heads, self.attention_head_size, self.hidden_size, ]) .transpose(-2, -1); - Tensor::einsum("balh,ahr->balr", &[hidden_states, &per_head_value], None) + Tensor::einsum( + "balh,ahr->balr", + &[hidden_states, &per_head_value], + None::, + ) } fn hash_vectors( @@ -307,9 +311,12 @@ impl LSHSelfAttention { num_hashes, rotation_size / 2, ]; - let random_rotations = Tensor::randn(&rotations_shape, (vectors.kind(), vectors.device())); - let rotated_vectors = - Tensor::einsum("bmtd,mdhr->bmhtr", &[vectors, random_rotations], None); + let random_rotations = Tensor::randn(rotations_shape, (vectors.kind(), vectors.device())); + let rotated_vectors = Tensor::einsum( + "bmtd,mdhr->bmhtr", + &[vectors, random_rotations], + None::, + ); let mut buckets = match &self.num_buckets { NumBuckets::Integer(_) => { @@ -320,7 +327,7 @@ impl LSHSelfAttention { } NumBuckets::Array(buckets_array) => { let (mut buckets, mut cur_sum, mut cur_product) = ( - Tensor::zeros(&[1], (rotated_vectors.kind(), rotated_vectors.device())), + Tensor::zeros([1], (rotated_vectors.kind(), rotated_vectors.device())), 0, 1, ); @@ -341,14 +348,14 @@ impl LSHSelfAttention { }; if let Some(attention_mask_value) = attention_mask { - if i64::from(attention_mask_value.sum(Kind::Int)) + if i64::try_from(attention_mask_value.sum(Kind::Int)).unwrap() < batch_size * *attention_mask_value.size().last().unwrap() { num_buckets += 1; let buckets_mask = attention_mask_value .unsqueeze(1) .unsqueeze(1) - .expand(&buckets.size(), true) + .expand(buckets.size(), true) .to_kind(Kind::Bool); buckets = buckets.where_self( &buckets_mask, @@ -432,7 +439,7 @@ impl LSHSelfAttention { query_shape[sorted_bucket_indices_per_hash.dim() - 1] = 1; let query_bucket_idx = sorted_bucket_indices_per_hash.new_full( query_shape.as_slice(), - i64::from(sorted_bucket_indices_per_hash.max()), + i64::try_from(sorted_bucket_indices_per_hash.max()).unwrap(), (Kind::Int64, sorted_bucket_indices_per_hash.device()), ); (query_bucket_idx, sorted_bucket_indices_per_hash) @@ -486,7 +493,7 @@ impl LSHSelfAttention { )?; } - let mut logits = query_key_dots.logsumexp(&[-1], true); + let mut logits = query_key_dots.logsumexp([-1], true); let attention_probs = (query_key_dots - &logits) .exp() .apply_t(&self.dropout, train); @@ -555,7 +562,8 @@ impl LSHSelfAttention { let hidden_states_shape = hidden_states.size(); let (batch_size, sequence_length) = (hidden_states_shape[0], hidden_states_shape[1]); let max_bucket = self.num_buckets.max_bucket(); - let increase_num_buckets = i64::from(past_buckets.max()) > num_hashes * max_bucket - 1; + let increase_num_buckets = + i64::try_from(past_buckets.max()).unwrap() > num_hashes * max_bucket - 1; let query_buckets = self.hash_vectors( query_vectors, @@ -597,14 +605,14 @@ impl LSHSelfAttention { &relevant_bucket_indices_chunk + bucket_indices_batch_offset; let relevant_hidden_states = hidden_states - .reshape(&[-1, self.hidden_size]) + .reshape([-1, self.hidden_size]) .index_select( 0, &relevant_bucket_indices_chunk_all_batch.to_kind(Kind::Int64), ) - .reshape(&[batch_size, self.num_attention_heads, -1, self.hidden_size]); + .reshape([batch_size, self.num_attention_heads, -1, self.hidden_size]); - let relevant_bucket_indices_chunk = relevant_bucket_indices_chunk.reshape(&[ + let relevant_bucket_indices_chunk = relevant_bucket_indices_chunk.reshape([ batch_size, self.num_attention_heads, num_hashes, @@ -633,18 +641,18 @@ impl LSHSelfAttention { let expanded_start_indices = start_indices_chunk .unsqueeze(-1) - .expand(&[indices.size()[0], total_chunk_size], true); + .expand([indices.size()[0], total_chunk_size], true); let chunk_sequence_indices = expanded_start_indices + Tensor::arange(total_chunk_size, (Kind::Int64, indices.device())) .unsqueeze(0) - .expand(&[indices.size()[0], total_chunk_size], true); + .expand([indices.size()[0], total_chunk_size], true); let chunk_sequence_indices = chunk_sequence_indices .flatten(0, 1) .remainder(sequence_length); let indices = indices .unsqueeze(1) - .expand(&[indices.size()[0], total_chunk_size, -1], true) + .expand([indices.size()[0], total_chunk_size, -1], true) .flatten(0, 1); indices.select(1, -1).copy_(&chunk_sequence_indices); @@ -668,9 +676,9 @@ impl LSHSelfAttention { fn gather_by_expansion(&self, vectors: &Tensor, indices: &Tensor, num_hashes: i64) -> Tensor { let expanded_indices = indices .unsqueeze(-1) - .expand(&[-1, -1, -1, self.attention_head_size], true); + .expand([-1, -1, -1, self.attention_head_size], true); vectors - .repeat(&[1, 1, num_hashes, 1]) + .repeat([1, 1, num_hashes, 1]) .gather(2, &expanded_indices, false) } @@ -742,7 +750,7 @@ impl LSHSelfAttention { Some(self.attention_head_size), )?; - query_vectors = query_vectors.unsqueeze(2).repeat(&[1, 1, num_hashes, 1, 1]); + query_vectors = query_vectors.unsqueeze(2).repeat([1, 1, num_hashes, 1, 1]); ( key_value_hidden_states, query_key_vectors, @@ -859,7 +867,7 @@ impl LSHSelfAttention { } else { ( Tensor::arange(sequence_length, (Kind::Int64, query_key_vectors.device())) - .repeat(&[batch_size, self.num_attention_heads, 1]), + .repeat([batch_size, self.num_attention_heads, 1]), None, ) }; @@ -1094,7 +1102,7 @@ impl LocalSelfAttention { .sqrt(); let indices = Tensor::arange(sequence_length, (Kind::Int64, query_vectors.device())) - .repeat(&[batch_size, self.num_attention_heads, 1]); + .repeat([batch_size, self.num_attention_heads, 1]); let do_standard_attention = sequence_length <= self.chunk_length; @@ -1158,7 +1166,7 @@ impl LocalSelfAttention { )?; } - let logits = query_key_dots.logsumexp(&[-1], true); + let logits = query_key_dots.logsumexp([-1], true); let attention_probs = (query_key_dots - logits) .exp() .apply_t(&self.dropout, train); diff --git a/src/reformer/attention_utils.rs b/src/reformer/attention_utils.rs index 836d3500..f5101b39 100644 --- a/src/reformer/attention_utils.rs +++ b/src/reformer/attention_utils.rs @@ -21,7 +21,7 @@ pub fn stable_argsort(input_tensor: &Tensor, dim: i64) -> Tensor { let scaling_dim = input_tensor.size()[dim as usize]; let scaled_offset = Tensor::arange(scaling_dim, (Kind::Int, input_tensor.device())) .view([1, 1, -1]) - .expand(&input_tensor.size(), true); + .expand(input_tensor.size(), true); let scaled_tensor = scaling_dim * input_tensor + (scaled_offset / scaling_dim); scaled_tensor.argsort(dim, false) } @@ -138,7 +138,7 @@ pub fn merge_hidden_size_dim( -1, num_attention_heads * attention_head_size, ]; - input.permute(&[0, 2, 1, 3]).reshape(&new_shape) + input.permute([0, 2, 1, 3]).reshape(new_shape) } pub fn split_seq_length_dim_to( diff --git a/src/reformer/embeddings.rs b/src/reformer/embeddings.rs index f8d165aa..0e42aa6e 100644 --- a/src/reformer/embeddings.rs +++ b/src/reformer/embeddings.rs @@ -16,6 +16,7 @@ use crate::common::embeddings::process_ids_embeddings_pair; use crate::reformer::ReformerConfig; use crate::RustBertError; use std::borrow::Borrow; +use std::convert::TryFrom; use tch::nn::Init; use tch::{nn, Kind, Tensor}; @@ -81,18 +82,18 @@ impl AxialPositionEmbeddings { .transpose(2, 1) .feature_dropout(self.dropout_prob, train) .transpose(2, 1) - .reshape(&[batch_size, sequence_length, -1]) + .reshape([batch_size, sequence_length, -1]) } else { Tensor::cat( &broadcasted_weights .iter() - .map(|tensor| tensor.reshape(&[batch_size, sequence_length, -1])) + .map(|tensor| tensor.reshape([batch_size, sequence_length, -1])) .collect::>(), -1, ) } } else { - let max_position_id = i64::from(position_ids.max()); + let max_position_id = i64::try_from(position_ids.max()).unwrap(); let required_pos_encodings_columns = (max_position_id + 1) / self.axial_pos_shape[1] + 1; let position_encodings = Tensor::cat( @@ -102,7 +103,7 @@ impl AxialPositionEmbeddings { .collect::>(), -1, ); - let position_encodings = position_encodings.reshape(&[ + let position_encodings = position_encodings.reshape([ batch_size, -1, *position_encodings.size().last().unwrap(), diff --git a/src/reformer/reformer_model.rs b/src/reformer/reformer_model.rs index 51e755af..130e28b2 100644 --- a/src/reformer/reformer_model.rs +++ b/src/reformer/reformer_model.rs @@ -428,14 +428,14 @@ impl ReformerModel { device: Device, ) -> Result { let input_ids_padding = Tensor::full( - &[input_shape[0], padding_length], + [input_shape[0], padding_length], self.pad_token_id, (Kind::Int64, device), ); let attention_mask = Some(if let Some(attention_mask) = attention_mask { let attention_mask_padding = Tensor::zeros( - &[input_shape[0], padding_length], + [input_shape[0], padding_length], (attention_mask.kind(), device), ); Tensor::cat(&[attention_mask, &attention_mask_padding], -1) @@ -443,7 +443,7 @@ impl ReformerModel { Tensor::cat( &[ Tensor::ones(input_shape, (Kind::Int8, device)), - Tensor::zeros(&[input_shape[0], padding_length], (Kind::Int8, device)), + Tensor::zeros([input_shape[0], padding_length], (Kind::Int8, device)), ], -1, ) @@ -461,7 +461,7 @@ impl ReformerModel { (Kind::Int64, device), ) .unsqueeze(0) - .expand(&[input_shape[0], padding_length], true); + .expand([input_shape[0], padding_length], true); Some(Tensor::cat(&[position_ids, &position_ids_padding], -1)) } else { None diff --git a/src/roberta/embeddings.rs b/src/roberta/embeddings.rs index eadb5d4e..08fe1657 100644 --- a/src/roberta/embeddings.rs +++ b/src/roberta/embeddings.rs @@ -194,7 +194,7 @@ impl BertEmbedding for RobertaEmbeddings { let calc_token_type_ids = if token_type_ids.is_none() { Some(Tensor::zeros( - &input_shape, + input_shape, (Kind::Int64, input_embeddings.device()), )) } else { diff --git a/src/t5/attention.rs b/src/t5/attention.rs index 1a1dea8c..2abae806 100644 --- a/src/t5/attention.rs +++ b/src/t5/attention.rs @@ -223,14 +223,14 @@ impl T5Attention { None }; - let mut scores = Tensor::einsum("bnqd,bnkd->bnqk", &[q, k], None); + let mut scores = Tensor::einsum("bnqd,bnkd->bnqk", &[q, k], None::); let calculated_position_bias = if position_bias.is_none() { let mut temp_value = if self.has_relative_attention_bias { self.compute_bias(real_seq_length, key_length, hidden_states.device()) } else { Tensor::zeros( - &[1, self.n_heads, real_seq_length, key_length], + [1, self.n_heads, real_seq_length, key_length], (scores.kind(), scores.device()), ) }; @@ -289,7 +289,7 @@ impl T5Attention { ); rp_bucket .apply(self.relative_attention_bias.as_ref().unwrap()) - .permute(&[2, 0, 1]) + .permute([2, 0, 1]) .unsqueeze(0) } } diff --git a/src/t5/encoder.rs b/src/t5/encoder.rs index c1287b26..b2e00ce1 100644 --- a/src/t5/encoder.rs +++ b/src/t5/encoder.rs @@ -20,6 +20,7 @@ use crate::t5::T5Config; use crate::Activation::{gelu_new, relu}; use crate::RustBertError; use std::borrow::{Borrow, BorrowMut}; +use std::convert::TryFrom; use tch::nn::LinearConfig; use tch::{nn, Kind, Scalar, Tensor}; @@ -227,7 +228,9 @@ impl T5Block { } pub(crate) fn clamp_hidden_states(hidden_states: Tensor) -> Tensor { - if (hidden_states.kind() != Kind::Float) & bool::from(hidden_states.isinf().any()) { + if (hidden_states.kind() != Kind::Float) + & bool::try_from(hidden_states.isinf().any()).unwrap() + { let clamp_value = match hidden_states.kind() { Kind::Half => half::f16::MAX.to_f64() - 1000., Kind::BFloat16 => half::bf16::MAX.to_f64() - 1000., @@ -398,7 +401,7 @@ impl T5Stack { let calculated_attention_mask = if attention_mask.is_none() { Some(Tensor::ones( - &[batch_size, mask_seq_length], + [batch_size, mask_seq_length], (Kind::Int64, input_embeddings.device()), )) } else { @@ -416,7 +419,7 @@ impl T5Stack { input_shape[1], (input_embeddings.kind(), input_embeddings.device()), ); - let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat(&[ + let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat([ input_shape[0], input_shape[1], 1, @@ -445,7 +448,7 @@ impl T5Stack { let encoder_mask = match encoder_attention_mask { Some(value) => value.copy(), None => Tensor::ones( - &[ + [ encoder_hidden_states_shape[0], encoder_hidden_states_shape[1], ], diff --git a/src/xlnet/attention.rs b/src/xlnet/attention.rs index 4492a58f..64dcdde1 100644 --- a/src/xlnet/attention.rs +++ b/src/xlnet/attention.rs @@ -150,9 +150,9 @@ impl XLNetRelativeAttention { fn rel_shift_bnij(&self, x: &Tensor, klen: i64) -> Tensor { let shape = x.size(); - x.reshape(&[shape[0], shape[1], shape[3], shape[2]]) + x.reshape([shape[0], shape[1], shape[3], shape[2]]) .narrow(2, 1, shape[3] - 1) - .reshape(&[shape[0], shape[1], shape[2], shape[3] - 1]) + .reshape([shape[0], shape[1], shape[2], shape[3] - 1]) .index_select(3, &Tensor::arange(klen, (Kind::Int64, x.device()))) } @@ -169,13 +169,13 @@ impl XLNetRelativeAttention { let ac = Tensor::einsum( "ibnd,jbnd->bnij", &[&(q_head + &self.r_w_bias), k_head_h], - None, + None::, ); let bd = self.rel_shift_bnij( &Tensor::einsum( "ibnd,jbnd->bnij", &[&(q_head + &self.r_r_bias), k_head_r], - None, + None::, ), ac.size()[3], ); @@ -185,30 +185,33 @@ impl XLNetRelativeAttention { let ef = Tensor::einsum( "ibnd,snd->ibns", &[&(q_head + &self.r_s_bias), &self.seg_embed], - None, + None::, ); - Tensor::einsum("ijbs,ibns->bnij", &[seg_mat, &ef], None) + Tensor::einsum("ijbs,ibns->bnij", &[seg_mat, &ef], None::) } - None => Tensor::zeros(&[1], (ac.kind(), ac.device())), + None => Tensor::zeros([1], (ac.kind(), ac.device())), }; let mut attention_score = (ac + bd + ef) * self.scale; if let Some(value) = attention_mask { let target_kind = attention_score.kind(); attention_score = - (attention_score - value.permute(&[2, 3, 0, 1]) * 1e30).to_kind(target_kind); + (attention_score - value.permute([2, 3, 0, 1]) * 1e30).to_kind(target_kind); }; let attention_probas = attention_score .softmax(3, attention_score.kind()) .apply_t(&self.dropout, train); - let attention_vector = - Tensor::einsum("bnij,jbnd->ibnd", &[&attention_probas, v_head_h], None); + let attention_vector = Tensor::einsum( + "bnij,jbnd->ibnd", + &[&attention_probas, v_head_h], + None::, + ); if self.output_attentions { ( attention_vector, - Some(attention_probas.permute(&[2, 3, 0, 1])), + Some(attention_probas.permute([2, 3, 0, 1])), ) } else { (attention_vector, None) @@ -222,9 +225,12 @@ impl XLNetRelativeAttention { residual: bool, train: bool, ) -> Tensor { - let mut attention_out = - Tensor::einsum("ibnd,hnd->ibh", &[attention_vector, &self.output], None) - .apply_t(&self.dropout, train); + let mut attention_out = Tensor::einsum( + "ibnd,hnd->ibh", + &[attention_vector, &self.output], + None::, + ) + .apply_t(&self.dropout, train); if residual { attention_out = attention_out + h; }; @@ -256,10 +262,10 @@ impl XLNetRelativeAttention { Some(value) => value, None => h, }; - let q_head_h = Tensor::einsum("ibh,hnd->ibnd", &[h, &self.query], None); - let k_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.key], None); - let v_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.value], None); - let k_head_r = Tensor::einsum("ibh,hnd->ibnd", &[r, &self.pos], None); + let q_head_h = Tensor::einsum("ibh,hnd->ibnd", &[h, &self.query], None::); + let k_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.key], None::); + let v_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.value], None::); + let k_head_r = Tensor::einsum("ibh,hnd->ibnd", &[r, &self.pos], None::); let (attention_vec_h, attention_probas_h) = self.rel_attention_core( &q_head_h, @@ -273,12 +279,12 @@ impl XLNetRelativeAttention { let output_h = self.post_attention(h, &attention_vec_h, true, train); let (output_g, attention_probas_g) = if let Some(g) = g { - let q_head_g = Tensor::einsum("ibh,hnd->ibnd", &[g, &self.query], None); + let q_head_g = Tensor::einsum("ibh,hnd->ibnd", &[g, &self.query], None::); let (attention_vec_g, attention_probas_g) = match target_mapping { Some(target_mapping) => { let q_head_g = - Tensor::einsum("mbnd,mlb->lbnd", &[&q_head_g, target_mapping], None); + Tensor::einsum("mbnd,mlb->lbnd", &[&q_head_g, target_mapping], None::); let (attention_vec_g, attention_probas_g) = self.rel_attention_core( &q_head_g, &k_head_h, @@ -288,8 +294,11 @@ impl XLNetRelativeAttention { attn_mask_g, train, ); - let attention_vec_g = - Tensor::einsum("lbnd,mlb->mbnd", &[&attention_vec_g, target_mapping], None); + let attention_vec_g = Tensor::einsum( + "lbnd,mlb->mbnd", + &[&attention_vec_g, target_mapping], + None::, + ); (attention_vec_g, attention_probas_g) } None => self.rel_attention_core( diff --git a/src/xlnet/xlnet_model.rs b/src/xlnet/xlnet_model.rs index b8b64b1f..28cff18f 100644 --- a/src/xlnet/xlnet_model.rs +++ b/src/xlnet/xlnet_model.rs @@ -251,8 +251,8 @@ impl XLNetModel { } fn create_mask(&self, q_len: i64, m_len: i64, device: Device) -> Tensor { - let attention_mask = Tensor::ones(&[q_len, q_len], (Kind::Int64, device)); - let attention_mask_pad = Tensor::zeros(&[q_len, m_len], (Kind::Int64, device)); + let attention_mask = Tensor::ones([q_len, q_len], (Kind::Int64, device)); + let attention_mask_pad = Tensor::zeros([q_len, m_len], (Kind::Int64, device)); let mask_up = attention_mask.triu(1); let mut output = Tensor::cat(&[&attention_mask_pad, &mask_up], 1); if self.same_length { @@ -307,12 +307,16 @@ impl XLNetModel { inverse_frequency: &Tensor, batch_size: Option, ) -> Tensor { - let sinusoid = Tensor::einsum("i,d->id", &[position_sequence, inverse_frequency], None); + let sinusoid = Tensor::einsum( + "i,d->id", + &[position_sequence, inverse_frequency], + None::, + ); let mut positional_embeddings = Tensor::cat(&[sinusoid.sin(), sinusoid.cos()], -1).unsqueeze(1); if let Some(bsz) = batch_size { - positional_embeddings = positional_embeddings.expand(&[-1, bsz, -1], true) + positional_embeddings = positional_embeddings.expand([-1, bsz, -1], true) }; positional_embeddings } @@ -466,13 +470,13 @@ impl XLNetModel { let perm_mask = perm_mask.map(|perm_mask| { perm_mask .to_kind(word_emb_k.kind()) - .permute(&[1, 2, 0]) + .permute([1, 2, 0]) .contiguous() }); let target_mapping = target_mapping.map(|target_mapping| { target_mapping .to_kind(word_emb_k.kind()) - .permute(&[1, 2, 0]) + .permute([1, 2, 0]) .contiguous() }); @@ -511,7 +515,7 @@ impl XLNetModel { if let Some(data_mask_value) = &data_mask { if m_len > 0 { let mems_mask = Tensor::zeros( - &[data_mask_value.size()[0], m_len, batch_size], + [data_mask_value.size()[0], m_len, batch_size], (Kind::Bool, data_mask_value.device()), ); data_mask = Some(Tensor::cat(&[&mems_mask, data_mask_value], 1)) @@ -528,7 +532,7 @@ impl XLNetModel { if m_len > 0 { non_tgt_mask = Tensor::cat( &[ - Tensor::zeros(&[q_len, m_len], (Kind::Int64, attn_mask_value.device())), + Tensor::zeros([q_len, m_len], (Kind::Int64, attn_mask_value.device())), non_tgt_mask, ], -1, @@ -542,14 +546,14 @@ impl XLNetModel { let mut output_h = word_emb_k.apply_t(&self.dropout, train); let mut output_g = target_mapping.as_ref().map(|target_mapping_value| { self.mask_emb - .expand(&[target_mapping_value.size()[0], batch_size, -1], true) + .expand([target_mapping_value.size()[0], batch_size, -1], true) .apply_t(&self.dropout, train) }); let seg_mat = if let Some(token_type_ids_value) = token_type_ids { let cat_ids = if m_len > 0 { let mem_pad = Tensor::zeros( - &[m_len, batch_size], + [m_len, batch_size], (Kind::Int64, token_type_ids_value.device()), ); Tensor::cat(&[mem_pad, token_type_ids_value.copy()], 0) @@ -636,7 +640,7 @@ impl XLNetModel { output_h } .apply_t(&self.dropout, train) - .permute(&[1, 0, 2]) + .permute([1, 0, 2]) .contiguous(); Ok(XLNetModelOutput { @@ -1673,10 +1677,8 @@ impl PrivateLanguageGenerator for XLNetGenerator { ) -> PreparedInput<'a> { let effective_batch_size = input_ids.size()[0]; let sequence_length = input_ids.size()[1]; - let dummy_token = Tensor::zeros( - &[effective_batch_size, 1], - (Kind::Int64, input_ids.device()), - ); + let dummy_token = + Tensor::zeros([effective_batch_size, 1], (Kind::Int64, input_ids.device())); let offset = 2i64; let input_ids = match &past { Cache::XLNetCache(past) => { @@ -1696,13 +1698,13 @@ impl PrivateLanguageGenerator for XLNetGenerator { }; let sequence_length = input_ids.size()[1]; let perm_mask = Tensor::zeros( - &[effective_batch_size, sequence_length, sequence_length], + [effective_batch_size, sequence_length, sequence_length], (Kind::Float, input_ids.device()), ); let _ = perm_mask.narrow(2, sequence_length - 1, 1).fill_(1.0); let target_mapping = Tensor::zeros( - &[effective_batch_size, 1, sequence_length], + [effective_batch_size, 1, sequence_length], (Kind::Float, input_ids.device()), ); let _ = target_mapping.narrow(2, sequence_length - 1, 1).fill_(1.0); diff --git a/tests/deberta.rs b/tests/deberta.rs index 1d4643e2..789fd661 100644 --- a/tests/deberta.rs +++ b/tests/deberta.rs @@ -99,10 +99,10 @@ fn deberta_masked_lm() -> anyhow::Result<()> { let deberta_model = DebertaForMaskedLM::new(vs.root(), &config); // Generate random input - let input_tensor = Tensor::randint(42, &[32, 128], (Kind::Int64, device)); - let attention_mask = Tensor::ones(&[32, 128], (Kind::Int64, device)); + let input_tensor = Tensor::randint(42, [32, 128], (Kind::Int64, device)); + let attention_mask = Tensor::ones([32, 128], (Kind::Int64, device)); let position_ids = Tensor::arange(128, (Kind::Int64, device)).unsqueeze(0); - let token_type_ids = Tensor::zeros(&[32, 128], (Kind::Int64, device)); + let token_type_ids = Tensor::zeros([32, 128], (Kind::Int64, device)); // Forward pass let model_output = no_grad(|| { diff --git a/tests/deberta_v2.rs b/tests/deberta_v2.rs index 09ea45ae..31a599e0 100644 --- a/tests/deberta_v2.rs +++ b/tests/deberta_v2.rs @@ -25,10 +25,10 @@ fn deberta_v2_masked_lm() -> anyhow::Result<()> { let deberta_model = DebertaV2ForMaskedLM::new(vs.root(), &config); // Generate random input - let input_tensor = Tensor::randint(42, &[32, 128], (Kind::Int64, device)); - let attention_mask = Tensor::ones(&[32, 128], (Kind::Int64, device)); + let input_tensor = Tensor::randint(42, [32, 128], (Kind::Int64, device)); + let attention_mask = Tensor::ones([32, 128], (Kind::Int64, device)); let position_ids = Tensor::arange(128, (Kind::Int64, device)).unsqueeze(0); - let token_type_ids = Tensor::zeros(&[32, 128], (Kind::Int64, device)); + let token_type_ids = Tensor::zeros([32, 128], (Kind::Int64, device)); // Forward pass let model_output = no_grad(|| { diff --git a/tests/fnet.rs b/tests/fnet.rs index 6dfe05c0..e05015b8 100644 --- a/tests/fnet.rs +++ b/tests/fnet.rs @@ -12,6 +12,7 @@ use rust_bert::Config; use rust_tokenizers::tokenizer::{FNetTokenizer, MultiThreadedTokenizer, TruncationStrategy}; use rust_tokenizers::vocab::Vocab; use std::collections::HashMap; +use std::convert::TryFrom; use tch::{nn, no_grad, Device, Tensor}; #[test] @@ -75,7 +76,9 @@ fn fnet_masked_lm() -> anyhow::Result<()> { assert_eq!("▁one", word_1); assert_eq!("▁the", word_2); - let value = (f64::from(model_output.prediction_scores.get(0).get(4).max()) - 13.1721).abs(); + let value = (f64::try_from(model_output.prediction_scores.get(0).get(4).max()).unwrap() + - 13.1721) + .abs(); dbg!(value); assert!(value < 1e-3); Ok(()) diff --git a/tests/longformer.rs b/tests/longformer.rs index 508aabad..cc458b3e 100644 --- a/tests/longformer.rs +++ b/tests/longformer.rs @@ -16,6 +16,7 @@ use rust_bert::Config; use rust_tokenizers::tokenizer::{MultiThreadedTokenizer, RobertaTokenizer, TruncationStrategy}; use rust_tokenizers::vocab::Vocab; use std::collections::HashMap; +use std::convert::TryFrom; use tch::{nn, no_grad, Device, Tensor}; #[test] @@ -116,12 +117,12 @@ fn longformer_masked_lm() -> anyhow::Result<()> { .prediction_scores .get(0) .get(4) - .double_value(&[i64::from(&index_1)]); + .double_value(&[i64::try_from(&index_1).unwrap()]); let score_2 = model_output .prediction_scores .get(1) .get(7) - .double_value(&[i64::from(&index_2)]); + .double_value(&[i64::try_from(&index_2).unwrap()]); assert_eq!("Ġeye", word_1); // Outputs "person" : "Looks like one [eye] is missing" assert_eq!("Ġsunny", word_2); // Outputs "pear" : "It was a nice and [sunny] day" diff --git a/tests/mobilebert.rs b/tests/mobilebert.rs index 1976ffdc..f0ac097a 100644 --- a/tests/mobilebert.rs +++ b/tests/mobilebert.rs @@ -10,6 +10,7 @@ use rust_bert::Config; use rust_tokenizers::tokenizer::{BertTokenizer, MultiThreadedTokenizer, TruncationStrategy}; use rust_tokenizers::vocab::Vocab; use std::collections::HashMap; +use std::convert::TryFrom; use tch::{nn, no_grad, Device, Tensor}; #[test] @@ -73,12 +74,12 @@ fn mobilebert_masked_model() -> anyhow::Result<()> { .logits .get(0) .get(4) - .double_value(&[i64::from(&index_1)]); + .double_value(&[i64::try_from(&index_1).unwrap()]); let score_2 = model_output .logits .get(1) .get(7) - .double_value(&[i64::from(&index_2)]); + .double_value(&[i64::try_from(&index_2).unwrap()]); assert_eq!("thing", word_1); // Outputs "person" : "Looks like one [person] is missing" assert_eq!("sunny", word_2); // Outputs "sunny" : "It was a very nice and [sunny] day" diff --git a/tests/xlnet.rs b/tests/xlnet.rs index ba0d6b5e..1ceddebf 100644 --- a/tests/xlnet.rs +++ b/tests/xlnet.rs @@ -59,10 +59,10 @@ fn xlnet_base_model() -> anyhow::Result<()> { let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device); // Forward pass - let perm_mask = Tensor::zeros(&[1, 4, 4], (Kind::Float, device)); + let perm_mask = Tensor::zeros([1, 4, 4], (Kind::Float, device)); let _ = perm_mask.narrow(2, 3, 1).fill_(1.0); - let target_mapping = Tensor::zeros(&[1, 1, 4], (Kind::Float, device)); + let target_mapping = Tensor::zeros([1, 1, 4], (Kind::Float, device)); let _ = target_mapping.narrow(2, 3, 1).fill_(1.0); let model_output = no_grad(|| { xlnet_model @@ -164,10 +164,10 @@ fn xlnet_lm_model() -> anyhow::Result<()> { let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device); // Forward pass - let perm_mask = Tensor::zeros(&[1, 4, 4], (Kind::Float, device)); + let perm_mask = Tensor::zeros([1, 4, 4], (Kind::Float, device)); let _ = perm_mask.narrow(2, 3, 1).fill_(1.0); - let target_mapping = Tensor::zeros(&[1, 1, 4], (Kind::Float, device)); + let target_mapping = Tensor::zeros([1, 1, 4], (Kind::Float, device)); let _ = target_mapping.narrow(2, 3, 1).fill_(1.0); let model_output = no_grad(|| { xlnet_model From 564ae85df0e16158cc3d0fdac9fc1a40bf3a69ed Mon Sep 17 00:00:00 2001 From: Guillaume Becquin Date: Sun, 14 May 2023 09:05:34 +0100 Subject: [PATCH 4/4] - Remove debugging print statement - Skip truncation of prompt for encoder-decoder models - Add right padding logic for encoder-decoder models --- src/pipelines/conversation.rs | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs index 9c943c66..6e026cc2 100644 --- a/src/pipelines/conversation.rs +++ b/src/pipelines/conversation.rs @@ -56,11 +56,11 @@ //! from the 3rd party utilization of the pretrained system. use crate::common::error::RustBertError; use crate::gpt2::GPT2Generator; -use crate::t5::T5Generator; use crate::pipelines::common::{ModelType, TokenizerOption}; use crate::pipelines::generation_utils::private_generation_utils::PrivateLanguageGenerator; use crate::pipelines::generation_utils::{GenerateConfig, LanguageGenerator}; use crate::resources::ResourceProvider; +use crate::t5::T5Generator; use std::collections::HashMap; use tch::{Device, Kind, Tensor}; use uuid::Uuid; @@ -737,9 +737,7 @@ impl ConversationOption { Self::GPT2(model_ref) => { Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap()) } - Self::T5(model_ref) => { - Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap()) - } + Self::T5(model_ref) => Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap()), } } @@ -786,6 +784,14 @@ impl ConversationOption { .collect(), } } + + /// Interface method to get the model family (encoder-decoder or decoder) + fn is_encoder_decoder(&self) -> bool { + match *self { + Self::GPT2(ref generator) => generator.is_encoder_decoder(), + Self::T5(ref generator) => generator.is_encoder_decoder(), + } + } } /// # Conversation model @@ -925,8 +931,6 @@ impl ConversationModel { let mut output = HashMap::with_capacity(active_uuid.len()); - println!("generated: {:#?}, prompt_ids: {:#?}", &generated, &prompt_ids); - for ( ((conversation, (generated_sequence, conversation_promp_ids)), uuid), removed_padding, @@ -936,7 +940,11 @@ impl ConversationModel { .zip(active_uuid.into_iter()) .zip(removed_padding_quantities.into_iter()) { - let generated_response = &generated_sequence[input_length - removed_padding.0..]; + let generated_response = if self.model.is_encoder_decoder() { + generated_sequence.as_slice() + } else { + &generated_sequence[input_length - removed_padding.0..] + }; conversation .generated_responses .push( @@ -1044,9 +1052,14 @@ impl ConversationModel { .get(input_idx as i64) .slice(0, 0, (max_len - input.len()) as i64, 1) .fill_(0); - let mut padded_input = vec![pad_token; max_len - input.len()]; - padded_input.extend(input); - padded_input + let padding = vec![pad_token; max_len - input.len()]; + if self.model.is_encoder_decoder() { + // right padding assumed for encoder-decoders + [input, &padding].concat() + } else { + // left padding assumed for decoders + [&padding, input].concat() + } }) .map(|tokens| Tensor::of_slice(&tokens).to(self.device)) .collect::>();