From 7e00a22f8827cbb24b963c13fdf60caa3ba1b4ff Mon Sep 17 00:00:00 2001
From: Dario Cancelliere <dario.cancelliere@gmail.com>
Date: Thu, 11 May 2023 02:54:21 +0200
Subject: [PATCH 1/4] Added GODEL support

---
 src/pipelines/conversation.rs | 21 ++++++++++++++++++++-
 src/t5/t5_model.rs            | 15 +++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs
index b8f46e5f..62f9e7ce 100644
--- a/src/pipelines/conversation.rs
+++ b/src/pipelines/conversation.rs
@@ -12,7 +12,8 @@
 // limitations under the License.
 
 //! # Multi-turn dialogue
-//! Conversation model based on Microsoft's [DialoGPT](https://github.com/microsoft/DialoGPT).
+//! Conversation model based on Microsoft's [DialoGPT](https://github.com/microsoft/DialoGPT) or
+//! [GODEL](https://github.com/microsoft/GODEL).
 //! This pipeline allows the generation of single or multi-turn conversations between a human and a model.
 //! The DialoGPT's page states that
 //! > The human evaluation results indicate that the response generated from DialoGPT is comparable to human response quality
@@ -55,6 +56,7 @@
 //! from the 3rd party utilization of the pretrained system.
 use crate::common::error::RustBertError;
 use crate::gpt2::GPT2Generator;
+use crate::t5::T5Generator;
 use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::PrivateLanguageGenerator;
 use crate::pipelines::generation_utils::{GenerateConfig, LanguageGenerator};
@@ -695,12 +697,14 @@ impl Default for ConversationManager {
 pub enum ConversationOption {
     /// Conversation based on GPT2 model
     GPT2(GPT2Generator),
+    T5(T5Generator),
 }
 
 impl ConversationOption {
     pub fn new(config: ConversationConfig) -> Result<Self, RustBertError> {
         match config.model_type {
             ModelType::GPT2 => Ok(ConversationOption::GPT2(GPT2Generator::new(config.into())?)),
+            ModelType::T5 => Ok(ConversationOption::T5(T5Generator::new(config.into())?)),
             _ => Err(RustBertError::InvalidConfigurationError(
                 "GPT2 is currently the only supported model for conversation generation"
                     .to_string(),
@@ -717,6 +721,10 @@ impl ConversationOption {
                 config.into(),
                 tokenizer,
             )?)),
+            ModelType::T5 => Ok(ConversationOption::T5(T5Generator::new_with_tokenizer(
+                config.into(),
+                tokenizer,
+            )?)),
             _ => Err(RustBertError::InvalidConfigurationError(
                 "GPT2 is currently the only supported model for conversation generation"
                     .to_string(),
@@ -729,6 +737,9 @@ impl ConversationOption {
             Self::GPT2(model_ref) => {
                 Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap())
             }
+            Self::T5(model_ref) => {
+                Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap())
+            }
         }
     }
 
@@ -736,6 +747,7 @@ impl ConversationOption {
     pub fn get_tokenizer(&self) -> &TokenizerOption {
         match self {
             Self::GPT2(model_ref) => model_ref._get_tokenizer(),
+            Self::T5(model_ref) => model_ref._get_tokenizer(),
         }
     }
 
@@ -743,6 +755,7 @@ impl ConversationOption {
     pub fn get_tokenizer_mut(&mut self) -> &TokenizerOption {
         match self {
             Self::GPT2(model_ref) => model_ref._get_tokenizer_mut(),
+            Self::T5(model_ref) => model_ref._get_tokenizer_mut(),
         }
     }
 
@@ -750,6 +763,7 @@ impl ConversationOption {
     pub fn model_type(&self) -> ModelType {
         match *self {
             Self::GPT2(_) => ModelType::GPT2,
+            Self::T5(_) => ModelType::T5,
         }
     }
 
@@ -765,6 +779,11 @@ impl ConversationOption {
                 .into_iter()
                 .map(|output| output.indices)
                 .collect(),
+            Self::T5(ref model) => model
+                .generate_from_ids_and_past(input_ids, attention_mask, None)
+                .into_iter()
+                .map(|output| output.indices)
+                .collect(),
         }
     }
 }
diff --git a/src/t5/t5_model.rs b/src/t5/t5_model.rs
index 204c4560..59d11807 100644
--- a/src/t5/t5_model.rs
+++ b/src/t5/t5_model.rs
@@ -61,6 +61,11 @@ impl T5ModelResources {
         "sentence-t5-base/model",
         "https://huggingface.co/sentence-transformers/sentence-t5-base/resolve/main/rust_model.ot",
     );
+    /// Shared under MIT license by the Microsoft team at <https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq>. Modified with conversion to C-array format.
+    pub const GODEL_V1_1_BASE: (&'static str, &'static str) = (
+        "godel-v1-1-base/model",
+        "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/rust_model.ot",
+    );
 }
 
 impl T5ConfigResources {
@@ -79,6 +84,11 @@ impl T5ConfigResources {
         "sentence-t5-base/config",
         "https://huggingface.co/sentence-transformers/sentence-t5-base/resolve/main/config.json",
     );
+    /// Shared under MIT license by the Microsoft team at <https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq>. Modified with conversion to C-array format.
+    pub const GODEL_V1_1_BASE: (&'static str, &'static str) = (
+        "godel-v1-1-base/config",
+        "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/config.json",
+    );
 }
 
 impl T5VocabResources {
@@ -97,6 +107,11 @@ impl T5VocabResources {
         "sentence-t5-base/spiece",
         "https://huggingface.co/sentence-transformers/sentence-t5-base/resolve/main/spiece.model",
     );
+    /// Shared under MIT license by the Microsoft team at <https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq>. Modified with conversion to C-array format.
+    pub const GODEL_V1_1_BASE: (&'static str, &'static str) = (
+        "godel-v1-1-base/spiece",
+        "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/spiece.model",
+    );
 }
 
 const T5LANGUAGES: [Language; 3] = [Language::English, Language::French, Language::German];

From a3f484e1afa73ce0070196706fa61f828a395445 Mon Sep 17 00:00:00 2001
From: Dario Cancelliere <dario.cancelliere@gmail.com>
Date: Thu, 11 May 2023 03:47:10 +0200
Subject: [PATCH 2/4] Added other missing resources

---
 src/pipelines/conversation.rs |  2 ++
 src/t5/t5_model.rs            | 19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs
index 62f9e7ce..ce450160 100644
--- a/src/pipelines/conversation.rs
+++ b/src/pipelines/conversation.rs
@@ -925,6 +925,8 @@ impl ConversationModel {
 
             let mut output = HashMap::with_capacity(active_uuid.len());
 
+            println!("generated: {:#?}, prompt_ids: {:#?}", &generated, &prompt_ids);
+
             for (
                 ((conversation, (generated_sequence, conversation_promp_ids)), uuid),
                 removed_padding,
diff --git a/src/t5/t5_model.rs b/src/t5/t5_model.rs
index 59d11807..f93e210f 100644
--- a/src/t5/t5_model.rs
+++ b/src/t5/t5_model.rs
@@ -66,6 +66,11 @@ impl T5ModelResources {
         "godel-v1-1-base/model",
         "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/rust_model.ot",
     );
+    /// Shared under MIT license by the Microsoft team at <https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq>. Modified with conversion to C-array format.
+    pub const GODEL_V1_1_LARGE: (&'static str, &'static str) = (
+        "godel-v1-1-large/model",
+        "https://huggingface.co/microsoft/GODEL-v1_1-large-seq2seq/resolve/main/rust_model.ot",
+    );
 }
 
 impl T5ConfigResources {
@@ -89,6 +94,11 @@ impl T5ConfigResources {
         "godel-v1-1-base/config",
         "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/config.json",
     );
+    /// Shared under MIT license by the Microsoft team at <https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq>. Modified with conversion to C-array format.
+    pub const GODEL_V1_1_LARGE: (&'static str, &'static str) = (
+        "godel-v1-1-large/config",
+        "https://huggingface.co/microsoft/GODEL-v1_1-large-seq2seq/resolve/main/config.json",
+    );
 }
 
 impl T5VocabResources {
@@ -107,10 +117,15 @@ impl T5VocabResources {
         "sentence-t5-base/spiece",
         "https://huggingface.co/sentence-transformers/sentence-t5-base/resolve/main/spiece.model",
     );
-    /// Shared under MIT license by the Microsoft team at <https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq>. Modified with conversion to C-array format.
+    /// Shared under Apache 2.0 license by the Google team at <https://github.com/google-research/text-to-text-transfer-transformer>.
     pub const GODEL_V1_1_BASE: (&'static str, &'static str) = (
         "godel-v1-1-base/spiece",
-        "https://huggingface.co/microsoft/GODEL-v1_1-base-seq2seq/resolve/main/spiece.model",
+        "https://huggingface.co/t5-base/resolve/main/spiece.model",
+    );
+    /// Shared under Apache 2.0 license by the Google team at <https://github.com/google-research/text-to-text-transfer-transformer>.
+    pub const GODEL_V1_1_LARGE: (&'static str, &'static str) = (
+        "godel-v1-1-large/spiece",
+        "https://huggingface.co/t5-large/resolve/main/spiece.model",
     );
 }
 

From 5f9500c54a4e06cec3c999e20e91a4ab3f93514a Mon Sep 17 00:00:00 2001
From: guillaume-be <guillaume.becquin@gmail.com>
Date: Thu, 11 May 2023 18:35:35 +0100
Subject: [PATCH 3/4] `tch 0.12.0` Update (#379)

* Fix 0.12 breaking changes

* Fix Clippy warnings

* Updated changelog
---
 CHANGELOG.md                                  |  2 +-
 Cargo.toml                                    |  4 +-
 benches/tensor_operations_benchmark.rs        |  4 +-
 src/albert/albert_model.rs                    |  2 +-
 src/albert/attention.rs                       |  4 +-
 src/bart/attention.rs                         |  2 +-
 src/bart/bart_model.rs                        | 10 +--
 src/bert/bert_model.rs                        |  4 +-
 src/common/dropout.rs                         |  2 +-
 src/deberta/attention.rs                      | 20 +++---
 src/deberta/embeddings.rs                     |  4 +-
 src/deberta_v2/attention.rs                   | 18 ++---
 src/deberta_v2/encoder.rs                     |  4 +-
 src/electra/electra_model.rs                  |  2 +-
 src/fnet/attention.rs                         |  4 +-
 src/gpt2/attention.rs                         |  6 +-
 src/gpt_j/attention.rs                        | 20 +++---
 src/gpt_neo/attention.rs                      |  6 +-
 src/longformer/attention.rs                   | 38 +++++-----
 src/longformer/embeddings.rs                  |  2 +-
 src/longformer/encoder.rs                     |  3 +-
 src/longformer/longformer_model.rs            | 11 ++-
 src/longt5/attention.rs                       | 34 +++++----
 src/longt5/encoder.rs                         |  6 +-
 src/m2m_100/embeddings.rs                     |  2 +-
 src/mbart/mbart_model.rs                      |  2 +-
 src/mobilebert/embeddings.rs                  |  2 +-
 src/mobilebert/mobilebert_model.rs            |  2 +-
 src/pegasus/embeddings.rs                     |  2 +-
 src/pipelines/conversation.rs                 |  2 +-
 src/pipelines/generation_utils.rs             | 69 +++++++++++--------
 src/pipelines/keywords_extraction/scorer.rs   | 11 +--
 src/pipelines/masked_language.rs              |  3 +-
 src/pipelines/sentence_embeddings/pipeline.rs | 10 +--
 src/prophetnet/attention.rs                   | 28 ++++----
 src/prophetnet/decoder.rs                     | 22 +++---
 src/prophetnet/embeddings.rs                  |  3 +-
 src/prophetnet/encoder.rs                     |  2 +-
 src/reformer/attention.rs                     | 58 +++++++++-------
 src/reformer/attention_utils.rs               |  4 +-
 src/reformer/embeddings.rs                    |  9 +--
 src/reformer/reformer_model.rs                |  8 +--
 src/roberta/embeddings.rs                     |  2 +-
 src/t5/attention.rs                           |  6 +-
 src/t5/encoder.rs                             | 11 +--
 src/xlnet/attention.rs                        | 53 ++++++++------
 src/xlnet/xlnet_model.rs                      | 36 +++++-----
 tests/deberta.rs                              |  6 +-
 tests/deberta_v2.rs                           |  6 +-
 tests/fnet.rs                                 |  5 +-
 tests/longformer.rs                           |  5 +-
 tests/mobilebert.rs                           |  5 +-
 tests/xlnet.rs                                |  8 +--
 53 files changed, 323 insertions(+), 271 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a4ca5bd3..8b637c79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,7 @@ All notable changes to this project will be documented in this file. The format
 ## Changed
 - Bumped the tokenizers dependency from 7.x to 8.x, exposing additional options for special token mapping and adding the NLLBTokenizer.
 - (BREAKING) Simplified the generation traits (removal of LMHeadModel and elimination of unnecessary specification for LanguageGenerator)
-- Upgraded to `torch` 2.0 (via `tch` 0.11.0).
+- Upgraded to `torch` 2.0 (via `tch` 0.12.0).
 
 ## Fixed
 - MIN/MAX computation for float-like (was set to infinity instead of min/max)
diff --git a/Cargo.toml b/Cargo.toml
index 0e3cf453..b0e025d7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -70,7 +70,7 @@ features = ["doc-only"]
 
 [dependencies]
 rust_tokenizers = "8.1"
-tch = "0.11.0"
+tch = "0.12.0"
 serde_json = "1"
 serde = { version = "1", features = ["derive"] }
 ordered-float = "3"
@@ -88,6 +88,6 @@ anyhow = "1"
 csv = "1"
 criterion = "0.4"
 tokio = { version = "1.24", features = ["sync", "rt-multi-thread", "macros"] }
-torch-sys = "0.11.0"
+torch-sys = "0.12.0"
 tempfile = "3"
 itertools = "0.10"
diff --git a/benches/tensor_operations_benchmark.rs b/benches/tensor_operations_benchmark.rs
index 2ef3d140..6c47acb3 100644
--- a/benches/tensor_operations_benchmark.rs
+++ b/benches/tensor_operations_benchmark.rs
@@ -21,8 +21,8 @@ fn bench_tensor_ops(c: &mut Criterion) {
     unsafe {
         torch_sys::dummy_cuda_dependency();
     }
-    let input = Tensor::rand(&[32, 128, 512], (Kind::Float, Device::cuda_if_available()));
-    let weights = Tensor::rand(&[512, 512], (Kind::Float, Device::cuda_if_available()));
+    let input = Tensor::rand([32, 128, 512], (Kind::Float, Device::cuda_if_available()));
+    let weights = Tensor::rand([512, 512], (Kind::Float, Device::cuda_if_available()));
 
     let _ = &input.matmul(&weights);
     c.bench_function("Matrix multiply ", |b| {
diff --git a/src/albert/albert_model.rs b/src/albert/albert_model.rs
index 1d957c8a..d55bf0cb 100644
--- a/src/albert/albert_model.rs
+++ b/src/albert/albert_model.rs
@@ -257,7 +257,7 @@ impl AlbertModel {
             get_shape_and_device_from_ids_embeddings_pair(input_ids, input_embeds)?;
 
         let calc_mask = if mask.is_none() {
-            Some(Tensor::ones(&input_shape, (Kind::Int64, device)))
+            Some(Tensor::ones(input_shape, (Kind::Int64, device)))
         } else {
             None
         };
diff --git a/src/albert/attention.rs b/src/albert/attention.rs
index 97fa5393..1a2574a9 100644
--- a/src/albert/attention.rs
+++ b/src/albert/attention.rs
@@ -130,8 +130,8 @@ impl AlbertSelfAttention {
             self.hidden_size,
         ));
 
-        let context: Tensor =
-            Tensor::einsum("bfnd,ndh->bfh", &[context, w], None) + self.dense.bs.as_ref().unwrap();
+        let context: Tensor = Tensor::einsum("bfnd,ndh->bfh", &[context, w], None::<i64>)
+            + self.dense.bs.as_ref().unwrap();
         let context = (input_ids + context.apply_t(&self.dropout, train)).apply(&self.layer_norm);
 
         if !self.output_attentions {
diff --git a/src/bart/attention.rs b/src/bart/attention.rs
index 4b19d963..394a8c44 100644
--- a/src/bart/attention.rs
+++ b/src/bart/attention.rs
@@ -176,7 +176,7 @@ impl BartAttention {
             .bmm(&value_states)
             .view([bs, self.num_heads, target_length, self.head_dim])
             .transpose(1, 2)
-            .reshape(&[bs, target_length, embed_dim])
+            .reshape([bs, target_length, embed_dim])
             .apply(&self.out_proj);
 
         (attention_output, saved_attention_weights, new_layer_state)
diff --git a/src/bart/bart_model.rs b/src/bart/bart_model.rs
index 1f416c29..53ba80dc 100644
--- a/src/bart/bart_model.rs
+++ b/src/bart/bart_model.rs
@@ -270,7 +270,7 @@ pub(crate) fn _make_causal_mask(
     let target_length = input_ids_shape[1];
 
     let mut mask = Tensor::full(
-        &[target_length, target_length],
+        [target_length, target_length],
         get_min(dtype).unwrap(),
         (dtype, device),
     );
@@ -283,14 +283,14 @@ pub(crate) fn _make_causal_mask(
     if past_key_values_length > 0 {
         mask = Tensor::cat(
             &[
-                Tensor::zeros(&[target_length, past_key_values_length], (dtype, device)),
+                Tensor::zeros([target_length, past_key_values_length], (dtype, device)),
                 mask,
             ],
             -1,
         );
     }
     mask.unsqueeze(0).unsqueeze(0).expand(
-        &[
+        [
             batch_size,
             1,
             target_length,
@@ -306,7 +306,7 @@ pub(crate) fn _expand_mask(mask: &Tensor, target_length: Option<i64>, dtype: Kin
     let expanded_mask = mask
         .unsqueeze(1)
         .unsqueeze(1)
-        .expand(&[batch_size, 1, target_length, source_length], true)
+        .expand([batch_size, 1, target_length, source_length], true)
         .totype(dtype);
     let inverted_mask: Tensor = 1 - expanded_mask;
     inverted_mask.masked_fill(&inverted_mask.to_kind(Kind::Bool), get_min(dtype).unwrap())
@@ -863,7 +863,7 @@ impl BartForSequenceClassification {
         let reshape = eos_mask.sum_dim_intlist([1].as_slice(), true, input_ids.kind());
         let sentence_representation = base_model_output
             .decoder_output
-            .permute(&[2, 0, 1])
+            .permute([2, 0, 1])
             .masked_select(&eos_mask)
             .view((-1, reshape.size()[0] * reshape.int64_value(&[0, 0])))
             .transpose(0, 1)
diff --git a/src/bert/bert_model.rs b/src/bert/bert_model.rs
index 84960abe..2b567ccd 100644
--- a/src/bert/bert_model.rs
+++ b/src/bert/bert_model.rs
@@ -370,7 +370,7 @@ impl<T: BertEmbedding> BertModel<T> {
             2 => {
                 if self.is_decoder {
                     let seq_ids = Tensor::arange(input_shape[1], (Kind::Int8, device));
-                    let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat(&[
+                    let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat([
                         input_shape[0],
                         input_shape[1],
                         1,
@@ -407,7 +407,7 @@ impl<T: BertEmbedding> BertModel<T> {
                 let encoder_mask = match encoder_mask {
                     Some(value) => value.copy(),
                     None => Tensor::ones(
-                        &[
+                        [
                             encoder_hidden_states_shape[0],
                             encoder_hidden_states_shape[1],
                         ],
diff --git a/src/common/dropout.rs b/src/common/dropout.rs
index 8c5adf16..e35ab604 100644
--- a/src/common/dropout.rs
+++ b/src/common/dropout.rs
@@ -43,7 +43,7 @@ impl XDropout {
 impl ModuleT for XDropout {
     fn forward_t(&self, input: &Tensor, train: bool) -> Tensor {
         if train {
-            let mask = (Tensor::ones(&[1], (input.kind(), input.device()))
+            let mask = (Tensor::ones([1], (input.kind(), input.device()))
                 - input
                     .empty_like()
                     .bernoulli_float_(1_f64 - self.dropout_prob))
diff --git a/src/deberta/attention.rs b/src/deberta/attention.rs
index 505d057f..948b01fd 100644
--- a/src/deberta/attention.rs
+++ b/src/deberta/attention.rs
@@ -37,7 +37,7 @@ pub trait DisentangledSelfAttention {
 pub fn build_relative_position(query_size: i64, key_size: i64, device: Device) -> Tensor {
     let q_ids = Tensor::arange(query_size, (Kind::Int64, device));
     let k_ids = Tensor::arange(key_size, (Kind::Int64, device));
-    let rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.view([1, -1]).repeat(&[query_size, 1]);
+    let rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.view([1, -1]).repeat([query_size, 1]);
     rel_pos_ids.slice(0, 0, query_size, 1).unsqueeze(0)
 }
 
@@ -62,7 +62,7 @@ impl DebertaDisentangledSelfAttention {
         let mut new_shape = x.size();
         let _ = new_shape.pop();
         new_shape.extend_from_slice(&[self.num_attention_heads, -1]);
-        x.view(new_shape.as_slice()).permute(&[0, 2, 1, 3])
+        x.view(new_shape.as_slice()).permute([0, 2, 1, 3])
     }
 
     fn linear(&self, weights: &Tensor, bias: Option<&Tensor>, x: &Tensor) -> Tensor {
@@ -81,7 +81,7 @@ impl DebertaDisentangledSelfAttention {
     ) -> Tensor {
         let query_layer_size = query_layer.size();
         c2p_pos.expand(
-            &[
+            [
                 query_layer_size[0],
                 query_layer_size[1],
                 query_layer_size[2],
@@ -101,7 +101,7 @@ impl DebertaDisentangledSelfAttention {
         let mut key_layer_size = key_layer.size();
         key_layer_size.reverse();
         c2p_pos.expand(
-            &[
+            [
                 query_layer_size[0],
                 query_layer_size[1],
                 key_layer_size[1],
@@ -182,7 +182,7 @@ impl DebertaDisentangledSelfAttention {
             )
             .unsqueeze(0);
 
-        let mut score = Tensor::zeros(&[1], (query_layer.kind(), key_layer.device()));
+        let mut score = Tensor::zeros([1], (query_layer.kind(), key_layer.device()));
 
         // content -> position
         if let Some(pos_proj) = &self.pos_proj {
@@ -410,9 +410,9 @@ impl DisentangledSelfAttention for DebertaDisentangledSelfAttention {
 
         if let Some(head_logits_proj) = &self.head_logits_proj {
             attention_scores = attention_scores
-                .permute(&[0, 2, 3, 1])
+                .permute([0, 2, 3, 1])
                 .apply(head_logits_proj)
-                .permute(&[0, 3, 1, 2]);
+                .permute([0, 3, 1, 2]);
         }
 
         let mut attention_probs =
@@ -420,14 +420,14 @@ impl DisentangledSelfAttention for DebertaDisentangledSelfAttention {
 
         if let Some(head_weights_proj) = &self.head_weights_proj {
             attention_probs = attention_probs
-                .permute(&[0, 2, 3, 1])
+                .permute([0, 2, 3, 1])
                 .apply(head_weights_proj)
-                .permute(&[0, 3, 1, 2]);
+                .permute([0, 3, 1, 2]);
         }
 
         let context_layer = attention_probs
             .matmul(&value_layer)
-            .permute(&[0, 2, 1, 3])
+            .permute([0, 2, 1, 3])
             .contiguous();
 
         let mut new_context_layer_shape = context_layer.size();
diff --git a/src/deberta/embeddings.rs b/src/deberta/embeddings.rs
index e618c568..0bef5e1e 100644
--- a/src/deberta/embeddings.rs
+++ b/src/deberta/embeddings.rs
@@ -127,7 +127,7 @@ where
         let calc_position_ids = if position_ids.is_none() {
             Some(
                 Tensor::arange(seq_length, (Kind::Int64, input_embeddings.device()))
-                    .expand(&[1, -1], true),
+                    .expand([1, -1], true),
             )
         } else {
             None
@@ -135,7 +135,7 @@ where
 
         let calc_token_type_ids = if token_type_ids.is_none() {
             Some(Tensor::zeros(
-                &input_shape,
+                input_shape,
                 (Kind::Int64, input_embeddings.device()),
             ))
         } else {
diff --git a/src/deberta_v2/attention.rs b/src/deberta_v2/attention.rs
index 1c243bc6..4cc431f9 100644
--- a/src/deberta_v2/attention.rs
+++ b/src/deberta_v2/attention.rs
@@ -51,7 +51,7 @@ pub fn build_relative_position(
 ) -> Tensor {
     let q_ids = Tensor::arange(query_size, (Kind::Int64, device));
     let k_ids = Tensor::arange(key_size, (Kind::Int64, device));
-    let mut rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.tile(&[q_ids.size()[0], 1]);
+    let mut rel_pos_ids = q_ids.unsqueeze(-1) - k_ids.tile([q_ids.size()[0], 1]);
     if (bucket_size > 0) & (max_position > 0) {
         rel_pos_ids = make_log_bucket_position(&rel_pos_ids, bucket_size, max_position);
     }
@@ -80,7 +80,7 @@ impl DebertaV2DisentangledSelfAttention {
         let _ = new_shape.pop();
         new_shape.extend_from_slice(&[self.num_attention_heads, -1]);
         let x = x.view(new_shape.as_slice());
-        x.permute(&[0, 2, 1, 3])
+        x.permute([0, 2, 1, 3])
             .contiguous()
             .view([-1, x.size()[1], *x.size().last().unwrap()])
     }
@@ -133,12 +133,12 @@ impl DebertaV2DisentangledSelfAttention {
 
         let pos_query_layer = self
             .transpose_for_scores(&relative_embeddings.apply(query_proj))
-            .repeat(&[query_layer.size()[0] / self.num_attention_heads, 1, 1]);
+            .repeat([query_layer.size()[0] / self.num_attention_heads, 1, 1]);
         let pos_key_layer = self
             .transpose_for_scores(&relative_embeddings.apply(key_proj))
-            .repeat(&[query_layer.size()[0] / self.num_attention_heads, 1, 1]);
+            .repeat([query_layer.size()[0] / self.num_attention_heads, 1, 1]);
 
-        let mut score = Tensor::zeros(&[1], (query_layer.kind(), query_layer.device()));
+        let mut score = Tensor::zeros([1], (query_layer.kind(), query_layer.device()));
 
         let c2p_pos = if self.pos_att_type.has_type(PositionAttentionType::c2p)
             | self.pos_att_type.has_type(PositionAttentionType::p2p)
@@ -149,7 +149,7 @@ impl DebertaV2DisentangledSelfAttention {
             let c2p_att = c2p_att.gather(
                 -1,
                 &c2p_pos.squeeze_dim(0).expand(
-                    &[
+                    [
                         query_layer.size()[0],
                         query_layer.size()[1],
                         *relative_pos.size().last().unwrap(),
@@ -186,7 +186,7 @@ impl DebertaV2DisentangledSelfAttention {
                 .gather(
                     -1,
                     &p2c_pos.squeeze_dim(0).expand(
-                        &[query_layer.size()[0], key_layer_size[1], key_layer_size[1]],
+                        [query_layer.size()[0], key_layer_size[1], key_layer_size[1]],
                         true,
                     ),
                     true,
@@ -203,7 +203,7 @@ impl DebertaV2DisentangledSelfAttention {
             let p2p_att = p2p_att.gather(
                 -1,
                 &c2p_pos.unwrap().expand(
-                    &[
+                    [
                         query_layer.size()[0],
                         query_layer.size()[1],
                         query_layer.size()[2],
@@ -402,7 +402,7 @@ impl DisentangledSelfAttention for DebertaV2DisentangledSelfAttention {
                 reverse_context_layer_size[1],
                 reverse_context_layer_size[0],
             ])
-            .permute(&[0, 2, 1, 3])
+            .permute([0, 2, 1, 3])
             .contiguous();
 
         let mut new_context_layer_shape = context_layer.size();
diff --git a/src/deberta_v2/encoder.rs b/src/deberta_v2/encoder.rs
index fbf77995..e9d86d4c 100644
--- a/src/deberta_v2/encoder.rs
+++ b/src/deberta_v2/encoder.rs
@@ -78,10 +78,10 @@ impl ConvLayer {
         train: bool,
     ) -> Tensor {
         let out = hidden_states
-            .permute(&[0, 2, 1])
+            .permute([0, 2, 1])
             .contiguous()
             .apply(&self.conv)
-            .permute(&[0, 2, 1])
+            .permute([0, 2, 1])
             .contiguous();
         let reverse_mask: Tensor = 1 - input_mask;
         let out = out.masked_fill(
diff --git a/src/electra/electra_model.rs b/src/electra/electra_model.rs
index 3f9b2f14..429923e0 100644
--- a/src/electra/electra_model.rs
+++ b/src/electra/electra_model.rs
@@ -266,7 +266,7 @@ impl ElectraModel {
             get_shape_and_device_from_ids_embeddings_pair(input_ids, input_embeds)?;
 
         let calc_mask = if mask.is_none() {
-            Some(Tensor::ones(&input_shape, (Kind::Int64, device)))
+            Some(Tensor::ones(input_shape, (Kind::Int64, device)))
         } else {
             None
         };
diff --git a/src/fnet/attention.rs b/src/fnet/attention.rs
index af5037d3..fc4a6377 100644
--- a/src/fnet/attention.rs
+++ b/src/fnet/attention.rs
@@ -42,7 +42,9 @@ impl FNetFourierTransform {
     }
 
     pub fn forward(&self, hidden_states: &Tensor) -> Tensor {
-        let self_outputs = hidden_states.fft_fft2(None, &[1, 2], "backward").real();
+        let self_outputs = hidden_states
+            .fft_fft2(None::<i64>, [1, 2], "backward")
+            .real();
         (self_outputs + hidden_states).apply(&self.layer_norm)
     }
 }
diff --git a/src/gpt2/attention.rs b/src/gpt2/attention.rs
index de07f79d..9602341c 100644
--- a/src/gpt2/attention.rs
+++ b/src/gpt2/attention.rs
@@ -71,7 +71,7 @@ impl Attention {
     {
         let p = p.borrow();
 
-        let bias = Tensor::ones(&[config.n_ctx, config.n_ctx], (Float, p.device()))
+        let bias = Tensor::ones([config.n_ctx, config.n_ctx], (Float, p.device()))
             .tril(0)
             .view((1, 1, config.n_ctx, config.n_ctx));
 
@@ -111,9 +111,9 @@ impl Attention {
     fn split_heads(&self, x: &Tensor, k: bool) -> Tensor {
         let x = x.view((x.size()[0], -1, self.n_head, self.dim_per_head));
         if k {
-            x.permute(&[0, 2, 3, 1])
+            x.permute([0, 2, 3, 1])
         } else {
-            x.permute(&[0, 2, 1, 3])
+            x.permute([0, 2, 1, 3])
         }
     }
 
diff --git a/src/gpt_j/attention.rs b/src/gpt_j/attention.rs
index cbfb87da..4e7bcd91 100644
--- a/src/gpt_j/attention.rs
+++ b/src/gpt_j/attention.rs
@@ -68,7 +68,7 @@ impl GptJAttention {
         let p = p.borrow();
 
         let max_positions = config.n_positions;
-        let bias = Tensor::ones(&[max_positions, max_positions], (Kind::Uint8, p.device()))
+        let bias = Tensor::ones([max_positions, max_positions], (Kind::Uint8, p.device()))
             .tril(0)
             .view([1, 1, max_positions, max_positions])
             .requires_grad_(false);
@@ -142,9 +142,9 @@ impl GptJAttention {
         if rotary {
             tensor
         } else if tensor.size().len() == 5 {
-            tensor.permute(&[0, 1, 3, 2, 4]) // (batch, blocks, head, block_length, head_features)
+            tensor.permute([0, 1, 3, 2, 4]) // (batch, blocks, head, block_length, head_features)
         } else if tensor.size().len() == 4 {
-            tensor.permute(&[0, 2, 1, 3]) // (batch, head, seq_length, head_features)
+            tensor.permute([0, 2, 1, 3]) // (batch, head, seq_length, head_features)
         } else {
             panic!(
                 "Input tensor should either be a rotary head, or its rank be one of [4, 5] but is: {}",
@@ -155,9 +155,9 @@ impl GptJAttention {
 
     fn merge_heads(tensor: &Tensor, num_heads: i64, attention_head_size: i64) -> Tensor {
         let tensor = if tensor.size().len() == 5 {
-            tensor.permute(&[0, 1, 3, 2, 4]).contiguous()
+            tensor.permute([0, 1, 3, 2, 4]).contiguous()
         } else if tensor.size().len() == 4 {
-            tensor.permute(&[0, 2, 1, 3]).contiguous()
+            tensor.permute([0, 2, 1, 3]).contiguous()
         } else {
             panic!(
                 "Input tensor rank should be one of [4, 5], but is: {}",
@@ -197,7 +197,7 @@ impl GptJAttention {
 
         let mask_value = get_min(attention_weights.kind()).unwrap();
         let mask_value = Tensor::full(
-            &attention_weights.size(),
+            attention_weights.size(),
             mask_value,
             (attention_weights.kind(), attention_weights.device()),
         );
@@ -261,8 +261,8 @@ impl GptJAttention {
             query = apply_rotary_pos_emb(&query, &sincos, offset);
         }
 
-        key = key.permute(&[0, 2, 1, 3]);
-        query = query.permute(&[0, 2, 1, 3]);
+        key = key.permute([0, 2, 1, 3]);
+        query = query.permute([0, 2, 1, 3]);
 
         if let Some(layer_past) = layer_past {
             key = Tensor::cat(&[&layer_past.prev_key, &key], -2);
@@ -297,7 +297,7 @@ fn fixed_pos_embedding(x: &Tensor, seq_len: i64) -> (Tensor, Tensor) {
     let sinusoid_inp = Tensor::einsum(
         "i , j -> i j",
         &[Tensor::arange(seq_len, (x.kind(), x.device())), inv_freq],
-        None,
+        None::<i64>,
     );
     (sinusoid_inp.sin(), sinusoid_inp.cos())
 }
@@ -312,7 +312,7 @@ fn apply_rotary_pos_emb(x: &Tensor, (sin, cos): &(Tensor, Tensor), offset: i64)
 fn duplicate_interleave(m: &Tensor) -> Tensor {
     let dim0 = m.size()[0];
     m.view([-1, 1]) // flatten the matrix
-        .repeat(&[1, 2]) // repeat all elements into the 2nd dimension
+        .repeat([1, 2]) // repeat all elements into the 2nd dimension
         .view([dim0, -1]) // reshape into a matrix, interleaving the copy
 }
 
diff --git a/src/gpt_neo/attention.rs b/src/gpt_neo/attention.rs
index cb28ee1c..eaa5e087 100644
--- a/src/gpt_neo/attention.rs
+++ b/src/gpt_neo/attention.rs
@@ -70,7 +70,7 @@ impl GptNeoSelfAttention {
         let p = p.borrow();
         let max_positions = config.max_position_embeddings;
 
-        let mut bias = Tensor::ones(&[max_positions, max_positions], (Kind::Uint8, p.device()))
+        let mut bias = Tensor::ones([max_positions, max_positions], (Kind::Uint8, p.device()))
             .tril(0)
             .view([1, 1, max_positions, max_positions])
             .requires_grad_(false);
@@ -135,11 +135,11 @@ impl GptNeoSelfAttention {
         let _ = new_shape.pop();
         new_shape.extend_from_slice(&[num_heads, attention_head_size]);
         let reshaped_tensor = input_tensor.view(new_shape.as_slice());
-        reshaped_tensor.permute(&[0, 2, 1, 3])
+        reshaped_tensor.permute([0, 2, 1, 3])
     }
 
     fn merge_heads(input_tensor: &Tensor, num_heads: i64, attention_head_size: i64) -> Tensor {
-        let output_tensor = input_tensor.permute(&[0, 2, 1, 3]).contiguous();
+        let output_tensor = input_tensor.permute([0, 2, 1, 3]).contiguous();
         let mut new_shape = output_tensor.size();
         new_shape.truncate(new_shape.len() - 2);
         new_shape.push(num_heads * attention_head_size);
diff --git a/src/longformer/attention.rs b/src/longformer/attention.rs
index adcd2f76..862ac73b 100644
--- a/src/longformer/attention.rs
+++ b/src/longformer/attention.rs
@@ -14,6 +14,7 @@ use crate::common::dropout::Dropout;
 use crate::common::kind::get_negative_infinity;
 use crate::longformer::LongformerConfig;
 use std::borrow::Borrow;
+use std::convert::TryFrom;
 use tch::{nn, Kind, Tensor};
 
 pub struct LongformerSelfAttention {
@@ -119,7 +120,7 @@ impl LongformerSelfAttention {
         );
 
         chunked_hidden_states
-            .constant_pad_nd(&[0, window_overlap + 1])
+            .constant_pad_nd([0, window_overlap + 1])
             .view([total_num_heads, num_chunks, -1])
             .slice(2, 0, -window_overlap, 1)
             .view([
@@ -165,15 +166,15 @@ impl LongformerSelfAttention {
         ];
 
         let beginning_mask = Tensor::ones(
-            &[affected_sequence_length, affected_sequence_length + 1],
+            [affected_sequence_length, affected_sequence_length + 1],
             (Kind::Int, input_tensor.device()),
         )
         .tril(0)
-        .flip(&[0])
+        .flip([0])
         .unsqueeze(0)
         .unsqueeze(2);
 
-        let ending_mask = beginning_mask.flip(&[1, 3]);
+        let ending_mask = beginning_mask.flip([1, 3]);
 
         let beginning_mask = beginning_mask
             .expand(beginning_input_size.as_slice(), true)
@@ -214,21 +215,21 @@ impl LongformerSelfAttention {
         let query =
             query
                 .transpose(1, 2)
-                .reshape(&[batch_size * num_heads, sequence_length, head_dim]);
+                .reshape([batch_size * num_heads, sequence_length, head_dim]);
         let key = key
             .transpose(1, 2)
-            .reshape(&[batch_size * num_heads, sequence_length, head_dim]);
+            .reshape([batch_size * num_heads, sequence_length, head_dim]);
 
         let query = self.chunk(&query, window_overlap);
         let key = self.chunk(&key, window_overlap);
 
         let diagonal_chunked_attention_scores = self.pad_and_transpose_last_two_dims(
-            &Tensor::einsum("bcxd,bcyd->bcxy", &[query, key], None),
+            &Tensor::einsum("bcxd,bcyd->bcxy", &[query, key], None::<i64>),
             &[0, 0, 0, 1],
         );
 
         let diagonal_attention_scores = Tensor::empty(
-            &[
+            [
                 batch_size * num_heads,
                 chunks_count + 1,
                 window_overlap,
@@ -320,7 +321,7 @@ impl LongformerSelfAttention {
         let (batch_size, sequence_length, num_heads, head_dim) = value.size4().unwrap();
         let chunk_counts = sequence_length / window_overlap - 1;
 
-        let chunked_attention_probas = attention_probas.transpose(1, 2).reshape(&[
+        let chunked_attention_probas = attention_probas.transpose(1, 2).reshape([
             batch_size * num_heads,
             sequence_length / window_overlap,
             window_overlap,
@@ -330,9 +331,9 @@ impl LongformerSelfAttention {
         let value =
             value
                 .transpose(1, 2)
-                .reshape(&[batch_size * num_heads, sequence_length, head_dim]);
+                .reshape([batch_size * num_heads, sequence_length, head_dim]);
 
-        let padded_value = (value + 1).constant_pad_nd(&[0, 0, window_overlap, window_overlap]) - 1;
+        let padded_value = (value + 1).constant_pad_nd([0, 0, window_overlap, window_overlap]) - 1;
         let chunked_value_size = &[
             batch_size * num_heads,
             chunk_counts + 1,
@@ -353,7 +354,7 @@ impl LongformerSelfAttention {
         Tensor::einsum(
             "bcwd,bcdh->bcwh",
             &[chunked_attention_probas, chunked_value],
-            None,
+            None::<i64>,
         )
         .view([batch_size, num_heads, sequence_length, head_dim])
         .transpose(1, 2)
@@ -365,7 +366,8 @@ impl LongformerSelfAttention {
     ) -> GlobalAttentionIndices {
         let num_global_attention_indices =
             is_index_global_attn.sum_dim_intlist([1].as_slice(), false, Kind::Int64);
-        let max_num_global_attention_indices = i64::from(num_global_attention_indices.max());
+        let max_num_global_attention_indices =
+            i64::try_from(num_global_attention_indices.max()).unwrap();
         let is_index_global_attn_nonzero = is_index_global_attn
             .nonzero_numpy()
             .into_iter()
@@ -411,7 +413,7 @@ impl LongformerSelfAttention {
         let batch_size = key_vectors.size()[0];
 
         let mut key_vectors_only_global = Tensor::zeros(
-            &[
+            [
                 batch_size,
                 max_num_global_attention_indices,
                 self.num_heads,
@@ -429,7 +431,7 @@ impl LongformerSelfAttention {
         let attention_probas_from_global_key = Tensor::einsum(
             "blhd,bshd->blhs",
             &[query_vectors, &key_vectors_only_global],
-            None,
+            None::<i64>,
         );
 
         let _ = attention_probas_from_global_key
@@ -463,7 +465,7 @@ impl LongformerSelfAttention {
         let attention_probas_only_global =
             attention_probas.narrow(-1, 0, max_num_global_attention_indices);
         let mut value_vectors_only_global = Tensor::zeros(
-            &[
+            [
                 batch_size,
                 max_num_global_attention_indices,
                 self.num_heads,
@@ -513,7 +515,7 @@ impl LongformerSelfAttention {
         let (sequence_length, batch_size) = (hidden_states_shape[0], hidden_states_shape[1]);
 
         let mut global_attention_hidden_states = Tensor::zeros(
-            &[max_num_global_attention_indices, batch_size, self.embed_dim],
+            [max_num_global_attention_indices, batch_size, self.embed_dim],
             (hidden_states.kind(), hidden_states.device()),
         );
 
@@ -718,7 +720,7 @@ impl LongformerSelfAttention {
         let mut attention_output =
             attention_output
                 .transpose(0, 1)
-                .reshape(&[sequence_length, batch_size, embed_dim]);
+                .reshape([sequence_length, batch_size, embed_dim]);
 
         let global_attention_probas = if is_global_attention {
             let (global_attention_output, global_attention_probas) = self
diff --git a/src/longformer/embeddings.rs b/src/longformer/embeddings.rs
index 5be27501..9ea17b82 100644
--- a/src/longformer/embeddings.rs
+++ b/src/longformer/embeddings.rs
@@ -93,7 +93,7 @@ impl LongformerEmbeddings {
             (Kind::Int64, inputs_embeds.device()),
         )
         .unsqueeze(0)
-        .expand(&[batch_size, sequence_length], true)
+        .expand([batch_size, sequence_length], true)
     }
 
     pub fn forward_t(
diff --git a/src/longformer/encoder.rs b/src/longformer/encoder.rs
index bc26fb48..b0a26201 100644
--- a/src/longformer/encoder.rs
+++ b/src/longformer/encoder.rs
@@ -15,6 +15,7 @@ use crate::common::dropout::Dropout;
 use crate::longformer::attention::LongformerSelfAttention;
 use crate::longformer::LongformerConfig;
 use std::borrow::{Borrow, BorrowMut};
+use std::convert::TryFrom;
 use tch::nn::Module;
 use tch::{nn, Tensor};
 
@@ -293,7 +294,7 @@ impl LongformerEncoder {
     ) -> LongformerEncoderOutput {
         let is_index_masked = attention_mask.lt(0);
         let is_index_global_attention = attention_mask.gt(0);
-        let is_global_attention = bool::from(is_index_global_attention.any());
+        let is_global_attention = bool::try_from(is_index_global_attention.any()).unwrap();
 
         let mut all_hidden_states: Option<Vec<Tensor>> = if self.output_hidden_states {
             Some(vec![])
diff --git a/src/longformer/longformer_model.rs b/src/longformer/longformer_model.rs
index 2cd25f0e..f0a9581a 100644
--- a/src/longformer/longformer_model.rs
+++ b/src/longformer/longformer_model.rs
@@ -393,7 +393,7 @@ impl LongformerModel {
             .map(|value| self.pad_with_nonzero_value(value, &[0, padding_length], pad_token_id));
         let inputs_embeds = input_embeds.map(|value| {
             let input_ids_padding = Tensor::full(
-                &[batch_size, padding_length],
+                [batch_size, padding_length],
                 pad_token_id,
                 (Kind::Int64, value.device()),
             );
@@ -407,8 +407,7 @@ impl LongformerModel {
 
         let attention_mask =
             attention_mask.map(|value| self.pad_with_boolean(value, &[0, padding_length], false));
-        let token_type_ids =
-            token_type_ids.map(|value| value.constant_pad_nd(&[0, padding_length]));
+        let token_type_ids = token_type_ids.map(|value| value.constant_pad_nd([0, padding_length]));
         Ok(PaddedInput {
             input_ids,
             position_ids,
@@ -584,7 +583,7 @@ impl LongformerModel {
                     let mut causal_mask = sequence_ids
                         .unsqueeze(0)
                         .unsqueeze(0)
-                        .repeat(&[batch_size, sequence_length, 1])
+                        .repeat([batch_size, sequence_length, 1])
                         .le_tensor(&sequence_ids.unsqueeze(-1).unsqueeze(0))
                         .totype(Kind::Int);
                     if causal_mask.size()[1] < padded_attention_mask.size()[1] {
@@ -593,7 +592,7 @@ impl LongformerModel {
                         causal_mask = Tensor::cat(
                             &[
                                 Tensor::ones(
-                                    &[batch_size, sequence_length, prefix_sequence_length],
+                                    [batch_size, sequence_length, prefix_sequence_length],
                                     (Kind::Int, device),
                                 ),
                                 causal_mask,
@@ -975,7 +974,7 @@ impl LongformerForSequenceClassification {
 
             let (batch_size, sequence_length) = (input_shape[0], input_shape[1]);
             let global_attention_mask =
-                Tensor::zeros(&[batch_size, sequence_length], (Kind::Int, device));
+                Tensor::zeros([batch_size, sequence_length], (Kind::Int, device));
             let _ = global_attention_mask.select(1, 0).fill_(1);
             Some(global_attention_mask)
         } else {
diff --git a/src/longt5/attention.rs b/src/longt5/attention.rs
index efb2e2d2..e3cc116f 100644
--- a/src/longt5/attention.rs
+++ b/src/longt5/attention.rs
@@ -143,16 +143,16 @@ fn make_global_fixed_block_ids(
         global_block_ids
             .max_dim(-1, false)
             .0
-            .repeat(&[num_globals, 1])
+            .repeat([num_globals, 1])
             .transpose(0, 1)
     } else {
         Tensor::zeros(
-            &[batch_size, 0],
+            [batch_size, 0],
             (global_block_ids.kind(), global_block_ids.device()),
         )
     };
     let global_segment_ids = Tensor::ones(
-        &[batch_size, num_globals],
+        [batch_size, num_globals],
         (attention_mask.kind(), attention_mask.device()),
     )
     .cumsum(-1, attention_mask.kind())
@@ -190,7 +190,7 @@ fn create_global_aggregates(
             hidden_states,
             &one_hot_block_ids.to_kind(hidden_states.kind()),
         ],
-        None,
+        None::<i64>,
     )
 }
 
@@ -214,7 +214,7 @@ fn compute_bias(
     );
     rp_bucket
         .apply(relative_attention_bias)
-        .permute(&[2, 0, 1])
+        .permute([2, 0, 1])
         .unsqueeze(0)
         .unsqueeze(0)
 }
@@ -322,11 +322,15 @@ impl LongT5LocalAttention {
         let key_states = concatenate_3_blocks(&key_states, 1, 2, None);
         let value_states = concatenate_3_blocks(&value_states, 1, 2, None);
 
-        let mut scores = Tensor::einsum("...qhd,...khd->...hqk", &[query_states, key_states], None);
+        let mut scores = Tensor::einsum(
+            "...qhd,...khd->...hqk",
+            &[query_states, key_states],
+            None::<i64>,
+        );
         let calc_position_bias = if position_bias.is_none() {
             let mut position_bias = if !self.has_relative_attention_bias {
                 Tensor::zeros(
-                    &[1, 1, self.n_heads, self.block_length, 3 * self.block_length],
+                    [1, 1, self.n_heads, self.block_length, 3 * self.block_length],
                     (scores.kind(), scores.device()),
                 )
             } else {
@@ -356,7 +360,7 @@ impl LongT5LocalAttention {
         let attention_output = unshape(&Tensor::einsum(
             "...hqk,...khd->...qhd",
             &[&attention_weights, &value_states],
-            None,
+            None::<i64>,
         ))
         .narrow(1, 0, seq_length)
         .apply(&self.output);
@@ -492,7 +496,7 @@ impl LongT5TransientGlobalAttention {
         );
         let side_bias = side_relative_position_bucket
             .apply(self.global_relative_attention_bias.as_ref().unwrap())
-            .permute(&[0, 3, 1, 2]);
+            .permute([0, 3, 1, 2]);
         attention_side_bias + side_bias
     }
 
@@ -551,7 +555,11 @@ impl LongT5TransientGlobalAttention {
         let key_states = Tensor::cat(&[key_states, side_key_states], 2);
         let value_states = Tensor::cat(&[value_states, side_value_states], 2);
 
-        let mut scores = Tensor::einsum("...qhd,...khd->...hqk", &[query_states, key_states], None);
+        let mut scores = Tensor::einsum(
+            "...qhd,...khd->...hqk",
+            &[query_states, key_states],
+            None::<i64>,
+        );
         let local_attention_mask = mask.map(|mask| {
             let local_attention_mask = get_local_attention_mask(mask, self.block_length);
             local_attention_mask
@@ -562,7 +570,7 @@ impl LongT5TransientGlobalAttention {
         let calc_position_bias = if position_bias.is_none() {
             let mut position_bias = if !self.has_relative_attention_bias {
                 Tensor::zeros(
-                    &[1, 1, self.n_heads, self.block_length, 3 * self.block_length],
+                    [1, 1, self.n_heads, self.block_length, 3 * self.block_length],
                     (scores.kind(), scores.device()),
                 )
             } else {
@@ -579,7 +587,7 @@ impl LongT5TransientGlobalAttention {
             }
             let calc_mask = if mask.is_none() {
                 Some(Tensor::ones(
-                    &[batch_size, seq_length],
+                    [batch_size, seq_length],
                     (global_segment_ids.kind(), global_segment_ids.device()),
                 ))
             } else {
@@ -610,7 +618,7 @@ impl LongT5TransientGlobalAttention {
         let attention_output = unshape(&Tensor::einsum(
             "...hqk,...khd->...qhd",
             &[&attention_weights, &value_states],
-            None,
+            None::<i64>,
         ))
         .narrow(1, 0, seq_length)
         .apply(&self.output);
diff --git a/src/longt5/encoder.rs b/src/longt5/encoder.rs
index 463b6f92..637e7e0a 100644
--- a/src/longt5/encoder.rs
+++ b/src/longt5/encoder.rs
@@ -306,7 +306,7 @@ impl LongT5Stack {
 
         let calculated_attention_mask = if attention_mask.is_none() {
             Some(Tensor::ones(
-                &[batch_size, mask_seq_length],
+                [batch_size, mask_seq_length],
                 (Kind::Int64, input_embeddings.device()),
             ))
         } else {
@@ -324,7 +324,7 @@ impl LongT5Stack {
                             sequence_length,
                             (input_embeddings.kind(), input_embeddings.device()),
                         );
-                        let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat(&[
+                        let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat([
                             batch_size,
                             sequence_length,
                             1,
@@ -358,7 +358,7 @@ impl LongT5Stack {
             let new_shape = &encoder_hidden_states.as_ref().unwrap().size()[..2];
             let calculated_encoder_attention_mask = if encoder_attention_mask.is_none() {
                 Some(Tensor::ones(
-                    &[batch_size, new_shape[1]],
+                    [batch_size, new_shape[1]],
                     (Kind::Int64, input_embeddings.device()),
                 ))
             } else {
diff --git a/src/m2m_100/embeddings.rs b/src/m2m_100/embeddings.rs
index be46de69..23bfb477 100644
--- a/src/m2m_100/embeddings.rs
+++ b/src/m2m_100/embeddings.rs
@@ -82,7 +82,7 @@ impl SinusoidalPositionalEmbedding {
             sinusoidal_embedding = Tensor::cat(
                 &[
                     sinusoidal_embedding,
-                    Tensor::zeros(&[num_embeddings, 1], (Kind::Float, device)),
+                    Tensor::zeros([num_embeddings, 1], (Kind::Float, device)),
                 ],
                 1,
             );
diff --git a/src/mbart/mbart_model.rs b/src/mbart/mbart_model.rs
index 6193c1c6..f5d51e3c 100644
--- a/src/mbart/mbart_model.rs
+++ b/src/mbart/mbart_model.rs
@@ -688,7 +688,7 @@ impl MBartForSequenceClassification {
         let reshape = eos_mask.sum_dim_intlist([1].as_slice(), true, Int64);
         let sentence_representation = base_model_output
             .decoder_output
-            .permute(&[2, 0, 1])
+            .permute([2, 0, 1])
             .masked_select(&eos_mask)
             .view((-1, reshape.size()[0] * reshape.int64_value(&[0, 0])))
             .transpose(0, 1)
diff --git a/src/mobilebert/embeddings.rs b/src/mobilebert/embeddings.rs
index 16826ac1..92de72e9 100644
--- a/src/mobilebert/embeddings.rs
+++ b/src/mobilebert/embeddings.rs
@@ -114,7 +114,7 @@ impl MobileBertEmbeddings {
 
         let updated_input_embeddings = if self.trigram_input {
             let padding_tensor = Tensor::zeros(
-                &[input_shape[0], 1, self.embedding_size],
+                [input_shape[0], 1, self.embedding_size],
                 (input_embeddings.kind(), input_embeddings.device()),
             );
             let input_embeddings = Tensor::cat(
diff --git a/src/mobilebert/mobilebert_model.rs b/src/mobilebert/mobilebert_model.rs
index b34ee5b8..d1cca736 100644
--- a/src/mobilebert/mobilebert_model.rs
+++ b/src/mobilebert/mobilebert_model.rs
@@ -387,7 +387,7 @@ impl MobileBertModel {
         };
         let position_ids =
             Tensor::arange(config.max_position_embeddings, (Kind::Int64, p.device()))
-                .expand(&[1, -1], true);
+                .expand([1, -1], true);
         MobileBertModel {
             embeddings,
             encoder,
diff --git a/src/pegasus/embeddings.rs b/src/pegasus/embeddings.rs
index b2e53a3f..2a285361 100644
--- a/src/pegasus/embeddings.rs
+++ b/src/pegasus/embeddings.rs
@@ -73,7 +73,7 @@ impl SinusoidalPositionalEmbedding {
         let sinusoidal_embeddings = Tensor::stack(&sinusoidal_embedding, 0).to_kind(Kind::Float);
 
         let reordered_sinusoidal_embeddings =
-            Tensor::empty(&[num_embeddings, embedding_dim], (Kind::Float, device));
+            Tensor::empty([num_embeddings, embedding_dim], (Kind::Float, device));
 
         reordered_sinusoidal_embeddings
             .slice(1, 0, sentinel, 1)
diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs
index b8f46e5f..bd7e035a 100644
--- a/src/pipelines/conversation.rs
+++ b/src/pipelines/conversation.rs
@@ -1011,7 +1011,7 @@ impl ConversationModel {
             .unwrap();
 
         let attention_mask = Tensor::ones(
-            &[inputs.len() as i64, max_len as i64],
+            [inputs.len() as i64, max_len as i64],
             (Kind::Int8, self.device),
         );
 
diff --git a/src/pipelines/generation_utils.rs b/src/pipelines/generation_utils.rs
index 51c75ec1..936e4a90 100644
--- a/src/pipelines/generation_utils.rs
+++ b/src/pipelines/generation_utils.rs
@@ -229,6 +229,7 @@ pub enum Cache {
 pub(crate) mod private_generation_utils {
     use std::cmp::{max, min};
     use std::collections::HashMap;
+    use std::convert::TryFrom;
     use std::mem;
 
     use rust_tokenizers::tokenizer::{truncate_sequences, TruncationStrategy};
@@ -653,7 +654,7 @@ pub(crate) mod private_generation_utils {
             bad_words_id_length_1: &[i64],
         ) -> Tensor {
             let mut static_bad_words_mask =
-                Tensor::zeros(&[scores.size()[1]], (Kind::Int8, scores.device()));
+                Tensor::zeros([scores.size()[1]], (Kind::Int8, scores.device()));
             let _ = static_bad_words_mask.index_fill_(
                 0,
                 &Tensor::of_slice(bad_words_id_length_1).to_device(scores.device()),
@@ -766,9 +767,9 @@ pub(crate) mod private_generation_utils {
             output_scores: bool,
         ) -> GeneratedOutputWithScores {
             let mut unfinished_sentences =
-                Tensor::ones(&[batch_size], (Kind::Int64, self.get_var_store().device()));
+                Tensor::ones([batch_size], (Kind::Int64, self.get_var_store().device()));
             let mut sentence_lengths: Tensor =
-                Tensor::ones(&[batch_size], (Kind::Int64, self.get_var_store().device()));
+                Tensor::ones([batch_size], (Kind::Int64, self.get_var_store().device()));
             let (bad_word_ids_length_1, bad_word_ids_length_greater_than_1) =
                 self.split_bad_word_ids(gen_opt.bad_word_ids);
             let mut static_bad_words_mask: Option<Tensor> = None;
@@ -902,7 +903,7 @@ pub(crate) mod private_generation_utils {
                     prev_scores.push(
                         next_token_logits
                             .log_softmax(-1, next_token_logits.kind())
-                            .gather(1, &next_token.reshape(&[-1, 1]), true)
+                            .gather(1, &next_token.reshape([-1, 1]), true)
                             .squeeze()
                             .masked_fill(&finished_mask, 0),
                     );
@@ -931,7 +932,7 @@ pub(crate) mod private_generation_utils {
                         );
                         unfinished_sentences = -unfinished_sentences * (sentence_with_eos - 1);
                     }
-                    if i64::from(unfinished_sentences.max()) == 0 {
+                    if i64::try_from(unfinished_sentences.max()).unwrap() == 0 {
                         break;
                     }
                 }
@@ -940,7 +941,7 @@ pub(crate) mod private_generation_utils {
                         &[
                             attention_mask.as_ref(),
                             Tensor::ones(
-                                &[*attention_mask.size().first().unwrap(), 1],
+                                [*attention_mask.size().first().unwrap(), 1],
                                 (Kind::Int64, attention_mask.device()),
                             )
                             .as_ref(),
@@ -1022,20 +1023,20 @@ pub(crate) mod private_generation_utils {
 
             let vocab_size = self.get_vocab_size();
             let beam_scores = Tensor::ones(
-                &[batch_size, gen_opt.num_beams],
+                [batch_size, gen_opt.num_beams],
                 (Kind::Float, self.get_var_store().device()),
             ) * -1e9;
             let _ = beam_scores
                 .slice(1, 0, *beam_scores.size().last().unwrap(), num_sub_beams)
                 .fill_(0);
 
-            let mut beam_scores = beam_scores.view_(&[-1]);
+            let mut beam_scores = beam_scores.view_([-1]);
             let mut beam_tokens = Tensor::zeros(
-                &[batch_size * gen_opt.num_beams],
+                [batch_size * gen_opt.num_beams],
                 (Kind::Int64, self.get_var_store().device()),
             );
             let mut beam_indices = Tensor::zeros(
-                &[batch_size * gen_opt.num_beams],
+                [batch_size * gen_opt.num_beams],
                 (Kind::Int64, self.get_var_store().device()),
             );
             let mut saved_beam_scores: Option<Vec<Tensor>> =
@@ -1052,7 +1053,7 @@ pub(crate) mod private_generation_utils {
             loop {
                 if num_beam_groups > 1 {
                     current_tokens = Tensor::zeros(
-                        &[batch_size * gen_opt.num_beams],
+                        [batch_size * gen_opt.num_beams],
                         (input_ids.kind(), input_ids.device()),
                     );
                 }
@@ -1364,7 +1365,7 @@ pub(crate) mod private_generation_utils {
                         &[
                             attention_mask.as_ref(),
                             Tensor::ones(
-                                &[*attention_mask.size().first().unwrap(), 1],
+                                [*attention_mask.size().first().unwrap(), 1],
                                 (Kind::Int64, attention_mask.device()),
                             )
                             .as_ref(),
@@ -1391,7 +1392,7 @@ pub(crate) mod private_generation_utils {
                     let beam_saved_token_scores = saved_beam_scores.as_mut().map(|saved_tokens| {
                         mem::replace(&mut saved_tokens[effective_beam_id as usize], Tensor::new())
                     });
-                    let final_score = f64::from(beam_scores.get(effective_beam_id));
+                    let final_score = f64::try_from(beam_scores.get(effective_beam_id)).unwrap();
                     let final_tokens = input_ids.get(effective_beam_id);
                     hypotheses[batch_index as usize].add(
                         final_tokens,
@@ -1411,7 +1412,7 @@ pub(crate) mod private_generation_utils {
             };
 
             let mut sentence_lengths =
-                Tensor::zeros(&[output_batch_size], (Kind::Int64, input_ids.device()));
+                Tensor::zeros([output_batch_size], (Kind::Int64, input_ids.device()));
             let mut best_ids = vec![];
 
             let mut scores_output = if output_scores {
@@ -1457,14 +1458,21 @@ pub(crate) mod private_generation_utils {
             }
             let sentence_max_length = gen_opt
                 .max_length
-                .map(|max_length| min(i64::from(sentence_lengths.max()) + 1, max_length))
-                .unwrap_or(i64::from(sentence_lengths.max()) + 1);
+                .map(|max_length| {
+                    min(
+                        i64::try_from(sentence_lengths.max()).unwrap() + 1,
+                        max_length,
+                    )
+                })
+                .unwrap_or(i64::try_from(sentence_lengths.max()).unwrap() + 1);
 
             let mut decoded = input_ids.new_empty(
-                &[output_batch_size, sentence_max_length],
+                [output_batch_size, sentence_max_length],
                 (Kind::Int64, input_ids.device()),
             );
-            if i64::from(sentence_lengths.max()) != i64::from(sentence_lengths.min()) {
+            if i64::try_from(sentence_lengths.max()).unwrap()
+                != i64::try_from(sentence_lengths.min()).unwrap()
+            {
                 let _ = decoded.fill_(
                     gen_opt
                         .pad_token_id
@@ -1476,15 +1484,16 @@ pub(crate) mod private_generation_utils {
                     0,
                     &Tensor::arange_start(
                         0,
-                        i64::from(sentence_lengths.get(hypothesis_index as i64)),
+                        i64::try_from(sentence_lengths.get(hypothesis_index as i64)).unwrap(),
                         (Kind::Int64, input_ids.device()),
                     ),
                     best_id,
                 );
-                let sentence_length = i64::from(sentence_lengths.get(hypothesis_index as i64));
+                let sentence_length =
+                    i64::try_from(sentence_lengths.get(hypothesis_index as i64)).unwrap();
                 let sentence_length_max = gen_opt
                     .max_length
-                    .unwrap_or_else(|| i64::from(sentence_lengths.max()));
+                    .unwrap_or_else(|| i64::try_from(sentence_lengths.max()).unwrap());
                 if sentence_length < sentence_length_max {
                     let _ = decoded.get(hypothesis_index as i64).index_fill_(
                         0,
@@ -1810,7 +1819,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator {
             }
             None => match self.get_bos_id() {
                 Some(bos_id) => {
-                    Tensor::ones(&[1, 1], (Int64, self.get_var_store().device())) * bos_id
+                    Tensor::ones([1, 1], (Int64, self.get_var_store().device())) * bos_id
                 }
                 None => panic!(
                     "A model with a BOS token must be used to start generation with an empty input"
@@ -1916,13 +1925,13 @@ pub trait LanguageGenerator: PrivateLanguageGenerator {
         let mut input_ids_len = *input_id_size.last().unwrap();
         if input_ids_len == 0 {
             input_ids = Tensor::ones(
-                &[*input_id_size.first().unwrap(), 1],
+                [*input_id_size.first().unwrap(), 1],
                 (Int64, input_ids.device()),
             ) * self
                 .get_bos_id()
                 .expect("`bos_token_id` has to be defined when no `input_ids` are provided.");
             attention_mask = Some(Tensor::ones(
-                &[*input_id_size.first().unwrap(), 1],
+                [*input_id_size.first().unwrap(), 1],
                 (Int64, input_ids.device()),
             ));
             input_ids_len += 1;
@@ -1952,7 +1961,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator {
             let encoder_outputs = self.encode(&input_ids, Some(&attention_mask)).unwrap();
             let expanded_batch_indices = Tensor::arange(batch_size, (Int64, input_ids.device()))
                 .view((-1, 1))
-                .repeat(&[1, num_beams * effective_batch_mult])
+                .repeat([1, num_beams * effective_batch_mult])
                 .view(-1);
             Some(encoder_outputs.index_select(0, &expanded_batch_indices))
         } else {
@@ -1965,7 +1974,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator {
                     input_ids
                         .unsqueeze(1)
                         .expand(
-                            &[batch_size, effective_batch_mult * num_beams, cur_len],
+                            [batch_size, effective_batch_mult * num_beams, cur_len],
                             true,
                         )
                         .contiguous()
@@ -1973,7 +1982,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator {
                     attention_mask
                         .unsqueeze(1)
                         .expand(
-                            &[batch_size, effective_batch_mult * num_beams, cur_len],
+                            [batch_size, effective_batch_mult * num_beams, cur_len],
                             true,
                         )
                         .contiguous()
@@ -1988,7 +1997,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator {
                     .expect("decoder start id must be specified for encoder decoders")
             });
             let input_ids = Tensor::full(
-                &[effective_batch_size * num_beams, 1],
+                [effective_batch_size * num_beams, 1],
                 decoder_start_token_id,
                 (Int64, input_ids.device()),
             );
@@ -1996,7 +2005,7 @@ pub trait LanguageGenerator: PrivateLanguageGenerator {
                 attention_mask
                     .unsqueeze(1)
                     .expand(
-                        &[batch_size, effective_batch_mult * num_beams, input_ids_len],
+                        [batch_size, effective_batch_mult * num_beams, input_ids_len],
                         true,
                     )
                     .contiguous()
@@ -2228,7 +2237,7 @@ impl BeamHypotheses {
                     1,
                     0,
                     Some(Tensor::zeros(
-                        &[1],
+                        [1],
                         (scores_tensor.kind(), scores_tensor.device()),
                     )),
                     None,
diff --git a/src/pipelines/keywords_extraction/scorer.rs b/src/pipelines/keywords_extraction/scorer.rs
index 822dab08..fbdc0a77 100644
--- a/src/pipelines/keywords_extraction/scorer.rs
+++ b/src/pipelines/keywords_extraction/scorer.rs
@@ -22,6 +22,7 @@
 /// SOFTWARE.
 use crate::pipelines::keywords_extraction::KeywordScorerType;
 use std::cmp::{max, min};
+use std::convert::TryFrom;
 use tch::{Kind, Tensor};
 
 impl KeywordScorerType {
@@ -96,7 +97,8 @@ fn maximal_margin_relevance_score(
         cosine_similarity(Some(&document_embedding), &word_embeddings).view([-1]);
     let word_similarities = cosine_similarity(None, &word_embeddings);
 
-    let mut keyword_indices = vec![i64::from(word_document_similarities.argmax(0, false))];
+    let mut keyword_indices =
+        vec![i64::try_from(word_document_similarities.argmax(0, false)).unwrap()];
     let mut candidate_indices = (0..word_document_similarities.size()[0]).collect::<Vec<i64>>();
     let _ = candidate_indices.remove(keyword_indices[0] as usize);
     for _ in 0..min(num_keywords - 1, word_embeddings.size()[0] as usize) {
@@ -112,7 +114,7 @@ fn maximal_margin_relevance_score(
             )
             .max_dim(1, false);
         let mmr = candidate_similarities * (1.0 - diversity) - target_similarities * diversity;
-        let mmr_index = candidate_indices[i64::from(mmr.argmax(0, false)) as usize];
+        let mmr_index = candidate_indices[i64::try_from(mmr.argmax(0, false)).unwrap() as usize];
         keyword_indices.push(mmr_index);
         let candidate_mmr_index = candidate_indices
             .iter()
@@ -149,12 +151,13 @@ fn max_sum_score(
     let (mut best_score, mut best_combination) = (None, None);
     for idx in 0..keyword_combinations.size()[0] {
         let combination = keyword_combinations.get(idx);
-        let combination_score = f64::from(
+        let combination_score = f64::try_from(
             word_similarities
                 .index_select(0, &combination)
                 .index_select(1, &combination)
                 .sum(word_similarities.kind()),
-        );
+        )
+        .unwrap();
         if let Some(current_best_score) = best_score {
             if combination_score < current_best_score {
                 best_score = Some(combination_score);
diff --git a/src/pipelines/masked_language.rs b/src/pipelines/masked_language.rs
index eca8a373..32829ae1 100644
--- a/src/pipelines/masked_language.rs
+++ b/src/pipelines/masked_language.rs
@@ -61,6 +61,7 @@ use crate::{
 use rust_tokenizers::tokenizer::TruncationStrategy;
 use rust_tokenizers::TokenizedInput;
 use std::borrow::Borrow;
+use std::convert::TryFrom;
 use tch::nn::VarStore;
 use tch::{nn, no_grad, Device, Tensor};
 
@@ -580,7 +581,7 @@ impl MaskedLanguageModel {
         for input_id in 0..input.as_ref().len() as i64 {
             let mut sequence_tokens = vec![];
             let sequence_mask = mask_token_mask.get(input_id);
-            if bool::from(sequence_mask.any()) {
+            if bool::try_from(sequence_mask.any())? {
                 let mask_scores = output
                     .get(input_id)
                     .index_select(0, &sequence_mask.argwhere().squeeze_dim(1));
diff --git a/src/pipelines/sentence_embeddings/pipeline.rs b/src/pipelines/sentence_embeddings/pipeline.rs
index 6005825c..b2104152 100644
--- a/src/pipelines/sentence_embeddings/pipeline.rs
+++ b/src/pipelines/sentence_embeddings/pipeline.rs
@@ -1,5 +1,5 @@
 use std::borrow::Borrow;
-use std::convert::TryInto;
+use std::convert::{TryFrom, TryInto};
 
 use rust_tokenizers::tokenizer::TruncationStrategy;
 use tch::{nn, Tensor};
@@ -365,7 +365,7 @@ impl SentenceEmbeddingsModel {
         };
         let maybe_normalized = if self.normalize_embeddings {
             let norm = &maybe_linear
-                .norm_scalaropt_dim(2, &[1], true)
+                .norm_scalaropt_dim(2, [1], true)
                 .clamp_min(1e-12)
                 .expand_as(&maybe_linear);
             maybe_linear / norm
@@ -385,7 +385,7 @@ impl SentenceEmbeddingsModel {
         S: AsRef<str> + Sync,
     {
         let SentenceEmbeddingsModelOutput { embeddings, .. } = self.encode_as_tensor(inputs)?;
-        Ok(Vec::from(embeddings))
+        Ok(Vec::try_from(embeddings)?)
     }
 
     fn nb_layers(&self) -> usize {
@@ -433,7 +433,7 @@ impl SentenceEmbeddingsModel {
             all_attentions,
         } = self.encode_as_tensor(inputs)?;
 
-        let embeddings = Vec::from(embeddings);
+        let embeddings = Vec::try_from(embeddings)?;
         let all_attentions = all_attentions.ok_or_else(|| {
             RustBertError::InvalidConfigurationError("No attention outputted".into())
         })?;
@@ -448,7 +448,7 @@ impl SentenceEmbeddingsModel {
                             .slice(0, i, i + 1, 1)
                             .slice(1, head as i64, head as i64 + 1, 1)
                             .squeeze();
-                        let attention_head = AttentionHead::from(attention_slice);
+                        let attention_head = AttentionHead::try_from(attention_slice).unwrap();
                         attention_layer.push(attention_head);
                     }
                     attention_output.push(attention_layer);
diff --git a/src/prophetnet/attention.rs b/src/prophetnet/attention.rs
index 21d67bb7..d381129b 100644
--- a/src/prophetnet/attention.rs
+++ b/src/prophetnet/attention.rs
@@ -531,7 +531,7 @@ impl ProphetNetNgramAttention {
         let predict_attention_weights = Tensor::einsum(
             "nbtc,nbsc->nbts",
             &[predict_query_states, predict_key_states],
-            None,
+            None::<i64>,
         );
 
         let predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings(
@@ -555,7 +555,7 @@ impl ProphetNetNgramAttention {
         let predict_attention_output = Tensor::einsum(
             "nbts,nbsc->nbtc",
             &[&predict_attention_probas, &predict_value_states],
-            None,
+            None::<i64>,
         )
         .transpose(1, 2)
         .contiguous()
@@ -611,11 +611,11 @@ impl ProphetNetNgramAttention {
             )
             .unsqueeze(0)
             .unsqueeze(0)
-            .repeat(&[batch_size, sequence_length, 1]);
+            .repeat([batch_size, sequence_length, 1]);
             let relative_positions = relative_positions
                 - position_ids
                     .unsqueeze(0)
-                    .repeat(&[batch_size, sequence_length, 1]);
+                    .repeat([batch_size, sequence_length, 1]);
             Some(compute_relative_buckets(
                 self.num_buckets,
                 self.relative_max_distance,
@@ -637,11 +637,11 @@ impl ProphetNetNgramAttention {
                 self.num_buckets,
                 self.num_attention_heads,
             ])
-            .permute(&[0, 3, 1, 2])
-            .reshape(&[-1, self.num_buckets]);
+            .permute([0, 3, 1, 2])
+            .reshape([-1, self.num_buckets]);
 
         let main_relative_position_buckets = main_relative_position_buckets
-            .repeat(&[1, self.num_attention_heads, 1])
+            .repeat([1, self.num_attention_heads, 1])
             .view([-1, *main_relative_position_buckets.size().last().unwrap()]);
 
         let mut new_shape = attention_weights
@@ -672,11 +672,11 @@ impl ProphetNetNgramAttention {
                 Tensor::arange(key_sequence_length, (Kind::Int64, hidden_states.device()))
                     .unsqueeze(0)
                     .unsqueeze(0)
-                    .repeat(&[batch_size, sequence_length, 1]);
+                    .repeat([batch_size, sequence_length, 1]);
             let relative_positions = relative_positions
                 - position_ids
                     .unsqueeze(0)
-                    .repeat(&[batch_size, sequence_length, 1]);
+                    .repeat([batch_size, sequence_length, 1]);
             Some(compute_relative_buckets(
                 self.num_buckets,
                 self.relative_max_distance,
@@ -700,12 +700,12 @@ impl ProphetNetNgramAttention {
                 self.num_buckets,
                 self.num_attention_heads,
             ])
-            .permute(&[0, 1, 4, 2, 3])
-            .reshape(&[-1, self.num_buckets]);
+            .permute([0, 1, 4, 2, 3])
+            .reshape([-1, self.num_buckets]);
 
         let predict_relative_position_buckets = predict_relative_position_buckets
             .unsqueeze(0)
-            .repeat(&[self.ngram, 1, self.num_attention_heads, 1])
+            .repeat([self.ngram, 1, self.num_attention_heads, 1])
             .view([
                 -1,
                 *predict_relative_position_buckets.size().last().unwrap(),
@@ -770,12 +770,12 @@ pub(crate) fn compute_all_stream_relative_buckets(
     let main_stream_relative_positions =
         position_ids
             .unsqueeze(1)
-            .repeat(&[1, *position_ids.size().last().unwrap(), 1])
+            .repeat([1, *position_ids.size().last().unwrap(), 1])
             - position_ids.unsqueeze(-1);
 
     let predicting_stream_relative_positions = Tensor::cat(&[&(position_ids - 1), position_ids], 1)
         .unsqueeze(1)
-        .repeat(&[1, *position_ids.size().last().unwrap(), 1])
+        .repeat([1, *position_ids.size().last().unwrap(), 1])
         - position_ids.unsqueeze(-1);
 
     let main_relative_position_buckets = compute_relative_buckets(
diff --git a/src/prophetnet/decoder.rs b/src/prophetnet/decoder.rs
index 5b647ebe..2a311ebc 100644
--- a/src/prophetnet/decoder.rs
+++ b/src/prophetnet/decoder.rs
@@ -25,7 +25,7 @@ use tch::nn::init::DEFAULT_KAIMING_UNIFORM;
 use tch::{nn, Device, Kind, Tensor};
 
 fn ngram_attention_bias(sequence_length: i64, ngram: i64, device: Device, kind: Kind) -> Tensor {
-    let left_block = Tensor::ones(&[ngram, sequence_length, sequence_length], (kind, device))
+    let left_block = Tensor::ones([ngram, sequence_length, sequence_length], (kind, device))
         * get_min(kind).unwrap();
     let right_block = left_block.copy();
     for stream_idx in 0..ngram {
@@ -302,7 +302,7 @@ impl ProphetNetDecoder {
                     ngram_hidden_states.push(
                         (&self.ngram_embeddings.get(ngram - 1) + &predicting_stream_pos_embed)
                             .transpose(0, 1)
-                            .repeat(&[1, batch_size, 1]),
+                            .repeat([1, batch_size, 1]),
                     );
                 }
                 (ngram_hidden_states, None, None)
@@ -328,7 +328,7 @@ impl ProphetNetDecoder {
         let extended_encoder_attention_mask =
             encoder_attention_mask.map(|encoder_attention_mask_value| {
                 encoder_attention_mask_value.ones_like()
-                    - encoder_attention_mask_value.unsqueeze(1).repeat(&[
+                    - encoder_attention_mask_value.unsqueeze(1).repeat([
                         self.num_attention_heads,
                         1,
                         1,
@@ -471,7 +471,7 @@ impl ProphetNetDecoder {
             self.max_target_positions,
             (Kind::Int64, position_ids.device()),
         )
-        .repeat(&[1, 1]);
+        .repeat([1, 1]);
 
         let (main_relative_buckets, predict_relative_buckets) = compute_all_stream_relative_buckets(
             self.num_buckets,
@@ -482,7 +482,7 @@ impl ProphetNetDecoder {
         let main_relative_buckets = main_relative_buckets
             .slice(1, 0, sequence_length, 1)
             .slice(2, 0, sequence_length, 1)
-            .repeat(&[batch_size, 1, 1]);
+            .repeat([batch_size, 1, 1]);
 
         let predict_relative_buckets = Tensor::cat(
             &[
@@ -500,7 +500,7 @@ impl ProphetNetDecoder {
             ],
             2,
         )
-        .repeat(&[batch_size, 1, 1]);
+        .repeat([batch_size, 1, 1]);
 
         (main_relative_buckets, predict_relative_buckets)
     }
@@ -514,7 +514,7 @@ impl ProphetNetDecoder {
         let (sequence_length, batch_size) = (input_size[0], input_size[1]);
 
         let causal_mask = Tensor::full(
-            &[sequence_length, sequence_length],
+            [sequence_length, sequence_length],
             get_min(hidden_states.kind()).unwrap(),
             (hidden_states.kind(), hidden_states.device()),
         )
@@ -522,7 +522,7 @@ impl ProphetNetDecoder {
 
         let extended_causal_mask = causal_mask
             .unsqueeze(0)
-            .expand(&[batch_size, sequence_length, sequence_length], true);
+            .expand([batch_size, sequence_length, sequence_length], true);
 
         let extended_attention_mask = if let Some(attention_mask_value) = attention_mask {
             let extended_attention_mask =
@@ -533,7 +533,7 @@ impl ProphetNetDecoder {
             extended_causal_mask
         };
 
-        extended_attention_mask.repeat(&[self.num_attention_heads, 1, 1])
+        extended_attention_mask.repeat([self.num_attention_heads, 1, 1])
     }
 
     fn prepare_predict_attention_mask(
@@ -579,7 +579,7 @@ impl ProphetNetDecoder {
                 - attention_mask_value.unsqueeze(0).unsqueeze(2))
                 * -10000.0;
             let extended_attention_mask = extended_attention_mask.expand(
-                &[self.ngram, batch_size, sequence_length, sequence_length],
+                [self.ngram, batch_size, sequence_length, sequence_length],
                 true,
             );
             let extended_attention_mask = Tensor::cat(
@@ -594,7 +594,7 @@ impl ProphetNetDecoder {
             extended_predict_causal_mask
         };
 
-        extended_attention_mask.repeat(&[1, self.num_attention_heads, 1, 1])
+        extended_attention_mask.repeat([1, self.num_attention_heads, 1, 1])
     }
 }
 
diff --git a/src/prophetnet/embeddings.rs b/src/prophetnet/embeddings.rs
index 42955431..6414b0fd 100644
--- a/src/prophetnet/embeddings.rs
+++ b/src/prophetnet/embeddings.rs
@@ -55,8 +55,7 @@ impl ProphetNetPositionalEmbeddings {
                 if let Some(prev_num_input_ids_value) = prev_num_input_ids {
                     let num_input_ids = input_shape[1] + prev_num_input_ids_value;
 
-                    Tensor::ones(&[1, 1], (Kind::Int64, device))
-                        * (self.padding_idx + num_input_ids)
+                    Tensor::ones([1, 1], (Kind::Int64, device)) * (self.padding_idx + num_input_ids)
                 } else {
                     let calc_attention_mask = if attention_mask.is_none() {
                         Some(Tensor::ones(input_shape, (Kind::Int64, device)))
diff --git a/src/prophetnet/encoder.rs b/src/prophetnet/encoder.rs
index d6f0160a..4152e5c8 100644
--- a/src/prophetnet/encoder.rs
+++ b/src/prophetnet/encoder.rs
@@ -154,7 +154,7 @@ impl ProphetNetEncoder {
         let input_embeds = input_embeds.unwrap_or_else(|| calc_input_embeddings.as_ref().unwrap());
 
         let extended_attention_mask = attention_mask.map(|mask| {
-            ((mask.ones_like() - mask.unsqueeze(1).repeat(&[self.num_attention_heads, 1, 1]))
+            ((mask.ones_like() - mask.unsqueeze(1).repeat([self.num_attention_heads, 1, 1]))
                 * -10000.0)
                 .to_kind(input_embeds.kind())
         });
diff --git a/src/reformer/attention.rs b/src/reformer/attention.rs
index b3ca9df4..e943d287 100644
--- a/src/reformer/attention.rs
+++ b/src/reformer/attention.rs
@@ -247,7 +247,7 @@ impl LSHSelfAttention {
         let per_head_query_key = self
             .query_key
             .ws
-            .reshape(&[
+            .reshape([
                 self.num_attention_heads,
                 self.attention_head_size,
                 self.hidden_size,
@@ -256,7 +256,7 @@ impl LSHSelfAttention {
         Tensor::einsum(
             "balh,ahr->balr",
             &[hidden_states, &per_head_query_key],
-            None,
+            None::<i64>,
         )
     }
 
@@ -264,13 +264,17 @@ impl LSHSelfAttention {
         let per_head_value = self
             .value
             .ws
-            .reshape(&[
+            .reshape([
                 self.num_attention_heads,
                 self.attention_head_size,
                 self.hidden_size,
             ])
             .transpose(-2, -1);
-        Tensor::einsum("balh,ahr->balr", &[hidden_states, &per_head_value], None)
+        Tensor::einsum(
+            "balh,ahr->balr",
+            &[hidden_states, &per_head_value],
+            None::<i64>,
+        )
     }
 
     fn hash_vectors(
@@ -307,9 +311,12 @@ impl LSHSelfAttention {
             num_hashes,
             rotation_size / 2,
         ];
-        let random_rotations = Tensor::randn(&rotations_shape, (vectors.kind(), vectors.device()));
-        let rotated_vectors =
-            Tensor::einsum("bmtd,mdhr->bmhtr", &[vectors, random_rotations], None);
+        let random_rotations = Tensor::randn(rotations_shape, (vectors.kind(), vectors.device()));
+        let rotated_vectors = Tensor::einsum(
+            "bmtd,mdhr->bmhtr",
+            &[vectors, random_rotations],
+            None::<i64>,
+        );
 
         let mut buckets = match &self.num_buckets {
             NumBuckets::Integer(_) => {
@@ -320,7 +327,7 @@ impl LSHSelfAttention {
             }
             NumBuckets::Array(buckets_array) => {
                 let (mut buckets, mut cur_sum, mut cur_product) = (
-                    Tensor::zeros(&[1], (rotated_vectors.kind(), rotated_vectors.device())),
+                    Tensor::zeros([1], (rotated_vectors.kind(), rotated_vectors.device())),
                     0,
                     1,
                 );
@@ -341,14 +348,14 @@ impl LSHSelfAttention {
         };
 
         if let Some(attention_mask_value) = attention_mask {
-            if i64::from(attention_mask_value.sum(Kind::Int))
+            if i64::try_from(attention_mask_value.sum(Kind::Int)).unwrap()
                 < batch_size * *attention_mask_value.size().last().unwrap()
             {
                 num_buckets += 1;
                 let buckets_mask = attention_mask_value
                     .unsqueeze(1)
                     .unsqueeze(1)
-                    .expand(&buckets.size(), true)
+                    .expand(buckets.size(), true)
                     .to_kind(Kind::Bool);
                 buckets = buckets.where_self(
                     &buckets_mask,
@@ -432,7 +439,7 @@ impl LSHSelfAttention {
             query_shape[sorted_bucket_indices_per_hash.dim() - 1] = 1;
             let query_bucket_idx = sorted_bucket_indices_per_hash.new_full(
                 query_shape.as_slice(),
-                i64::from(sorted_bucket_indices_per_hash.max()),
+                i64::try_from(sorted_bucket_indices_per_hash.max()).unwrap(),
                 (Kind::Int64, sorted_bucket_indices_per_hash.device()),
             );
             (query_bucket_idx, sorted_bucket_indices_per_hash)
@@ -486,7 +493,7 @@ impl LSHSelfAttention {
             )?;
         }
 
-        let mut logits = query_key_dots.logsumexp(&[-1], true);
+        let mut logits = query_key_dots.logsumexp([-1], true);
         let attention_probs = (query_key_dots - &logits)
             .exp()
             .apply_t(&self.dropout, train);
@@ -555,7 +562,8 @@ impl LSHSelfAttention {
         let hidden_states_shape = hidden_states.size();
         let (batch_size, sequence_length) = (hidden_states_shape[0], hidden_states_shape[1]);
         let max_bucket = self.num_buckets.max_bucket();
-        let increase_num_buckets = i64::from(past_buckets.max()) > num_hashes * max_bucket - 1;
+        let increase_num_buckets =
+            i64::try_from(past_buckets.max()).unwrap() > num_hashes * max_bucket - 1;
 
         let query_buckets = self.hash_vectors(
             query_vectors,
@@ -597,14 +605,14 @@ impl LSHSelfAttention {
             &relevant_bucket_indices_chunk + bucket_indices_batch_offset;
 
         let relevant_hidden_states = hidden_states
-            .reshape(&[-1, self.hidden_size])
+            .reshape([-1, self.hidden_size])
             .index_select(
                 0,
                 &relevant_bucket_indices_chunk_all_batch.to_kind(Kind::Int64),
             )
-            .reshape(&[batch_size, self.num_attention_heads, -1, self.hidden_size]);
+            .reshape([batch_size, self.num_attention_heads, -1, self.hidden_size]);
 
-        let relevant_bucket_indices_chunk = relevant_bucket_indices_chunk.reshape(&[
+        let relevant_bucket_indices_chunk = relevant_bucket_indices_chunk.reshape([
             batch_size,
             self.num_attention_heads,
             num_hashes,
@@ -633,18 +641,18 @@ impl LSHSelfAttention {
 
         let expanded_start_indices = start_indices_chunk
             .unsqueeze(-1)
-            .expand(&[indices.size()[0], total_chunk_size], true);
+            .expand([indices.size()[0], total_chunk_size], true);
         let chunk_sequence_indices = expanded_start_indices
             + Tensor::arange(total_chunk_size, (Kind::Int64, indices.device()))
                 .unsqueeze(0)
-                .expand(&[indices.size()[0], total_chunk_size], true);
+                .expand([indices.size()[0], total_chunk_size], true);
 
         let chunk_sequence_indices = chunk_sequence_indices
             .flatten(0, 1)
             .remainder(sequence_length);
         let indices = indices
             .unsqueeze(1)
-            .expand(&[indices.size()[0], total_chunk_size, -1], true)
+            .expand([indices.size()[0], total_chunk_size, -1], true)
             .flatten(0, 1);
 
         indices.select(1, -1).copy_(&chunk_sequence_indices);
@@ -668,9 +676,9 @@ impl LSHSelfAttention {
     fn gather_by_expansion(&self, vectors: &Tensor, indices: &Tensor, num_hashes: i64) -> Tensor {
         let expanded_indices = indices
             .unsqueeze(-1)
-            .expand(&[-1, -1, -1, self.attention_head_size], true);
+            .expand([-1, -1, -1, self.attention_head_size], true);
         vectors
-            .repeat(&[1, 1, num_hashes, 1])
+            .repeat([1, 1, num_hashes, 1])
             .gather(2, &expanded_indices, false)
     }
 
@@ -742,7 +750,7 @@ impl LSHSelfAttention {
                     Some(self.attention_head_size),
                 )?;
 
-                query_vectors = query_vectors.unsqueeze(2).repeat(&[1, 1, num_hashes, 1, 1]);
+                query_vectors = query_vectors.unsqueeze(2).repeat([1, 1, num_hashes, 1, 1]);
                 (
                     key_value_hidden_states,
                     query_key_vectors,
@@ -859,7 +867,7 @@ impl LSHSelfAttention {
         } else {
             (
                 Tensor::arange(sequence_length, (Kind::Int64, query_key_vectors.device()))
-                    .repeat(&[batch_size, self.num_attention_heads, 1]),
+                    .repeat([batch_size, self.num_attention_heads, 1]),
                 None,
             )
         };
@@ -1094,7 +1102,7 @@ impl LocalSelfAttention {
                 .sqrt();
 
         let indices = Tensor::arange(sequence_length, (Kind::Int64, query_vectors.device()))
-            .repeat(&[batch_size, self.num_attention_heads, 1]);
+            .repeat([batch_size, self.num_attention_heads, 1]);
 
         let do_standard_attention = sequence_length <= self.chunk_length;
 
@@ -1158,7 +1166,7 @@ impl LocalSelfAttention {
             )?;
         }
 
-        let logits = query_key_dots.logsumexp(&[-1], true);
+        let logits = query_key_dots.logsumexp([-1], true);
         let attention_probs = (query_key_dots - logits)
             .exp()
             .apply_t(&self.dropout, train);
diff --git a/src/reformer/attention_utils.rs b/src/reformer/attention_utils.rs
index 836d3500..f5101b39 100644
--- a/src/reformer/attention_utils.rs
+++ b/src/reformer/attention_utils.rs
@@ -21,7 +21,7 @@ pub fn stable_argsort(input_tensor: &Tensor, dim: i64) -> Tensor {
     let scaling_dim = input_tensor.size()[dim as usize];
     let scaled_offset = Tensor::arange(scaling_dim, (Kind::Int, input_tensor.device()))
         .view([1, 1, -1])
-        .expand(&input_tensor.size(), true);
+        .expand(input_tensor.size(), true);
     let scaled_tensor = scaling_dim * input_tensor + (scaled_offset / scaling_dim);
     scaled_tensor.argsort(dim, false)
 }
@@ -138,7 +138,7 @@ pub fn merge_hidden_size_dim(
         -1,
         num_attention_heads * attention_head_size,
     ];
-    input.permute(&[0, 2, 1, 3]).reshape(&new_shape)
+    input.permute([0, 2, 1, 3]).reshape(new_shape)
 }
 
 pub fn split_seq_length_dim_to(
diff --git a/src/reformer/embeddings.rs b/src/reformer/embeddings.rs
index f8d165aa..0e42aa6e 100644
--- a/src/reformer/embeddings.rs
+++ b/src/reformer/embeddings.rs
@@ -16,6 +16,7 @@ use crate::common::embeddings::process_ids_embeddings_pair;
 use crate::reformer::ReformerConfig;
 use crate::RustBertError;
 use std::borrow::Borrow;
+use std::convert::TryFrom;
 use tch::nn::Init;
 use tch::{nn, Kind, Tensor};
 
@@ -81,18 +82,18 @@ impl AxialPositionEmbeddings {
                     .transpose(2, 1)
                     .feature_dropout(self.dropout_prob, train)
                     .transpose(2, 1)
-                    .reshape(&[batch_size, sequence_length, -1])
+                    .reshape([batch_size, sequence_length, -1])
             } else {
                 Tensor::cat(
                     &broadcasted_weights
                         .iter()
-                        .map(|tensor| tensor.reshape(&[batch_size, sequence_length, -1]))
+                        .map(|tensor| tensor.reshape([batch_size, sequence_length, -1]))
                         .collect::<Vec<Tensor>>(),
                     -1,
                 )
             }
         } else {
-            let max_position_id = i64::from(position_ids.max());
+            let max_position_id = i64::try_from(position_ids.max()).unwrap();
             let required_pos_encodings_columns =
                 (max_position_id + 1) / self.axial_pos_shape[1] + 1;
             let position_encodings = Tensor::cat(
@@ -102,7 +103,7 @@ impl AxialPositionEmbeddings {
                     .collect::<Vec<Tensor>>(),
                 -1,
             );
-            let position_encodings = position_encodings.reshape(&[
+            let position_encodings = position_encodings.reshape([
                 batch_size,
                 -1,
                 *position_encodings.size().last().unwrap(),
diff --git a/src/reformer/reformer_model.rs b/src/reformer/reformer_model.rs
index 51e755af..130e28b2 100644
--- a/src/reformer/reformer_model.rs
+++ b/src/reformer/reformer_model.rs
@@ -428,14 +428,14 @@ impl ReformerModel {
         device: Device,
     ) -> Result<PaddedReformerInput, RustBertError> {
         let input_ids_padding = Tensor::full(
-            &[input_shape[0], padding_length],
+            [input_shape[0], padding_length],
             self.pad_token_id,
             (Kind::Int64, device),
         );
 
         let attention_mask = Some(if let Some(attention_mask) = attention_mask {
             let attention_mask_padding = Tensor::zeros(
-                &[input_shape[0], padding_length],
+                [input_shape[0], padding_length],
                 (attention_mask.kind(), device),
             );
             Tensor::cat(&[attention_mask, &attention_mask_padding], -1)
@@ -443,7 +443,7 @@ impl ReformerModel {
             Tensor::cat(
                 &[
                     Tensor::ones(input_shape, (Kind::Int8, device)),
-                    Tensor::zeros(&[input_shape[0], padding_length], (Kind::Int8, device)),
+                    Tensor::zeros([input_shape[0], padding_length], (Kind::Int8, device)),
                 ],
                 -1,
             )
@@ -461,7 +461,7 @@ impl ReformerModel {
                     (Kind::Int64, device),
                 )
                 .unsqueeze(0)
-                .expand(&[input_shape[0], padding_length], true);
+                .expand([input_shape[0], padding_length], true);
                 Some(Tensor::cat(&[position_ids, &position_ids_padding], -1))
             } else {
                 None
diff --git a/src/roberta/embeddings.rs b/src/roberta/embeddings.rs
index eadb5d4e..08fe1657 100644
--- a/src/roberta/embeddings.rs
+++ b/src/roberta/embeddings.rs
@@ -194,7 +194,7 @@ impl BertEmbedding for RobertaEmbeddings {
 
         let calc_token_type_ids = if token_type_ids.is_none() {
             Some(Tensor::zeros(
-                &input_shape,
+                input_shape,
                 (Kind::Int64, input_embeddings.device()),
             ))
         } else {
diff --git a/src/t5/attention.rs b/src/t5/attention.rs
index 1a1dea8c..2abae806 100644
--- a/src/t5/attention.rs
+++ b/src/t5/attention.rs
@@ -223,14 +223,14 @@ impl T5Attention {
             None
         };
 
-        let mut scores = Tensor::einsum("bnqd,bnkd->bnqk", &[q, k], None);
+        let mut scores = Tensor::einsum("bnqd,bnkd->bnqk", &[q, k], None::<i64>);
 
         let calculated_position_bias = if position_bias.is_none() {
             let mut temp_value = if self.has_relative_attention_bias {
                 self.compute_bias(real_seq_length, key_length, hidden_states.device())
             } else {
                 Tensor::zeros(
-                    &[1, self.n_heads, real_seq_length, key_length],
+                    [1, self.n_heads, real_seq_length, key_length],
                     (scores.kind(), scores.device()),
                 )
             };
@@ -289,7 +289,7 @@ impl T5Attention {
         );
         rp_bucket
             .apply(self.relative_attention_bias.as_ref().unwrap())
-            .permute(&[2, 0, 1])
+            .permute([2, 0, 1])
             .unsqueeze(0)
     }
 }
diff --git a/src/t5/encoder.rs b/src/t5/encoder.rs
index c1287b26..b2e00ce1 100644
--- a/src/t5/encoder.rs
+++ b/src/t5/encoder.rs
@@ -20,6 +20,7 @@ use crate::t5::T5Config;
 use crate::Activation::{gelu_new, relu};
 use crate::RustBertError;
 use std::borrow::{Borrow, BorrowMut};
+use std::convert::TryFrom;
 use tch::nn::LinearConfig;
 use tch::{nn, Kind, Scalar, Tensor};
 
@@ -227,7 +228,9 @@ impl T5Block {
     }
 
     pub(crate) fn clamp_hidden_states(hidden_states: Tensor) -> Tensor {
-        if (hidden_states.kind() != Kind::Float) & bool::from(hidden_states.isinf().any()) {
+        if (hidden_states.kind() != Kind::Float)
+            & bool::try_from(hidden_states.isinf().any()).unwrap()
+        {
             let clamp_value = match hidden_states.kind() {
                 Kind::Half => half::f16::MAX.to_f64() - 1000.,
                 Kind::BFloat16 => half::bf16::MAX.to_f64() - 1000.,
@@ -398,7 +401,7 @@ impl T5Stack {
 
         let calculated_attention_mask = if attention_mask.is_none() {
             Some(Tensor::ones(
-                &[batch_size, mask_seq_length],
+                [batch_size, mask_seq_length],
                 (Kind::Int64, input_embeddings.device()),
             ))
         } else {
@@ -416,7 +419,7 @@ impl T5Stack {
                         input_shape[1],
                         (input_embeddings.kind(), input_embeddings.device()),
                     );
-                    let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat(&[
+                    let causal_mask = seq_ids.unsqueeze(0).unsqueeze(0).repeat([
                         input_shape[0],
                         input_shape[1],
                         1,
@@ -445,7 +448,7 @@ impl T5Stack {
             let encoder_mask = match encoder_attention_mask {
                 Some(value) => value.copy(),
                 None => Tensor::ones(
-                    &[
+                    [
                         encoder_hidden_states_shape[0],
                         encoder_hidden_states_shape[1],
                     ],
diff --git a/src/xlnet/attention.rs b/src/xlnet/attention.rs
index 4492a58f..64dcdde1 100644
--- a/src/xlnet/attention.rs
+++ b/src/xlnet/attention.rs
@@ -150,9 +150,9 @@ impl XLNetRelativeAttention {
 
     fn rel_shift_bnij(&self, x: &Tensor, klen: i64) -> Tensor {
         let shape = x.size();
-        x.reshape(&[shape[0], shape[1], shape[3], shape[2]])
+        x.reshape([shape[0], shape[1], shape[3], shape[2]])
             .narrow(2, 1, shape[3] - 1)
-            .reshape(&[shape[0], shape[1], shape[2], shape[3] - 1])
+            .reshape([shape[0], shape[1], shape[2], shape[3] - 1])
             .index_select(3, &Tensor::arange(klen, (Kind::Int64, x.device())))
     }
 
@@ -169,13 +169,13 @@ impl XLNetRelativeAttention {
         let ac = Tensor::einsum(
             "ibnd,jbnd->bnij",
             &[&(q_head + &self.r_w_bias), k_head_h],
-            None,
+            None::<i64>,
         );
         let bd = self.rel_shift_bnij(
             &Tensor::einsum(
                 "ibnd,jbnd->bnij",
                 &[&(q_head + &self.r_r_bias), k_head_r],
-                None,
+                None::<i64>,
             ),
             ac.size()[3],
         );
@@ -185,30 +185,33 @@ impl XLNetRelativeAttention {
                 let ef = Tensor::einsum(
                     "ibnd,snd->ibns",
                     &[&(q_head + &self.r_s_bias), &self.seg_embed],
-                    None,
+                    None::<i64>,
                 );
-                Tensor::einsum("ijbs,ibns->bnij", &[seg_mat, &ef], None)
+                Tensor::einsum("ijbs,ibns->bnij", &[seg_mat, &ef], None::<i64>)
             }
-            None => Tensor::zeros(&[1], (ac.kind(), ac.device())),
+            None => Tensor::zeros([1], (ac.kind(), ac.device())),
         };
         let mut attention_score = (ac + bd + ef) * self.scale;
         if let Some(value) = attention_mask {
             let target_kind = attention_score.kind();
             attention_score =
-                (attention_score - value.permute(&[2, 3, 0, 1]) * 1e30).to_kind(target_kind);
+                (attention_score - value.permute([2, 3, 0, 1]) * 1e30).to_kind(target_kind);
         };
 
         let attention_probas = attention_score
             .softmax(3, attention_score.kind())
             .apply_t(&self.dropout, train);
 
-        let attention_vector =
-            Tensor::einsum("bnij,jbnd->ibnd", &[&attention_probas, v_head_h], None);
+        let attention_vector = Tensor::einsum(
+            "bnij,jbnd->ibnd",
+            &[&attention_probas, v_head_h],
+            None::<i64>,
+        );
 
         if self.output_attentions {
             (
                 attention_vector,
-                Some(attention_probas.permute(&[2, 3, 0, 1])),
+                Some(attention_probas.permute([2, 3, 0, 1])),
             )
         } else {
             (attention_vector, None)
@@ -222,9 +225,12 @@ impl XLNetRelativeAttention {
         residual: bool,
         train: bool,
     ) -> Tensor {
-        let mut attention_out =
-            Tensor::einsum("ibnd,hnd->ibh", &[attention_vector, &self.output], None)
-                .apply_t(&self.dropout, train);
+        let mut attention_out = Tensor::einsum(
+            "ibnd,hnd->ibh",
+            &[attention_vector, &self.output],
+            None::<i64>,
+        )
+        .apply_t(&self.dropout, train);
         if residual {
             attention_out = attention_out + h;
         };
@@ -256,10 +262,10 @@ impl XLNetRelativeAttention {
             Some(value) => value,
             None => h,
         };
-        let q_head_h = Tensor::einsum("ibh,hnd->ibnd", &[h, &self.query], None);
-        let k_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.key], None);
-        let v_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.value], None);
-        let k_head_r = Tensor::einsum("ibh,hnd->ibnd", &[r, &self.pos], None);
+        let q_head_h = Tensor::einsum("ibh,hnd->ibnd", &[h, &self.query], None::<i64>);
+        let k_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.key], None::<i64>);
+        let v_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.value], None::<i64>);
+        let k_head_r = Tensor::einsum("ibh,hnd->ibnd", &[r, &self.pos], None::<i64>);
 
         let (attention_vec_h, attention_probas_h) = self.rel_attention_core(
             &q_head_h,
@@ -273,12 +279,12 @@ impl XLNetRelativeAttention {
         let output_h = self.post_attention(h, &attention_vec_h, true, train);
 
         let (output_g, attention_probas_g) = if let Some(g) = g {
-            let q_head_g = Tensor::einsum("ibh,hnd->ibnd", &[g, &self.query], None);
+            let q_head_g = Tensor::einsum("ibh,hnd->ibnd", &[g, &self.query], None::<i64>);
 
             let (attention_vec_g, attention_probas_g) = match target_mapping {
                 Some(target_mapping) => {
                     let q_head_g =
-                        Tensor::einsum("mbnd,mlb->lbnd", &[&q_head_g, target_mapping], None);
+                        Tensor::einsum("mbnd,mlb->lbnd", &[&q_head_g, target_mapping], None::<i64>);
                     let (attention_vec_g, attention_probas_g) = self.rel_attention_core(
                         &q_head_g,
                         &k_head_h,
@@ -288,8 +294,11 @@ impl XLNetRelativeAttention {
                         attn_mask_g,
                         train,
                     );
-                    let attention_vec_g =
-                        Tensor::einsum("lbnd,mlb->mbnd", &[&attention_vec_g, target_mapping], None);
+                    let attention_vec_g = Tensor::einsum(
+                        "lbnd,mlb->mbnd",
+                        &[&attention_vec_g, target_mapping],
+                        None::<i64>,
+                    );
                     (attention_vec_g, attention_probas_g)
                 }
                 None => self.rel_attention_core(
diff --git a/src/xlnet/xlnet_model.rs b/src/xlnet/xlnet_model.rs
index b8b64b1f..28cff18f 100644
--- a/src/xlnet/xlnet_model.rs
+++ b/src/xlnet/xlnet_model.rs
@@ -251,8 +251,8 @@ impl XLNetModel {
     }
 
     fn create_mask(&self, q_len: i64, m_len: i64, device: Device) -> Tensor {
-        let attention_mask = Tensor::ones(&[q_len, q_len], (Kind::Int64, device));
-        let attention_mask_pad = Tensor::zeros(&[q_len, m_len], (Kind::Int64, device));
+        let attention_mask = Tensor::ones([q_len, q_len], (Kind::Int64, device));
+        let attention_mask_pad = Tensor::zeros([q_len, m_len], (Kind::Int64, device));
         let mask_up = attention_mask.triu(1);
         let mut output = Tensor::cat(&[&attention_mask_pad, &mask_up], 1);
         if self.same_length {
@@ -307,12 +307,16 @@ impl XLNetModel {
         inverse_frequency: &Tensor,
         batch_size: Option<i64>,
     ) -> Tensor {
-        let sinusoid = Tensor::einsum("i,d->id", &[position_sequence, inverse_frequency], None);
+        let sinusoid = Tensor::einsum(
+            "i,d->id",
+            &[position_sequence, inverse_frequency],
+            None::<i64>,
+        );
         let mut positional_embeddings =
             Tensor::cat(&[sinusoid.sin(), sinusoid.cos()], -1).unsqueeze(1);
 
         if let Some(bsz) = batch_size {
-            positional_embeddings = positional_embeddings.expand(&[-1, bsz, -1], true)
+            positional_embeddings = positional_embeddings.expand([-1, bsz, -1], true)
         };
         positional_embeddings
     }
@@ -466,13 +470,13 @@ impl XLNetModel {
         let perm_mask = perm_mask.map(|perm_mask| {
             perm_mask
                 .to_kind(word_emb_k.kind())
-                .permute(&[1, 2, 0])
+                .permute([1, 2, 0])
                 .contiguous()
         });
         let target_mapping = target_mapping.map(|target_mapping| {
             target_mapping
                 .to_kind(word_emb_k.kind())
-                .permute(&[1, 2, 0])
+                .permute([1, 2, 0])
                 .contiguous()
         });
 
@@ -511,7 +515,7 @@ impl XLNetModel {
         if let Some(data_mask_value) = &data_mask {
             if m_len > 0 {
                 let mems_mask = Tensor::zeros(
-                    &[data_mask_value.size()[0], m_len, batch_size],
+                    [data_mask_value.size()[0], m_len, batch_size],
                     (Kind::Bool, data_mask_value.device()),
                 );
                 data_mask = Some(Tensor::cat(&[&mems_mask, data_mask_value], 1))
@@ -528,7 +532,7 @@ impl XLNetModel {
             if m_len > 0 {
                 non_tgt_mask = Tensor::cat(
                     &[
-                        Tensor::zeros(&[q_len, m_len], (Kind::Int64, attn_mask_value.device())),
+                        Tensor::zeros([q_len, m_len], (Kind::Int64, attn_mask_value.device())),
                         non_tgt_mask,
                     ],
                     -1,
@@ -542,14 +546,14 @@ impl XLNetModel {
         let mut output_h = word_emb_k.apply_t(&self.dropout, train);
         let mut output_g = target_mapping.as_ref().map(|target_mapping_value| {
             self.mask_emb
-                .expand(&[target_mapping_value.size()[0], batch_size, -1], true)
+                .expand([target_mapping_value.size()[0], batch_size, -1], true)
                 .apply_t(&self.dropout, train)
         });
 
         let seg_mat = if let Some(token_type_ids_value) = token_type_ids {
             let cat_ids = if m_len > 0 {
                 let mem_pad = Tensor::zeros(
-                    &[m_len, batch_size],
+                    [m_len, batch_size],
                     (Kind::Int64, token_type_ids_value.device()),
                 );
                 Tensor::cat(&[mem_pad, token_type_ids_value.copy()], 0)
@@ -636,7 +640,7 @@ impl XLNetModel {
             output_h
         }
         .apply_t(&self.dropout, train)
-        .permute(&[1, 0, 2])
+        .permute([1, 0, 2])
         .contiguous();
 
         Ok(XLNetModelOutput {
@@ -1673,10 +1677,8 @@ impl PrivateLanguageGenerator for XLNetGenerator {
     ) -> PreparedInput<'a> {
         let effective_batch_size = input_ids.size()[0];
         let sequence_length = input_ids.size()[1];
-        let dummy_token = Tensor::zeros(
-            &[effective_batch_size, 1],
-            (Kind::Int64, input_ids.device()),
-        );
+        let dummy_token =
+            Tensor::zeros([effective_batch_size, 1], (Kind::Int64, input_ids.device()));
         let offset = 2i64;
         let input_ids = match &past {
             Cache::XLNetCache(past) => {
@@ -1696,13 +1698,13 @@ impl PrivateLanguageGenerator for XLNetGenerator {
         };
         let sequence_length = input_ids.size()[1];
         let perm_mask = Tensor::zeros(
-            &[effective_batch_size, sequence_length, sequence_length],
+            [effective_batch_size, sequence_length, sequence_length],
             (Kind::Float, input_ids.device()),
         );
         let _ = perm_mask.narrow(2, sequence_length - 1, 1).fill_(1.0);
 
         let target_mapping = Tensor::zeros(
-            &[effective_batch_size, 1, sequence_length],
+            [effective_batch_size, 1, sequence_length],
             (Kind::Float, input_ids.device()),
         );
         let _ = target_mapping.narrow(2, sequence_length - 1, 1).fill_(1.0);
diff --git a/tests/deberta.rs b/tests/deberta.rs
index 1d4643e2..789fd661 100644
--- a/tests/deberta.rs
+++ b/tests/deberta.rs
@@ -99,10 +99,10 @@ fn deberta_masked_lm() -> anyhow::Result<()> {
     let deberta_model = DebertaForMaskedLM::new(vs.root(), &config);
 
     //    Generate random input
-    let input_tensor = Tensor::randint(42, &[32, 128], (Kind::Int64, device));
-    let attention_mask = Tensor::ones(&[32, 128], (Kind::Int64, device));
+    let input_tensor = Tensor::randint(42, [32, 128], (Kind::Int64, device));
+    let attention_mask = Tensor::ones([32, 128], (Kind::Int64, device));
     let position_ids = Tensor::arange(128, (Kind::Int64, device)).unsqueeze(0);
-    let token_type_ids = Tensor::zeros(&[32, 128], (Kind::Int64, device));
+    let token_type_ids = Tensor::zeros([32, 128], (Kind::Int64, device));
 
     //    Forward pass
     let model_output = no_grad(|| {
diff --git a/tests/deberta_v2.rs b/tests/deberta_v2.rs
index 09ea45ae..31a599e0 100644
--- a/tests/deberta_v2.rs
+++ b/tests/deberta_v2.rs
@@ -25,10 +25,10 @@ fn deberta_v2_masked_lm() -> anyhow::Result<()> {
     let deberta_model = DebertaV2ForMaskedLM::new(vs.root(), &config);
 
     //    Generate random input
-    let input_tensor = Tensor::randint(42, &[32, 128], (Kind::Int64, device));
-    let attention_mask = Tensor::ones(&[32, 128], (Kind::Int64, device));
+    let input_tensor = Tensor::randint(42, [32, 128], (Kind::Int64, device));
+    let attention_mask = Tensor::ones([32, 128], (Kind::Int64, device));
     let position_ids = Tensor::arange(128, (Kind::Int64, device)).unsqueeze(0);
-    let token_type_ids = Tensor::zeros(&[32, 128], (Kind::Int64, device));
+    let token_type_ids = Tensor::zeros([32, 128], (Kind::Int64, device));
 
     //    Forward pass
     let model_output = no_grad(|| {
diff --git a/tests/fnet.rs b/tests/fnet.rs
index 6dfe05c0..e05015b8 100644
--- a/tests/fnet.rs
+++ b/tests/fnet.rs
@@ -12,6 +12,7 @@ use rust_bert::Config;
 use rust_tokenizers::tokenizer::{FNetTokenizer, MultiThreadedTokenizer, TruncationStrategy};
 use rust_tokenizers::vocab::Vocab;
 use std::collections::HashMap;
+use std::convert::TryFrom;
 use tch::{nn, no_grad, Device, Tensor};
 
 #[test]
@@ -75,7 +76,9 @@ fn fnet_masked_lm() -> anyhow::Result<()> {
 
     assert_eq!("▁one", word_1);
     assert_eq!("▁the", word_2);
-    let value = (f64::from(model_output.prediction_scores.get(0).get(4).max()) - 13.1721).abs();
+    let value = (f64::try_from(model_output.prediction_scores.get(0).get(4).max()).unwrap()
+        - 13.1721)
+        .abs();
     dbg!(value);
     assert!(value < 1e-3);
     Ok(())
diff --git a/tests/longformer.rs b/tests/longformer.rs
index 508aabad..cc458b3e 100644
--- a/tests/longformer.rs
+++ b/tests/longformer.rs
@@ -16,6 +16,7 @@ use rust_bert::Config;
 use rust_tokenizers::tokenizer::{MultiThreadedTokenizer, RobertaTokenizer, TruncationStrategy};
 use rust_tokenizers::vocab::Vocab;
 use std::collections::HashMap;
+use std::convert::TryFrom;
 use tch::{nn, no_grad, Device, Tensor};
 
 #[test]
@@ -116,12 +117,12 @@ fn longformer_masked_lm() -> anyhow::Result<()> {
         .prediction_scores
         .get(0)
         .get(4)
-        .double_value(&[i64::from(&index_1)]);
+        .double_value(&[i64::try_from(&index_1).unwrap()]);
     let score_2 = model_output
         .prediction_scores
         .get(1)
         .get(7)
-        .double_value(&[i64::from(&index_2)]);
+        .double_value(&[i64::try_from(&index_2).unwrap()]);
 
     assert_eq!("Ġeye", word_1); // Outputs "person" : "Looks like one [eye] is missing"
     assert_eq!("Ġsunny", word_2); // Outputs "pear" : "It was a nice and [sunny] day"
diff --git a/tests/mobilebert.rs b/tests/mobilebert.rs
index 1976ffdc..f0ac097a 100644
--- a/tests/mobilebert.rs
+++ b/tests/mobilebert.rs
@@ -10,6 +10,7 @@ use rust_bert::Config;
 use rust_tokenizers::tokenizer::{BertTokenizer, MultiThreadedTokenizer, TruncationStrategy};
 use rust_tokenizers::vocab::Vocab;
 use std::collections::HashMap;
+use std::convert::TryFrom;
 use tch::{nn, no_grad, Device, Tensor};
 
 #[test]
@@ -73,12 +74,12 @@ fn mobilebert_masked_model() -> anyhow::Result<()> {
         .logits
         .get(0)
         .get(4)
-        .double_value(&[i64::from(&index_1)]);
+        .double_value(&[i64::try_from(&index_1).unwrap()]);
     let score_2 = model_output
         .logits
         .get(1)
         .get(7)
-        .double_value(&[i64::from(&index_2)]);
+        .double_value(&[i64::try_from(&index_2).unwrap()]);
 
     assert_eq!("thing", word_1); // Outputs "person" : "Looks like one [person] is missing"
     assert_eq!("sunny", word_2); // Outputs "sunny" : "It was a very nice and [sunny] day"
diff --git a/tests/xlnet.rs b/tests/xlnet.rs
index ba0d6b5e..1ceddebf 100644
--- a/tests/xlnet.rs
+++ b/tests/xlnet.rs
@@ -59,10 +59,10 @@ fn xlnet_base_model() -> anyhow::Result<()> {
     let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
 
     // Forward pass
-    let perm_mask = Tensor::zeros(&[1, 4, 4], (Kind::Float, device));
+    let perm_mask = Tensor::zeros([1, 4, 4], (Kind::Float, device));
     let _ = perm_mask.narrow(2, 3, 1).fill_(1.0);
 
-    let target_mapping = Tensor::zeros(&[1, 1, 4], (Kind::Float, device));
+    let target_mapping = Tensor::zeros([1, 1, 4], (Kind::Float, device));
     let _ = target_mapping.narrow(2, 3, 1).fill_(1.0);
     let model_output = no_grad(|| {
         xlnet_model
@@ -164,10 +164,10 @@ fn xlnet_lm_model() -> anyhow::Result<()> {
     let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
 
     // Forward pass
-    let perm_mask = Tensor::zeros(&[1, 4, 4], (Kind::Float, device));
+    let perm_mask = Tensor::zeros([1, 4, 4], (Kind::Float, device));
     let _ = perm_mask.narrow(2, 3, 1).fill_(1.0);
 
-    let target_mapping = Tensor::zeros(&[1, 1, 4], (Kind::Float, device));
+    let target_mapping = Tensor::zeros([1, 1, 4], (Kind::Float, device));
     let _ = target_mapping.narrow(2, 3, 1).fill_(1.0);
     let model_output = no_grad(|| {
         xlnet_model

From 564ae85df0e16158cc3d0fdac9fc1a40bf3a69ed Mon Sep 17 00:00:00 2001
From: Guillaume Becquin <guillaume.becquin@gmail.com>
Date: Sun, 14 May 2023 09:05:34 +0100
Subject: [PATCH 4/4] - Remove debugging print statement - Skip truncation of
 prompt for encoder-decoder models - Add right padding logic for
 encoder-decoder models

---
 src/pipelines/conversation.rs | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs
index 9c943c66..6e026cc2 100644
--- a/src/pipelines/conversation.rs
+++ b/src/pipelines/conversation.rs
@@ -56,11 +56,11 @@
 //! from the 3rd party utilization of the pretrained system.
 use crate::common::error::RustBertError;
 use crate::gpt2::GPT2Generator;
-use crate::t5::T5Generator;
 use crate::pipelines::common::{ModelType, TokenizerOption};
 use crate::pipelines::generation_utils::private_generation_utils::PrivateLanguageGenerator;
 use crate::pipelines::generation_utils::{GenerateConfig, LanguageGenerator};
 use crate::resources::ResourceProvider;
+use crate::t5::T5Generator;
 use std::collections::HashMap;
 use tch::{Device, Kind, Tensor};
 use uuid::Uuid;
@@ -737,9 +737,7 @@ impl ConversationOption {
             Self::GPT2(model_ref) => {
                 Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap())
             }
-            Self::T5(model_ref) => {
-                Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap())
-            }
+            Self::T5(model_ref) => Ok(*model_ref.get_eos_ids().as_ref().unwrap().first().unwrap()),
         }
     }
 
@@ -786,6 +784,14 @@ impl ConversationOption {
                 .collect(),
         }
     }
+
+    /// Interface method to get the model family (encoder-decoder or decoder)
+    fn is_encoder_decoder(&self) -> bool {
+        match *self {
+            Self::GPT2(ref generator) => generator.is_encoder_decoder(),
+            Self::T5(ref generator) => generator.is_encoder_decoder(),
+        }
+    }
 }
 
 /// # Conversation model
@@ -925,8 +931,6 @@ impl ConversationModel {
 
             let mut output = HashMap::with_capacity(active_uuid.len());
 
-            println!("generated: {:#?}, prompt_ids: {:#?}", &generated, &prompt_ids);
-
             for (
                 ((conversation, (generated_sequence, conversation_promp_ids)), uuid),
                 removed_padding,
@@ -936,7 +940,11 @@ impl ConversationModel {
                 .zip(active_uuid.into_iter())
                 .zip(removed_padding_quantities.into_iter())
             {
-                let generated_response = &generated_sequence[input_length - removed_padding.0..];
+                let generated_response = if self.model.is_encoder_decoder() {
+                    generated_sequence.as_slice()
+                } else {
+                    &generated_sequence[input_length - removed_padding.0..]
+                };
                 conversation
                     .generated_responses
                     .push(
@@ -1044,9 +1052,14 @@ impl ConversationModel {
                     .get(input_idx as i64)
                     .slice(0, 0, (max_len - input.len()) as i64, 1)
                     .fill_(0);
-                let mut padded_input = vec![pad_token; max_len - input.len()];
-                padded_input.extend(input);
-                padded_input
+                let padding = vec![pad_token; max_len - input.len()];
+                if self.model.is_encoder_decoder() {
+                    // right padding assumed for encoder-decoders
+                    [input, &padding].concat()
+                } else {
+                    // left padding assumed for decoders
+                    [&padding, input].concat()
+                }
             })
             .map(|tokens| Tensor::of_slice(&tokens).to(self.device))
             .collect::<Vec<Tensor>>();