diff --git a/.all-contributorsrc b/.all-contributorsrc
index 2b8e16bc..97d1589c 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -411,7 +411,18 @@
       "avatar_url": "https://avatars1.githubusercontent.com/u/14850762?v=4",
       "profile": "http://pablomarino.me",
       "contributions": [
-        "code"
+        "code",
+        "doc"
+      ]
+    },
+    {
+      "login": "strawberrypie",
+      "name": "Anton Kiselev",
+      "avatar_url": "https://avatars2.githubusercontent.com/u/29224443?v=4",
+      "profile": "http://linkedin.com/in/strawberrypie/",
+      "contributions": [
+        "code",
+        "doc"
       ]
     }
   ],
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2f9d0d27..47804f4d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,11 +4,54 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## UNRELEASED
+## [0.47.0] - 2020-08-09
 
-- Removed blank string answer in Question Answering predictions
+### Added
+
+- Added support for testing models through a Streamlit app. Use the command `simpl-viewer". Currently supports:
+  - Classification (including multilabel)
+  - NER (design inspired by [displaCy Named Entity Visualizer](https://explosion.ai/demos/displacy-ent))
+  - QA
+
+
+## [0.46.5] - 2020-08-05
+
+### Changed
+
+- Python version requirement changed back to 3.6 for Colab support.
+- Miscellaneous bug fixes in 0.46.3 and 0.46.4
+
+## [0.46.2] - 2020-08-01
+
+### Fixed
+
+- Fixed unreachable condition in Electra language modeling.
+
+## [0.46.1] - 2020-08-01
+
+### Fixed
+
+- Bug in ConvAI models where cache_dir was not being created.
+
+## [0.46.0] - 2020-08-01
+
+### Changed
+
+- Uses PyTorch native AMP instead of Apex. [@strawberrypie](https://github.com/strawberrypie)
+
+## [0.45.5] - 2020-07-29
+
+### Fixed
+
+- Bug fixed in loading classiication models with a `labels_map` where labels are ints.
+
+## [0.45.4] - 2020-07-28
+
+### Fixed
 
-## [0.45.2] - 2020-07-19
+- Bug fixed in lazy loading classification tasks where `lazy_text_column=0` caused an error.
+
+## [0.45.2] - 2020-07-25
 
 ### Added
 
@@ -1001,7 +1044,21 @@ Model checkpoint is now saved for all epochs again.
 
 - This CHANGELOG file to hopefully serve as an evolving example of a standardized open source project CHANGELOG.
 
-[0.45.2]: https://github.com/ThilinaRajapakse/simpletransformers/compare/3e98361...HEAD
+[0.46.5]: https://github.com/ThilinaRajapakse/simpletransformers/compare/2cc77f7...HEAD
+
+[0.46.3]: https://github.com/ThilinaRajapakse/simpletransformers/compare/7f37cb7...2cc77f7
+
+[0.46.2]: https://github.com/ThilinaRajapakse/simpletransformers/compare/b64637c...7f37cb7
+
+[0.46.1]: https://github.com/ThilinaRajapakse/simpletransformers/compare/121cba4...b64637c
+
+[0.46.0]: https://github.com/ThilinaRajapakse/simpletransformers/compare/120d1e6...121cba4
+
+[0.45.5]: https://github.com/ThilinaRajapakse/simpletransformers/compare/0ac6b69...120d1e6
+
+[0.45.4]: https://github.com/ThilinaRajapakse/simpletransformers/compare/ac0f1a0...0ac6b69
+
+[0.45.2]: https://github.com/ThilinaRajapakse/simpletransformers/compare/3e98361...ac0f1a0
 
 [0.45.0]: https://github.com/ThilinaRajapakse/simpletransformers/compare/fad190f...3e98361
 
diff --git a/README.md b/README.md
index 7ff915af..717fd6f1 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Downloads](https://pepy.tech/badge/simpletransformers)](https://pepy.tech/project/simpletransformers)
 <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
-[![All Contributors](https://img.shields.io/badge/all_contributors-44-orange.svg?style=flat-square)](#contributors-)
+[![All Contributors](https://img.shields.io/badge/all_contributors-45-orange.svg?style=flat-square)](#contributors-)
 <!-- ALL-CONTRIBUTORS-BADGE:END -->
 
 # Simple Transformers
@@ -210,10 +210,10 @@ Any feedback will be immensely helpful in improving the documentation! If you ha
     - [Evaluating a Model](#evaluating-a-model)
     - [Predicting from a trained Model](#predicting-from-a-trained-model)
   - [Text Representation Generation](#text-representation-generation)
-       - [Minimal example for generating word embeddings](#minimal-start-for-generating-word-embeddings)
-       - [Minimal example for generating sentence embeddings](#minimal-start-for-generating-sentence-embeddings)
+    - [Minimal example for generating word embeddings](#minimal-example-for-generating-word-embeddings)
+    - [Minimal example for generating sentence embeddings](#minimal-example-for-generating-sentence-embeddings)
   - [Regression](#regression)
-    - [Minimal Start for Regression](#minimal-start-for-regression)
+      - [Minimal Start for Regression](#minimal-start-for-regression)
   - [Visualization Support](#visualization-support)
   - [Experimental Features](#experimental-features)
     - [Sliding Window For Long Sequences](#sliding-window-for-long-sequences)
@@ -224,7 +224,6 @@ Any feedback will be immensely helpful in improving the documentation! If you ha
       - [*cache_dir: str*](#cache_dir-str)
       - [*best_model_dir: str*](#best_model_dir-str)
       - [*fp16: bool*](#fp16-bool)
-      - [*fp16_opt_level: str*](#fp16_opt_level-str)
       - [*max_seq_length: int*](#max_seq_length-int)
       - [*train_batch_size: int*](#train_batch_size-int)
       - [*gradient_accumulation_steps: int*](#gradient_accumulation_steps-int)
@@ -263,6 +262,8 @@ Any feedback will be immensely helpful in improving the documentation! If you ha
       - [*config*](#config)
   - [Current Pretrained Models](#current-pretrained-models)
   - [Acknowledgements](#acknowledgements)
+  - [How to Contribute](#how-to-contribute)
+    - [How to Update Docs](#how-to-update-docs)
   - [Contributors ✨](#contributors-)
 <!--te-->
 
@@ -272,16 +273,14 @@ Any feedback will be immensely helpful in improving the documentation! If you ha
 
 1. Install Anaconda or Miniconda Package Manager from [here](https://www.anaconda.com/distribution/)
 2. Create a new virtual environment and install packages.
-`conda create -n transformers python pandas tqdm`
-`conda activate transformers`
+`conda create -n st python pandas tqdm`
+`conda activate st`
 If using cuda:
-&nbsp;&nbsp;&nbsp;&nbsp;`conda install pytorch cudatoolkit=10.1 -c pytorch`
+&nbsp;&nbsp;&nbsp;&nbsp;`conda install pytorch>=1.6 cudatoolkit=10.2 -c pytorch`
 else:
 &nbsp;&nbsp;&nbsp;&nbsp;`conda install pytorch cpuonly -c pytorch`
 
-3. Install Apex if you are using fp16 training. Please follow the instructions [here](https://github.com/NVIDIA/apex). (Installing Apex from pip has caused issues for several people.)
-
-4. Install simpletransformers.
+3. Install simpletransformers.
 `pip install simpletransformers`
 
 #### Optional
@@ -1479,7 +1478,7 @@ LanguageGenerationModel has a few additional attributes in its `args` dictionary
 ```python
     "do_sample": True,
     "prompt": "",
-    "length": 20,
+    "max_length": 20,
     "stop_token": None,
     "temperature": 1.0,
     "repetition_penalty": 1.0,
@@ -1646,6 +1645,14 @@ Note, you must set `evaluate_generated_text` to `True` to evaluate generated seq
 import logging
 
 import pandas as pd
+import sklearn
+
+from simpletransformers.classification import ClassificationModel
+from simpletransformers.classification.multi_modal_classification_model import \
+    MultiModalClassificationModel
+from simpletransformers.experimental.classification import ClassificationModel
+from simpletransformers.language_representation import RepresentationModel
+from simpletransformers.seq2seq import Seq2SeqModel
 from simpletransformers.t5 import T5Model
 
 logging.basicConfig(level=logging.INFO)
@@ -1954,10 +1961,7 @@ The prediction data should be a list of strings.
 The `Seq2SeqModel` must be initialized with `encoder_decoder_type="bart"` and `encoder_decoder_name` set to a pre-trained model name or the path to a saved model directory.
 
 ```python
-import logging
 
-import pandas as pd
-from simpletransformers.seq2seq import Seq2SeqModel
 
 logging.basicConfig(level=logging.INFO)
 transformers_logger = logging.getLogger("transformers")
@@ -2027,10 +2031,7 @@ The `Seq2SeqModel` must be initialized with `encoder_decoder_type="marian"` and
 Everything else is identical to the Bart model usage.
 
 ```python
-import logging
 
-import pandas as pd
-from simpletransformers.seq2seq import Seq2SeqModel
 
 logging.basicConfig(level=logging.INFO)
 transformers_logger = logging.getLogger("transformers")
@@ -2076,10 +2077,7 @@ for en, de in zip(src, predictions):
 #### Generic Encoder-Decoder minimal start
 
 ```python
-import logging
 
-import pandas as pd
-from simpletransformers.seq2seq import Seq2SeqModel
 
 logging.basicConfig(level=logging.INFO)
 transformers_logger = logging.getLogger("transformers")
@@ -2796,7 +2794,6 @@ If `label_list` is not given, `num_labels` is required and the labels should be
 Create a `MultiModalClassificationModel`.
 
 ```python
-from simpletransformers.classification.multi_modal_classification_model import MultiModalClassificationModel
 
 
 model = MultiModalClassificationModel("bert", "bert-base-uncased")
@@ -2924,15 +2921,14 @@ _[Back to Table of Contents](#table-of-contents)_
 ---
 
 ## [Text Representation Generation](#text-representation-generation)
-       
-Use transformers language models to generate contextual word or sentence representations from text that you can then feed to any down-stream tasks of your preference.  
-For more complete examples of how to use this component with downstream tasks refer to: https://github.com/ThilinaRajapakse/simpletransformers/tree/master/examples/language_representation 
-       
+
+Use transformers language models to generate contextual word or sentence representations from text that you can then feed to any down-stream tasks of your preference.
+For more complete examples of how to use this component with downstream tasks refer to: https://github.com/ThilinaRajapakse/simpletransformers/tree/master/examples/language_representation
+
 ### Minimal example for generating word embeddings
 Generate a list of contextual word embeddings for every sentence in a list
 ```python
-from simpletransformers.language_representation import RepresentationModel
-        
+
 sentences = ["Example sentence 1", "Example sentence 2"]
 model = RepresentationModel(
         model_type="bert",
@@ -2942,11 +2938,10 @@ model = RepresentationModel(
 word_vectors = model.encode_sentences(sentences, combine_strategy=None)
 assert word_vectors.shape === (2, 5, 768) # token vector for every token in each sentence, bert based models add 2 tokens per sentence by default([CLS] & [SEP])
 ```
-       
-### Minimal example for generating sentence embeddings 
+
+### Minimal example for generating sentence embeddings
 Same code as for generating word embeddings, the only differennce is that we pass combine_s`trategy="mean" parameter to `combine_strategy="mean"
 ```python
-from simpletransformers.language_representation import RepresentationModel
 sentences = ["Example sentence 1", "Example sentence 2"]
 model = RepresentationModel(
         model_type="bert",
@@ -2975,8 +2970,6 @@ Regression can be used with either single sentence or sentence pair tasks.
 #### Minimal Start for Regression
 
 ```python
-from simpletransformers.classification import ClassificationModel
-import pandas as pd
 
 
 train_data = [
@@ -3041,7 +3034,6 @@ _[Back to Table of Contents](#table-of-contents)_
 To use experimental features, import from `simpletransformers.experimental.X`
 
 ```python
-from simpletransformers.experimental.classification import ClassificationModel
 ```
 
 ### Sliding Window For Long Sequences
@@ -3062,9 +3054,6 @@ Currently available on binary and multiclass classification models of the follow
 Set `sliding_window` to `True` for the ClassificationModel to enable this feature.
 
 ```python
-from simpletransformers.classification import ClassificationModel
-import pandas as pd
-import sklearn
 
 # Train and Evaluation data needs to be in a Pandas Dataframe of two columns. The first column is the text with type str, and the second column in the label with type int.
 train_data = [['Example sentence belonging to class 1' * 50, 1], ['Example sentence belonging to class 0', 0], ['Example  2 sentence belonging to class 0', 0]] + [['Example sentence belonging to class 0', 0] for i in range(12)]
@@ -3132,7 +3121,6 @@ self.args = {
     "best_model_dir": "outputs/best_model/",
 
     "fp16": True,
-    "fp16_opt_level": "O1",
     "max_seq_length": 128,
     "train_batch_size": 8,
     "eval_batch_size": 8,
@@ -3192,10 +3180,7 @@ The directory where cached files will be saved.
 The directory where the best model (model checkpoints) will be saved if evaluate_during_training is enabled and the training loop achieves a lowest evaluation loss calculated after every evaluate_during_training_steps, or an epoch.
 
 #### *fp16: bool*
-Whether or not fp16 mode should be used. Requires NVidia Apex library.
-
-#### *fp16_opt_level: str*
-Can be '01', '02', '03'. See the [Apex docs](https://nvidia.github.io/apex/amp.html) for an explanation of the different optimization levels (opt_levels).
+Whether or not fp16 mode should be used.
 
 #### *max_seq_length: int*
 Maximum sequence level the model will support.
@@ -3326,6 +3311,22 @@ None of this would have been possible without the hard work by the HuggingFace t
 
 _<div>Icon for the Social Media Preview made by <a href="https://www.flaticon.com/authors/freepik" title="Freepik">Freepik</a> from <a href="https://www.flaticon.com/" title="Flaticon">www.flaticon.com</a></div>_
 
+## How to Contribute
+
+### How to Update Docs
+The latest version of the docs is hosted on [Github Pages](https://simpletransformers.ai/), if you want to help document Simple Transformers
+below are the steps to edit the docs.
+Docs are built using [Jekyll](https://jekyllrb.com/) library, refer to their webpage for a detailed explanation of how it works.
+1) **Install [Jekyll](https://jekyllrb.com/)**: Run the command `gem install bundler jekyll`
+2) **Visualizing the docs on your local computer**:
+In your terminal cd into the docs directory of this repo, eg: `cd simpletransformers/docs`
+From the docs directory run this command to serve the Jekyll docs locally: `bundle exec jekyll serve`
+Browse to http://localhost:4000 or whatever url you see in the console to visualize the docs.
+3) **Edit and visualize changes**:
+All the section pages of our docs can be found under `docs/_docs` directory, you can edit any file you want by following the markdown format and visualize the changes after refreshing the browser tab.
+
+**Note**: The docs present in the readme.md file are going to be deprecated soon and removed, so we don't recommend spending time on it.
+
 ## Contributors ✨
 
 Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
@@ -3390,7 +3391,8 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
   </tr>
   <tr>
     <td align="center"><a href="https://github.com/taranais"><img src="https://avatars1.githubusercontent.com/u/859916?v=4" width="100px;" alt=""/><br /><sub><b>taranais</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=taranais" title="Code">💻</a></td>
-    <td align="center"><a href="http://pablomarino.me"><img src="https://avatars1.githubusercontent.com/u/14850762?v=4" width="100px;" alt=""/><br /><sub><b>Pablo N. Marino</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=pablonm3" title="Code">💻</a></td>
+    <td align="center"><a href="http://pablomarino.me"><img src="https://avatars1.githubusercontent.com/u/14850762?v=4" width="100px;" alt=""/><br /><sub><b>Pablo N. Marino</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=pablonm3" title="Code">💻</a> <a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=pablonm3" title="Documentation">📖</a></td>
+    <td align="center"><a href="http://linkedin.com/in/strawberrypie/"><img src="https://avatars2.githubusercontent.com/u/29224443?v=4" width="100px;" alt=""/><br /><sub><b>Anton Kiselev</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=strawberrypie" title="Code">💻</a> <a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=strawberrypie" title="Documentation">📖</a></td>
   </tr>
 </table>
 
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index 414c6325..953d038e 100644
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -107,6 +107,12 @@ docs:
         url: /docs/seq2seq-data-formats/
       - title: "Seq2Seq Minimal Start"
         url: /docs/seq2seq-minimal-start/
+  - title: Text Representation Generation
+    children:
+      - title: "Text Representation Examples"
+        url: /docs/text-rep-examples/
+      - title: "Text Representation Model"
+        url: /docs/text-rep-model/
   - title: Meta
     children:
       - title: "History"
diff --git a/docs/_docs/01-installation.md b/docs/_docs/01-installation.md
index 0af8635c..ab63b0f0 100644
--- a/docs/_docs/01-installation.md
+++ b/docs/_docs/01-installation.md
@@ -2,11 +2,11 @@
 title: Installation
 permalink: /docs/installation/
 excerpt: "Instructions for installing the Simple Transformers library."
-last_modified_at: 2020-05-02 17:56:52
+last_modified_at: 2020/08/05 18:19:13
 toc: true
 ---
 
-It's a good idea to always use virtual environments when working with Python packages. 
+It's a good idea to always use virtual environments when working with Python packages.
 Anaconda/Miniconda is a package manager that lets you create virtual environments and manage package installations smoothly.
 
 Follow the instructions given below to install Simple Transformers using with Anaconda (or miniconda, a lighter version of anaconda).
@@ -16,27 +16,25 @@ Follow the instructions given below to install Simple Transformers using with An
 1. Install Anaconda or Miniconda Package Manager from [here](https://www.anaconda.com/distribution/).
 2. Create a new virtual environment and install packages.
    ```shell
-   conda create -n transformers python pandas tqdm
-   conda activate transformers
+   conda create -n st python pandas tqdm
+   conda activate st
    ```
-3. Using a CUDA capable GPU is recommended.  
-   To install Pytorch with CUDA support:  
+3. Using a CUDA capable GPU is recommended.
+   To install Pytorch with CUDA support:
    ```shell
-   conda install pytorch cudatoolkit=10.1 -c pytorch 
+   conda install pytorch>=1.6 cudatoolkit=10.2 -c pytorch
    ```
-   CPU only:  
+   CPU only:
    ```shell
    conda install pytorch cpuonly -c pytorch
    ```
 
-4. Install Apex to use FP16 training. Please follow the instructions [here](https://github.com/NVIDIA/apex). (Installing Apex from pip has caused issues for several people.)
-
-5. Install simpletransformers.  
-`pip install simpletransformers` 
+4. Install simpletransformers.
+`pip install simpletransformers`
 
 ## Optional
 
-1. Install Weights and Biases (wandb) for experiment tracking and visualizing training in a web browser.  
+1. Install Weights and Biases (wandb) for experiment tracking and visualizing training in a web browser.
 `pip install wandb`
 
 ---
diff --git a/docs/_docs/03-usage.md b/docs/_docs/03-usage.md
index 81c485e2..c78cf307 100644
--- a/docs/_docs/03-usage.md
+++ b/docs/_docs/03-usage.md
@@ -33,6 +33,8 @@ The currently implemented task-specific Simple Transformer models, along with th
 | Question answering                                        | `QuestionAnsweringModel`        |
 | Regression                                                | `ClassificationModel`           |
 | Sentence-pair classification                              | `ClassificationModel`           |
+| Text Representation Generation                            | `RepresentationModel`           |
+
 
 
 ## Creating a Task-Specific Model
@@ -154,7 +156,6 @@ Configuration options in Simple Transformers are defined as either dataclasses o
 | evaluate_during_training_steps   | int   | 2000                                                      | Perform evaluation at every specified number of steps. A checkpoint model and the evaluation results will be saved.                                                                      |
 | evaluate_during_training_verbose | bool  | False                                                     | Print results from evaluation during training.                                                                                                                                           |
 | fp16                             | bool  | True                                                      | Whether or not fp16 mode should be used. Requires NVidia Apex library.                                                                                                                   |
-| fp16_opt_level                   | str   | O1                                                        | Can be '01', '02', '03'. See the Apex docs for an explanation of the different optimization levels (opt_levels).                                                                         |
 | gradient_accumulation_steps      | int   | 1                                                         | The number of training steps to execute before performing a optimizer.step(). Effectively increases the training batch size while sacrificing training time to lower memory consumption. |
 | learning_rate                    | float | 4e-5                                                      | The learning rate for training.                                                                                                                                                          |
 | logging_steps                    | int   | 50                                                        | Log training loss and learning at every specified number of steps.                                                                                                                       |
diff --git a/docs/_docs/19-qa-minimal-start.md b/docs/_docs/19-qa-minimal-start.md
index 6f4e5585..bd9d1e9c 100644
--- a/docs/_docs/19-qa-minimal-start.md
+++ b/docs/_docs/19-qa-minimal-start.md
@@ -2,7 +2,7 @@
 title: Question Answering  Minimal Start
 permalink: /docs/qa-minimal-start/
 excerpt: "Minimal start for Question Answering tasks."
-last_modified_at: 2020-05-02 17:58:53
+last_modified_at: 2020/07/30 20:41:22
 ---
 
 ```python
@@ -143,7 +143,7 @@ to_predict = [
     }
 ]
 
-answers = model.predict(to_predict)
+answers, probabilities = model.predict(to_predict)
 
 print(answers)
 
diff --git a/docs/_docs/31-seq2seq-minimal-start.md b/docs/_docs/31-seq2seq-minimal-start.md
index eb2c0959..b080b826 100644
--- a/docs/_docs/31-seq2seq-minimal-start.md
+++ b/docs/_docs/31-seq2seq-minimal-start.md
@@ -2,7 +2,7 @@
 title: Seq2Seq Minimal Start
 permalink: /docs/seq2seq-minimal-start/
 excerpt: "Minimal start for Seq2Seq."
-last_modified_at: 2020/07/23 23:54:29
+last_modified_at: 2020/08/05 21:34:49
 ---
 
 ## Generic Encoder-Decoder minimal start
@@ -220,4 +220,8 @@ for en, de in zip(src, predictions):
     print(de)
     print()
 
-```
\ No newline at end of file
+```
+
+## Guides
+
+- [BART for Paraphrasing with Simple Transformers](https://towardsdatascience.com/bart-for-paraphrasing-with-simple-transformers-7c9ea3dfdd8c?source=friends_link&sk=07420669325ac550f86b86bad362633c)
diff --git a/docs/_docs/32-text-rep-examples.md b/docs/_docs/32-text-rep-examples.md
new file mode 100644
index 00000000..7ce9d009
--- /dev/null
+++ b/docs/_docs/32-text-rep-examples.md
@@ -0,0 +1,36 @@
+---
+title: Text Representation Examples
+permalink: /docs/text-rep-examples/
+excerpt: "Text Representation Examples"
+last_modified_at: 2020/07/26 23:16:38
+toc: true
+---
+
+### Minimal example for generating word embeddings
+Generate a list of contextual word embeddings for every sentence in a list
+```python
+from simpletransformers.language_representation import RepresentationModel
+        
+sentences = ["Example sentence 1", "Example sentence 2"]
+model = RepresentationModel(
+        model_type="bert",
+        model_name="bert-base-uncased",
+        use_cuda=False
+    )
+word_vectors = model.encode_sentences(sentences, combine_strategy=None)
+assert word_vectors.shape === (2, 5, 768) # token vector for every token in each sentence, bert based models add 2 tokens per sentence by default([CLS] & [SEP])
+```
+       
+### Minimal example for generating sentence embeddings 
+Same code as for generating word embeddings, the only difference is that we pass `combine_strategy="mean"` parameter
+```python
+from simpletransformers.language_representation import RepresentationModel
+sentences = ["Example sentence 1", "Example sentence 2"]
+model = RepresentationModel(
+        model_type="bert",
+        model_name="bert-base-uncased",
+        use_cuda=False
+    )
+word_vectors = model.encode_sentences(sentences, combine_strategy="mean")
+assert word_vectors.shape === (2, 768) # one sentence embedding per sentence
+```
diff --git a/docs/_docs/33-text-rep-model.md b/docs/_docs/33-text-rep-model.md
new file mode 100644
index 00000000..503b706a
--- /dev/null
+++ b/docs/_docs/33-text-rep-model.md
@@ -0,0 +1,128 @@
+---
+title: Text Representation Model
+permalink: /docs/text-rep-model/
+excerpt: "Text Representation Model"
+last_modified_at: 2020/07/26 23:16:38
+toc: true
+---
+
+
+
+## `RepresentationModel`
+
+The `RepresentationModel` class is used for generating (contextual) word or sentence embeddings from a list of text sentences,
+You can then feed these vectors to any model or downstream task.
+
+To create a `RepresentationModel`, you must specify a `model_type` and a `model_name`.
+
+- `model_type` should be one of the model types, currently supported: bert, roberta, gpt2
+- `model_name` specifies the exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
+
+    **Note:** For a list of standard pre-trained models, see [here](https://huggingface.co/transformers/pretrained_models.html).
+    {: .notice--info}
+
+    **Note:** For a list of community models, see [here](https://huggingface.co/models).
+    {: .notice--info}
+
+    You may use any of these models provided the `model_type` is supported.
+
+```python
+from simpletransformers.language_representation import RepresentationModel
+
+model = RepresentationModel(
+    "roberta", "roberta-base"
+)
+```
+
+**Note:** For more information on working with Simple Transformers models, please refer to the [General Usage section](/docs/usage/#creating-a-task-specific-model).
+{: .notice--info}
+
+
+### Configuring a `RepresentationModel`
+
+
+```python
+from simpletransformers.language_representation import RepresentationModel
+from simpletransformers.config.model_args import ModelArgs
+
+model_args = ModelArgs(max_seq_length=156)
+
+model = RepresentationModel(
+    "roberta",
+    "roberta-base",
+    args=model_args,
+)
+```
+
+**Note:** For configuration options common to all Simple Transformers models, please refer to the [Configuring a Simple Transformers Model section](/docs/usage/#configuring-a-simple-transformers-model).
+{: .notice--info}
+
+
+## `Class RepresentationModel`
+
+> *simpletransformers.language_representation.RepresentationModel*{: .function-name}(self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1, **kwargs,)
+
+Initializes a RepresentationModel model.
+{: .function-text}
+
+> Parameters
+{: .parameter-blockquote}
+
+* **model_type** *(`str`)* - The type of model to use, currently supported: bert, roberta, gpt2
+
+* **model_name** *(`str`)* - The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
+
+* **args** *(`dict`, optional)* - [Default args](/docs/usage/#configuring-a-simple-transformers-model) will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
+
+* **use_cuda** *(`bool`, optional)* - Use GPU if available. Setting to False will force model to use CPU only. (See [here](/docs/usage/#to-cuda-or-not-to-cuda))
+
+* **cuda_device** *(`int`, optional)* - Specific GPU that should be used. Will use the first available GPU by default. (See [here](/docs/usage/#selecting-a-cuda-device))
+
+* **kwargs** *(optional)* - For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. (See [here](/docs/usage/#options-for-downloading-pre-trained-models))
+{: .parameter-list}
+
+> Returns
+{: .returns}
+
+* `None`
+{: .return-list}
+
+
+**Note:** For configuration options common to all Simple Transformers models, please refer to the [Configuring a Simple Transformers Model section](/docs/usage/#configuring-a-simple-transformers-model).
+{: .notice--info}
+
+
+
+## Generating contextual word embeddings from text with a `RepresentationModel`
+
+The `encode_sentences()`  method is used to create word embeddings with the model.
+
+```python
+sentence_list = ["Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages", "hi", "I am happy"]
+word_embeddings = model.encode_sentences(sentence_list, combine_strategy="mean")
+```
+
+**Note:** The input **must** be a List even if there is only one sentence.
+{: .notice--info}
+
+
+> *simpletransformers.language_representation.RepresentationModel.encode_sentences*{: .function-name}(text_list, combine_strategy=None, batch_size=32)
+
+Generates list of contextual word or sentence embeddings using the model passed to class constructor.
+{: .function-text}
+
+> Parameters
+{: .parameter-blockquote}
+
+* **text_list** - list of text sentences.
+
+* **combine_strategy** - strategy for combining word vectors, supported values: None, "mean", "concat".
+
+* **batch_size** - size of batches of sentences feeded to the model.
+{: .parameter-list}
+
+> Returns
+{: .returns}
+
+* **answer_list** *(`list`)* - list of lists of sentence embeddings(if `combine_strategy=None`) OR list of sentence embeddings(if `combine_strategy!=None`).
+{: .return-list}
diff --git a/docs/_docs/51-docs-old.md b/docs/_docs/51-docs-old.md
index c6a9e8d1..f4028758 100644
--- a/docs/_docs/51-docs-old.md
+++ b/docs/_docs/51-docs-old.md
@@ -145,7 +145,6 @@ ELECTRA models can now be used with Language Model Training, Token Classificatio
       - [*cache_dir: str*](#cachedir-str)
       - [*best_model_dir: str*](#bestmodeldir-str)
       - [*fp16: bool*](#fp16-bool)
-      - [*fp16_opt_level: str*](#fp16optlevel-str)
       - [*max_seq_length: int*](#maxseqlength-int)
       - [*train_batch_size: int*](#trainbatchsize-int)
       - [*gradient_accumulation_steps: int*](#gradientaccumulationsteps-int)
@@ -200,9 +199,7 @@ If using cuda:
 else:  
 &nbsp;&nbsp;&nbsp;&nbsp;`conda install pytorch cpuonly -c pytorch`  
 
-3. Install Apex if you are using fp16 training. Please follow the instructions [here](https://github.com/NVIDIA/apex). (Installing Apex from pip has caused issues for several people.)
-
-4. Install simpletransformers.  
+3. Install simpletransformers.  
 `pip install simpletransformers` 
 
 #### Optional
@@ -2145,7 +2142,6 @@ self.args = {
     "best_model_dir": "outputs/best_model/",
 
     "fp16": True,
-    "fp16_opt_level": "O1",
     "max_seq_length": 128,
     "train_batch_size": 8,
     "eval_batch_size": 8,
@@ -2207,9 +2203,6 @@ The directory where the best model (model checkpoints) will be saved if evaluate
 #### *fp16: bool*
 Whether or not fp16 mode should be used. Requires NVidia Apex library.
 
-#### *fp16_opt_level: str*
-Can be '01', '02', '03'. See the [Apex docs](https://nvidia.github.io/apex/amp.html) for an explanation of the different optimization levels (opt_levels).
-
 #### *max_seq_length: int*
 Maximum sequence level the model will support.
 
diff --git a/docs/index.html b/docs/index.html
index 13b7adca..f8e3f4ea 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -19,6 +19,7 @@
     <li><b>Language Generation</b></li>
     <li><b>Multi-Modal Classification</b></li>
     <li><b>Conversational AI</b></li>
+    <li><b>Text Representation Generation</b></li>
 feature_row:
   - title: "Simple but Powerful"
     excerpt: "Get started with 3 lines of code, or configure every detail."
diff --git a/examples/hyperparameter tuning/extended-tuning/data_prep.py b/examples/hyperparameter tuning/extended-tuning/data_prep.py
index b829d8a1..cb561022 100644
--- a/examples/hyperparameter tuning/extended-tuning/data_prep.py	
+++ b/examples/hyperparameter tuning/extended-tuning/data_prep.py	
@@ -2,7 +2,6 @@
 
 from utils import load_rte_data_file
 
-
 # Preparing train data
 train_df = load_rte_data_file("data/train.jsonl")
 eval_df = load_rte_data_file("data/val.jsonl")
diff --git a/examples/hyperparameter tuning/extended-tuning/sweep_layerwise.py b/examples/hyperparameter tuning/extended-tuning/sweep_layerwise.py
index c5e7f02e..b0ef1403 100644
--- a/examples/hyperparameter tuning/extended-tuning/sweep_layerwise.py	
+++ b/examples/hyperparameter tuning/extended-tuning/sweep_layerwise.py	
@@ -2,9 +2,9 @@
 from statistics import mean
 
 import pandas as pd
+import wandb
 from sklearn.metrics import accuracy_score
 
-import wandb
 from simpletransformers.classification import ClassificationArgs, ClassificationModel
 from utils import load_rte_data_file
 
diff --git a/examples/hyperparameter tuning/extended-tuning/sweep_vanilla.py b/examples/hyperparameter tuning/extended-tuning/sweep_vanilla.py
index a7af8524..9025fbce 100644
--- a/examples/hyperparameter tuning/extended-tuning/sweep_vanilla.py	
+++ b/examples/hyperparameter tuning/extended-tuning/sweep_vanilla.py	
@@ -2,10 +2,10 @@
 from statistics import mean, mode
 
 import pandas as pd
+import wandb
 from sklearn.metrics import accuracy_score, f1_score
 from sklearn.model_selection import train_test_split
 
-import wandb
 from simpletransformers.classification import ClassificationArgs, ClassificationModel
 from utils import load_rte_data_file
 
diff --git a/examples/hyperparameter tuning/extended-tuning/train_default.py b/examples/hyperparameter tuning/extended-tuning/train_default.py
index be455887..b25c4d4c 100644
--- a/examples/hyperparameter tuning/extended-tuning/train_default.py	
+++ b/examples/hyperparameter tuning/extended-tuning/train_default.py	
@@ -2,13 +2,12 @@
 from statistics import mean
 
 import pandas as pd
+import wandb
 from sklearn.metrics import accuracy_score
 
-import wandb
 from simpletransformers.classification import ClassificationArgs, ClassificationModel
 from utils import load_rte_data_file
 
-
 logging.basicConfig(level=logging.INFO)
 transformers_logger = logging.getLogger("transformers")
 transformers_logger.setLevel(logging.WARNING)
diff --git a/examples/hyperparameter tuning/extended-tuning/train_layerwise.py b/examples/hyperparameter tuning/extended-tuning/train_layerwise.py
index 280b50e9..ead736ad 100644
--- a/examples/hyperparameter tuning/extended-tuning/train_layerwise.py	
+++ b/examples/hyperparameter tuning/extended-tuning/train_layerwise.py	
@@ -3,10 +3,10 @@
 
 import pandas as pd
 import prettyprinter
+import wandb
 from prettyprinter import pprint
 from sklearn.metrics import accuracy_score
 
-import wandb
 from simpletransformers.classification import ClassificationArgs, ClassificationModel
 from utils import load_rte_data_file
 
diff --git a/examples/hyperparameter tuning/extended-tuning/train_vanilla.py b/examples/hyperparameter tuning/extended-tuning/train_vanilla.py
index 667188bd..ddf36e9e 100644
--- a/examples/hyperparameter tuning/extended-tuning/train_vanilla.py	
+++ b/examples/hyperparameter tuning/extended-tuning/train_vanilla.py	
@@ -3,10 +3,10 @@
 
 import pandas as pd
 import prettyprinter
+import wandb
 from prettyprinter import pprint
 from sklearn.metrics import accuracy_score, f1_score
 
-import wandb
 from simpletransformers.classification import ClassificationArgs, ClassificationModel
 from utils import load_rte_data_file
 
diff --git a/examples/hyperparameter tuning/sweeps.py b/examples/hyperparameter tuning/sweeps.py
index 0eef6e1b..93b32c24 100644
--- a/examples/hyperparameter tuning/sweeps.py	
+++ b/examples/hyperparameter tuning/sweeps.py	
@@ -2,12 +2,9 @@
 
 import pandas as pd
 import sklearn
-
 import wandb
-from simpletransformers.classification import (
-    ClassificationArgs,
-    ClassificationModel,
-)
+
+from simpletransformers.classification import ClassificationArgs, ClassificationModel
 
 sweep_config = {
     "method": "bayes",  # grid, random
diff --git a/examples/language_generation/data_prep.py b/examples/language_generation/data_prep.py
index a55a7cbe..26fd9e6e 100644
--- a/examples/language_generation/data_prep.py
+++ b/examples/language_generation/data_prep.py
@@ -1,6 +1,5 @@
 import pandas as pd
 
-
 df = pd.read_csv("data/cs.AI.tsv", sep="\t")
 abstracts = df["abstract"].tolist()
 
diff --git a/examples/language_generation/fine_tune.py b/examples/language_generation/fine_tune.py
index ad0f9dd1..e4a511d2 100644
--- a/examples/language_generation/fine_tune.py
+++ b/examples/language_generation/fine_tune.py
@@ -1,6 +1,6 @@
-from simpletransformers.language_modeling import LanguageModelingModel
 import logging
 
+from simpletransformers.language_modeling import LanguageModelingModel
 
 logging.basicConfig(level=logging.INFO)
 transformers_logger = logging.getLogger("transformers")
diff --git a/examples/language_generation/generate.py b/examples/language_generation/generate.py
index 0135f7cc..1fcb5800 100644
--- a/examples/language_generation/generate.py
+++ b/examples/language_generation/generate.py
@@ -1,6 +1,6 @@
 import logging
-from simpletransformers.language_generation import LanguageGenerationModel
 
+from simpletransformers.language_generation import LanguageGenerationModel
 
 logging.basicConfig(level=logging.INFO)
 transformers_logger = logging.getLogger("transformers")
diff --git a/examples/language_generation/train_new_lm.py b/examples/language_generation/train_new_lm.py
index 28daf0a3..5b624513 100644
--- a/examples/language_generation/train_new_lm.py
+++ b/examples/language_generation/train_new_lm.py
@@ -1,7 +1,7 @@
-from simpletransformers.language_modeling import LanguageModelingModel
-import logging
 import argparse
+import logging
 
+from simpletransformers.language_modeling import LanguageModelingModel
 
 logging.basicConfig(level=logging.INFO)
 transformers_logger = logging.getLogger("transformers")
diff --git a/examples/language_representation/binary_classification_dummy.py b/examples/language_representation/binary_classification_dummy.py
index 1ceba9fb..4b979c0f 100644
--- a/examples/language_representation/binary_classification_dummy.py
+++ b/examples/language_representation/binary_classification_dummy.py
@@ -1,6 +1,7 @@
 import pandas as pd
-from sklearn.metrics import classification_report
 from sklearn.linear_model import RidgeClassifier
+from sklearn.metrics import classification_report
+
 from simpletransformers.language_representation import RepresentationModel
 
 train_data = [["Example sentence belonging to class 1", 1], ["Example sentence belonging to class 0", 0]]
diff --git a/examples/language_representation/classification_yelp_polarity/classification_yelp.py b/examples/language_representation/classification_yelp_polarity/classification_yelp.py
index 6fc7f394..a64bc964 100644
--- a/examples/language_representation/classification_yelp_polarity/classification_yelp.py
+++ b/examples/language_representation/classification_yelp_polarity/classification_yelp.py
@@ -1,8 +1,10 @@
+from os.path import dirname, join
+
 import pandas as pd
-from sklearn.metrics import classification_report
 from sklearn.linear_model import RidgeClassifier
+from sklearn.metrics import classification_report
+
 from simpletransformers.language_representation import RepresentationModel
-from os.path import dirname, join
 
 project_root = dirname(dirname(dirname(dirname(__file__))))  # path to root of the project
 
diff --git a/examples/named_entity_recognition/named_entity_recognition.py b/examples/named_entity_recognition/named_entity_recognition.py
index 2fb31425..7ef88c3e 100644
--- a/examples/named_entity_recognition/named_entity_recognition.py
+++ b/examples/named_entity_recognition/named_entity_recognition.py
@@ -1,8 +1,9 @@
-import pandas as pd
-from simpletransformers.ner import NERModel
 import numpy as np
+import pandas as pd
 from scipy.special import softmax
 
+from simpletransformers.ner import NERModel
+
 # Creating train_df  and eval_df for demonstration
 train_data = [
     [0, "Simple", "B-MISC"],
diff --git a/examples/seq2seq/minimal_seq2seq.py b/examples/seq2seq/minimal_seq2seq.py
index cd7519f7..149df22f 100644
--- a/examples/seq2seq/minimal_seq2seq.py
+++ b/examples/seq2seq/minimal_seq2seq.py
@@ -1,6 +1,7 @@
 import logging
 
 import pandas as pd
+
 from simpletransformers.seq2seq import Seq2SeqModel
 
 logging.basicConfig(level=logging.INFO)
diff --git a/examples/seq2seq/paraphrasing/data_download.sh b/examples/seq2seq/paraphrasing/data_download.sh
new file mode 100755
index 00000000..45604e8d
--- /dev/null
+++ b/examples/seq2seq/paraphrasing/data_download.sh
@@ -0,0 +1,7 @@
+mkdir data
+wget https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz -P data
+tar -xvf data/paws_wiki_labeled_final.tar.gz -C data
+mv data/final/* data
+rm -r data/final
+
+wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv -P data
diff --git a/examples/seq2seq/paraphrasing/predict.py b/examples/seq2seq/paraphrasing/predict.py
new file mode 100644
index 00000000..f0511c55
--- /dev/null
+++ b/examples/seq2seq/paraphrasing/predict.py
@@ -0,0 +1,30 @@
+import logging
+
+from simpletransformers.seq2seq import Seq2SeqModel
+
+
+logging.basicConfig(level=logging.INFO)
+transformers_logger = logging.getLogger("transformers")
+transformers_logger.setLevel(logging.ERROR)
+
+model = Seq2SeqModel(
+    encoder_decoder_type="bart", encoder_decoder_name="outputs"
+)
+
+
+while True:
+    original = input("Enter text to paraphrase: ")
+    to_predict = [original]
+
+    preds = model.predict(to_predict)
+
+    print("---------------------------------------------------------")
+    print(original)
+
+    print()
+    print("Predictions >>>")
+    for pred in preds[0]:
+        print(pred)
+
+    print("---------------------------------------------------------")
+    print()
diff --git a/examples/seq2seq/paraphrasing/readme.md b/examples/seq2seq/paraphrasing/readme.md
new file mode 100644
index 00000000..17d501c4
--- /dev/null
+++ b/examples/seq2seq/paraphrasing/readme.md
@@ -0,0 +1 @@
+Code for the Medium Article [here](https://towardsdatascience.com/bart-for-paraphrasing-with-simple-transformers-7c9ea3dfdd8c?source=friends_link&sk=07420669325ac550f86b86bad362633c).
\ No newline at end of file
diff --git a/examples/seq2seq/paraphrasing/train.py b/examples/seq2seq/paraphrasing/train.py
new file mode 100644
index 00000000..1a6410a2
--- /dev/null
+++ b/examples/seq2seq/paraphrasing/train.py
@@ -0,0 +1,141 @@
+import os
+from datetime import datetime
+import logging
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
+
+from utils import load_data, clean_unnecessary_spaces
+
+
+logging.basicConfig(level=logging.INFO)
+transformers_logger = logging.getLogger("transformers")
+transformers_logger.setLevel(logging.ERROR)
+
+# Google Data
+train_df = pd.read_csv("data/train.tsv", sep="\t").astype(str)
+eval_df = pd.read_csv("data/dev.tsv", sep="\t").astype(str)
+
+train_df = train_df.loc[train_df["label"] == "1"]
+eval_df = eval_df.loc[eval_df["label"] == "1"]
+
+train_df = train_df.rename(
+    columns={"sentence1": "input_text", "sentence2": "target_text"}
+)
+eval_df = eval_df.rename(
+    columns={"sentence1": "input_text", "sentence2": "target_text"}
+)
+
+train_df = train_df[["input_text", "target_text"]]
+eval_df = eval_df[["input_text", "target_text"]]
+
+train_df["prefix"] = "paraphrase"
+eval_df["prefix"] = "paraphrase"
+
+# MSRP Data
+train_df = pd.concat(
+    [
+        train_df,
+        load_data("data/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
+    ]
+)
+eval_df = pd.concat(
+    [
+        eval_df,
+        load_data("data/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
+    ]
+)
+
+# Quora Data
+
+# The Quora Dataset is not separated into train/test, so we do it manually the first time.
+df = load_data(
+    "data/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate"
+)
+q_train, q_test = train_test_split(df)
+
+q_train.to_csv("data/quora_train.tsv", sep="\t")
+q_test.to_csv("data/quora_test.tsv", sep="\t")
+
+# The code block above only needs to be run once.
+# After that, the two lines below are sufficient to load the Quora dataset.
+
+# q_train = pd.read_csv("data/quora_train.tsv", sep="\t")
+# q_test = pd.read_csv("data/quora_test.tsv", sep="\t")
+
+train_df = pd.concat([train_df, q_train])
+eval_df = pd.concat([eval_df, q_test])
+
+train_df = train_df[["prefix", "input_text", "target_text"]]
+eval_df = eval_df[["prefix", "input_text", "target_text"]]
+
+train_df = train_df.dropna()
+eval_df = eval_df.dropna()
+
+train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
+train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)
+
+eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
+eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)
+
+print(train_df)
+
+model_args = Seq2SeqArgs()
+model_args.eval_batch_size = 64
+model_args.evaluate_during_training = True
+model_args.evaluate_during_training_steps = 2500
+model_args.evaluate_during_training_verbose = True
+model_args.fp16 = False
+model_args.learning_rate = 5e-5
+model_args.max_seq_length = 128
+model_args.num_train_epochs = 2
+model_args.overwrite_output_dir = True
+model_args.reprocess_input_data = True
+model_args.save_eval_checkpoints = False
+model_args.save_steps = -1
+model_args.train_batch_size = 8
+model_args.use_multiprocessing = False
+
+model_args.do_sample = True
+model_args.num_beams = None
+model_args.num_return_sequences = 3
+model_args.max_length = 128
+model_args.top_k = 50
+model_args.top_p = 0.95
+
+model_args.wandb_project = "Paraphrasing with BART"
+
+
+model = Seq2SeqModel(
+    encoder_decoder_type="bart",
+    encoder_decoder_name="facebook/bart-large",
+    args=model_args,
+)
+
+model.train_model(train_df, eval_data=eval_df)
+
+to_predict = [
+    prefix + ": " + str(input_text)
+    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
+]
+truth = eval_df["target_text"].tolist()
+
+preds = model.predict(to_predict)
+
+# Saving the predictions if needed
+os.makedirs("predictions", exist_ok=True)
+
+with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f:
+    for i, text in enumerate(eval_df["input_text"].tolist()):
+        f.write(str(text) + "\n\n")
+
+        f.write("Truth:\n")
+        f.write(truth[i] + "\n\n")
+
+        f.write("Prediction:\n")
+        for pred in preds[i]:
+            f.write(str(pred) + "\n")
+        f.write(
+            "________________________________________________________________________________\n"
+        )
diff --git a/examples/seq2seq/paraphrasing/utils.py b/examples/seq2seq/paraphrasing/utils.py
new file mode 100644
index 00000000..37eba105
--- /dev/null
+++ b/examples/seq2seq/paraphrasing/utils.py
@@ -0,0 +1,36 @@
+import warnings
+
+import pandas as pd
+
+
+def load_data(
+    file_path, input_text_column, target_text_column, label_column, keep_label=1
+):
+    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
+    df = df.loc[df[label_column] == keep_label]
+    df = df.rename(
+        columns={input_text_column: "input_text", target_text_column: "target_text"}
+    )
+    df = df[["input_text", "target_text"]]
+    df["prefix"] = "paraphrase"
+
+    return df
+
+
+def clean_unnecessary_spaces(out_string):
+    if not isinstance(out_string, str):
+        warnings.warn(f">>> {out_string} <<< is not a string.")
+        out_string = str(out_string)
+    out_string = (
+        out_string.replace(" .", ".")
+        .replace(" ?", "?")
+        .replace(" !", "!")
+        .replace(" ,", ",")
+        .replace(" ' ", "'")
+        .replace(" n't", "n't")
+        .replace(" 'm", "'m")
+        .replace(" 's", "'s")
+        .replace(" 've", "'ve")
+        .replace(" 're", "'re")
+    )
+    return out_string
diff --git a/examples/t5/mixed_tasks/test.py b/examples/t5/mixed_tasks/test.py
index a7bf06e7..d2808bec 100644
--- a/examples/t5/mixed_tasks/test.py
+++ b/examples/t5/mixed_tasks/test.py
@@ -6,10 +6,11 @@
 import numpy as np
 import pandas as pd
 from scipy.stats import pearsonr, spearmanr
-from simpletransformers.t5 import T5Model
 from sklearn.metrics import accuracy_score, f1_score
 from transformers.data.metrics.squad_metrics import compute_exact, compute_f1
 
+from simpletransformers.t5 import T5Model
+
 
 def f1(truths, preds):
     return mean([compute_f1(truth, pred) for truth, pred in zip(truths, preds)])
diff --git a/examples/t5/mixed_tasks/train.py b/examples/t5/mixed_tasks/train.py
index ed9170bb..be94f972 100644
--- a/examples/t5/mixed_tasks/train.py
+++ b/examples/t5/mixed_tasks/train.py
@@ -1,6 +1,6 @@
 import pandas as pd
-from simpletransformers.t5 import T5Model
 
+from simpletransformers.t5 import T5Model
 
 train_df = pd.read_csv("data/train.tsv", sep="\t").astype(str)
 eval_df = pd.read_csv("data/eval.tsv", sep="\t").astype(str)
diff --git a/examples/t5/training_on_a_new_task/data_prep.py b/examples/t5/training_on_a_new_task/data_prep.py
index e64411ae..eaae0f4d 100644
--- a/examples/t5/training_on_a_new_task/data_prep.py
+++ b/examples/t5/training_on_a_new_task/data_prep.py
@@ -1,7 +1,8 @@
-import pandas as pd
 import gzip
-from sklearn.model_selection import train_test_split
 import os
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
 from tqdm.auto import tqdm
 
 
diff --git a/examples/t5/training_on_a_new_task/predict.py b/examples/t5/training_on_a_new_task/predict.py
index fc3fec79..73b1a134 100644
--- a/examples/t5/training_on_a_new_task/predict.py
+++ b/examples/t5/training_on_a_new_task/predict.py
@@ -1,6 +1,5 @@
 from simpletransformers.t5 import T5Model
 
-
 model_args = {
     "reprocess_input_data": True,
     "overwrite_output_dir": True,
diff --git a/examples/t5/training_on_a_new_task/test.py b/examples/t5/training_on_a_new_task/test.py
index 12fa1b01..0631989c 100644
--- a/examples/t5/training_on_a_new_task/test.py
+++ b/examples/t5/training_on_a_new_task/test.py
@@ -1,7 +1,8 @@
-from simpletransformers.t5 import T5Model
-import pandas as pd
 from pprint import pprint
 
+import pandas as pd
+
+from simpletransformers.t5 import T5Model
 
 model_args = {
     "reprocess_input_data": True,
diff --git a/examples/t5/training_on_a_new_task/train.py b/examples/t5/training_on_a_new_task/train.py
index 824382c4..8cd0656f 100644
--- a/examples/t5/training_on_a_new_task/train.py
+++ b/examples/t5/training_on_a_new_task/train.py
@@ -2,7 +2,6 @@
 
 from simpletransformers.t5 import T5Model
 
-
 train_df = pd.read_csv("data/train_df.tsv", sep="\t").astype(str)
 eval_df = pd.read_csv("data/eval_df.tsv", sep="\t").astype(str)
 
diff --git a/examples/text_classification/binary_classification.py b/examples/text_classification/binary_classification.py
index b6d3e434..6dba5836 100644
--- a/examples/text_classification/binary_classification.py
+++ b/examples/text_classification/binary_classification.py
@@ -1,4 +1,5 @@
 import pandas as pd
+
 from simpletransformers.classification import ClassificationModel
 
 # Train and Evaluation data needs to be in a Pandas Dataframe of two columns. The first column is the text with type str, and the second column is the label with type int.
diff --git a/examples/text_classification/lazy_loading_regression.py b/examples/text_classification/lazy_loading_regression.py
index b5ef1a4c..a295b2f3 100644
--- a/examples/text_classification/lazy_loading_regression.py
+++ b/examples/text_classification/lazy_loading_regression.py
@@ -1,8 +1,8 @@
 import os
 
 import pandas as pd
-from simpletransformers.classification import ClassificationModel
 
+from simpletransformers.classification import ClassificationModel
 
 train_data = [
     ["Example sentence belonging to class 1", "Yep, this is 1", 0.8],
diff --git a/examples/text_classification/multiclass_classification.py b/examples/text_classification/multiclass_classification.py
index 04f44d68..4e068044 100644
--- a/examples/text_classification/multiclass_classification.py
+++ b/examples/text_classification/multiclass_classification.py
@@ -1,4 +1,5 @@
 import pandas as pd
+
 from simpletransformers.classification import ClassificationModel
 
 # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns. If the Dataframe has a header, it should contain a 'text' and a 'labels' column. If no header is present, the Dataframe should contain at least two columns, with the first column is the text with type str, and the second column in the label with type int.
diff --git a/examples/text_classification/multilabel_classification.py b/examples/text_classification/multilabel_classification.py
index ecd7a1fd..19df3c85 100644
--- a/examples/text_classification/multilabel_classification.py
+++ b/examples/text_classification/multilabel_classification.py
@@ -1,4 +1,5 @@
 import pandas as pd
+
 from simpletransformers.classification import MultiLabelClassificationModel
 
 # Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns, a 'text' and a 'labels' column. The `labels` column should contain multi-hot encoded lists.
diff --git a/examples/text_classification/yelp_reviews_polarity/train.py b/examples/text_classification/yelp_reviews_polarity/train.py
index 2bc25f12..b258f8f1 100644
--- a/examples/text_classification/yelp_reviews_polarity/train.py
+++ b/examples/text_classification/yelp_reviews_polarity/train.py
@@ -1,8 +1,8 @@
 import sys
 
 import pandas as pd
-from simpletransformers.classification import ClassificationModel
 
+from simpletransformers.classification import ClassificationModel
 
 prefix = "data/"
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 25078563..bda73495 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -6,4 +6,4 @@ pytest==5.1.3
 # lint/format/types
 black==19.10b0
 flake8==3.7.8
-pytype==2019.7.11
\ No newline at end of file
+pytype==2019.7.11
diff --git a/setup.py b/setup.py
index 756167d6..429d9c3f 100755
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="simpletransformers",
-    version="0.45.3",
+    version="0.47.0",
     author="Thilina Rajapakse",
     author_email="chaturangarajapakshe@gmail.com",
     description="An easy-to-use wrapper library for the Transformers library.",
@@ -34,5 +34,6 @@
         "pandas",
         "tokenizers",
         "wandb",
+        "streamlit",
     ],
 )
diff --git a/simpletransformers/classification/__init__.py b/simpletransformers/classification/__init__.py
index f2a5c4f8..acb54ae3 100755
--- a/simpletransformers/classification/__init__.py
+++ b/simpletransformers/classification/__init__.py
@@ -1,6 +1,8 @@
 from simpletransformers.classification.classification_model import ClassificationModel
 from simpletransformers.classification.multi_label_classification_model import MultiLabelClassificationModel
 from simpletransformers.classification.multi_modal_classification_model import MultiModalClassificationModel
-from simpletransformers.config.model_args import ClassificationArgs
-from simpletransformers.config.model_args import MultiLabelClassificationArgs
-from simpletransformers.config.model_args import MultiModalClassificationArgs
+from simpletransformers.config.model_args import (
+    ClassificationArgs,
+    MultiLabelClassificationArgs,
+    MultiModalClassificationArgs,
+)
diff --git a/simpletransformers/classification/classification_model.py b/simpletransformers/classification/classification_model.py
index 67758c4b..c36f6860 100755
--- a/simpletransformers/classification/classification_model.py
+++ b/simpletransformers/classification/classification_model.py
@@ -10,10 +10,12 @@
 import os
 import random
 import warnings
-from multiprocessing import cpu_count
 from dataclasses import asdict
+from multiprocessing import cpu_count
 
 import numpy as np
+import pandas as pd
+import torch
 from scipy.stats import mode, pearsonr
 from sklearn.metrics import (
     confusion_matrix,
@@ -21,28 +23,11 @@
     matthews_corrcoef,
     mean_squared_error,
 )
-from tqdm.auto import tqdm, trange
-from tqdm.contrib import tenumerate
-
-import pandas as pd
-import torch
-from simpletransformers.classification.classification_utils import InputExample, convert_examples_to_features
-from simpletransformers.classification.transformer_models.albert_model import AlbertForSequenceClassification
-from simpletransformers.classification.transformer_models.bert_model import BertForSequenceClassification
-from simpletransformers.classification.transformer_models.camembert_model import CamembertForSequenceClassification
-from simpletransformers.classification.transformer_models.distilbert_model import DistilBertForSequenceClassification
-from simpletransformers.classification.transformer_models.flaubert_model import FlaubertForSequenceClassification
-from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification
-from simpletransformers.classification.transformer_models.xlm_model import XLMForSequenceClassification
-from simpletransformers.classification.transformer_models.xlm_roberta_model import XLMRobertaForSequenceClassification
-from simpletransformers.classification.transformer_models.xlnet_model import XLNetForSequenceClassification
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import ClassificationArgs
-from simpletransformers.classification.classification_utils import LazyClassificationDataset
-from simpletransformers.custom_models.models import ElectraForSequenceClassification
 from tensorboardX import SummaryWriter
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
+from tqdm.auto import tqdm, trange
+from tqdm.contrib import tenumerate
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -62,8 +47,8 @@
     LongformerForSequenceClassification,
     LongformerTokenizer,
     MobileBertConfig,
-    MobileBertTokenizer,
     MobileBertForSequenceClassification,
+    MobileBertTokenizer,
     RobertaConfig,
     RobertaTokenizer,
     XLMConfig,
@@ -75,6 +60,24 @@
     get_linear_schedule_with_warmup,
 )
 
+from simpletransformers.classification.classification_utils import (
+    InputExample,
+    LazyClassificationDataset,
+    convert_examples_to_features,
+)
+from simpletransformers.classification.transformer_models.albert_model import AlbertForSequenceClassification
+from simpletransformers.classification.transformer_models.bert_model import BertForSequenceClassification
+from simpletransformers.classification.transformer_models.camembert_model import CamembertForSequenceClassification
+from simpletransformers.classification.transformer_models.distilbert_model import DistilBertForSequenceClassification
+from simpletransformers.classification.transformer_models.flaubert_model import FlaubertForSequenceClassification
+from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification
+from simpletransformers.classification.transformer_models.xlm_model import XLMForSequenceClassification
+from simpletransformers.classification.transformer_models.xlm_roberta_model import XLMRobertaForSequenceClassification
+from simpletransformers.classification.transformer_models.xlnet_model import XLNetForSequenceClassification
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import ClassificationArgs
+from simpletransformers.custom_models.models import ElectraForSequenceClassification
+
 try:
     import wandb
 
@@ -142,7 +145,11 @@ def __init__(
             if num_labels:
                 assert num_labels == len(self.args.labels_list)
             if self.args.labels_map:
-                assert list(self.args.labels_map.keys()) == self.args.labels_list
+                try:
+                    assert list(self.args.labels_map.keys()) == self.args.labels_list
+                except AssertionError:
+                    assert [int(key) for key in list(self.args.labels_map.keys())] == self.args.labels_list
+                    self.args.labels_map = {int(key): value for key, value in self.args.labels_map.items()}
             else:
                 self.args.labels_map = {label: i for i, label in enumerate(self.args.labels_list)}
         else:
@@ -393,14 +400,6 @@ def train(
             optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
         )
 
-        if args.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
         if args.n_gpu > 1:
             model = torch.nn.DataParallel(model)
 
@@ -443,6 +442,11 @@ def train(
             wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
             wandb.watch(self.model)
 
+        if args.fp16:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for _ in train_iterator:
             if epochs_trained > 0:
@@ -461,9 +465,15 @@ def train(
                     continue
 
                 inputs = self._get_inputs_dict(batch)
-                outputs = model(**inputs)
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                loss = outputs[0]
+                if args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
+                        loss = outputs[0]
+                else:
+                    outputs = model(**inputs)
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    loss = outputs[0]
 
                 if args.n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -479,25 +489,21 @@ def train(
                     loss = loss / args.gradient_accumulation_steps
 
                 if args.fp16:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     amp.master_params(optimizer), args.max_grad_norm
-                    # )
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     model.parameters(), args.max_grad_norm
-                    # )
 
                 tr_loss += loss.item()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                    else:
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
-                    optimizer.step()
+                    if args.fp16:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
diff --git a/simpletransformers/classification/classification_utils.py b/simpletransformers/classification/classification_utils.py
index 3c2879a2..3c642373 100755
--- a/simpletransformers/classification/classification_utils.py
+++ b/simpletransformers/classification/classification_utils.py
@@ -18,12 +18,20 @@
 
 import csv
 import json
+import linecache
 import os
 import sys
 from collections import Counter
 from io import open
 from multiprocessing import Pool, cpu_count
 
+import torch
+import torch.nn as nn
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
+
 try:
     import torchvision
     import torchvision.transforms as transforms
@@ -33,14 +41,6 @@
 except ImportError:
     torchvision_available = False
 
-import linecache
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
-from tqdm.auto import tqdm
-
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset
 
 csv.field_size_limit(2147483647)
 
@@ -566,6 +566,8 @@ def __init__(self, data_file, tokenizer, args):
             self.text_column = None
         else:
             self.text_column = args.lazy_text_column
+            self.text_a_column = None
+            self.text_b_column = None
         self.labels_column = args.lazy_labels_column
 
     @staticmethod
@@ -579,7 +581,7 @@ def _get_n_lines(data_file, start_row):
     def __getitem__(self, idx):
         line = linecache.getline(self.data_file, idx + 1 + self.start_row).rstrip("\n").split(self.delimiter)
 
-        if self.text_column:
+        if not self.text_a_column and not self.text_b_column:
             text = line[self.text_column]
             label = line[self.labels_column]
 
diff --git a/simpletransformers/classification/multi_label_classification_model.py b/simpletransformers/classification/multi_label_classification_model.py
index 984b3fe4..f7733a95 100755
--- a/simpletransformers/classification/multi_label_classification_model.py
+++ b/simpletransformers/classification/multi_label_classification_model.py
@@ -1,24 +1,10 @@
 import logging
+import random
 import warnings
 from multiprocessing import cpu_count
-import random
-import numpy as np
 
+import numpy as np
 import torch
-from simpletransformers.classification import ClassificationModel
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import MultiLabelClassificationArgs
-from simpletransformers.custom_models.models import (
-    AlbertForMultiLabelSequenceClassification,
-    BertForMultiLabelSequenceClassification,
-    DistilBertForMultiLabelSequenceClassification,
-    ElectraForMultiLabelSequenceClassification,
-    FlaubertForMultiLabelSequenceClassification,
-    RobertaForMultiLabelSequenceClassification,
-    XLMForMultiLabelSequenceClassification,
-    XLMRobertaForMultiLabelSequenceClassification,
-    XLNetForMultiLabelSequenceClassification,
-)
 from transformers import (
     WEIGHTS_NAME,
     AlbertConfig,
@@ -41,6 +27,21 @@
     XLNetTokenizer,
 )
 
+from simpletransformers.classification import ClassificationModel
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import MultiLabelClassificationArgs
+from simpletransformers.custom_models.models import (
+    AlbertForMultiLabelSequenceClassification,
+    BertForMultiLabelSequenceClassification,
+    DistilBertForMultiLabelSequenceClassification,
+    ElectraForMultiLabelSequenceClassification,
+    FlaubertForMultiLabelSequenceClassification,
+    RobertaForMultiLabelSequenceClassification,
+    XLMForMultiLabelSequenceClassification,
+    XLMRobertaForMultiLabelSequenceClassification,
+    XLNetForMultiLabelSequenceClassification,
+)
+
 try:
     import wandb
 
diff --git a/simpletransformers/classification/multi_modal_classification_model.py b/simpletransformers/classification/multi_modal_classification_model.py
index 68870a68..e24dfc3d 100644
--- a/simpletransformers/classification/multi_modal_classification_model.py
+++ b/simpletransformers/classification/multi_modal_classification_model.py
@@ -10,10 +10,12 @@
 import os
 import random
 import warnings
-from multiprocessing import cpu_count
 from dataclasses import asdict
+from multiprocessing import cpu_count
 
 import numpy as np
+import pandas as pd
+import torch
 from scipy.stats import mode, pearsonr
 from sklearn.metrics import (
     confusion_matrix,
@@ -21,35 +23,33 @@
     matthews_corrcoef,
     mean_squared_error,
 )
-from tqdm.auto import tqdm, trange
-
-import pandas as pd
-import torch
-from simpletransformers.classification.classification_utils import (
-    ImageEncoder,
-    InputExample,
-    JsonlDataset,
-    collate_fn,
-    convert_examples_to_features,
-    get_image_transforms,
-)
-from simpletransformers.classification.transformer_models.mmbt_model import MMBTForClassification
-from simpletransformers.config.model_args import MultiModalClassificationArgs
-from simpletransformers.config.global_args import global_args
 from tensorboardX import SummaryWriter
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
+from tqdm.auto import tqdm, trange
 from transformers import (
+    BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
     WEIGHTS_NAME,
     AdamW,
     BertConfig,
     BertModel,
     BertTokenizer,
     get_linear_schedule_with_warmup,
-    BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
 )
 from transformers.configuration_mmbt import MMBTConfig
 
+from simpletransformers.classification.classification_utils import (
+    ImageEncoder,
+    InputExample,
+    JsonlDataset,
+    collate_fn,
+    convert_examples_to_features,
+    get_image_transforms,
+)
+from simpletransformers.classification.transformer_models.mmbt_model import MMBTForClassification
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import MultiModalClassificationArgs
+
 try:
     import wandb
 
@@ -432,14 +432,6 @@ def train(
             optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
         )
 
-        if args.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
         if args.n_gpu > 1:
             model = torch.nn.DataParallel(model)
 
@@ -458,6 +450,11 @@ def train(
             wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
             wandb.watch(self.model)
 
+        if args.fp16:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for _ in train_iterator:
             train_iterator.set_description(f"Epoch {epoch_number} of {args.num_train_epochs}")
@@ -473,10 +470,16 @@ def train(
                 labels = batch[5]
 
                 inputs = self._get_inputs_dict(batch)
-                outputs = model(**inputs)
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                logits = outputs[0]  # Different from default behaviour
-                loss = self.criterion(logits, labels)
+                if args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
+                        loss = outputs[0]
+                else:
+                    outputs = model(**inputs)
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    logits = outputs[0]  # Different from default behaviour
+                    loss = self.criterion(logits, labels)
 
                 if args.n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -492,25 +495,21 @@ def train(
                     loss = loss / args.gradient_accumulation_steps
 
                 if args.fp16:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     amp.master_params(optimizer), args.max_grad_norm
-                    # )
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     model.parameters(), args.max_grad_norm
-                    # )
 
                 tr_loss += loss.item()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                    else:
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
-                    optimizer.step()
+                    if args.fp16:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
diff --git a/simpletransformers/classification/transformer_models/camembert_model.py b/simpletransformers/classification/transformer_models/camembert_model.py
index 9e04c513..3e9ce2d9 100755
--- a/simpletransformers/classification/transformer_models/camembert_model.py
+++ b/simpletransformers/classification/transformer_models/camembert_model.py
@@ -1,7 +1,8 @@
-from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification
 from transformers.configuration_camembert import CamembertConfig
 from transformers.modeling_camembert import CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST
 
+from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification
+
 
 class CamembertForSequenceClassification(RobertaForSequenceClassification):
     r"""
diff --git a/simpletransformers/classification/transformer_models/xlm_roberta_model.py b/simpletransformers/classification/transformer_models/xlm_roberta_model.py
index a62107db..5c0b970b 100644
--- a/simpletransformers/classification/transformer_models/xlm_roberta_model.py
+++ b/simpletransformers/classification/transformer_models/xlm_roberta_model.py
@@ -1,7 +1,8 @@
-from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification
 from transformers.configuration_xlm_roberta import XLMRobertaConfig
 from transformers.modeling_xlm_roberta import XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
 
+from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification
+
 
 class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
     config_class = XLMRobertaConfig
diff --git a/simpletransformers/config/global_args.py b/simpletransformers/config/global_args.py
index b69488e5..30077905 100644
--- a/simpletransformers/config/global_args.py
+++ b/simpletransformers/config/global_args.py
@@ -1,6 +1,5 @@
-from multiprocessing import cpu_count
 import sys
-
+from multiprocessing import cpu_count
 
 global_args = {
     "adam_epsilon": 1e-8,
@@ -20,7 +19,6 @@
     "evaluate_during_training_steps": 2000,
     "evaluate_during_training_verbose": False,
     "fp16": True,
-    "fp16_opt_level": "O1",
     "gradient_accumulation_steps": 1,
     "learning_rate": 4e-5,
     "local_rank": -1,
diff --git a/simpletransformers/config/model_args.py b/simpletransformers/config/model_args.py
index 7c07152e..1c114cc3 100644
--- a/simpletransformers/config/model_args.py
+++ b/simpletransformers/config/model_args.py
@@ -1,8 +1,8 @@
-from dataclasses import dataclass, field, fields, asdict
-from multiprocessing import cpu_count
 import json
-import sys
 import os
+import sys
+from dataclasses import asdict, dataclass, field, fields
+from multiprocessing import cpu_count
 
 from torch.utils.data import Dataset
 
@@ -42,7 +42,6 @@ class ModelArgs:
     evaluate_during_training_steps: int = 2000
     evaluate_during_training_verbose: bool = False
     fp16: bool = True
-    fp16_opt_level: str = "O1"
     gradient_accumulation_steps: int = 1
     learning_rate: float = 4e-5
     local_rank: int = -1
diff --git a/simpletransformers/conv_ai/__init__.py b/simpletransformers/conv_ai/__init__.py
index 2f78e922..775ca0ba 100755
--- a/simpletransformers/conv_ai/__init__.py
+++ b/simpletransformers/conv_ai/__init__.py
@@ -1,2 +1,2 @@
-from simpletransformers.conv_ai.conv_ai_model import ConvAIModel
 from simpletransformers.config.model_args import ConvAIArgs
+from simpletransformers.conv_ai.conv_ai_model import ConvAIModel
diff --git a/simpletransformers/conv_ai/conv_ai_model.py b/simpletransformers/conv_ai/conv_ai_model.py
index d10215df..7507c8b8 100644
--- a/simpletransformers/conv_ai/conv_ai_model.py
+++ b/simpletransformers/conv_ai/conv_ai_model.py
@@ -12,11 +12,14 @@
 import statistics
 import warnings
 from collections import defaultdict
+from dataclasses import asdict
 from itertools import chain
 from multiprocessing import cpu_count
-from dataclasses import asdict
 
 import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
 from scipy.stats import mode, pearsonr
 from sklearn.metrics import (
     confusion_matrix,
@@ -25,18 +28,10 @@
     matthews_corrcoef,
     mean_squared_error,
 )
-from tqdm.auto import tqdm, trange
-
-import pandas as pd
-import torch
-import torch.nn.functional as F
-from simpletransformers.classification.classification_utils import InputExample, convert_examples_to_features
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import ConvAIArgs
-from simpletransformers.conv_ai.conv_ai_utils import get_dataset
 from tensorboardX import SummaryWriter
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
+from tqdm.auto import tqdm, trange
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -49,6 +44,11 @@
     get_linear_schedule_with_warmup,
 )
 
+from simpletransformers.classification.classification_utils import InputExample, convert_examples_to_features
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import ConvAIArgs
+from simpletransformers.conv_ai.conv_ai_utils import get_dataset
+
 try:
     import wandb
 
@@ -292,14 +292,6 @@ def train(
             optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
         )
 
-        if args.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
         if args.n_gpu > 1:
             model = torch.nn.DataParallel(model)
 
@@ -318,6 +310,11 @@ def train(
             wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
             wandb.watch(self.model)
 
+        if args.fp16:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for _ in train_iterator:
             train_iterator.set_description(f"Epoch {epoch_number + 1} of {args.num_train_epochs}")
@@ -331,15 +328,21 @@ def train(
                 batch = tuple(t.to(device) for t in batch)
                 input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
 
-                (lm_loss), (mc_loss), *_ = model(
-                    input_ids,
-                    token_type_ids=token_type_ids,
-                    mc_token_ids=mc_token_ids,
-                    mc_labels=mc_labels,
-                    lm_labels=lm_labels,
-                )
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef
+                if args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
+                        loss = outputs[0]
+                else:
+                    (lm_loss), (mc_loss), *_ = model(
+                        input_ids,
+                        token_type_ids=token_type_ids,
+                        mc_token_ids=mc_token_ids,
+                        mc_labels=mc_labels,
+                        lm_labels=lm_labels,
+                    )
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef
 
                 if args.n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -353,25 +356,21 @@ def train(
                     loss = loss / args.gradient_accumulation_steps
 
                 if args.fp16:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     amp.master_params(optimizer), args.max_grad_norm
-                    # )
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     model.parameters(), args.max_grad_norm
-                    # )
 
                 tr_loss += loss.item()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                    else:
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
-                    optimizer.step()
+                    if args.fp16:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
@@ -615,8 +614,7 @@ def load_and_cache_examples(self, dataset_path=None, evaluate=False, no_cache=Fa
         if not no_cache:
             no_cache = args.no_cache
 
-        if not no_cache:
-            os.makedirs(self.args.cache_dir, exist_ok=True)
+        os.makedirs(self.args.cache_dir, exist_ok=True)
 
         dataset_path = dataset_path if dataset_path else ""
 
diff --git a/simpletransformers/conv_ai/conv_ai_utils.py b/simpletransformers/conv_ai/conv_ai_utils.py
index 26017ec0..c1193363 100644
--- a/simpletransformers/conv_ai/conv_ai_utils.py
+++ b/simpletransformers/conv_ai/conv_ai_utils.py
@@ -10,9 +10,8 @@
 from datetime import datetime
 from multiprocessing import Pool
 
-from tqdm.auto import tqdm
-
 import torch
+from tqdm.auto import tqdm
 from transformers import cached_path
 
 PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
diff --git a/simpletransformers/custom_models/models.py b/simpletransformers/custom_models/models.py
index c31e4ada..35fd9edc 100755
--- a/simpletransformers/custom_models/models.py
+++ b/simpletransformers/custom_models/models.py
@@ -25,10 +25,13 @@
     ElectraModel,
     ElectraPreTrainedModel,
 )
-from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, RobertaClassificationHead
+from transformers.modeling_roberta import (
+    ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+    RobertaClassificationHead,
+    RobertaForQuestionAnswering,
+)
 from transformers.modeling_utils import PreTrainedModel, SequenceSummary
 from transformers.modeling_xlm_roberta import XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
-from transformers.modeling_roberta import RobertaForQuestionAnswering
 
 
 class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
diff --git a/simpletransformers/experimental/classification/classification_model.py b/simpletransformers/experimental/classification/classification_model.py
index 7bd7b7db..26201bd6 100755
--- a/simpletransformers/experimental/classification/classification_model.py
+++ b/simpletransformers/experimental/classification/classification_model.py
@@ -12,6 +12,7 @@
 from multiprocessing import cpu_count
 
 import numpy as np
+import torch
 from scipy.stats import pearsonr
 from sklearn.metrics import (
     confusion_matrix,
@@ -19,34 +20,11 @@
     matthews_corrcoef,
     mean_squared_error,
 )
-from tqdm.auto import tqdm, trange
-
-import torch
-from simpletransformers.experimental.classification.classification_utils import (
-    InputExample,
-    convert_examples_to_features,
-)
-from simpletransformers.experimental.classification.transformer_models.albert_model import (
-    AlbertForSequenceClassification,
-)
-from simpletransformers.experimental.classification.transformer_models.bert_model import BertForSequenceClassification
-from simpletransformers.experimental.classification.transformer_models.camembert_model import (
-    CamembertForSequenceClassification,
-)
-from simpletransformers.experimental.classification.transformer_models.distilbert_model import (
-    DistilBertForSequenceClassification,
-)
-from simpletransformers.experimental.classification.transformer_models.roberta_model import (
-    RobertaForSequenceClassification,
-)
-from simpletransformers.experimental.classification.transformer_models.xlm_model import XLMForSequenceClassification
-from simpletransformers.experimental.classification.transformer_models.xlnet_model import (
-    XLNetForSequenceClassification,
-)
 from tensorboardX import SummaryWriter
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
+from tqdm.auto import tqdm, trange
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -67,6 +45,28 @@
     get_linear_schedule_with_warmup,
 )
 
+from simpletransformers.experimental.classification.classification_utils import (
+    InputExample,
+    convert_examples_to_features,
+)
+from simpletransformers.experimental.classification.transformer_models.albert_model import (
+    AlbertForSequenceClassification,
+)
+from simpletransformers.experimental.classification.transformer_models.bert_model import BertForSequenceClassification
+from simpletransformers.experimental.classification.transformer_models.camembert_model import (
+    CamembertForSequenceClassification,
+)
+from simpletransformers.experimental.classification.transformer_models.distilbert_model import (
+    DistilBertForSequenceClassification,
+)
+from simpletransformers.experimental.classification.transformer_models.roberta_model import (
+    RobertaForSequenceClassification,
+)
+from simpletransformers.experimental.classification.transformer_models.xlm_model import XLMForSequenceClassification
+from simpletransformers.experimental.classification.transformer_models.xlnet_model import (
+    XLNetForSequenceClassification,
+)
+
 
 class ClassificationModel:
     def __init__(
@@ -135,7 +135,6 @@ def __init__(
             "output_dir": "outputs/",
             "cache_dir": "cache_dir/",
             "fp16": True,
-            "fp16_opt_level": "O1",
             "max_seq_length": 128,
             "train_batch_size": 8,
             "gradient_accumulation_steps": 1,
@@ -282,14 +281,6 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None)
             optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total
         )
 
-        if args["fp16"]:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"])
-
         if args["n_gpu"] > 1:
             model = torch.nn.DataParallel(model)
 
@@ -298,6 +289,11 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None)
         model.zero_grad()
         train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"])
 
+        if args["fp16"]:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for _ in train_iterator:
             # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
@@ -305,12 +301,20 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None)
                 batch = tuple(t.to(self.device) for t in batch)
 
                 inputs = self._get_inputs_dict(batch)
-                if self.sliding_window:
-                    outputs = model(inputs)
+                if args["fp16"]:
+                    with amp.autocast():
+                        if self.sliding_window:
+                            outputs = model(inputs)
+                        else:
+                            outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
                 else:
-                    outputs = model(**inputs)
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                loss = outputs[0]
+                    if self.sliding_window:
+                        outputs = model(inputs)
+                    else:
+                        outputs = model(**inputs)
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    loss = outputs[0]
                 if show_running_loss:
                     print("\rRunning loss: %f" % loss, end="")
 
@@ -320,16 +324,21 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None)
                     loss = loss / args["gradient_accumulation_steps"]
 
                 if args["fp16"]:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"])
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"])
 
                 tr_loss += loss.item()
                 if (step + 1) % args["gradient_accumulation_steps"] == 0:
-                    optimizer.step()
+                    if args["fp16"]:
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"])
+
+                    if args["fp16"]:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
diff --git a/simpletransformers/experimental/classification/multi_label_classification_model.py b/simpletransformers/experimental/classification/multi_label_classification_model.py
index 72344a08..4ffee0fa 100755
--- a/simpletransformers/experimental/classification/multi_label_classification_model.py
+++ b/simpletransformers/experimental/classification/multi_label_classification_model.py
@@ -1,15 +1,6 @@
 from multiprocessing import cpu_count
 
 import torch
-from simpletransformers.classification import ClassificationModel
-from simpletransformers.custom_models.models import (
-    AlbertForMultiLabelSequenceClassification,
-    BertForMultiLabelSequenceClassification,
-    DistilBertForMultiLabelSequenceClassification,
-    RobertaForMultiLabelSequenceClassification,
-    XLMForMultiLabelSequenceClassification,
-    XLNetForMultiLabelSequenceClassification,
-)
 from transformers import (
     WEIGHTS_NAME,
     AlbertConfig,
@@ -26,6 +17,16 @@
     XLNetTokenizer,
 )
 
+from simpletransformers.classification import ClassificationModel
+from simpletransformers.custom_models.models import (
+    AlbertForMultiLabelSequenceClassification,
+    BertForMultiLabelSequenceClassification,
+    DistilBertForMultiLabelSequenceClassification,
+    RobertaForMultiLabelSequenceClassification,
+    XLMForMultiLabelSequenceClassification,
+    XLNetForMultiLabelSequenceClassification,
+)
+
 
 class MultiLabelClassificationModel(ClassificationModel):
     def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, args=None, use_cuda=True):
@@ -85,7 +86,6 @@ def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, arg
             "output_dir": "outputs/",
             "cache_dir": "cache_dir/",
             "fp16": False,
-            "fp16_opt_level": "O1",
             "max_seq_length": 128,
             "train_batch_size": 8,
             "gradient_accumulation_steps": 1,
diff --git a/simpletransformers/language_generation/__init__.py b/simpletransformers/language_generation/__init__.py
index 10d5d2b6..c16c6ceb 100755
--- a/simpletransformers/language_generation/__init__.py
+++ b/simpletransformers/language_generation/__init__.py
@@ -1,2 +1,2 @@
-from simpletransformers.language_generation.language_generation_model import LanguageGenerationModel
 from simpletransformers.config.model_args import LanguageGenerationArgs
+from simpletransformers.language_generation.language_generation_model import LanguageGenerationModel
diff --git a/simpletransformers/language_generation/language_generation_model.py b/simpletransformers/language_generation/language_generation_model.py
index fa6979fc..33d1e776 100644
--- a/simpletransformers/language_generation/language_generation_model.py
+++ b/simpletransformers/language_generation/language_generation_model.py
@@ -1,15 +1,11 @@
 import argparse
+import json
 import logging
-import random
 import os
-import json
+import random
 
 import numpy as np
-
 import torch
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import LanguageGenerationArgs
-from simpletransformers.language_generation.language_generation_utils import PREPROCESSING_FUNCTIONS
 from transformers import (
     CTRLConfig,
     CTRLLMHeadModel,
@@ -31,6 +27,10 @@
     XLNetTokenizer,
 )
 
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import LanguageGenerationArgs
+from simpletransformers.language_generation.language_generation_utils import PREPROCESSING_FUNCTIONS
+
 logger = logging.getLogger(__name__)
 
 MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
diff --git a/simpletransformers/language_modeling/__init__.py b/simpletransformers/language_modeling/__init__.py
index 55f28581..e978e202 100755
--- a/simpletransformers/language_modeling/__init__.py
+++ b/simpletransformers/language_modeling/__init__.py
@@ -1,2 +1,2 @@
-from simpletransformers.language_modeling.language_modeling_model import LanguageModelingModel
 from simpletransformers.config.model_args import LanguageModelingArgs
+from simpletransformers.language_modeling.language_modeling_model import LanguageModelingModel
diff --git a/simpletransformers/language_modeling/language_modeling_model.py b/simpletransformers/language_modeling/language_modeling_model.py
index 5e6bf681..82e75351 100755
--- a/simpletransformers/language_modeling/language_modeling_model.py
+++ b/simpletransformers/language_modeling/language_modeling_model.py
@@ -10,40 +10,32 @@
 import os
 import random
 import warnings
+from dataclasses import asdict
 from multiprocessing import cpu_count
 from typing import Dict, List
-from dataclasses import asdict
 
 import numpy as np
+import pandas as pd
+import torch
 from sklearn.metrics import (
     confusion_matrix,
     label_ranking_average_precision_score,
     matthews_corrcoef,
     mean_squared_error,
 )
-from tqdm.auto import tqdm, trange
-
-import pandas as pd
-import torch
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import LanguageModelingArgs
-from simpletransformers.custom_models.models import ElectraForLanguageModelingModel
-from simpletransformers.language_modeling.language_modeling_utils import (
-    SimpleDataset,
-    mask_tokens,
-)
 from tensorboardX import SummaryWriter
 from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer
 from tokenizers.processors import BertProcessing
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
+from tqdm.auto import tqdm, trange
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
     AutoConfig,
-    AutoTokenizer,
     AutoModelWithLMHead,
+    AutoTokenizer,
     BertConfig,
     BertForMaskedLM,
     BertTokenizer,
@@ -60,6 +52,9 @@
     GPT2Config,
     GPT2LMHeadModel,
     GPT2Tokenizer,
+    LongformerConfig,
+    LongformerForMaskedLM,
+    LongformerTokenizer,
     OpenAIGPTConfig,
     OpenAIGPTLMHeadModel,
     OpenAIGPTTokenizer,
@@ -68,13 +63,14 @@
     RobertaConfig,
     RobertaForMaskedLM,
     RobertaTokenizer,
-    LongformerConfig,
-    LongformerForMaskedLM,
-    LongformerTokenizer,
     get_linear_schedule_with_warmup,
 )
+from transformers.data.datasets.language_modeling import LineByLineTextDataset, TextDataset
 
-from transformers.data.datasets.language_modeling import TextDataset, LineByLineTextDataset
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import LanguageModelingArgs
+from simpletransformers.custom_models.models import ElectraForLanguageModelingModel
+from simpletransformers.language_modeling.language_modeling_utils import SimpleDataset, mask_tokens
 
 try:
     import wandb
@@ -488,18 +484,10 @@ def collate(examples: List[torch.Tensor]):
             optimizer.load_state_dict(torch.load(os.path.join(args.model_name, "optimizer.pt")))
             scheduler.load_state_dict(torch.load(os.path.join(args.model_name, "scheduler.pt")))
 
-        if args.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
         if args.n_gpu > 1:
             model = torch.nn.DataParallel(model)
 
-        # Distributed training (should be after apex fp16 initialization)
+        # Distributed training
         if args.local_rank != -1:
             model = torch.nn.parallel.DistributedDataParallel(
                 model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
@@ -545,6 +533,11 @@ def collate(examples: List[torch.Tensor]):
             wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
             wandb.watch(self.model)
 
+        if args.fp16:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for current_epoch in train_iterator:
             if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
@@ -568,22 +561,28 @@ def collate(examples: List[torch.Tensor]):
                 inputs = inputs.to(self.device)
                 labels = labels.to(self.device)
 
-                if args.model_type == "longformer":
-                    outputs = model(inputs, attention_mask=None, masked_lm_labels=labels)
-                else:
-                    outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                if args.model_type == "electra":
-                    g_loss = outputs[0]
-                    d_loss = outputs[1]
-                    loss = g_loss + args.discriminator_loss_weight * d_loss
+                if args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
+                        loss = outputs[0]
                 else:
-                    loss = outputs[0]
-                # if loss.item() < 1:
-                #     masked = (labels[0] != -100).nonzero()
-                #     print(labels[0][masked])
-                #     preds = outputs[1][0, masked, :].clone().detach().cpu().numpy()
-                #     print(np.argmax(preds, axis=2))
+                    if args.model_type == "longformer":
+                        outputs = model(inputs, attention_mask=None, masked_lm_labels=labels)
+                    else:
+                        outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    if args.model_type == "electra":
+                        g_loss = outputs[0]
+                        d_loss = outputs[1]
+                        loss = g_loss + args.discriminator_loss_weight * d_loss
+                    else:
+                        loss = outputs[0]
+                    # if loss.item() < 1:
+                    #     masked = (labels[0] != -100).nonzero()
+                    #     print(labels[0][masked])
+                    #     preds = outputs[1][0, masked, :].clone().detach().cpu().numpy()
+                    #     print(np.argmax(preds, axis=2))
 
                 if args.n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -599,19 +598,21 @@ def collate(examples: List[torch.Tensor]):
                     loss = loss / args.gradient_accumulation_steps
 
                 if args.fp16:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
 
                 tr_loss += loss.item()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                    else:
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
-                    optimizer.step()
+                    if args.fp16:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
diff --git a/simpletransformers/language_modeling/language_modeling_utils.py b/simpletransformers/language_modeling/language_modeling_utils.py
index 7794b9d1..dc43e009 100644
--- a/simpletransformers/language_modeling/language_modeling_utils.py
+++ b/simpletransformers/language_modeling/language_modeling_utils.py
@@ -4,14 +4,12 @@
 from multiprocessing import Pool
 from typing import Tuple
 
-from tqdm.auto import tqdm
-
 import torch
 from tokenizers.processors import BertProcessing
 from torch.utils.data import Dataset
+from tqdm.auto import tqdm
 from transformers import PreTrainedTokenizer
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -157,7 +155,7 @@ def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> T
     masked_indices = torch.bernoulli(probability_matrix).bool()
     labels[~masked_indices] = -100  # We only compute loss on masked tokens
 
-    if args.model_type == "electra" and False:
+    if args.model_type == "electra":
         # For ELECTRA, we replace all masked input tokens with tokenizer.mask_token
         inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
     else:
diff --git a/simpletransformers/language_representation/representation_model.py b/simpletransformers/language_representation/representation_model.py
index ab66e169..7452f6e5 100644
--- a/simpletransformers/language_representation/representation_model.py
+++ b/simpletransformers/language_representation/representation_model.py
@@ -10,13 +10,12 @@
 
 import numpy as np
 import torch
+from transformers import BertConfig, BertTokenizer, GPT2Config, GPT2Tokenizer, RobertaConfig, RobertaTokenizer
+
 from simpletransformers.config.model_args import ModelArgs
 from simpletransformers.language_representation.transformer_models.bert_model import BertForTextRepresentation
 from simpletransformers.language_representation.transformer_models.gpt2_model import GPT2ForTextRepresentation
 
-
-from transformers import BertConfig, BertTokenizer, RobertaConfig, RobertaTokenizer, GPT2Config, GPT2Tokenizer
-
 try:
     import wandb
 
@@ -128,7 +127,14 @@ def _tokenize(self, text_list):
         return torch.LongTensor(input_ids)
 
     def encode_sentences(self, text_list, combine_strategy=None, batch_size=32):
-        # supported values for combine_strategy: None, "mean", "concat"
+        """
+
+        Generates list of contextual word or sentence embeddings using the model passed to class constructor
+        :param text_list: list of text sentences
+        :param combine_strategy: strategy for combining word vectors, supported values: None, "mean", "concat"
+        :param batch_size
+        :return: list of lists of sentence embeddings(if `combine_strategy=None`) OR list of sentence embeddings(if `combine_strategy!=None`)
+        """
         batches = batch_iterable(text_list, batch_size=batch_size)
         embeddings = np.array([])
         for batch in batches:
diff --git a/simpletransformers/ner/__init__.py b/simpletransformers/ner/__init__.py
index 2b8f1cf5..4f97e5d3 100755
--- a/simpletransformers/ner/__init__.py
+++ b/simpletransformers/ner/__init__.py
@@ -1,2 +1,2 @@
-from simpletransformers.ner.ner_model import NERModel
 from simpletransformers.config.model_args import NERArgs
+from simpletransformers.ner.ner_model import NERModel
diff --git a/simpletransformers/ner/ner_model.py b/simpletransformers/ner/ner_model.py
index 0d0f8619..5a3cc378 100755
--- a/simpletransformers/ner/ner_model.py
+++ b/simpletransformers/ner/ner_model.py
@@ -6,35 +6,24 @@
 import os
 import random
 import warnings
-from multiprocessing import cpu_count
 from dataclasses import asdict
+from multiprocessing import cpu_count
 
 import numpy as np
-from scipy.stats import pearsonr
-from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
-from tqdm.auto import tqdm, trange
-
 import pandas as pd
 import torch
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import NERArgs
-from simpletransformers.ner.ner_utils import (
-    InputExample,
-    convert_examples_to_features,
-    get_examples_from_df,
-    get_labels,
-    read_examples_from_file,
-    LazyNERDataset,
-)
+from scipy.stats import pearsonr
+from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
 from tensorboardX import SummaryWriter
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from tqdm.auto import tqdm, trange
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
     AutoConfig,
-    AutoTokenizer,
     AutoModelForTokenClassification,
+    AutoTokenizer,
     BertConfig,
     BertForTokenClassification,
     BertTokenizer,
@@ -51,8 +40,8 @@
     LongformerForTokenClassification,
     LongformerTokenizer,
     MobileBertConfig,
-    MobileBertTokenizer,
     MobileBertForTokenClassification,
+    MobileBertTokenizer,
     RobertaConfig,
     RobertaForTokenClassification,
     RobertaTokenizer,
@@ -62,6 +51,17 @@
     get_linear_schedule_with_warmup,
 )
 
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import NERArgs
+from simpletransformers.ner.ner_utils import (
+    InputExample,
+    LazyNERDataset,
+    convert_examples_to_features,
+    get_examples_from_df,
+    get_labels,
+    read_examples_from_file,
+)
+
 try:
     import wandb
 
@@ -335,14 +335,6 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
             optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
         )
 
-        if args.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
         if args.n_gpu > 1:
             model = torch.nn.DataParallel(model)
 
@@ -383,6 +375,11 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
             wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
             wandb.watch(self.model)
 
+        if args.fp16:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for _ in train_iterator:
             if epochs_trained > 0:
@@ -403,9 +400,15 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
 
                 inputs = self._get_inputs_dict(batch)
 
-                outputs = model(**inputs)
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                loss = outputs[0]
+                if args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
+                        loss = outputs[0]
+                else:
+                    outputs = model(**inputs)
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    loss = outputs[0]
 
                 if args.n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -421,24 +424,21 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
                     loss = loss / args.gradient_accumulation_steps
 
                 if args.fp16:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     amp.master_params(optimizer), args.max_grad_norm
-                    # )
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     model.parameters(), args.max_grad_norm
-                    # )
 
                 tr_loss += loss.item()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                    if args.fp16:
+                        scaler.step(optimizer)
+                        scaler.update()
                     else:
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-                    optimizer.step()
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
diff --git a/simpletransformers/ner/ner_utils.py b/simpletransformers/ner/ner_utils.py
index e84fcf09..46e6fc78 100755
--- a/simpletransformers/ner/ner_utils.py
+++ b/simpletransformers/ner/ner_utils.py
@@ -17,18 +17,17 @@
 
 from __future__ import absolute_import, division, print_function
 
+import linecache
 import logging
 import os
 from io import open
 from multiprocessing import Pool, cpu_count
-from torch.utils.data import Dataset
-from torch.nn import CrossEntropyLoss
-import torch
-import linecache
-
-from tqdm.auto import tqdm
 
 import pandas as pd
+import torch
+from torch.nn import CrossEntropyLoss
+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
 
 
 class InputExample(object):
diff --git a/simpletransformers/question_answering/__init__.py b/simpletransformers/question_answering/__init__.py
index fa55467a..57c4db27 100755
--- a/simpletransformers/question_answering/__init__.py
+++ b/simpletransformers/question_answering/__init__.py
@@ -1,2 +1,2 @@
-from simpletransformers.question_answering.question_answering_model import QuestionAnsweringModel
 from simpletransformers.config.model_args import QuestionAnsweringArgs
+from simpletransformers.question_answering.question_answering_model import QuestionAnsweringModel
diff --git a/simpletransformers/question_answering/question_answering_model.py b/simpletransformers/question_answering/question_answering_model.py
index ee794359..065256f8 100755
--- a/simpletransformers/question_answering/question_answering_model.py
+++ b/simpletransformers/question_answering/question_answering_model.py
@@ -6,10 +6,12 @@
 import os
 import random
 import warnings
-from multiprocessing import cpu_count
 from dataclasses import asdict
+from multiprocessing import cpu_count
 
 import numpy as np
+import pandas as pd
+import torch
 from scipy.stats import pearsonr
 from sklearn.metrics import (
     confusion_matrix,
@@ -17,39 +19,19 @@
     matthews_corrcoef,
     mean_squared_error,
 )
-from tqdm.auto import tqdm, trange
-
-import pandas as pd
-import torch
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import QuestionAnsweringArgs
-from simpletransformers.custom_models.models import ElectraForQuestionAnswering, XLMRobertaForQuestionAnswering
-from simpletransformers.question_answering.question_answering_utils import (
-    LazyQuestionAnsweringDataset,
-    RawResult,
-    RawResultExtended,
-    build_examples,
-    convert_examples_to_features,
-    get_best_predictions,
-    get_best_predictions_extended,
-    get_examples,
-    to_list,
-    write_predictions,
-    write_predictions_extended,
-    squad_convert_examples_to_features,
-)
 from tensorboardX import SummaryWriter
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
+from tqdm.auto import tqdm, trange
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
-    AutoConfig,
-    AutoTokenizer,
-    AutoModelForQuestionAnswering,
     AlbertConfig,
     AlbertForQuestionAnswering,
     AlbertTokenizer,
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
     BartConfig,
     BartForQuestionAnswering,
     BartTokenizer,
@@ -62,25 +44,43 @@
     ElectraConfig,
     ElectraTokenizer,
     LongformerConfig,
-    LongformerTokenizer,
     LongformerForQuestionAnswering,
+    LongformerTokenizer,
     MobileBertConfig,
-    MobileBertTokenizer,
     MobileBertForQuestionAnswering,
+    MobileBertTokenizer,
     RobertaConfig,
     RobertaForQuestionAnswering,
     RobertaTokenizer,
     XLMConfig,
     XLMForQuestionAnswering,
-    XLMTokenizer,
     XLMRobertaConfig,
     XLMRobertaTokenizer,
+    XLMTokenizer,
     XLNetConfig,
     XLNetForQuestionAnswering,
     XLNetTokenizer,
     get_linear_schedule_with_warmup,
 )
 
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import QuestionAnsweringArgs
+from simpletransformers.custom_models.models import ElectraForQuestionAnswering, XLMRobertaForQuestionAnswering
+from simpletransformers.question_answering.question_answering_utils import (
+    LazyQuestionAnsweringDataset,
+    RawResult,
+    RawResultExtended,
+    build_examples,
+    convert_examples_to_features,
+    get_best_predictions,
+    get_best_predictions_extended,
+    get_examples,
+    squad_convert_examples_to_features,
+    to_list,
+    write_predictions,
+    write_predictions_extended,
+)
+
 try:
     import wandb
 
@@ -396,14 +396,6 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
             optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
         )
 
-        if args.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
         if args.n_gpu > 1:
             model = torch.nn.DataParallel(model)
 
@@ -445,6 +437,11 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
             wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
             wandb.watch(self.model)
 
+        if args.fp16:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for _ in train_iterator:
             if epochs_trained > 0:
@@ -464,10 +461,15 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
                 batch = tuple(t.to(device) for t in batch)
 
                 inputs = self._get_inputs_dict(batch)
-
-                outputs = model(**inputs)
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                loss = outputs[0]
+                if args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
+                        loss = outputs[0]
+                else:
+                    outputs = model(**inputs)
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    loss = outputs[0]
 
                 if args.n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -483,25 +485,21 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
                     loss = loss / args.gradient_accumulation_steps
 
                 if args.fp16:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     amp.master_params(optimizer), args.max_grad_norm
-                    # )
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     model.parameters(), args.max_grad_norm
-                    # )
 
                 tr_loss += loss.item()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                    else:
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
-                    optimizer.step()
+                    if args.fp16:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
@@ -854,7 +852,8 @@ def predict(self, to_predict, n_best_size=None):
             n_best_size (Optional): Number of predictions to return. args.n_best_size will be used if not specified.
 
         Returns:
-            preds: A python list containing the predicted answer, and id for each question in to_predict.
+            list: A python list  of dicts containing the predicted answer/answers, and id for each question in to_predict.
+            list: A python list  of dicts containing the predicted probability/probabilities, and id for each question in to_predict.
         """  # noqa: ignore flake8"
         tokenizer = self.tokenizer
         device = self.device
diff --git a/simpletransformers/question_answering/question_answering_utils.py b/simpletransformers/question_answering/question_answering_utils.py
index dad61100..8f497141 100755
--- a/simpletransformers/question_answering/question_answering_utils.py
+++ b/simpletransformers/question_answering/question_answering_utils.py
@@ -2,31 +2,28 @@
 
 import collections
 import json
+import linecache
 import logging
 import math
 import mmap
 import os
 import re
 import string
+from functools import partial
 from io import open
 from multiprocessing import Pool, cpu_count
-from functools import partial
 from pprint import pprint
-from torch.utils.data import Dataset
-
-from tqdm import tqdm, trange
-import linecache
 
 import torch
 from tensorboardX import SummaryWriter
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from transformers import AdamW, get_linear_schedule_with_warmup
-from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
-from transformers import XLMTokenizer, SquadExample
+from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset
+from tqdm import tqdm, trange
+from transformers import AdamW, SquadExample, XLMTokenizer, get_linear_schedule_with_warmup
 from transformers.data.processors.squad import (
-    squad_convert_example_to_features_init,
     squad_convert_example_to_features,
+    squad_convert_example_to_features_init,
 )
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 logger = logging.getLogger(__name__)
 
diff --git a/simpletransformers/seq2seq/__init__.py b/simpletransformers/seq2seq/__init__.py
index e6e579ee..4e86a1a7 100755
--- a/simpletransformers/seq2seq/__init__.py
+++ b/simpletransformers/seq2seq/__init__.py
@@ -1,2 +1,2 @@
-from simpletransformers.seq2seq.seq2seq_model import Seq2SeqModel
 from simpletransformers.config.model_args import Seq2SeqArgs
+from simpletransformers.seq2seq.seq2seq_model import Seq2SeqModel
diff --git a/simpletransformers/seq2seq/seq2seq_model.py b/simpletransformers/seq2seq/seq2seq_model.py
index 00b5a60f..7908603d 100644
--- a/simpletransformers/seq2seq/seq2seq_model.py
+++ b/simpletransformers/seq2seq/seq2seq_model.py
@@ -4,31 +4,30 @@
 import os
 import random
 import warnings
-from multiprocessing import cpu_count, Pool
-from pathlib import Path
 from dataclasses import asdict
+from multiprocessing import Pool, cpu_count
+from pathlib import Path
 
 import numpy as np
-from tqdm.auto import tqdm, trange
-
 import pandas as pd
 import torch
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import Seq2SeqArgs
-from simpletransformers.seq2seq.seq2seq_utils import Seq2SeqDataset, SimpleSummarizationDataset
 from tensorboardX import SummaryWriter
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AdamW, EncoderDecoderModel, EncoderDecoderConfig, get_linear_schedule_with_warmup
+from tqdm.auto import tqdm, trange
 from transformers import (
+    AdamW,
+    AutoConfig,
     AutoModel,
     AutoTokenizer,
-    AutoConfig,
-    BertTokenizer,
-    BertModel,
-    BertForMaskedLM,
+    BartConfig,
+    BartForConditionalGeneration,
+    BartTokenizer,
     BertConfig,
+    BertForMaskedLM,
+    BertModel,
+    BertTokenizer,
     CamembertConfig,
     CamembertModel,
     CamembertTokenizer,
@@ -38,9 +37,14 @@
     ElectraConfig,
     ElectraModel,
     ElectraTokenizer,
+    EncoderDecoderConfig,
+    EncoderDecoderModel,
     LongformerConfig,
     LongformerModel,
     LongformerTokenizer,
+    MarianConfig,
+    MarianMTModel,
+    MarianTokenizer,
     MobileBertConfig,
     MobileBertModel,
     MobileBertTokenizer,
@@ -49,14 +53,13 @@
     RobertaConfig,
     RobertaModel,
     RobertaTokenizer,
-    BartForConditionalGeneration,
-    BartTokenizer,
-    BartConfig,
-    MarianMTModel,
-    MarianTokenizer,
-    MarianConfig,
+    get_linear_schedule_with_warmup,
 )
 
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import Seq2SeqArgs
+from simpletransformers.seq2seq.seq2seq_utils import Seq2SeqDataset, SimpleSummarizationDataset
+
 try:
     import wandb
 
@@ -393,14 +396,6 @@ def train(
             optimizer.load_state_dict(torch.load(os.path.join(args.model_name, "optimizer.pt")))
             scheduler.load_state_dict(torch.load(os.path.join(args.model_name, "scheduler.pt")))
 
-        if args.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
         if args.n_gpu > 1:
             model = torch.nn.DataParallel(model)
 
@@ -444,6 +439,11 @@ def train(
             wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
             wandb.watch(self.model)
 
+        if args.fp16:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for current_epoch in train_iterator:
             if epochs_trained > 0:
@@ -463,9 +463,15 @@ def train(
                 # batch = tuple(t.to(device) for t in batch)
 
                 inputs = self._get_inputs_dict(batch)
-                outputs = model(**inputs)
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                loss = outputs[0]
+                if args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
+                        loss = outputs[0]
+                else:
+                    outputs = model(**inputs)
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    loss = outputs[0]
 
                 if args.n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -481,25 +487,21 @@ def train(
                     loss = loss / args.gradient_accumulation_steps
 
                 if args.fp16:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     amp.master_params(optimizer), args.max_grad_norm
-                    # )
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     model.parameters(), args.max_grad_norm
-                    # )
 
                 tr_loss += loss.item()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                    else:
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
-                    optimizer.step()
+                    if args.fp16:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
diff --git a/simpletransformers/seq2seq/seq2seq_utils.py b/simpletransformers/seq2seq/seq2seq_utils.py
index 24390c25..d5cad829 100644
--- a/simpletransformers/seq2seq/seq2seq_utils.py
+++ b/simpletransformers/seq2seq/seq2seq_utils.py
@@ -4,13 +4,12 @@
 from multiprocessing import Pool
 from typing import Tuple
 
-from tqdm.auto import tqdm
-
 import pandas as pd
 import torch
 from tokenizers.implementations import ByteLevelBPETokenizer
 from tokenizers.processors import BertProcessing
 from torch.utils.data import Dataset
+from tqdm.auto import tqdm
 from transformers import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
diff --git a/simpletransformers/streamlit/classification_view.py b/simpletransformers/streamlit/classification_view.py
index 71ac7447..e9345184 100644
--- a/simpletransformers/streamlit/classification_view.py
+++ b/simpletransformers/streamlit/classification_view.py
@@ -30,6 +30,7 @@ def get_states(model, session_state=None):
 def classification_viewer(model, model_class):
     st.subheader("Enter text: ")
     input_text = st.text_area("")
+    st.sidebar.subheader("Parameters")
 
     if model_class == "ClassificationModel":
         try:
@@ -42,7 +43,6 @@ def classification_viewer(model, model_class):
             )
             session_state, model = get_states(model, session_state)
 
-        st.subheader(f"Predictions")
         model.args.max_seq_length = st.sidebar.slider(
             "Max Seq Length", min_value=1, max_value=512, value=model.args.max_seq_length
         )
@@ -59,6 +59,18 @@ def classification_viewer(model, model_class):
             model.args.stride = st.sidebar.slider(
                 "Stride (Fraction of Max Seq Length)", min_value=0.0, max_value=1.0, value=model.args.stride
             )
+    elif model_class == "MultiLabelClassificationModel":
+        try:
+            session_state, model = get_states(model)
+        except AttributeError:
+            session_state = get(
+                max_seq_length=model.args.max_seq_length,
+            )
+            session_state, model = get_states(model, session_state)
+
+        model.args.max_seq_length = st.sidebar.slider(
+            "Max Seq Length", min_value=1, max_value=512, value=model.args.max_seq_length
+        )
 
     if input_text:
         prediction, raw_values = model.predict([input_text])
diff --git a/simpletransformers/streamlit/qa_view.py b/simpletransformers/streamlit/qa_view.py
index cccce87b..d8b76484 100644
--- a/simpletransformers/streamlit/qa_view.py
+++ b/simpletransformers/streamlit/qa_view.py
@@ -26,6 +26,7 @@ def get_states(model, session_state=None):
 
 
 def qa_viewer(model):
+    st.sidebar.subheader("Parameters")
     try:
         session_state, model = get_states(model)
     except AttributeError:
@@ -36,7 +37,6 @@ def qa_viewer(model):
         )
         session_state, model = get_states(model, session_state)
 
-    st.subheader(f"Predictions")
     model.args.max_seq_length = st.sidebar.slider(
         "Max Seq Length", min_value=1, max_value=512, value=model.args.max_seq_length
     )
diff --git a/simpletransformers/streamlit/simple_view.py b/simpletransformers/streamlit/simple_view.py
index 1b84591b..76bbea71 100644
--- a/simpletransformers/streamlit/simple_view.py
+++ b/simpletransformers/streamlit/simple_view.py
@@ -111,54 +111,49 @@ def streamlit_runner(
     cuda_device=-1,
     **kwargs,
 ):
+    st.title("Simple Transformers Viewer")
+    st.markdown("---")
+    info_text = st.empty()
     if not (model_class and model_type and model_name):
         model_list = find_all_models(".", [])
         selected_dir = st.sidebar.selectbox("Choose Model", model_list)
         if selected_dir:
             selected_dir = selected_dir.split(":- ")[-1]
         else:
-            st.subheader("No models found in current directory.")
-            st.markdown(
+            info_text.markdown(
                 """
-            Simple Viewer looked everywhere in this directory and subdirectories but didn't find any Simple Transformers models. :(
-
-            If you are trying to load models saved with an older Simple Transformers version, make sure the `model_args.json` file
-            contains the `model_class`, `model_type`, and `model_name`.
-
-            Or, you can write a Python script like the one below and save it to `view.py`.
-
-            ```python
-            from simpletransformers.streamlit.simple_view import streamlit_runner
+                ### No models found in current directory.
 
+                Simple Viewer looked everywhere in this directory and subdirectories but didn't find any Simple Transformers models. :worried:
 
-            streamlit_runner(model_class="ClassificationModel", model_type="distilbert", model_name="outputs")
+                If you are trying to load models saved with an older Simple Transformers version, make sure the `model_args.json` file
+                contains the `model_class`, `model_type`, and `model_name`.
 
-            ```
+                Or, you can specify the model paths manually through the "Specify model manually" option on the sidebar.
 
-            You can execute this with `streamlit run view.py`.
+                ---
 
-            The `streamlit_runner()` function accepts all the same arguments as the corresponding Simple Transformers model.
-            """
+                """
             )
 
         manual_model = st.sidebar.checkbox("Specify model manually", value=False if selected_dir else True)
         if manual_model:
             st.sidebar.subheader("Model Details")
-            st.write("Please fill the Model details on the sidebar or select a model from the Choose Model dropdown.")
+            fill_info = st.empty()
+            fill_info.markdown("Please fill the Model details on the sidebar.")
             model_class = st.sidebar.selectbox("Model Class", list(model_class_map.keys()))
             model_type = st.sidebar.text_input("Model type (e.g. bert, roberta, xlnet)")
             model_name = st.sidebar.text_input("Model name (e.g. bert-base-cased, roberta-base)")
 
             if manual_model_load(model_class, model_type, model_name):
                 selected_dir = None
-
+                info_text.markdown("")
+                fill_info.markdown("")
     model, model_class = load_model(
         selected_dir, model_class, model_type, model_name, num_labels, weight, args, use_cuda, cuda_device, **kwargs
     )
     model.args.use_multiprocessing = False
 
-    st.title("Simple Transformers Viewer")
-    st.markdown("---")
     st.header(model_class_map[model_class])
 
     if model_class in ["ClassificationModel", "MultiLabelClassificationModel"]:
diff --git a/simpletransformers/t5/__init__.py b/simpletransformers/t5/__init__.py
index 485ca292..5dcc7d0a 100755
--- a/simpletransformers/t5/__init__.py
+++ b/simpletransformers/t5/__init__.py
@@ -1,2 +1,2 @@
-from simpletransformers.t5.t5_model import T5Model
 from simpletransformers.config.model_args import T5Args
+from simpletransformers.t5.t5_model import T5Model
diff --git a/simpletransformers/t5/run_simple_transformers_streamlit_app.py b/simpletransformers/t5/run_simple_transformers_streamlit_app.py
new file mode 100644
index 00000000..a1723f1f
--- /dev/null
+++ b/simpletransformers/t5/run_simple_transformers_streamlit_app.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+from simpletransformers.streamlit.simple_view import streamlit_runner
+
+
+streamlit_runner()
+
diff --git a/simpletransformers/t5/t5_model.py b/simpletransformers/t5/t5_model.py
index 6f38f605..56c28e29 100644
--- a/simpletransformers/t5/t5_model.py
+++ b/simpletransformers/t5/t5_model.py
@@ -2,27 +2,27 @@
 import logging
 import math
 import os
-from os import truncate
 import random
 import warnings
-from multiprocessing import cpu_count, Pool
-from pathlib import Path
 from dataclasses import asdict
+from multiprocessing import Pool, cpu_count
+from os import truncate
+from pathlib import Path
 
 import numpy as np
-from tqdm.auto import tqdm, trange
-
 import pandas as pd
 import torch
-from simpletransformers.config.global_args import global_args
-from simpletransformers.config.model_args import T5Args
-from simpletransformers.t5.t5_utils import T5Dataset
 from tensorboardX import SummaryWriter
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
+from tqdm.auto import tqdm, trange
 from transformers import AdamW, T5Config, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
 
+from simpletransformers.config.global_args import global_args
+from simpletransformers.config.model_args import T5Args
+from simpletransformers.t5.t5_utils import T5Dataset
+
 try:
     import wandb
 
@@ -269,14 +269,6 @@ def train(
             optimizer.load_state_dict(torch.load(os.path.join(args.model_name, "optimizer.pt")))
             scheduler.load_state_dict(torch.load(os.path.join(args.model_name, "scheduler.pt")))
 
-        if args.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
         if args.n_gpu > 1:
             model = torch.nn.DataParallel(model)
 
@@ -320,6 +312,11 @@ def train(
             wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
             wandb.watch(self.model)
 
+        if args.fp16:
+            from torch.cuda import amp
+
+            scaler = amp.GradScaler()
+
         model.train()
         for current_epoch in train_iterator:
             if epochs_trained > 0:
@@ -339,9 +336,15 @@ def train(
                 batch = tuple(t.to(device) for t in batch)
 
                 inputs = self._get_inputs_dict(batch)
-                outputs = model(**inputs)
-                # model outputs are always tuple in pytorch-transformers (see doc)
-                loss = outputs[0]
+                if args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        # model outputs are always tuple in pytorch-transformers (see doc)
+                        loss = outputs[0]
+                else:
+                    outputs = model(**inputs)
+                    # model outputs are always tuple in pytorch-transformers (see doc)
+                    loss = outputs[0]
 
                 if args.n_gpu > 1:
                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -357,25 +360,21 @@ def train(
                     loss = loss / args.gradient_accumulation_steps
 
                 if args.fp16:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     amp.master_params(optimizer), args.max_grad_norm
-                    # )
+                    scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                    # torch.nn.utils.clip_grad_norm_(
-                    #     model.parameters(), args.max_grad_norm
-                    # )
 
                 tr_loss += loss.item()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                    else:
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                        scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
-                    optimizer.step()
+                    if args.fp16:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
                     scheduler.step()  # Update learning rate schedule
                     model.zero_grad()
                     global_step += 1
diff --git a/simpletransformers/t5/t5_utils.py b/simpletransformers/t5/t5_utils.py
index 79df195c..016796b7 100644
--- a/simpletransformers/t5/t5_utils.py
+++ b/simpletransformers/t5/t5_utils.py
@@ -1,17 +1,16 @@
 import logging
 import os
-from os import truncate
 import pickle
 from multiprocessing import Pool
+from os import truncate
 from typing import Tuple
 
-from tqdm.auto import tqdm
-
 import pandas as pd
 import torch
 from tokenizers.implementations import ByteLevelBPETokenizer
 from tokenizers.processors import BertProcessing
 from torch.utils.data import Dataset
+from tqdm.auto import tqdm
 from transformers import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
diff --git a/tests/test_classification.py b/tests/test_classification.py
index 329c47d7..8c4b77dc 100644
--- a/tests/test_classification.py
+++ b/tests/test_classification.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import pytest
+
 from simpletransformers.classification import ClassificationModel, MultiLabelClassificationModel
 
 
diff --git a/tests/test_language_modeling.py b/tests/test_language_modeling.py
index 6db3d4cc..827de7a0 100644
--- a/tests/test_language_modeling.py
+++ b/tests/test_language_modeling.py
@@ -1,7 +1,9 @@
+import os
+
 import pandas as pd
 import pytest
+
 from simpletransformers.language_modeling import LanguageModelingModel
-import os
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_language_representation.py b/tests/test_language_representation.py
index f578a1b0..1bc68aed 100644
--- a/tests/test_language_representation.py
+++ b/tests/test_language_representation.py
@@ -1,4 +1,5 @@
 import pytest
+
 from simpletransformers.language_representation import RepresentationModel
 
 
diff --git a/tests/test_named_entity_recognition.py b/tests/test_named_entity_recognition.py
index f421e60c..6ef78a57 100644
--- a/tests/test_named_entity_recognition.py
+++ b/tests/test_named_entity_recognition.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import pytest
+
 from simpletransformers.ner import NERModel
 
 
diff --git a/tests/test_question_answering.py b/tests/test_question_answering.py
index 8a129e83..6f6bb3de 100644
--- a/tests/test_question_answering.py
+++ b/tests/test_question_answering.py
@@ -1,8 +1,9 @@
 import json
 import logging
-import pytest
 import os
 
+import pytest
+
 from simpletransformers.question_answering import QuestionAnsweringModel
 
 
diff --git a/tests/test_seq2seq.py b/tests/test_seq2seq.py
index ffeece35..af41129d 100644
--- a/tests/test_seq2seq.py
+++ b/tests/test_seq2seq.py
@@ -1,7 +1,9 @@
+import os
+
 import pandas as pd
 import pytest
+
 from simpletransformers.seq2seq import Seq2SeqArgs, Seq2SeqModel
-import os
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_t5.py b/tests/test_t5.py
index d45f0d1e..72d49438 100644
--- a/tests/test_t5.py
+++ b/tests/test_t5.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import pytest
+
 from simpletransformers.t5 import T5Model