diff --git a/README.md b/README.md
index 7be6833..625aad3 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,16 @@ Please join <a href="https://discord.gg/xBPBXfcFHd"><img alt="Join us on Discord
 
 <a href="https://www.youtube.com/watch?v=RYLomvaPWa4">AI Coffeebreak explanation</a>
 
+## Appreciation
+
+- <a href="https://stability.ai/">Stability.ai</a> for the generous sponsorship to work on cutting edge artificial intelligence research
+
+- <a href="https://huggingface.co/">🤗 Huggingface</a> for their amazing transformers and accelerate library
+
+- <a href="https://github.com/gmegh">Guillem</a> for his ongoing contributions
+
+- You? If you are a great machine learning engineer and / or researcher, feel free to contribute to the frontier of open source generative AI
+
 ## Install
 
 ```bash
@@ -132,6 +142,74 @@ entire_video.shape # (1, 3, 17 + 14 + 14 = 45, 256, 256)
 
 That's it!
 
+## Token Critic
+
+A <a href="https://arxiv.org/abs/2209.04439">new paper</a> suggests that instead of relying on the predicted probabilities of each token as a measure of confidence, one can train an extra critic to decide what to iteratively mask during sampling. You can optionally train this critic for potentially better generations as shown below
+
+```python
+import torch
+from phenaki_pytorch import CViViT, MaskGit, TokenCritic, PhenakiCritic
+
+cvivit = CViViT(
+    dim = 512,
+    codebook_size = 5000,
+    image_size = (256, 128),
+    patch_size = 32,
+    temporal_patch_size = 2,
+    spatial_depth = 4,
+    temporal_depth = 4,
+    dim_head = 64,
+    heads = 8
+)
+
+maskgit = MaskGit(
+    num_tokens = 5000,
+    max_seq_len = 1024,
+    dim = 512,
+    dim_context = 768,
+    depth = 6,
+)
+
+critic = TokenCritic(
+    num_tokens = 5000,
+    max_seq_len = 1024,
+    dim = 512,
+    dim_context = 768,
+    depth = 6
+)
+
+critic_trainer = PhenakiCritic(
+    maskgit = maskgit,
+    critic = critic,
+    cvivit = cvivit
+).cuda()
+
+texts = [
+    'a whale breaching from afar',
+    'young girl blowing out candles on her birthday cake',
+    'fireworks with blue and green sparkles'
+]
+
+videos = torch.randn(3, 3, 3, 256, 128).cuda() # (batch, channels, frames, height, width)
+
+loss = critic_trainer(videos = videos, texts = texts)
+loss.backward()
+```
+
+Then just pass the critic to `Phenaki`
+
+```python
+
+phenaki = Phenaki(
+    cvivit = cvivit,
+    maskgit = maskgit,
+    critic = critic
+).cuda()
+
+```
+
+Now your generations should be greatly improved (but who knows, since this is only a month old research)
+
 ## Phenaki Trainer (wip)
 
 This repository will also endeavor to allow the researcher to train on text-to-image and then text-to-video. Similarly, for unconditional training, the researcher should be able to first train on images and then fine tune on video. Below is an example for text-to-video
@@ -209,12 +287,11 @@ trainer = PhenakiTrainer(
 trainer.train()
 ```
 
-Unconditional is as follows
-
-ex. unconditional images and video training
+Token critic training is similarly
 
 ```python
 import torch
+from torch.utils.data import Dataset
 from phenaki_pytorch import CViViT, MaskGit, Phenaki, PhenakiTrainer
 
 cvivit = CViViT(
@@ -240,36 +317,70 @@ maskgit = MaskGit(
     unconditional = False
 )
 
-phenaki = Phenaki(
-    cvivit = cvivit,
-    maskgit = maskgit
+critic = TokenCritic(
+    num_tokens = 5000,
+    max_seq_len = 1024,
+    dim = 512,
+    dim_context = 768,
+    depth = 6
+)
+
+phenaki_critic = PhenakiCritic(
+    maskgit = maskgit,
+    critic = critic,
+    cvivit = cvivit
 ).cuda()
 
-# pass in the folder to images or video
+# mock text video dataset
+# you will have to extend your own, and return the (<video tensor>, <caption>) tuple
 
-trainer = PhenakiTrainer(
-    phenaki = phenaki,
+class MockTextVideoDataset(Dataset):
+    def __init__(
+        self,
+        length = 100,
+        image_size = 256,
+        num_frames = 17
+    ):
+        super().__init__()
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.len = length
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self, idx):
+        video = torch.randn(3, self.num_frames, self.image_size, self.image_size)
+        caption = 'video caption'
+        return video, caption
+
+dataset = MockTextVideoDataset()
+
+# pass in the dataset
+
+trainer = PhenakiCriticTrainer(
+    phenaki_critic = phenaki_critic,
     batch_size = 4,
     grad_accum_every = 4,
-    train_on_images = True,                # for sake of example, bottom is folder of images
-    dataset = '/path/to/images/or/video'
+    train_on_images = False, # if your mock dataset above return (images, caption) pairs, set this to True
+    dataset = dataset        # pass in your dataset here
 )
 
 trainer.train()
 ```
 
-## Token Critic
+Unconditional is as follows
 
-A <a href="https://arxiv.org/abs/2209.04439">new paper</a> suggests that instead of relying on the predicted probabilities of each token as a measure of confidence, one can train an extra critic to decide what to iteratively mask during sampling. You can optionally train this critic for potentially better generations as shown below
+ex. unconditional images and video training
 
 ```python
 import torch
-from phenaki_pytorch import CViViT, MaskGit, TokenCritic, PhenakiCritic
+from phenaki_pytorch import CViViT, MaskGit, Phenaki, PhenakiTrainer
 
 cvivit = CViViT(
     dim = 512,
     codebook_size = 5000,
-    image_size = (256, 128),
+    image_size = 256,
     patch_size = 32,
     temporal_patch_size = 2,
     spatial_depth = 4,
@@ -278,63 +389,34 @@ cvivit = CViViT(
     heads = 8
 )
 
+cvivit.load('/path/to/trained/cvivit.pt')
+
 maskgit = MaskGit(
     num_tokens = 5000,
     max_seq_len = 1024,
     dim = 512,
     dim_context = 768,
     depth = 6,
+    unconditional = False
 )
 
-critic = TokenCritic(
-    num_tokens = 5000,
-    max_seq_len = 1024,
-    dim = 512,
-    dim_context = 768,
-    depth = 6
-)
-
-critic_trainer = PhenakiCritic(
-    maskgit = maskgit,
-    critic = critic,
-    cvivit = cvivit
-).cuda()
-
-texts = [
-    'a whale breaching from afar',
-    'young girl blowing out candles on her birthday cake',
-    'fireworks with blue and green sparkles'
-]
-
-videos = torch.randn(3, 3, 3, 256, 128).cuda() # (batch, channels, frames, height, width)
-
-loss = critic_trainer(videos = videos, texts = texts)
-loss.backward()
-```
-
-Then just pass the critic to `Phenaki`
-
-```python
-
 phenaki = Phenaki(
     cvivit = cvivit,
-    maskgit = maskgit,
-    critic = critic
+    maskgit = maskgit
 ).cuda()
 
-```
-
-Now your generations should be greatly improved (but who knows, since this is only a month old research)
-
-## Appreciation
-
-- <a href="https://stability.ai/">Stability.ai</a> for the generous sponsorship to work on cutting edge artificial intelligence research
-
-- <a href="https://huggingface.co/">🤗 Huggingface</a> for their amazing transformers and accelerate library
+# pass in the folder to images or video
 
-- <a href="https://github.com/gmegh">Guillem</a> for his ongoing contributions
+trainer = PhenakiTrainer(
+    phenaki = phenaki,
+    batch_size = 4,
+    grad_accum_every = 4,
+    train_on_images = True,                # for sake of example, bottom is folder of images
+    dataset = '/path/to/images/or/video'
+)
 
-- You? If you are a great machine learning engineer and / or researcher, feel free to contribute to the frontier of open source generative AI
+trainer.train()
+```
 
 ## Todo
 
@@ -358,7 +440,9 @@ Now your generations should be greatly improved (but who knows, since this is on
 - [x] wire up accelerate for multi-gpu training for both c-vivit and maskgit
 - [x] add depthwise-convs to cvivit for position generating
 - [x] some basic video manipulation code, allow for sampled tensor to be saved as gif
+- [x] basic critic training code
 
+- [ ] get some basic critic sampling code, show comparison of with and without critic
 - [ ] add position generating dsconv to maskgit too
 - [ ] add all top of the line research for stabilizing transformers training
 - [ ] bring in concatenative token shift (temporal dimension)
diff --git a/phenaki_pytorch/__init__.py b/phenaki_pytorch/__init__.py
index 8956b46..2ad7a7d 100644
--- a/phenaki_pytorch/__init__.py
+++ b/phenaki_pytorch/__init__.py
@@ -1,4 +1,4 @@
 from phenaki_pytorch.phenaki_pytorch import Phenaki, CViViT, MaskGit, MaskGitTrainWrapper, TokenCritic, PhenakiCritic, make_video
 
 from phenaki_pytorch.cvivit_trainer import CViViTTrainer
-from phenaki_pytorch.phenaki_trainer import PhenakiTrainer
+from phenaki_pytorch.phenaki_trainer import PhenakiTrainer, PhenakiCriticTrainer
diff --git a/phenaki_pytorch/phenaki_pytorch.py b/phenaki_pytorch/phenaki_pytorch.py
index 08f59f1..127d994 100644
--- a/phenaki_pytorch/phenaki_pytorch.py
+++ b/phenaki_pytorch/phenaki_pytorch.py
@@ -40,15 +40,15 @@ def get_mask_subset_with_prob(mask, prob):
     batch, seq_len, device = *mask.shape, mask.device
     max_masked = math.ceil(prob * seq_len)
 
-    num_tokens = mask.sum(dim=-1, keepdim=True)
-    mask_excess = (mask.cumsum(dim=-1) > (num_tokens * prob).ceil())
+    num_tokens = mask.sum(dim = -1, keepdim = True)
+    mask_excess = (mask.cumsum(dim = -1) > (num_tokens * prob).ceil())
     mask_excess = mask_excess[:, :max_masked]
 
-    rand = torch.rand((batch, seq_len), device=device).masked_fill(~mask, -1e9)
-    _, sampled_indices = rand.topk(max_masked, dim=-1)
+    rand = torch.rand((batch, seq_len), device = device).masked_fill(~mask, -1e9)
+    _, sampled_indices = rand.topk(max_masked, dim = -1)
     sampled_indices = (sampled_indices + 1).masked_fill_(mask_excess, 0)
 
-    new_mask = torch.zeros((batch, seq_len + 1), device=device)
+    new_mask = torch.zeros((batch, seq_len + 1), device = device)
     new_mask.scatter_(-1, sampled_indices, 1)
     return new_mask[:, 1:].bool()
 
@@ -283,7 +283,6 @@ def forward(
         **kwargs
     ):
         x = rearrange(x, 'b ... -> b (...)')
-
         b, n, device = *x.shape, x.device
 
         if not exists(text_mask):
diff --git a/phenaki_pytorch/phenaki_trainer.py b/phenaki_pytorch/phenaki_trainer.py
index 6af6082..fe3e9f6 100644
--- a/phenaki_pytorch/phenaki_trainer.py
+++ b/phenaki_pytorch/phenaki_trainer.py
@@ -31,7 +31,7 @@
 from phenaki_pytorch.optimizer import get_optimizer
 from accelerate import Accelerator
 
-from phenaki_pytorch.phenaki_pytorch import Phenaki
+from phenaki_pytorch.phenaki_pytorch import Phenaki, PhenakiCritic
 
 from phenaki_pytorch.data import ImageDataset, VideoDataset, video_tensor_to_gif, DataLoader
 
@@ -228,7 +228,7 @@ def __init__(
         self.max_grad_norm = max_grad_norm
 
         self.train_num_steps = train_num_steps
-        self.image_size = phenaki.cvivit.image_size
+        self.image_size = cvivit.image_size
 
         # sampling related variables
 
@@ -448,3 +448,239 @@ def train(self):
                 pbar.update(1)
 
         self.print('training complete')
+
+# critic trainer
+
+@beartype
+class PhenakiCriticTrainer(object):
+    def __init__(
+        self,
+        phenaki_critic: PhenakiCritic,
+        *,
+        folder = None,
+        train_on_images = False,
+        batch_size = 16,
+        grad_accum_every = 1,
+        num_frames = 17,
+        sample_num_frames = None,
+        train_lr = 1e-4,
+        train_num_steps = 100000,
+        max_grad_norm = None,
+        ema_update_every = 10,
+        ema_decay = 0.995,
+        adam_betas = (0.9, 0.99),
+        wd = 0,
+        save_and_sample_every = 1000,
+        num_samples = 25,
+        results_folder = './results',
+        amp = False,
+        fp16 = False,
+        split_batches = True,
+        convert_image_to = None,
+        sample_texts_file_path = None,  # path to a text file with video captions, delimited by newline
+        sample_texts: Optional[List[str]] = None,
+        dataset: Optional[Dataset] = None,
+        dataset_fields: Optional[Tuple[str, ...]] = None
+    ):
+        super().__init__()
+        maskgit = phenaki_critic.maskgit
+        cvivit = phenaki_critic.cvivit
+
+        assert exists(cvivit), 'cvivit must be present on phenaki critic'
+
+        # define accelerator
+
+        self.accelerator = Accelerator(
+            split_batches = split_batches,
+            mixed_precision = 'fp16' if fp16 else 'no'
+        )
+
+        self.accelerator.native_amp = amp
+
+        self.model = phenaki_critic
+
+        assert has_int_squareroot(num_samples), 'number of samples must have an integer square root'
+        self.unconditional = maskgit.unconditional
+
+        # training related variables
+
+        self.batch_size = batch_size
+        self.grad_accum_every = grad_accum_every
+
+        self.max_grad_norm = max_grad_norm
+
+        self.train_num_steps = train_num_steps
+        self.image_size = cvivit.image_size
+
+        # sampling related variables
+
+        self.num_samples = num_samples
+
+        self.sample_texts = None
+
+        if exists(sample_texts_file_path):
+            sample_texts_file_path = Path(sample_texts_file_path)
+            assert sample_texts_file_path.exists()
+            captions = sample_texts_file_path.read_text().split('\n')
+            self.sample_texts = list(filter(len, captions))
+
+        elif exists(self.sample_texts):
+            self.sample_texts = sample_texts
+
+        assert maskgit.unconditional or exists(self.sample_texts), 'if maskgit is to be trained text conditioned, `sample_texts` List[str] or `sample_texts_file_path` must be given'
+
+        self.save_and_sample_every = save_and_sample_every
+
+        # dataset and dataloader
+
+        dataset_klass = ImageDataset if train_on_images else VideoDataset
+
+        self.sample_num_frames = default(sample_num_frames, num_frames)
+        self.train_on_images = train_on_images
+
+        if dataset:
+            self.ds = dataset
+        elif train_on_images:
+            assert exists(folder)
+            self.ds = ImageDataset(folder, self.image_size)
+        else:
+            assert exists(folder)
+            self.ds = VideoDataset(folder, self.image_size, num_frames = num_frames)
+
+        dl = DataLoader(self.ds, batch_size = batch_size, shuffle = True, pin_memory = True, num_workers = cpu_count())
+
+        dl = self.accelerator.prepare(dl)
+        self.dl = cycle(dl)
+
+        if exists(dataset_fields):
+            assert not has_duplicates(dataset_fields), 'dataset fields must not have duplicate field names'
+            valid_dataset_fields = set(DATASET_FIELD_TYPE_CONFIG.keys())
+            assert len(set(dataset_fields) - valid_dataset_fields) == 0, f'dataset fields must be one of {valid_dataset_fields}'
+
+        self.dataset_fields = dataset_fields
+
+        # optimizer
+
+        self.opt = get_optimizer(maskgit.parameters(), lr = train_lr, wd = wd, betas = adam_betas)
+
+        # step counter state
+
+        self.step = 0
+
+        # prepare model, dataloader, optimizer with accelerator
+
+        self.model, self.opt = self.accelerator.prepare(self.model, self.opt)
+
+        self.results_folder = Path(results_folder)
+        self.results_folder.mkdir(parents = True, exist_ok = True)
+
+    def data_tuple_to_kwargs(self, data):
+        if not exists(self.dataset_fields):
+            self.dataset_fields = determine_types(data, DATASET_FIELD_TYPE_CONFIG)
+            assert not has_duplicates(self.dataset_fields), 'dataset fields must not have duplicate field names'
+
+        return dict(zip(self.dataset_fields, data))
+
+    def print(self, msg):
+        self.accelerator.print(msg)
+
+    @property
+    def device(self):
+        return self.accelerator.device
+
+    @property
+    def is_distributed(self):
+        return not (self.accelerator.distributed_type == DistributedType.NO and self.accelerator.num_processes == 1)
+
+    @property
+    def is_main(self):
+        return self.accelerator.is_main_process
+
+    @property
+    def is_local_main(self):
+        return self.accelerator.is_local_main_process
+
+    def save(self, milestone):
+        if not self.accelerator.is_local_main_process:
+            return
+
+        data = {
+            'step': self.step,
+            'model': self.accelerator.get_state_dict(self.model),
+            'opt': self.opt.state_dict(),
+            'scaler': self.accelerator.scaler.state_dict() if exists(self.accelerator.scaler) else None
+        }
+
+        torch.save(data, str(self.results_folder / f'model-{milestone}.pt'))
+
+    def load(self, milestone):
+        accelerator = self.accelerator
+        device = accelerator.device
+
+        data = torch.load(str(self.results_folder / f'model-{milestone}.pt'), map_location=device)
+
+        model = self.accelerator.unwrap_model(self.model)
+        model.load_state_dict(data['model'])
+
+        self.step = data['step']
+        self.opt.load_state_dict(data['opt'])
+
+        if exists(self.accelerator.scaler) and exists(data['scaler']):
+            self.accelerator.scaler.load_state_dict(data['scaler'])
+
+    def train_step(self):
+        accelerator = self.accelerator
+        device = self.device
+
+        total_loss = 0.
+
+        for _ in range(self.grad_accum_every):
+            data = next(self.dl)
+            data = elements_to_device_if_tensor(data, device)
+            data_kwargs = self.data_tuple_to_kwargs(data)
+
+            assert not (self.train_on_images and data_kwargs['videos'].ndim != 4), 'you have it set to train on images, but the dataset is not returning tensors of 4 dimensions (batch, channels, height, width)'
+
+            with self.accelerator.autocast():
+                loss = self.model(**data_kwargs)
+                loss = loss / self.grad_accum_every
+                total_loss += loss.item()
+
+            self.accelerator.backward(loss)
+
+        if exists(self.max_grad_norm):
+            accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
+
+        accelerator.wait_for_everyone()
+
+        self.opt.step()
+        self.opt.zero_grad()
+
+        accelerator.wait_for_everyone()
+
+        if self.is_main and self.step % self.save_and_sample_every == 0:
+            self.model.eval()
+            milestone = self.step // self.save_and_sample_every
+
+            # save checkpoints
+
+            self.save(milestone)
+
+        self.step += 1
+        return total_loss
+
+    def train(self):
+
+        with tqdm(
+            initial = self.step,
+            total = self.train_num_steps,
+            disable = not self.is_main
+        ) as pbar:
+
+            while self.step < self.train_num_steps:
+                loss = self.train_step()
+
+                pbar.set_description(f'loss: {loss:.4f}')
+                pbar.update(1)
+
+        self.print('training complete')
diff --git a/setup.py b/setup.py
index 88402f7..e510967 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'phenaki-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.54',
+  version = '0.0.55',
   license='MIT',
   description = 'Phenaki - Pytorch',
   author = 'Phil Wang',