diff --git a/README.md b/README.md index 71ebefc..1e46c5f 100644 --- a/README.md +++ b/README.md @@ -450,3 +450,15 @@ trainer.train() year = {2021} } ``` + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2302.01327, + doi = {10.48550/ARXIV.2302.01327}, + url = {https://arxiv.org/abs/2302.01327}, + author = {Kumar, Manoj and Dehghani, Mostafa and Houlsby, Neil}, + title = {Dual PatchNorm}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} +``` diff --git a/phenaki_pytorch/cvivit.py b/phenaki_pytorch/cvivit.py index 0ecf70d..f45ca07 100644 --- a/phenaki_pytorch/cvivit.py +++ b/phenaki_pytorch/cvivit.py @@ -269,12 +269,16 @@ def __init__( self.to_patch_emb_first_frame = nn.Sequential( Rearrange('b c 1 (h p1) (w p2) -> b 1 h w (c p1 p2)', p1 = patch_height, p2 = patch_width), - nn.Linear(channels * patch_width * patch_height, dim) + nn.LayerNorm(channels * patch_width * patch_height), + nn.Linear(channels * patch_width * patch_height, dim), + nn.LayerNorm(dim) ) self.to_patch_emb = nn.Sequential( Rearrange('b c (t pt) (h p1) (w p2) -> b t h w (c pt p1 p2)', p1 = patch_height, p2 = patch_width, pt = temporal_patch_size), - nn.Linear(channels * patch_width * patch_height * temporal_patch_size, dim) + nn.LayerNorm(channels * patch_width * patch_height * temporal_patch_size), + nn.Linear(channels * patch_width * patch_height * temporal_patch_size, dim), + nn.LayerNorm(dim) ) transformer_kwargs = dict( diff --git a/setup.py b/setup.py index 2fd4491..04c4024 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'phenaki-pytorch', packages = find_packages(exclude=[]), - version = '0.1.1', + version = '0.2.0', license='MIT', description = 'Phenaki - Pytorch', author = 'Phil Wang', @@ -31,7 +31,7 @@ 'torchvision', 'transformers>=4.20.1', 'tqdm', - 'vector-quantize-pytorch>=0.10.14' + 'vector-quantize-pytorch>=0.10.15' ], classifiers=[ 'Development Status :: 4 - Beta',