Add Uformer architecture (#41)

chaiNNer-org · Nov 22, 2023 · 8de1158 · 8de1158
1 parent 49f4494
commit 8de1158
Show file tree

Hide file tree

Showing 6 changed files with 1,900 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -112,6 +112,7 @@ This has only been tested with the models that are linked here, and any unoffici
 #### Denoising
 
 - [SCUNet](https://github.com/cszn/SCUNet) | [GAN Model](https://github.com/cszn/KAIR/releases/download/v1.0/scunet_color_real_gan.pth) | [PSNR Model](https://github.com/cszn/KAIR/releases/download/v1.0/scunet_color_real_psnr.pth)
+- [Uformer](https://github.com/ZhendongWang6/Uformer) | [Denoise SIDD Model](https://mailustceducn-my.sharepoint.com/:u:/g/personal/zhendongwang_mail_ustc_edu_cn/Ea7hMP82A0xFlOKPlQnBJy0B9gVP-1MJL75mR4QKBMGc2w?e=iOz0zz) | [Deblur GoPro Model](https://mailustceducn-my.sharepoint.com/:u:/g/personal/zhendongwang_mail_ustc_edu_cn/EfCPoTSEKJRAshoE6EAC_3YB7oNkbLUX6AUgWSCwoJe0oA?e=jai90x)
 - [KBNet](https://github.com/zhangyi-3/KBNet) | [Models](https://mycuhk-my.sharepoint.com/personal/1155135732_link_cuhk_edu_hk/_layouts/15/onedrive.aspx?ga=1&id=%2Fpersonal%2F1155135732%5Flink%5Fcuhk%5Fedu%5Fhk%2FDocuments%2Fshare%2FKBNet%2FDenoising%2Fpretrained%5Fmodels)
 
 #### DeJPEG

diff --git a/src/spandrel/__helpers/main_registry.py b/src/spandrel/__helpers/main_registry.py
@@ -21,6 +21,7 @@
     SwiftSRGAN,
     Swin2SR,
     SwinIR,
+    Uformer,
 )
 from .model_descriptor import StateDict
 from .registry import ArchRegistry, ArchSupport
@@ -165,6 +166,19 @@ def _detect(state_dict: StateDict) -> bool:
         ),
         load=FBCNN.load,
     ),
+    ArchSupport(
+        id="Uformer",
+        detect=_has_keys(
+            "input_proj.proj.0.weight",
+            "output_proj.proj.0.weight",
+            "encoderlayer_0.blocks.0.norm1.weight",
+            "encoderlayer_2.blocks.0.norm1.weight",
+            "conv.blocks.0.norm1.weight",
+            "decoderlayer_0.blocks.0.norm1.weight",
+            "decoderlayer_2.blocks.0.norm1.weight",
+        ),
+        load=Uformer.load,
+    ),
     ArchSupport(
         id="DAT",
         detect=_has_keys("layers.0.blocks.2.attn.attn_mask_0", "conv_first.weight"),

diff --git a/src/spandrel/architectures/Uformer/__init__.py b/src/spandrel/architectures/Uformer/__init__.py
@@ -0,0 +1,120 @@
+import math
+
+from ...__helpers.model_descriptor import (
+    RestorationModelDescriptor,
+    SizeRequirements,
+    StateDict,
+)
+from ..__arch_helpers.state import get_seq_len
+from .arch.Uformer import Uformer
+
+
+def load(state_dict: StateDict) -> RestorationModelDescriptor[Uformer]:
+    img_size = 256  # cannot be deduced from state_dict
+    in_chans = 3
+    dd_in = 3
+    embed_dim = 32
+    depths = [2, 2, 2, 2, 2, 2, 2, 2, 2]
+    num_heads = [1, 2, 4, 8, 16, 16, 8, 4, 2]
+    win_size = 8
+    mlp_ratio = 4.0
+    qkv_bias = True
+    drop_rate = 0.0  # cannot be deduced from state_dict
+    attn_drop_rate = 0.0  # cannot be deduced from state_dict
+    drop_path_rate = 0.1  # cannot be deduced from state_dict
+    token_projection = "linear"
+    token_mlp = "leff"
+    shift_flag = True  # cannot be deduced from state_dict
+    modulator = False
+    cross_modulator = False
+
+    embed_dim = state_dict["input_proj.proj.0.weight"].shape[0]
+    dd_in = state_dict["input_proj.proj.0.weight"].shape[1]
+    in_chans = state_dict["output_proj.proj.0.weight"].shape[0]
+
+    depths[0] = get_seq_len(state_dict, "encoderlayer_0.blocks")
+    depths[1] = get_seq_len(state_dict, "encoderlayer_1.blocks")
+    depths[2] = get_seq_len(state_dict, "encoderlayer_2.blocks")
+    depths[3] = get_seq_len(state_dict, "encoderlayer_3.blocks")
+    depths[4] = get_seq_len(state_dict, "conv.blocks")
+    depths[5] = get_seq_len(state_dict, "decoderlayer_0.blocks")
+    depths[6] = get_seq_len(state_dict, "decoderlayer_1.blocks")
+    depths[7] = get_seq_len(state_dict, "decoderlayer_2.blocks")
+    depths[8] = get_seq_len(state_dict, "decoderlayer_3.blocks")
+
+    num_heads_suffix = "blocks.0.attn.relative_position_bias_table"
+    num_heads[0] = state_dict[f"encoderlayer_0.{num_heads_suffix}"].shape[1]
+    num_heads[1] = state_dict[f"encoderlayer_1.{num_heads_suffix}"].shape[1]
+    num_heads[2] = state_dict[f"encoderlayer_2.{num_heads_suffix}"].shape[1]
+    num_heads[3] = state_dict[f"encoderlayer_3.{num_heads_suffix}"].shape[1]
+    num_heads[4] = state_dict[f"conv.{num_heads_suffix}"].shape[1]
+    num_heads[5] = state_dict[f"decoderlayer_0.{num_heads_suffix}"].shape[1]
+    num_heads[6] = state_dict[f"decoderlayer_1.{num_heads_suffix}"].shape[1]
+    num_heads[7] = state_dict[f"decoderlayer_2.{num_heads_suffix}"].shape[1]
+    num_heads[8] = state_dict[f"decoderlayer_3.{num_heads_suffix}"].shape[1]
+
+    if "encoderlayer_0.blocks.0.attn.qkv.to_q.depthwise.weight" in state_dict:
+        token_projection = "conv"
+        qkv_bias = True  # cannot be deduced from state_dict
+    else:
+        token_projection = "linear"
+        qkv_bias = "encoderlayer_0.blocks.0.attn.qkv.to_q.bias" in state_dict
+
+    modulator = "decoderlayer_0.blocks.0.modulator.weight" in state_dict
+    cross_modulator = "decoderlayer_0.blocks.0.cross_modulator.weight" in state_dict
+
+    # size_temp = (2 * win_size - 1) ** 2
+    size_temp = state_dict[
+        "encoderlayer_0.blocks.0.attn.relative_position_bias_table"
+    ].shape[0]
+    win_size = (int(math.sqrt(size_temp)) + 1) // 2
+
+    if "encoderlayer_0.blocks.0.mlp.fc1.weight" in state_dict:
+        token_mlp = "mlp"  # or "ffn", doesn't matter
+        mlp_ratio = (
+            state_dict["encoderlayer_0.blocks.0.mlp.fc1.weight"].shape[0] / embed_dim
+        )
+    elif state_dict["encoderlayer_0.blocks.0.mlp.dwconv.0.weight"].shape[1] == 1:
+        token_mlp = "leff"
+        mlp_ratio = (
+            state_dict["encoderlayer_0.blocks.0.mlp.linear1.0.weight"].shape[0]
+            / embed_dim
+        )
+    else:
+        token_mlp = "fastleff"
+        mlp_ratio = (
+            state_dict["encoderlayer_0.blocks.0.mlp.linear1.0.weight"].shape[0]
+            / embed_dim
+        )
+
+    model = Uformer(
+        img_size=img_size,
+        in_chans=in_chans,
+        dd_in=dd_in,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        win_size=win_size,
+        mlp_ratio=mlp_ratio,
+        qkv_bias=qkv_bias,
+        drop_rate=drop_rate,
+        attn_drop_rate=attn_drop_rate,
+        drop_path_rate=drop_path_rate,
+        token_projection=token_projection,
+        token_mlp=token_mlp,
+        shift_flag=shift_flag,
+        modulator=modulator,
+        cross_modulator=cross_modulator,
+    )
+
+    return RestorationModelDescriptor(
+        model,
+        state_dict,
+        architecture="Uformer",
+        tags=[],
+        supports_half=False,  # Too much weirdness to support this at the moment
+        supports_bfloat16=True,
+        input_channels=dd_in,
+        output_channels=dd_in,
+        size_requirements=SizeRequirements(multiple_of=128, square=True),
+    )
diff --git a/src/spandrel/architectures/Uformer/arch/LICENSE b/src/spandrel/architectures/Uformer/arch/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Zhendong Wang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.