-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmodel.py
executable file
·170 lines (127 loc) · 5.55 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from utils import *
class Adapter(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
def __init__(
self,
c_in,
c_type,
width=16, # best so far 16
dtype=None
):
super().__init__()
self.c_in = c_in
self.c_type = c_type
# Round up to the nearest integer
size = int(math.ceil(math.sqrt(self.c_in)))
norm_layer = nn.LayerNorm
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = nn.Conv2d(1, width, kernel_size=1,
stride=1, bias=False, dtype=dtype)
self.bn1 = norm_layer([width, size, size], dtype=dtype)
self.conv2 = nn.Conv2d(width, width, kernel_size=3,
stride=1, padding=1, bias=False, dtype=dtype)
self.bn2 = norm_layer([width, size, size], dtype=dtype)
self.conv3 = nn.Conv2d(width, 1, kernel_size=1,
stride=1, bias=False, dtype=dtype)
self.bn3 = norm_layer([1, size, size], dtype=dtype)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
"""
size = int(math.sqrt(self.c_in))
x = x.view(-1, 1, size, size) #sqrt(768) is not a perfect integer for ViT-L/14
"""
size = int(math.ceil(math.sqrt(self.c_in))
) # Round up to the nearest integer
pad_size = size**2 - self.c_in # Compute the padding size
# Pad the input tensor with zeros if necessary
x = torch.nn.functional.pad(x, (0, pad_size))
x = x.view(-1, 1, size, size) # Reshape the tensor to a square shape
identity = x
out = self.conv1(x)
out = self.bn1(out)
if self.c_type == 'conv-3x':
out = self.conv2(out)
out = self.bn2(out)
out = self.conv3(out)
out = self.bn3(out)
out += identity
out = out.view(-1, 1, size*size)
out = out[:, :, :self.c_in].view(-1, self.c_in)
return out
class Adapter_FC(nn.Module):
def __init__(self, c_in, reduction=4, dtype=None):
super(Adapter_FC, self).__init__()
self.fc = nn.Sequential(
nn.Linear(c_in, c_in // reduction, bias=False, dtype=dtype),
nn.LayerNorm(c_in // reduction, dtype=dtype),
nn.Linear(c_in // reduction, c_in, bias=False, dtype=dtype),
nn.LayerNorm(c_in, dtype=dtype),
)
def forward(self, image_features):
x = self.fc(image_features)
ratio = 0.2 # to prevent overfitting
image_features = ratio * x + (1 - ratio) * image_features
return image_features
class Embedder(nn.Module):
def __init__(self, vocab_size, d_model, weights, dtype=None):
super().__init__()
self.embed = nn.Embedding(vocab_size, d_model, dtype=dtype)
self.embed.weight = nn.Parameter(weights.clone())
def forward(self, x):
return self.embed(x)
def attention(q, k, v, d_k, mask=None, dropout=None):
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
mask = mask.unsqueeze(1)
scores = scores.masked_fill(mask == 0, -1e9)
scores = F.softmax(scores, dim=-1)
if dropout is not None:
scores = dropout(scores)
output = torch.matmul(scores, v)
return output
class MultiHeadAttention(nn.Module):
def __init__(self, heads, d_model, dropout=0.1, dtype=None):
super().__init__()
self.d_model = d_model
self.d_k = d_model // heads
self.h = heads
self.q_linear = nn.Linear(d_model, d_model, dtype=dtype)
self.v_linear = nn.Linear(d_model, d_model, dtype=dtype)
self.k_linear = nn.Linear(d_model, d_model, dtype=dtype)
self.dropout = nn.Dropout(dropout)
self.out = nn.Linear(d_model, d_model, dtype=dtype)
def forward(self, q, k, v, mask=None):
bs = q.size(0)
# perform linear operation and split into h heads
k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
# transpose to get dimensions bs * h * sl * d_model
k = k.transpose(1, 2)
q = q.transpose(1, 2)
v = v.transpose(1, 2)
# calculate attention using function we will define next
scores = attention(q, k, v, self.d_k, mask, self.dropout)
# concatenate heads and put through final linear layer
concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
output = self.out(concat)
return output
class Encoder(nn.Module):
def __init__(self, vocab_size, d_model, heads, weights, dtype=None):
super().__init__()
self.embed = Embedder(vocab_size, d_model, weights, dtype=dtype)
self.attn = MultiHeadAttention(heads, d_model, dtype=dtype)
def forward(self, src):
x1 = self.embed(src)
return self.attn(x1, x1, x1)