萌新的卷积神经网络学习笔记--pytorch搭建Vit Transformer


一些函数的使用
tensor.chunk(划分为多少份,dim= 按哪个维度划分)

from einops import rearrange
from einops.layers.torch import Rearrange
类似reshape( ),view( ), 但更直观,前者为方法,后者为类


from einops import repeat
复制矩阵里的元素生成新矩阵,且属于浅拷贝

einsum()
求两个矩阵的点积

patch + position embedding
Position Embedding 中位置编码,cls token的解读: http://t.csdn.cn/twonr
# patch + position embedding
class Patch_embed(nn.Module):
def __init__(self, img_size, patch_size,dim,P):
super(Patch_embed,self).__init__()
self.unfold = nn.Sequential(nn.Unfold(kernel_size = (patch_size,patch_size),
dilation=(1,1),
padding=(0,0),
stride=(patch_size,patch_size)),
Rearrange("b i j -> b j i"),
nn.Linear(patch_size**2*3,dim)
)
self.dropout = nn.Dropout(p=P)
n = (img_size//patch_size)**2
# nn.Parameter()将变量转化为可学习的参数
# 可学习的参数做位置编码,编码过程直接相加
self.pos = nn.Parameter(torch.randn(1, n+1 ,dim))# [1,197,dim]
# 其他部分都是图像本身各个patch间在计算,单独在首行加一条可学习的向量(cls token)做为输出预测的依据
self.cls = nn.Parameter(torch.randn(1,1, dim))# [1,1,dim]
def forward(self,x): # 输入[1,3,224,224]
# patch_size=(16,16) 224/16=14
# [1,16*16*3,14*14] -> [1,14*14,16*16*3] ->[1,14*14,dim]
# [1,768, 196] -> [1,196,768] ->[1,196,dim]
x = self.unfold(x)
b, _, _ = x.shape
cls_token = repeat(self.cls, "() n d -> b n d",b=b) # ->[batch_size, 1, dim]
# cls token加在首行
x = torch.cat((cls_token,x),dim=-2) # [b,196,dim]->[b,197,dim]
#position embedding
x += self.pos
x=self.dropout(x)
return x
主要部分:
MSA(Multi-head Attention部分)
多头自注意力机制(大概是这样吧)

class MSA(nn.Module):
'''多头自注意力机制,输入输出维度不变'''
def __init__(self,dim_in, # 输入向量(token)的维度
heads, # 头数
head_dim, # 每个头的维度
P=0.
):
super().__init__()
self.heads = heads
dim_inner = heads * head_dim
self.scale = head_dim ** (-0.5) # 1/(dk**0.5)
self.to_qkv = nn.Linear(dim_in, dim_inner*3) # 生成Q,K,V三个矩阵,所以乘三
self.softmax = nn.Softmax(dim=-1)
# 统一输出向量的维度, 与输入一致
# 判断要输出的向量维度是否和输入一致,当头数为1并且每个头的维度与输入相同时 直接输出
project_out = not (heads == 1 and head_dim == dim_in)
self.to_out = nn.Sequential(
nn.Linear(dim_inner, dim_in),
nn.Dropout(P)
) if project_out else nn.Identity()
def forward(self,x):
# x.shape:[b, n, l],
# 表示同时传入b个数据,一个数据包含n个长度为l的向量(token)
b, n, _ = *x.shape,
h = self.heads
# 通过一个全链层获得Q,K,V矩阵,
QKV = self.to_qkv(x).chunk(3,dim=-1) # chunk(3,dim=-1)按最后一个维度分为三份,打包成元组
# map(func,iterable)
Q, K, V = map(lambda t: rearrange(t,"b n (heads head_dim) -> b heads n head_dim", heads=h), QKV)
# attention(q,k,v) = softmax((q*k)/dk**0.5)*v
# einsum()里维度的描述只能用字母a~z, 或A~Z,不然会报错
# b heads n head_dim, b heads n head_dim -> b heads n n # -> b heads n n 的话有两个n会报错
QK = einsum("b h i d, b h j d -> b h i j", Q, K)*self.scale # einsum('i j, k j->i k', A, B),求两矩阵的点积
A = self.softmax(QK)
# b heads n n, b heads n head_dim -> b heads n head_dim
att = einsum("b h i j, b h j d -> b h i d", A, V) # einsum('i j, j k->i k', A, B)
# b heads n head_dim -> b n heads*head_dim (即dim_inner)
out = rearrange(att, "b heads n head_dim -> b n (heads head_dim)")
out = self.to_out(out) # 统一输出的向量(token)维度与输入维度相同
return out
MLP:多层感知机(即全连接层)
class MLP(nn.Module):
'''通过两层全连接层 向量维度 放大再缩小后输出'''
def __init__(self, dim_in, dim_out, P=0.3):
super(MLP,self).__init__()
self.mlp =nn.Sequential(
nn.Linear(dim_in,dim_out),
nn.GELU(),
nn.Dropout(p=P),
nn.Linear(dim_out,dim_in),
nn.Dropout(p=P)
)
def forward(self,x):
x = self.mlp(x)
return x
Transformer Encoder

class Transformer_encoder(nn.Module):
def __init__(self, dim_in, #输入向量的维度
heads, # 多头自注意力机制头数
head_dim, # 每个头的维度
P, # dropout的概率
mlp_dim # MLP里隐藏层的神经元个数
) :
super().__init__()
self.Norm = nn.LayerNorm(dim_in) # Norm
self.MSA = MSA(dim_in,heads=heads, head_dim=head_dim,P=P) # MSA
self.MLP = MLP(dim_in=dim_in,mlp_dim=mlp_dim,P=P) # MLP
def forward(self,x):
x1 = self.Norm(x)
x2 = self.MSA(x1)
x3 = x + x2
x4 = self.Norm(x3)
x5 = self.MLP(x4)
out = x3 + x5
return out
VIT
class vit(nn.Module):
def __init__(self, img_size, # 图片尺寸
depth, # 多少层transformer encoder
dim_in, # 输入向量的维度
patch_size, #分块的大小
heads, # 多头自注意力机制头数
head_dim, # 每个头的维度P, # dropout的概率
mlp_dim, # MLP里隐藏层的神经元个数
P,
class_num
):
super(vit,self).__init__()
self.patch_embed = Patch_embed(img_size,patch_size,dim_in,P)
self.layers = nn.ModuleList()
self.MLP_Head = nn.Sequential(nn.LayerNorm(dim_in),
nn.Linear(dim_in, class_num),
nn.Softmax(dim=-1)
)
for i in range(depth):
self.layers.append(Transformer_encoder(dim_in,heads,head_dim,mlp_dim,P))
def forward(self,x): # [1, 3, 224, 224]
x = self.patch_embed(x) # [[1, 197, 1000]]
# 通过depth层 transformer encoder
for transformer_encoder in self.layers:
x = transformer_encoder(x)
x = x[:,0] # 取首行的cls token
x = self.MLP_Head(x) # 经过MLP Head后输出类别
return x

test
if __name__ == "__main__":
x = torch.rand(1,3,224,224)
model = vit(img_size=224, # 图片尺寸
depth=3, # 多少层transformer encoder
dim_in=1000, # patch_embed后的向量维度
patch_size=16, #分块的大小
heads=3, # 多头自注意力机制头数
head_dim=2000, # 每个头的维度P
mlp_dim=2000, # MLP里隐藏层的神经元个数
P=0.3,# dropout的概率即全连接层的神经元不参与训练的概率(防过拟合)
class_num=2 # 最后输出几维向量(多少类)
)
y = model(x)
print(f"x.shape: {x.shape}")
print(f"y.shape: {y.shape}")
print(f"y: {y}")
print(f"y.sum():{y.sum()}")
输出:

不知道对不对,至少能跑通!
参考:
https://zhuanlan.zhihu.com/p/445122996
https://blog.csdn.net/weixin_44966641/article/details/118733341