YOLOV1_pytorch_(2)
## 3 预测
### 3.1 预测流程
1. 图片预处理
2. 预测
3. 解码
4. 画框
(1) 预测
predict.py
```python
import torch
from torch.autograd import Variable
from resnet_yolo import resnet50
import torchvision.transforms as transforms
import CV2
import numpy as np
VOC_CLASSES = ( # always index 0
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
Color = [[0, 0, 0],[128, 0, 0],[0, 128, 0],[128, 128, 0],[0, 0, 128],
[128, 0, 128],[0, 128, 128],[128, 128, 128],[64, 0, 0],[192, 0, 0],
[64, 128, 0],[192, 128, 0],[64, 0, 128],[192, 0,128],[64, 128, 128],
[192, 128, 128],[0, 64, 0],[128, 64, 0],[0, 192, 0],[128, 192, 0],[0, 64, 128]]
def nms(bboxes,scores,threshold=0.5):
x1 = bboxes[:,0]
y1 = bboxes[:,1]
x2 = bboxes[:,2]
y2 = bboxes[:,3]
areas = (x2-x1)*(y2-y1)
_,order = scores.sort(0,descending=True)
keep = []
while order.numel() > 0:
if order.numel()>1:
i = order[0]
else:
i = order
keep.append(i)
if order.numel() == 1:
break
xx1 = x1[order[1:]].clamp(min=x1[i])
yy1 = y1[order[1:]].clamp(min=y1[i])
xx2 = x2[order[1:]].clamp(max=x1[i])
yy2 = y2[order[1:]].clamp(max=y1[i])
w = (xx2-xx1).clamp(min=0)
h = (yy2-yy1).clamp(min=0)
inter = w*h
ove = inter/(areas[i]+areas[order[1:]]-inter)
#ids = (ove <= threshold).nonzero().squeeze()
ids = torch.nonzero(ove <= threshold).squeeze()
if ids.numel() == 0:
break
order = order[ids+1]
return torch.LongTensor(keep)
def decoder(pred):
grid_num = 7
boxes = []
cls_indexs = []
probs = []
cell_size = 1./grid_num
pred = pred.data
pred = pred.squeeze(0) # 7x7x30
contain1 = pred[:,:,4].unsqueeze(2) # [7, 7, 1]
contain2 = pred[:,:,9].unsqueeze(2) # [7, 7, 1]
contain = torch.cat((contain1,contain2),2) # [7, 7, 2]
mask1 = contain > 0.1 # [7, 7, 2]
mask2 = (contain==contain.max()) # [7, 7, 2]
mask = (mask1+mask2).gt(0) # [7, 7, 2]
for i in range(grid_num):
for j in range(grid_num):
for b in range(2):
if mask[i,j,b] == 1:
box = pred[i,j,b*5:b*5+4]
contain_prob = torch.FloatTensor([pred[i,j,b*5+4]])
xy = torch.FloatTensor([j,i])*cell_size #cell左上角 up left of cell
box[:2] = box[:2]*cell_size + xy # return cxcy relative to image
box_xy = torch.FloatTensor(box.size())#转换成xy形式 convert[cx,cy,w,h] to [x1,y1,x2,y2]
box_xy[:2] = box[:2] - 0.5*box[2:]
box_xy[2:] = box[:2] + 0.5*box[2:]
max_prob,cls_index = torch.max(pred[i,j,10:],0)
if float((contain_prob*max_prob)[0]) > 0.1:
boxes.append(box_xy.view(1,4))
cls_indexs.append(torch.LongTensor(cls_index,0))
probs.append(contain_prob*max_prob)
if len(boxes) == 0:
boxes = torch.zeros((1,4))
probs = torch.zeros(1)
cls_indexs = torch.zeros(1)
else:
boxes = torch.cat(boxes,0) #(n,4)
probs = torch.cat(probs,0) #(n,)
cls_indexs = torch.cat(cls_indexs,0) #(n,)
keep = nms(boxes,probs)
return boxes[keep],cls_indexs[keep],probs[keep]
def predict_gpu(model,image_name,root_path='/Users/ls/PycharmProjects/YOLOV1_LS/VOCdevkit/VOC2007/JPEGImages/'):
result = []
image = CV2.imread(root_path+image_name)
# 1 图片预处理
h,w,_ = image.shape
img = CV2.resize(image,(448,448)) # 统一输入模型的图片尺寸
img = CV2.cvtColor(img,CV2.COLOR_BGR2RGB) # 色彩空间转换
mean = (123,117,104) #RGB
img = img - np.array(mean,dtype=np.float32) # 去均值
transform = transforms.Compose([transforms.ToTensor(),])
img = transform(img) # 转置
img = Variable(img[None,:,:,:],volatile=True)
# img = img.cuda()
# 2 预测
pred = model(img) #1x7x7x30
pred = pred.cpu()
# 3 解码
boxes,cls_indexs,probs = decoder(pred)
for i,box in enumerate(boxes):
x1 = int(box[0]*w)
x2 = int(box[2]*w)
y1 = int(box[1]*h)
y2 = int(box[3]*h)
cls_index = cls_indexs[i]
if cls_index.numel()==0:return
cls_index = int(cls_index) # convert LongTensor to int
prob = probs[i]
prob = float(prob)
result.append([(x1,y1),(x2,y2),VOC_CLASSES[cls_index],image_name,prob])
return result
if __name__ == '__main__':
model = resnet50()
print('load model...')
# model.load_state_dict(torch.load('best.pth'))
model.eval()
#model.cuda()
image_name = '000015.jpg'
image = CV2.imread(image_name)
print('predicting...')
result = predict_gpu(model,image_name)
# 4 画框
for left_up,right_bottom,class_name, _ ,prob in result:
color = Color[VOC_CLASSES.index(class_name)]
CV2.rectangle(image,left_up,right_bottom,color,2)
label = class_name+str(round(prob,2))
text_size, baseline = CV2.getTextSize(label, CV2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
p1 = (left_up[0], left_up[1]- text_size[1])
CV2.rectangle(image, (p1[0] - 2//2, p1[1] - 2 - baseline), (p1[0] + text_size[0], p1[1] + text_size[1]), color, -1)
CV2.putText(image, label, (p1[0], p1[1] + baseline), CV2.FONT_HERSHEY_SIMPLEX, 0.4, (255,255,255), 1, 8)
CV2.imwrite('result.jpg',image)
```
(2)NMS
NMS的目的是根据IoU删除重复的预测框,原理是同一物体的预测框IoU 大,不同物体的预测框IoU小。
I. 计算预测框面积。
II. 对置信度降序排列,返回排序下标。把排序好的第一个概率大的预测框取出,计算剩余框与第一个框的IoU。
III. 根据阈值筛选保留的框,并对保留的框做同样的操作,直到剩余一个框,则停止操作。
```python
def nms(bboxes,scores,threshold=0.5):
x1 = bboxes[:,0]
y1 = bboxes[:,1]
x2 = bboxes[:,2]
y2 = bboxes[:,3]
# 1 计算所有预测框的面积
areas = (x2-x1)*(y2-y1)
# 2 按照预测概率排序
_,order = scores.sort(0,descending=True)
keep = []
while order.numel() > 0:
# 3 取出第一个框
if order.numel()>1:
i = order[0]
else:
i = order
keep.append(i)
if order.numel() == 1:
break
# 4 计算剩余框与第一个框的IoU
xx1 = x1[order[1:]].clamp(min=x1[i])
yy1 = y1[order[1:]].clamp(min=y1[i])
xx2 = x2[order[1:]].clamp(max=x1[i])
yy2 = y2[order[1:]].clamp(max=y1[i])
w = (xx2-xx1).clamp(min=0)
h = (yy2-yy1).clamp(min=0)
inter = w*h
ove = inter/(areas[i]+areas[order[1:]]-inter)
#ids = (ove <= threshold).nonzero().squeeze()
# 5 根据IoU剔除重合的框
ids = torch.nonzero(ove <= threshold).squeeze()
if ids.numel() == 0:
break
# 6 取出与第一个框重叠小或不重叠的框作为下一轮筛选的对象
order = order[ids+1]
return torch.LongTensor(keep)
```
(3)解码
I. 取出预测值中的置信度,根据置信度阈值初筛预测框。
II.遍历输出特征图的行、列、每个网格的框,取出对应的预测框、类别概率。根据预测偏移计算预测框的中心点。预测类别概率与置信度乘积作为最终的预测概率,再根据最终的预测概率设置阈值帅选一遍框。
III.根据预测框和物体类别概率进行非极大值抑制,输出符合条件的预测值。
```python
def decoder(pred):
grid_num = 7
boxes = []
cls_indexs = []
probs = []
cell_size = 1./grid_num
pred = pred.data
pred = pred.squeeze(0) # 7x7x30
contain1 = pred[:,:,4].unsqueeze(2) # [7, 7, 1]
contain2 = pred[:,:,9].unsqueeze(2) # [7, 7, 1]
# 1 根据置信度筛选框
contain = torch.cat((contain1,contain2),2) # [7, 7, 2]
mask1 = contain > 0.1 # [7, 7, 2]
mask2 = (contain==contain.max()) # [7, 7, 2]
mask = (mask1+mask2).gt(0) # [7, 7, 2]
for i in range(grid_num):
for j in range(grid_num):
for b in range(2):
if mask[i,j,b] == 1:
box = pred[i,j,b*5:b*5+4]
contain_prob = torch.FloatTensor([pred[i,j,b*5+4]])
xy = torch.FloatTensor([j,i])*cell_size #cell左上角 up left of cell
# 2 解码
box[:2] = box[:2]*cell_size + xy # return cxcy relative to image
box_xy = torch.FloatTensor(box.size())#转换成xy形式 convert[cx,cy,w,h] to [x1,y1,x2,y2]
box_xy[:2] = box[:2] - 0.5*box[2:]
box_xy[2:] = box[:2] + 0.5*box[2:]
max_prob,cls_index = torch.max(pred[i,j,10:],0)
# 3 根据最终预测概率筛选框
if float((contain_prob*max_prob)[0]) > 0.1:
boxes.append(box_xy.view(1,4))
cls_indexs.append(torch.LongTensor(cls_index,0))
probs.append(contain_prob*max_prob)
if len(boxes) == 0:
boxes = torch.zeros((1,4))
probs = torch.zeros(1)
cls_indexs = torch.zeros(1)
else:
boxes = torch.cat(boxes,0) #(n,4)
probs = torch.cat(probs,0) #(n,)
cls_indexs = torch.cat(cls_indexs,0) #(n,)
# 4 非极大值抑制
keep = nms(boxes,probs)
return boxes[keep],cls_indexs[keep],probs[keep]
```