YOLO系列:实时目标检测的演进

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

前言

YOLO(You Only Look Once)是实时目标检测领域的里程碑式算法。从2016年首次提出到如今的YOLOv5/YOLOX,该系列算法经历了多次重大革新。本文将系统梳理YOLO的发展历程和核心技术。

YOLO核心原理

YOLO将目标检测任务重新定义为单一的回归问题:

1
输入图像 → CNN → 输出(边界框+类别+置信度)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
import torch.nn as nn

class YOLOLoss(nn.Module):
"""YOLO损失函数"""

def __init__(self, num_classes=80, grid_size=13, num_boxes=5):
super().__init__()
self.num_classes = num_classes
self.grid_size = grid_size
self.num_boxes = num_boxes

self.lambda_coord = 5.0
self.lambda_noobj = 0.5

def forward(self, predictions, targets):
"""
Args:
predictions: 模型预测 (batch, grid*grid*boxes, 5+num_classes)
targets: 真实标签
"""
batch_size = predictions.size(0)

# 重塑预测
predictions = predictions.view(batch_size, self.grid_size,
self.grid_size, self.num_boxes,
5 + self.num_classes)

# 提取各部分
pred_box = predictions[..., :4] # 边界框
pred_conf = predictions[..., 4] # 置信度
pred_cls = predictions[..., 5:] # 类别

# 计算各项损失
box_loss = self._compute_box_loss(pred_box, targets)
conf_loss = self._compute_conf_loss(pred_conf, targets)
cls_loss = self._compute_cls_loss(pred_cls, targets)

total_loss = (self.lambda_coord * box_loss +
conf_loss +
cls_loss)

return total_loss

def _compute_box_loss(self, pred_box, targets):
"""边界框损失(MSELoss)"""
# 简化的box loss计算
mask = targets[..., 4] > 0 # 有目标的网格
if mask.sum() == 0:
return torch.tensor(0.0, device=pred_box.device)

return nn.MSELoss(reduction='sum')(pred_box[mask], targets[mask][..., :4])

def _compute_conf_loss(self, pred_conf, targets):
"""置信度损失"""
obj_mask = targets[..., 4] > 0
noobj_mask = targets[..., 4] == 0

obj_loss = nn.MSELoss(reduction='sum')(
pred_conf[obj_mask], targets[obj_mask][..., 4]
) if obj_mask.sum() > 0 else torch.tensor(0.0)

noobj_loss = self.lambda_noobj * nn.MSELoss(reduction='sum')(
pred_conf[noobj_mask], targets[noobj_mask][..., 4]
) if noobj_mask.sum() > 0 else torch.tensor(0.0)

return obj_loss + noobj_loss

def _compute_cls_loss(self, pred_cls, targets):
"""类别损失"""
mask = targets[..., 4] > 0
if mask.sum() == 0:
return torch.tensor(0.0, device=pred_cls.device)

return nn.CrossEntropyLoss(reduction='sum')(
pred_cls[mask].view(-1, self.num_classes),
targets[mask][..., 5].long().view(-1)
)

YOLOv3:多尺度检测

YOLOv3引入FPN进行多尺度预测:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class YOLOv3Backbone(nn.Module):
"""YOLOv3骨干网络(Darknet-53)"""

def __init__(self):
super().__init__()

self.stem = self._make_conv(3, 32, 3, 1)

# Darknet-53主体
self.layer1 = self._make_residual(32, 64, 1)
self.layer2 = self._make_residual(64, 128, 2)
self.layer3 = self._make_residual(128, 256, 8)
self.layer4 = self._make_residual(256, 512, 8)
self.layer5 = self._make_residual(512, 1024, 4)

def _make_conv(self, in_ch, out_ch, kernel, stride):
return nn.Sequential(
nn.Conv2d(in_ch, out_ch, kernel, stride, kernel//2, bias=False),
nn.BatchNorm2d(out_ch),
nn.LeakyReLU(0.1)
)

def _make_residual(self, in_ch, out_ch, num_blocks):
layers = [self._make_conv(in_ch, out_ch, 3, 2)]

for _ in range(num_blocks):
layers.append(self._make_conv(out_ch, out_ch, 1, 1))

return nn.Sequential(*layers)

def forward(self, x):
c1 = self.layer3(self.layer2(self.layer1(self.stem(x))))
c2 = self.layer4(c1)
c3 = self.layer5(c2)

return c1, c2, c3

class YOLOv3Neck(nn.Module):
"""YOLOv3的FPN颈部网络"""

def __init__(self):
super().__init__()

# 上采样分支
self.up1 = nn.Sequential(
nn.Conv2d(1024, 512, 1),
nn.Upsample(scale_factor=2, mode='nearest')
)

self.conv1 = nn.Sequential(
nn.Conv2d(768, 512, 1),
nn.Conv2d(512, 1024, 3, padding=1),
nn.Conv2d(1024, 512, 1)
)

self.up2 = nn.Sequential(
nn.Conv2d(512, 256, 1),
nn.Upsample(scale_factor=2, mode='nearest')
)

self.conv2 = nn.Sequential(
nn.Conv2d(384, 256, 1),
nn.Conv2d(256, 512, 3, padding=1),
nn.Conv2d(512, 256, 1)
)

class YOLOv3Head(nn.Module):
"""YOLOv3检测头"""

def __init__(self, num_classes=80, anchors_per_scale=3):
super().__init__()
self.num_classes = num_classes
self.num_anchors = anchors_per_scale

# 三个尺度的检测头
self.detect1 = self._make_detect_layer(512, num_classes)
self.detect2 = self._make_detect_layer(256, num_classes)
self.detect3 = self._make_detect_layer(128, num_classes)

def _make_detect_layer(self, in_channels, num_classes):
return nn.Sequential(
nn.Conv2d(in_channels, 1024, 3, padding=1),
nn.Conv2d(1024, 255, 1) # 255 = 3*(4+1+80)
)

def forward(self, features):
# features: [f1, f2, f3] 三个尺度特征
out1 = self.detect1(features[2]) # 大目标
out2 = self.detect2(features[1]) # 中目标
out3 = self.detect3(features[0]) # 小目标

return [out1, out2, out3]

YOLOv4:Bag of Freebies

YOLOv4引入了大量训练技巧:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
class YOLOv4Loss:
"""YOLOv4损失函数组合"""

def __init__(self):
self.ciou_loss = CIoULoss()
self.focal_loss = FocalLoss()
self.bce_loss = nn.BCEWithLogitsLoss(reduction='sum')

def compute(self, pred_boxes, pred_conf, pred_cls,
target_boxes, target_conf, target_cls):
# CIoU边界框损失
box_loss = self.ciou_loss(pred_boxes, target_boxes)

# Focal Loss处理类别不平衡
cls_loss = self.focal_loss(pred_cls, pred_conf, target_cls)

# 置信度损失
conf_loss = self.bce_loss(pred_conf, target_conf)

return box_loss + cls_loss + conf_loss

class CIoULoss(nn.Module):
"""Complete IoU Loss"""

def __init__(self):
super().__init__()

def forward(self, pred, target):
# 计算IoU
iou = self._calculate_iou(pred, target)

# 计算中心点距离
pred_center = self._get_center(pred)
target_center = self._get_center(target)
center_distance = torch.sum((pred_center - target_center) ** 2, dim=-1)

# 计算最小包围框
enclose = self._get_enclose(pred, target)
enclose_diagonal = torch.sum(enclose ** 2, dim=-1)

# 计算CIoU
ciou = iou - (center_distance / (enclose_diagonal + 1e-7))

return (1 - ciou).mean()

YOLOX:Anchor-Free时代

YOLOX去掉锚框,简化检测流程:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
class YOLOXHead(nn.Module):
"""YOLOX无锚框检测头"""

def __init__(self, num_classes=80, in_channels=256):
super().__init__()
self.num_classes = num_classes

# 共享卷积
self.cls_conv = nn.Sequential(
nn.Conv2d(in_channels, in_channels, 3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels, num_classes, 1)
)

self.reg_conv = nn.Sequential(
nn.Conv2d(in_channels, in_channels, 3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels, 4, 1) # 直接预测边界框
)

self.obj_conv = nn.Sequential(
nn.Conv2d(in_channels, in_channels, 3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels, 1, 1) # 置信度
)

def forward(self, x):
# 预测类别(SimOTA需要)
cls_output = self.cls_conv(x)

# 预测边界框
bbox_output = self.reg_conv(x)

# 预测置信度
obj_output = self.obj_conv(x)

return cls_output, bbox_output, obj_output

class SimOTA:
"""SimOTA标签分配"""

def __call__(self, pred_cls, pred_box, gt_box, gt_label, fg_mask):
# 计算匹配成本
cost = (self._calc_cls_cost(pred_cls, gt_label, fg_mask) +
self._calc_reg_cost(pred_box, gt_box, fg_mask) +
self._calc_iou_cost(pred_box, gt_box, fg_mask))

# Top-k选择
k = min(10, fg_mask.sum())
matching_matrix = self._dynamic_k_matching(cost, gt_box, k)

return matching_matrix

模型推理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def yolo_inference(model, image, conf_thresh=0.5, iou_thresh=0.5):
"""YOLO推理流程"""
model.eval()

# 预处理
input_tensor = preprocess_image(image, target_size=416)

with torch.no_grad():
predictions = model(input_tensor)

# 后处理
boxes, scores, class_ids = postprocess(
predictions,
conf_threshold=conf_thresh,
iou_threshold=iou_thresh
)

return boxes, scores, class_ids

def postprocess(predictions, conf_threshold=0.5, iou_threshold=0.5):
"""预测后处理"""
# 过滤低置信度框
mask = predictions[..., 4] > conf_threshold

boxes = predictions[mask][..., :4]
scores = predictions[mask][..., 4]
class_ids = predictions[mask][..., 5:].argmax(dim=-1)

# NMS
boxes = xywh2xyxy(boxes)
keep = torchvision.ops.nms(boxes, scores, iou_threshold)

return boxes[keep], scores[keep], class_ids[keep]

YOLO各版本对比

版本 mAP@50 FPS 参数量 主要创新
YOLOv3 57.9 35 62M 多尺度、FPN
YOLOv4 62.8 65 27M CSPDarknet、Mish
YOLOv5 66.9 140 7.2M AutoAnchor、Data Aug
YOLOX 67.2 68 9.0M Anchor-Free、SimOTA

总结

YOLO系列从最初的简单设计发展到如今的高性能实时检测器,经历了anchor-based到anchor-free的转变,引入了大量训练技巧和优化策略,在工业界得到广泛应用。

参考资源

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero