YOLO系列：实时目标检测的演进

Posted on 三月 25, 2021

🎙️ 语音朗读当前: 晓晓 (温柔女声)

前言

YOLO（You Only Look Once）是实时目标检测领域的里程碑式算法。从2016年首次提出到如今的YOLOv5/YOLOX，该系列算法经历了多次重大革新。本文将系统梳理YOLO的发展历程和核心技术。

YOLO核心原理

YOLO将目标检测任务重新定义为单一的回归问题：

1	输入图像 → CNN → 输出(边界框+类别+置信度)

import torch
import torch.nn as nn

class YOLOLoss(nn.Module):
    """YOLO损失函数"""
    
    def __init__(self, num_classes=80, grid_size=13, num_boxes=5):
        super().__init__()
        self.num_classes = num_classes
        self.grid_size = grid_size
        self.num_boxes = num_boxes
        
        self.lambda_coord = 5.0
        self.lambda_noobj = 0.5
    
    def forward(self, predictions, targets):
        """
        Args:
            predictions: 模型预测 (batch, grid*grid*boxes, 5+num_classes)
            targets: 真实标签
        """
        batch_size = predictions.size(0)
        
        # 重塑预测
        predictions = predictions.view(batch_size, self.grid_size, 
                                       self.grid_size, self.num_boxes, 
                                       5 + self.num_classes)
        
        # 提取各部分
        pred_box = predictions[..., :4]  # 边界框
        pred_conf = predictions[..., 4]   # 置信度
        pred_cls = predictions[..., 5:]   # 类别
        
        # 计算各项损失
        box_loss = self._compute_box_loss(pred_box, targets)
        conf_loss = self._compute_conf_loss(pred_conf, targets)
        cls_loss = self._compute_cls_loss(pred_cls, targets)
        
        total_loss = (self.lambda_coord * box_loss + 
                     conf_loss + 
                     cls_loss)
        
        return total_loss
    
    def _compute_box_loss(self, pred_box, targets):
        """边界框损失（MSELoss）"""
        # 简化的box loss计算
        mask = targets[..., 4] > 0  # 有目标的网格
        if mask.sum() == 0:
            return torch.tensor(0.0, device=pred_box.device)
        
        return nn.MSELoss(reduction='sum')(pred_box[mask], targets[mask][..., :4])
    
    def _compute_conf_loss(self, pred_conf, targets):
        """置信度损失"""
        obj_mask = targets[..., 4] > 0
        noobj_mask = targets[..., 4] == 0
        
        obj_loss = nn.MSELoss(reduction='sum')(
            pred_conf[obj_mask], targets[obj_mask][..., 4]
        ) if obj_mask.sum() > 0 else torch.tensor(0.0)
        
        noobj_loss = self.lambda_noobj * nn.MSELoss(reduction='sum')(
            pred_conf[noobj_mask], targets[noobj_mask][..., 4]
        ) if noobj_mask.sum() > 0 else torch.tensor(0.0)
        
        return obj_loss + noobj_loss
    
    def _compute_cls_loss(self, pred_cls, targets):
        """类别损失"""
        mask = targets[..., 4] > 0
        if mask.sum() == 0:
            return torch.tensor(0.0, device=pred_cls.device)
        
        return nn.CrossEntropyLoss(reduction='sum')(
            pred_cls[mask].view(-1, self.num_classes),
            targets[mask][..., 5].long().view(-1)
        )

YOLOv3：多尺度检测

YOLOv3引入FPN进行多尺度预测：

class YOLOv3Backbone(nn.Module):
    """YOLOv3骨干网络（Darknet-53）"""
    
    def __init__(self):
        super().__init__()
        
        self.stem = self._make_conv(3, 32, 3, 1)
        
        # Darknet-53主体
        self.layer1 = self._make_residual(32, 64, 1)
        self.layer2 = self._make_residual(64, 128, 2)
        self.layer3 = self._make_residual(128, 256, 8)
        self.layer4 = self._make_residual(256, 512, 8)
        self.layer5 = self._make_residual(512, 1024, 4)
    
    def _make_conv(self, in_ch, out_ch, kernel, stride):
        return nn.Sequential(
            nn.Conv2d(in_ch, out_ch, kernel, stride, kernel//2, bias=False),
            nn.BatchNorm2d(out_ch),
            nn.LeakyReLU(0.1)
        )
    
    def _make_residual(self, in_ch, out_ch, num_blocks):
        layers = [self._make_conv(in_ch, out_ch, 3, 2)]
        
        for _ in range(num_blocks):
            layers.append(self._make_conv(out_ch, out_ch, 1, 1))
        
        return nn.Sequential(*layers)
    
    def forward(self, x):
        c1 = self.layer3(self.layer2(self.layer1(self.stem(x))))
        c2 = self.layer4(c1)
        c3 = self.layer5(c2)
        
        return c1, c2, c3

class YOLOv3Neck(nn.Module):
    """YOLOv3的FPN颈部网络"""
    
    def __init__(self):
        super().__init__()
        
        # 上采样分支
        self.up1 = nn.Sequential(
            nn.Conv2d(1024, 512, 1),
            nn.Upsample(scale_factor=2, mode='nearest')
        )
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(768, 512, 1),
            nn.Conv2d(512, 1024, 3, padding=1),
            nn.Conv2d(1024, 512, 1)
        )
        
        self.up2 = nn.Sequential(
            nn.Conv2d(512, 256, 1),
            nn.Upsample(scale_factor=2, mode='nearest')
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(384, 256, 1),
            nn.Conv2d(256, 512, 3, padding=1),
            nn.Conv2d(512, 256, 1)
        )

class YOLOv3Head(nn.Module):
    """YOLOv3检测头"""
    
    def __init__(self, num_classes=80, anchors_per_scale=3):
        super().__init__()
        self.num_classes = num_classes
        self.num_anchors = anchors_per_scale
        
        # 三个尺度的检测头
        self.detect1 = self._make_detect_layer(512, num_classes)
        self.detect2 = self._make_detect_layer(256, num_classes)
        self.detect3 = self._make_detect_layer(128, num_classes)
    
    def _make_detect_layer(self, in_channels, num_classes):
        return nn.Sequential(
            nn.Conv2d(in_channels, 1024, 3, padding=1),
            nn.Conv2d(1024, 255, 1)  # 255 = 3*(4+1+80)
        )
    
    def forward(self, features):
        # features: [f1, f2, f3] 三个尺度特征
        out1 = self.detect1(features[2])  # 大目标
        out2 = self.detect2(features[1])  # 中目标
        out3 = self.detect3(features[0])  # 小目标
        
        return [out1, out2, out3]

YOLOv4：Bag of Freebies

YOLOv4引入了大量训练技巧：

class YOLOv4Loss:
    """YOLOv4损失函数组合"""
    
    def __init__(self):
        self.ciou_loss = CIoULoss()
        self.focal_loss = FocalLoss()
        self.bce_loss = nn.BCEWithLogitsLoss(reduction='sum')
    
    def compute(self, pred_boxes, pred_conf, pred_cls,
                target_boxes, target_conf, target_cls):
        # CIoU边界框损失
        box_loss = self.ciou_loss(pred_boxes, target_boxes)
        
        # Focal Loss处理类别不平衡
        cls_loss = self.focal_loss(pred_cls, pred_conf, target_cls)
        
        # 置信度损失
        conf_loss = self.bce_loss(pred_conf, target_conf)
        
        return box_loss + cls_loss + conf_loss

class CIoULoss(nn.Module):
    """Complete IoU Loss"""
    
    def __init__(self):
        super().__init__()
    
    def forward(self, pred, target):
        # 计算IoU
        iou = self._calculate_iou(pred, target)
        
        # 计算中心点距离
        pred_center = self._get_center(pred)
        target_center = self._get_center(target)
        center_distance = torch.sum((pred_center - target_center) ** 2, dim=-1)
        
        # 计算最小包围框
        enclose = self._get_enclose(pred, target)
        enclose_diagonal = torch.sum(enclose ** 2, dim=-1)
        
        # 计算CIoU
        ciou = iou - (center_distance / (enclose_diagonal + 1e-7))
        
        return (1 - ciou).mean()

YOLOX：Anchor-Free时代

YOLOX去掉锚框，简化检测流程：

class YOLOXHead(nn.Module):
    """YOLOX无锚框检测头"""
    
    def __init__(self, num_classes=80, in_channels=256):
        super().__init__()
        self.num_classes = num_classes
        
        # 共享卷积
        self.cls_conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, num_classes, 1)
        )
        
        self.reg_conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, 4, 1)  # 直接预测边界框
        )
        
        self.obj_conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, 1, 1)  # 置信度
        )
    
    def forward(self, x):
        # 预测类别（SimOTA需要）
        cls_output = self.cls_conv(x)
        
        # 预测边界框
        bbox_output = self.reg_conv(x)
        
        # 预测置信度
        obj_output = self.obj_conv(x)
        
        return cls_output, bbox_output, obj_output

class SimOTA:
    """SimOTA标签分配"""
    
    def __call__(self, pred_cls, pred_box, gt_box, gt_label, fg_mask):
        # 计算匹配成本
        cost = (self._calc_cls_cost(pred_cls, gt_label, fg_mask) +
                self._calc_reg_cost(pred_box, gt_box, fg_mask) +
                self._calc_iou_cost(pred_box, gt_box, fg_mask))
        
        # Top-k选择
        k = min(10, fg_mask.sum())
        matching_matrix = self._dynamic_k_matching(cost, gt_box, k)
        
        return matching_matrix

模型推理

def yolo_inference(model, image, conf_thresh=0.5, iou_thresh=0.5):
    """YOLO推理流程"""
    model.eval()
    
    # 预处理
    input_tensor = preprocess_image(image, target_size=416)
    
    with torch.no_grad():
        predictions = model(input_tensor)
    
    # 后处理
    boxes, scores, class_ids = postprocess(
        predictions, 
        conf_threshold=conf_thresh,
        iou_threshold=iou_thresh
    )
    
    return boxes, scores, class_ids

def postprocess(predictions, conf_threshold=0.5, iou_threshold=0.5):
    """预测后处理"""
    # 过滤低置信度框
    mask = predictions[..., 4] > conf_threshold
    
    boxes = predictions[mask][..., :4]
    scores = predictions[mask][..., 4]
    class_ids = predictions[mask][..., 5:].argmax(dim=-1)
    
    # NMS
    boxes = xywh2xyxy(boxes)
    keep = torchvision.ops.nms(boxes, scores, iou_threshold)
    
    return boxes[keep], scores[keep], class_ids[keep]

YOLO各版本对比

版本	mAP@50	FPS	参数量	主要创新
YOLOv3	57.9	35	62M	多尺度、FPN
YOLOv4	62.8	65	27M	CSPDarknet、Mish
YOLOv5	66.9	140	7.2M	AutoAnchor、Data Aug
YOLOX	67.2	68	9.0M	Anchor-Free、SimOTA

总结

YOLO系列从最初的简单设计发展到如今的高性能实时检测器，经历了anchor-based到anchor-free的转变，引入了大量训练技巧和优化策略，在工业界得到广泛应用。

ovo$^{mc^2}$