图像分类:从VGG到ResNet的演进

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

图像分类:从VGG到ResNet的演进

图像分类是计算机视觉的基础任务,从VGG到ResNet,网络架构经历了重要演进。

VGGNet

VGGNet的核心思想是使用小卷积核(3×3)堆叠代替大卷积核:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import torch
import torch.nn as nn

class VGGBlock(nn.Module):
def __init__(self, in_channels, out_channels, num_convs):
super().__init__()
layers = []
for _ in range(num_convs):
layers.append(nn.Conv2d(in_channels, out_channels, 3, padding=1))
layers.append(nn.ReLU(inplace=True))
in_channels = out_channels
layers.append(nn.MaxPool2d(2, 2))
self.block = nn.Sequential(*layers)

def forward(self, x):
return self.block(x)

class VGG16(nn.Module):
def __init__(self, num_classes=1000):
super().__init__()
self.features = nn.Sequential(
VGGBlock(3, 64, 2),
VGGBlock(64, 128, 2),
VGGBlock(128, 256, 3),
VGGBlock(256, 512, 3),
VGGBlock(512, 512, 3),
)
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, num_classes)
)

def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x

VGG的关键贡献

  • 证明小卷积核堆叠可以替代大卷积核
  • 网络深度是提升性能的关键因素
  • 统一的架构设计简单有效

ResNet:残差学习

ResNet通过残差连接解决了深层网络的退化问题:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class BasicBlock(nn.Module):
expansion = 1

def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, 3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, 3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample

def forward(self, x):
identity = x

out = self.conv1(x)
out = self.bn1(out)
out = nn.ReLU(inplace=True)(out)

out = self.conv2(out)
out = self.bn2(out)

if self.downsample:
identity = self.downsample(x)

out += identity # 残差连接
out = nn.ReLU(inplace=True)(out)
return out

class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
super().__init__()
self.in_channels = 64

self.conv1 = nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)

self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)

def _make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels * block.expansion,
1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels * block.expansion)
)

layers = [block(self.in_channels, out_channels, stride, downsample)]
self.in_channels = out_channels * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_channels, out_channels))

return nn.Sequential(*layers)

def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = nn.ReLU(inplace=True)(x)
x = self.maxpool(x)

x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)

x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x

def resnet18(num_classes=1000):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)

def resnet34(num_classes=1000):
return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)

迁移学习实战

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import torchvision.models as models
import torchvision.transforms as transforms

# 加载预训练ResNet
model = models.resnet50(pretrained=True)

# 冻结特征提取层
for param in model.parameters():
param.requires_grad = False

# 替换分类头
num_features = model.fc.in_features
model.fc = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(num_features, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, 10) # 10个类别
)

# 数据增强
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

模型对比

模型 深度 Top-5精度 参数量
VGG-16 16 92.0% 138M
ResNet-18 18 91.2% 11.7M
ResNet-50 50 93.6% 25.6M
ResNet-101 101 94.1% 44.5M

总结

从VGG到ResNet,图像分类架构的演进核心在于如何有效训练更深的网络。VGG证明了深度的重要性,ResNet通过残差连接解决了退化问题,使得训练上百层的网络成为可能。迁移学习使得预训练模型能够快速适应新任务,大幅降低了数据需求和训练成本。

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero