模型量化:深度学习模型压缩与加速

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

前言

模型量化(Model Quantization)是一种有效的模型压缩技术,通过将浮点数权重和激活值转换为低精度表示(如INT8、INT4)来减少模型大小和加速推理。本文将深入解析量化的原理、方法和实现。

量化基础概念

量化将连续的浮点值映射到离散的整数值:

1
2
x_quantized = round(x_float / scale) + zero_point
x_float = (x_quantized - zero_point) * scale
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import torch
import torch.nn as nn
import numpy as np

class Quantizer:
"""量化器基类"""

def __init__(self, num_bits=8):
self.num_bits = num_bits
self.num_levels = 2 ** num_bits

def quantize(self, tensor):
raise NotImplementedError

def dequantize(self, quantized_tensor):
raise NotImplementedError

class SymmetricQuantizer(Quantizer):
"""对称量化"""

def __init__(self, num_bits=8):
super().__init__(num_bits)
self.scale = None
self.zero_point = 0 # 对称量化为0

def quantize(self, tensor):
# 计算scale
max_val = tensor.abs().max()
self.scale = max_val / (self.num_levels / 2 - 1)

# 量化
quantized = torch.round(tensor / self.scale)
quantized = torch.clamp(quantized,
-(self.num_levels/2 - 1),
self.num_levels/2 - 1)
return quantized.to(torch.int8)

def dequantize(self, quantized_tensor):
return quantized_tensor.float() * self.scale

class AffineQuantizer(Quantizer):
"""非对称量化(Affine/Zero-point量化)"""

def __init__(self, num_bits=8):
super().__init__(num_bits)
self.scale = None
self.zero_point = None

def quantize(self, tensor):
# 计算scale和zero_point
min_val = tensor.min()
max_val = tensor.max()

self.scale = (max_val - min_val) / (self.num_levels - 1)
self.zero_point = torch.round(-min_val / self.scale)
self.zero_point = torch.clamp(self.zero_point, 0, self.num_levels - 1)

# 量化
quantized = torch.round(tensor / self.scale) + self.zero_point
quantized = torch.clamp(quantized, 0, self.num_levels - 1)

return quantized.to(torch.int8)

def dequantize(self, quantized_tensor):
return (quantized_tensor.float() - self.zero_point) * self.scale

量化感知训练

量化感知训练(QAT)在训练过程中模拟量化效果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class FakeQuantize(nn.Module):
"""伪量化模块"""

def __init__(self, num_bits=8, quantize_forward=True):
super().__init__()
self.num_bits = num_bits
self.quantize_forward = quantize_forward

# 可学习的scale和zero_point
self.scale = nn.Parameter(torch.ones(1))
self.zero_point = nn.Parameter(torch.zeros(1))

def forward(self, x):
if self.training and self.quantize_forward:
# 前向:伪量化
x = self._fake_quantize(x)
return x

def _fake_quantize(self, x):
# 量化
x_quantized = torch.round(x / self.scale) + self.zero_point
x_quantized = torch.clamp(x_quantized, 0, 2**self.num_bits - 1)

# 反量化
x_dequantized = (x_quantized - self.zero_point) * self.scale

# 存储STE的梯度
x_dequantized = x + (x_dequantized - x).detach()

return x_dequantized

class QuantizedLinear(nn.Module):
"""量化版全连接层"""

def __init__(self, in_features, out_features, bias=True, num_bits=8):
super().__init__()
self.in_features = in_features
self.out_features = out_features

# 浮点权重
self.weight = nn.Parameter(torch.randn(out_features, in_features))
self.bias = nn.Parameter(torch.randn(out_features)) if bias else None

# 量化器
self.weight_quantizer = SymmetricQuantizer(num_bits)

# 伪量化用于训练
self.fake_quantize = FakeQuantize(num_bits)

def forward(self, x):
if self.training:
# 训练时使用伪量化
weight = self.fake_quantize(self.weight)
else:
# 推理时使用实际量化
weight = self.weight_quantizer.dequantize(
self.weight_quantizer.quantize(self.weight)
)

output = nn.functional.linear(x, weight, self.bias)
return output

后训练量化

后训练量化(PTQ)在模型训练完成后进行量化:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class PostTrainingQuantizer:
"""后训练量化器"""

def __init__(self, model, dataloader, num_bits=8):
self.model = model
self.dataloader = dataloader
self.num_bits = num_bits

def calibrate(self):
"""校准:收集激活值统计信息"""
self.model.eval()

# 用于存储各层激活范围
activation_ranges = {}
hooks = []

def hook_fn(name):
def hook(module, input, output):
if isinstance(output, tuple):
output = output[0]
# 收集统计信息
activation_ranges[name] = {
'min': output.detach().min(),
'max': output.detach().max()
}
return hook

# 注册hooks
for name, module in self.model.named_modules():
if isinstance(module, (nn.Conv2d, nn.Linear)):
hooks.append(module.register_forward_hook(hook_fn(name)))

# 运行校准数据
with torch.no_grad():
for i, (inputs, _) in enumerate(self.dataloader):
if i >= 100: # 使用100个batch校准
break
self.model(inputs)

# 移除hooks
for hook in hooks:
hook.remove()

return activation_ranges

def quantize_model(self):
"""量化模型"""
# 1. 校准获取激活范围
activation_ranges = self.calibrate()

# 2. 量化权重
quantized_state_dict = {}
for name, param in self.model.state_dict().items():
quantizer = SymmetricQuantizer(self.num_bits)
quantized_param = quantizer.quantize(param)
quantized_state_dict[name] = {
'data': quantized_param,
'scale': quantizer.scale
}

return quantized_state_dict

def static_quantize_model(model, example_inputs, quantized_dtype=torch.qint8):
"""PyTorch静态量化"""
# 准备模型
model.eval()
model.fuse_model() # 融合Conv+BN+ReLU

# 准备量化配置
qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.set_qconfig(qconfig)

# 插入观察者
torch.quantization.prepare(model, inplace=True)

# 校准
with torch.no_grad():
for inputs, _ in dataloader:
model(inputs)

# 转换
quantized_model = torch.quantization.convert(model, inplace=False)

return quantized_model

动态量化

动态量化仅量化权重,激活值在推理时量化:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def dynamic_quantize_lstm(model, dtype=torch.qint8):
"""LSTM动态量化"""
# 动态量化LSTM
quantized_model = torch.quantization.quantize_dynamic(
model, # 模型
{nn.LSTM, nn.Linear}, # 要量化的层
dtype=dtype # 量化精度
)
return quantized_model

class DynamicQuantizedLinear(nn.Module):
"""动态量化全连接层"""

def __init__(self, in_features, out_features, bias=True):
super().__init__()
self.weight = nn.Parameter(torch.randn(out_features, in_features))
self.bias = nn.Parameter(torch.randn(out_features)) if bias else None

# 预量化权重
self._quantized_weight = None

def quantize(self):
"""量化权重"""
scale = self.weight.abs().max() / 127
self._quantized_weight = torch.quantize_per_tensor(
self.weight, scale.item(), 0, torch.qint8
)

def forward(self, x):
if self._quantized_weight is None:
self.quantize()

# 动态反量化输入
x_float = x.float()

# 使用量化权重计算
output = nn.functional.linear(x_float, self.weight, self.bias)

return output

量化感知微调

在量化后进行微调恢复性能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class QuantizationAwareTraining:
"""量化感知训练"""

def __init__(self, model, train_loader, lr=1e-4):
self.model = model
self.train_loader = train_loader
self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)

def train_epoch(self):
self.model.train()
total_loss = 0

for inputs, targets in self.train_loader:
self.optimizer.zero_grad()

outputs = self.model(inputs)
loss = nn.CrossEntropyLoss()(outputs, targets)
loss.backward()

self.optimizer.step()
total_loss += loss.item()

return total_loss / len(self.train_loader)

def quantization_aware_train(self, epochs=10):
"""量化感知训练流程"""
for epoch in range(epochs):
loss = self.train_epoch()
print(f"Epoch {epoch+1}: Loss = {loss:.4f}")

# 周期性更新量化参数
if (epoch + 1) % 5 == 0:
self.update_quantization_params()

def update_quantization_params(self):
"""更新量化参数"""
for module in self.model.modules():
if hasattr(module, 'scale'):
# 重新计算scale
pass

量化效果对比

量化方法 精度损失 加速比 适用场景
FP32 - 1x 基准
INT8 1-2% 2-4x 通用
INT4 3-5% 4-8x 极致压缩
二进制 5-10% 8-32x 特殊硬件

总结

模型量化是部署深度学习模型到边缘设备的关键技术。通过合理的量化策略,可以在保持模型性能的同时大幅减少存储空间和加速推理。

参考资源

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero