模型量化：深度学习模型压缩与加速

Posted on 四月 5, 2021

🎙️ 语音朗读当前: 晓晓 (温柔女声)

前言

模型量化（Model Quantization）是一种有效的模型压缩技术，通过将浮点数权重和激活值转换为低精度表示（如INT8、INT4）来减少模型大小和加速推理。本文将深入解析量化的原理、方法和实现。

量化基础概念

量化将连续的浮点值映射到离散的整数值：

1 2	x_quantized = round(x_float / scale) + zero_point x_float = (x_quantized - zero_point) * scale

import torch
import torch.nn as nn
import numpy as np

class Quantizer:
    """量化器基类"""
    
    def __init__(self, num_bits=8):
        self.num_bits = num_bits
        self.num_levels = 2 ** num_bits
    
    def quantize(self, tensor):
        raise NotImplementedError
    
    def dequantize(self, quantized_tensor):
        raise NotImplementedError

class SymmetricQuantizer(Quantizer):
    """对称量化"""
    
    def __init__(self, num_bits=8):
        super().__init__(num_bits)
        self.scale = None
        self.zero_point = 0  # 对称量化为0
    
    def quantize(self, tensor):
        # 计算scale
        max_val = tensor.abs().max()
        self.scale = max_val / (self.num_levels / 2 - 1)
        
        # 量化
        quantized = torch.round(tensor / self.scale)
        quantized = torch.clamp(quantized, 
                               -(self.num_levels/2 - 1), 
                               self.num_levels/2 - 1)
        return quantized.to(torch.int8)
    
    def dequantize(self, quantized_tensor):
        return quantized_tensor.float() * self.scale

class AffineQuantizer(Quantizer):
    """非对称量化（Affine/Zero-point量化）"""
    
    def __init__(self, num_bits=8):
        super().__init__(num_bits)
        self.scale = None
        self.zero_point = None
    
    def quantize(self, tensor):
        # 计算scale和zero_point
        min_val = tensor.min()
        max_val = tensor.max()
        
        self.scale = (max_val - min_val) / (self.num_levels - 1)
        self.zero_point = torch.round(-min_val / self.scale)
        self.zero_point = torch.clamp(self.zero_point, 0, self.num_levels - 1)
        
        # 量化
        quantized = torch.round(tensor / self.scale) + self.zero_point
        quantized = torch.clamp(quantized, 0, self.num_levels - 1)
        
        return quantized.to(torch.int8)
    
    def dequantize(self, quantized_tensor):
        return (quantized_tensor.float() - self.zero_point) * self.scale

量化感知训练

量化感知训练（QAT）在训练过程中模拟量化效果：

class FakeQuantize(nn.Module):
    """伪量化模块"""
    
    def __init__(self, num_bits=8, quantize_forward=True):
        super().__init__()
        self.num_bits = num_bits
        self.quantize_forward = quantize_forward
        
        # 可学习的scale和zero_point
        self.scale = nn.Parameter(torch.ones(1))
        self.zero_point = nn.Parameter(torch.zeros(1))
    
    def forward(self, x):
        if self.training and self.quantize_forward:
            # 前向：伪量化
            x = self._fake_quantize(x)
        return x
    
    def _fake_quantize(self, x):
        # 量化
        x_quantized = torch.round(x / self.scale) + self.zero_point
        x_quantized = torch.clamp(x_quantized, 0, 2**self.num_bits - 1)
        
        # 反量化
        x_dequantized = (x_quantized - self.zero_point) * self.scale
        
        # 存储STE的梯度
        x_dequantized = x + (x_dequantized - x).detach()
        
        return x_dequantized

class QuantizedLinear(nn.Module):
    """量化版全连接层"""
    
    def __init__(self, in_features, out_features, bias=True, num_bits=8):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        # 浮点权重
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        self.bias = nn.Parameter(torch.randn(out_features)) if bias else None
        
        # 量化器
        self.weight_quantizer = SymmetricQuantizer(num_bits)
        
        # 伪量化用于训练
        self.fake_quantize = FakeQuantize(num_bits)
    
    def forward(self, x):
        if self.training:
            # 训练时使用伪量化
            weight = self.fake_quantize(self.weight)
        else:
            # 推理时使用实际量化
            weight = self.weight_quantizer.dequantize(
                self.weight_quantizer.quantize(self.weight)
            )
        
        output = nn.functional.linear(x, weight, self.bias)
        return output

后训练量化

后训练量化（PTQ）在模型训练完成后进行量化：

class PostTrainingQuantizer:
    """后训练量化器"""
    
    def __init__(self, model, dataloader, num_bits=8):
        self.model = model
        self.dataloader = dataloader
        self.num_bits = num_bits
    
    def calibrate(self):
        """校准：收集激活值统计信息"""
        self.model.eval()
        
        # 用于存储各层激活范围
        activation_ranges = {}
        hooks = []
        
        def hook_fn(name):
            def hook(module, input, output):
                if isinstance(output, tuple):
                    output = output[0]
                # 收集统计信息
                activation_ranges[name] = {
                    'min': output.detach().min(),
                    'max': output.detach().max()
                }
            return hook
        
        # 注册hooks
        for name, module in self.model.named_modules():
            if isinstance(module, (nn.Conv2d, nn.Linear)):
                hooks.append(module.register_forward_hook(hook_fn(name)))
        
        # 运行校准数据
        with torch.no_grad():
            for i, (inputs, _) in enumerate(self.dataloader):
                if i >= 100:  # 使用100个batch校准
                    break
                self.model(inputs)
        
        # 移除hooks
        for hook in hooks:
            hook.remove()
        
        return activation_ranges
    
    def quantize_model(self):
        """量化模型"""
        # 1. 校准获取激活范围
        activation_ranges = self.calibrate()
        
        # 2. 量化权重
        quantized_state_dict = {}
        for name, param in self.model.state_dict().items():
            quantizer = SymmetricQuantizer(self.num_bits)
            quantized_param = quantizer.quantize(param)
            quantized_state_dict[name] = {
                'data': quantized_param,
                'scale': quantizer.scale
            }
        
        return quantized_state_dict

def static_quantize_model(model, example_inputs, quantized_dtype=torch.qint8):
    """PyTorch静态量化"""
    # 准备模型
    model.eval()
    model.fuse_model()  # 融合Conv+BN+ReLU
    
    # 准备量化配置
    qconfig = torch.quantization.get_default_qconfig('fbgemm')
    torch.quantization.set_qconfig(qconfig)
    
    # 插入观察者
    torch.quantization.prepare(model, inplace=True)
    
    # 校准
    with torch.no_grad():
        for inputs, _ in dataloader:
            model(inputs)
    
    # 转换
    quantized_model = torch.quantization.convert(model, inplace=False)
    
    return quantized_model

动态量化

动态量化仅量化权重，激活值在推理时量化：

def dynamic_quantize_lstm(model, dtype=torch.qint8):
    """LSTM动态量化"""
    # 动态量化LSTM
    quantized_model = torch.quantization.quantize_dynamic(
        model,  # 模型
        {nn.LSTM, nn.Linear},  # 要量化的层
        dtype=dtype  # 量化精度
    )
    return quantized_model

class DynamicQuantizedLinear(nn.Module):
    """动态量化全连接层"""
    
    def __init__(self, in_features, out_features, bias=True):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        self.bias = nn.Parameter(torch.randn(out_features)) if bias else None
        
        # 预量化权重
        self._quantized_weight = None
    
    def quantize(self):
        """量化权重"""
        scale = self.weight.abs().max() / 127
        self._quantized_weight = torch.quantize_per_tensor(
            self.weight, scale.item(), 0, torch.qint8
        )
    
    def forward(self, x):
        if self._quantized_weight is None:
            self.quantize()
        
        # 动态反量化输入
        x_float = x.float()
        
        # 使用量化权重计算
        output = nn.functional.linear(x_float, self.weight, self.bias)
        
        return output

量化感知微调

在量化后进行微调恢复性能：

class QuantizationAwareTraining:
    """量化感知训练"""
    
    def __init__(self, model, train_loader, lr=1e-4):
        self.model = model
        self.train_loader = train_loader
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    def train_epoch(self):
        self.model.train()
        total_loss = 0
        
        for inputs, targets in self.train_loader:
            self.optimizer.zero_grad()
            
            outputs = self.model(inputs)
            loss = nn.CrossEntropyLoss()(outputs, targets)
            loss.backward()
            
            self.optimizer.step()
            total_loss += loss.item()
        
        return total_loss / len(self.train_loader)
    
    def quantization_aware_train(self, epochs=10):
        """量化感知训练流程"""
        for epoch in range(epochs):
            loss = self.train_epoch()
            print(f"Epoch {epoch+1}: Loss = {loss:.4f}")
            
            # 周期性更新量化参数
            if (epoch + 1) % 5 == 0:
                self.update_quantization_params()
    
    def update_quantization_params(self):
        """更新量化参数"""
        for module in self.model.modules():
            if hasattr(module, 'scale'):
                # 重新计算scale
                pass

量化效果对比

量化方法	精度损失	加速比	适用场景
FP32	-	1x	基准
INT8	1-2%	2-4x	通用
INT4	3-5%	4-8x	极致压缩
二进制	5-10%	8-32x	特殊硬件

总结

模型量化是部署深度学习模型到边缘设备的关键技术。通过合理的量化策略，可以在保持模型性能的同时大幅减少存储空间和加速推理。

ovo$^{mc^2}$