LLM大模型量化技术：INT4与INT8实战指南

Posted on 六月 28, 2023

🎙️ 语音朗读当前: 晓晓 (温柔女声)

引言

大语言模型的参数规模庞大，给部署和推理带来了巨大挑战。模型量化通过降低权重精度，在保持模型性能的同时大幅减少内存占用和计算开销。本文深入解析LLM量化的技术原理，包括INT8、INT4量化方法以及GPTQ、AWQ等先进量化算法。

量化基础理论

1. 量化原理

import torch
import numpy as np

class Quantizer:
    """
    量化器基类
    将浮点张量量化为低比特整数
    """
    def __init__(self, num_bits=8):
        self.num_bits = num_bits
        self.qmin = -(2 ** (num_bits - 1))
        self.qmax = 2 ** (num_bits - 1) - 1
        
    def quantize(self, x: torch.Tensor) -> tuple:
        """
        量化：张量 -> (量化值, scale, zero_point)
        """
        # 计算scale和zero_point
        x_min = x.min()
        x_max = x.max()
        
        scale = (x_max - x_min) / (self.qmax - self.qmin)
        zero_point = self.qmin - x_min / scale
        
        # 量化
        x_quant = torch.round(x / scale + zero_point)
        x_quant = torch.clamp(x_quant, self.qmin, self.qmax)
        
        return x_quant.to(torch.int8), scale, zero_point
    
    def dequantize(self, x_quant, scale, zero_point):
        """
        反量化：量化值 -> 浮点张量
        """
        return (x_quant.float() - zero_point) * scale


class DynamicQuantizer:
    """
    动态量化
    仅在推理时量化权重，激活值动态量化
    """
    def __init__(self):
        self.weight_scale = None
        self.weight_zero_point = None
        
    def quantize_weights(self, weight: torch.Tensor):
        """量化权重"""
        # per-tensor量化
        scale = weight.abs().max() / 127.0
        weight_quant = torch.round(weight / scale)
        weight_quant = torch.clamp(weight_quant, -128, 127)
        
        self.weight_scale = scale
        return weight_quant.to(torch.int8)
    
    def quantize_activations(self, x: torch.Tensor):
        """动态量化激活值"""
        scale = x.abs().max() / 127.0
        x_quant = torch.round(x / scale)
        x_quant = torch.clamp(x_quant, -128, 127)
        return x_quant.to(torch.int8), scale
    
    def matmul_int8(self, x_quant, x_scale, w_quant):
        """INT8矩阵乘法"""
        # 反量化相乘
        x_float = x_quant.float() * x_scale
        w_float = w_quant.float() * self.weight_scale
        return torch.matmul(x_float, w_float.t())

2. GPTQ量化算法

class GPTQQuantizer:
    """
    GPTQ (Generative Pretrained Transformer Quantization)
    基于近似二阶信息的逐层量化方法
    """
    def __init__(self, model, bit_width=4):
        self.model = model
        self.bit_width = bit_width
        self.quant_stats = {}
        
    def quantize_layer(self, layer, weight_name: str):
        """
        量化单个层
        """
        weight = layer.state_dict()[weight_name].clone()
        
        # 获取权重形状
        out_features, in_features = weight.shape
        
        # 量化参数
        if self.bit_width == 4:
            num_groups = 128  # 每128列一组
        else:
            num_groups = 1
            
        groups = out_features // num_groups
        
        # 逐组量化
        quantized_weights = torch.zeros_like(weight)
        
        for g in range(groups):
            start = g * num_groups
            end = min((g + 1) * num_groups, out_features)
            
            w_g = weight[start:end, :]
            
            # 计算量化参数
            scale = w_g.abs().max() / (2 ** (self.bit_width - 1) - 1)
            
            # 量化
            w_q = torch.round(w_g / scale)
            w_q = torch.clamp(w_q, -(2**(self.bit_width-1)), 2**(self.bit_width-1)-1)
            
            quantized_weights[start:end, :] = w_q * scale
        
        return quantized_weights
    
    def calibrate(self, calibration_data):
        """
        校准：使用少量数据确定量化参数
        """
        self.model.eval()
        
        with torch.no_grad():
            for batch in calibration_data:
                self.model(batch)
                
    def quantize_model(self):
        """
        量化整个模型
        """
        quantized_state_dict = {}
        
        for name, module in self.model.named_modules():
            if isinstance(module, torch.nn.Linear):
                # 量化线性层
                weight = module.weight.data
                quantized_weight = self.quantize_layer(module, 'weight')
                quantized_state_dict[name + '.weight'] = quantized_weight
                
        return quantized_state_dict

3. AWQ量化算法

class AWQQuantizer:
    """
    AWQ (Activation-Aware Weight Quantization)
    考虑激活分布的权重量化
    """
    def __init__(self, model, bit_width=4):
        self.model = model
        self.bit_width = bit_width
        
    def find_best_scale(self, weight, activations, n_grid=128):
        """
        寻找最优量化scale
        """
        best_scale = None
        best_error = float('inf')
        
        # 权重标准差
        weight_scale = weight.abs().max()
        
        for scale_percent in np.linspace(0.5, 2.0, n_grid):
            scale = weight_scale * scale_percent
            
            # 量化
            w_q = torch.round(weight / scale)
            w_q = torch.clamp(w_q, -128, 127) * scale
            
            # 计算误差
            error = ((weight - w_q) ** 2).mean()
            
            if error < best_error:
                best_error = error
                best_scale = scale
                
        return best_scale
    
    def quantize_with_calibration(self, calibration_loader):
        """
        使用校准数据量化
        """
        # 收集激活值统计
        act_scales = {}
        
        def hook_fn(module, input, output):
            if isinstance(input[0], torch.Tensor):
                key = id(module)
                if key not in act_scales:
                    act_scales[key] = []
                act_scales[key].append(input[0].detach().abs().max())
        
        # 注册hook
        handles = []
        for name, module in self.model.named_modules():
            if isinstance(module, torch.nn.Linear):
                h = module.register_forward_hook(hook_fn)
                handles.append(h)
        
        # 运行校准数据
        self.model.eval()
        with torch.no_grad():
            for batch in calibration_loader:
                self.model(batch)
        
        # 移除hook
        for h in handles:
            h.remove()
        
        # 计算激活scale
        for key in act_scales:
            act_scales[key] = torch.stack(act_scales[key]).mean()
        
        return act_scales

Transformers量化实战

1. BitsAndBytes配置

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

class LLMQuantizer:
    """
    使用HuggingFace BitsAndBytes进行量化
    """
    def __init__(self, model_path: str):
        self.model_path = model_path
        
    def load_int8_model(self):
        """
        加载INT8量化模型
        """
        quant_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0,      # 离群值阈值
            llm_int8_has_fp16_weight=False,
            llm_int8_skip_modules=None,    # 不量化的模块
            llm_int8_enable_fp32_cpu_offload=False
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            self.model_path,
            quantization_config=quant_config,
            device_map="auto",
            trust_remote_code=True
        )
        
        return model
    
    def load_int4_model(self):
        """
        加载INT4量化模型 (QLoRA)
        """
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",     # Normal Float 4
            bnb_4bit_use_double_quant=True,  # 双重量化
            bnb_4bit_quant_storage=torch.uint8
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            self.model_path,
            quantization_config=quant_config,
            device_map="auto",
            trust_remote_code=True
        )
        
        return model
    
    def get_memory_footprint(self, model):
        """
        获取模型内存占用
        """
        param_size = 0
        buffer_size = 0
        
        for param in model.parameters():
            param_size += param.nelement() * param.element_size()
            
        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()
        
        total_size = param_size + buffer_size
        return {
            "total_mb": total_size / 1024 / 1024,
            "total_gb": total_size / 1024 / 1024 / 1024,
            "param_size_mb": param_size / 1024 / 1024
        }

2. GGML/llama.cpp量化

# 使用llama.cpp量化模型
# 1. 转换为GGUF格式
python3 convert.py models/llama-7b --outfile models/llama-7b/ggml-model-f16.gguf

# 2. 量化
./quantize models/llama-7b/ggml-model-f16.gguf \
            models/llama-7b/ggml-model-q4_0.gguf q4_0

class GGMLQuantizer:
    """
    GGML量化配置
    """
    QUANT_TYPES = {
        "q4_0": {
            "bits": 4,
            "description": "4-bit 量化, 传统方法",
            "memory_factor": 4.5
        },
        "q4_1": {
            "bits": 4,
            "description": "4-bit 量化, 带scale",
            "memory_factor": 4.7
        },
        "q5_0": {
            "bits": 5,
            "description": "5-bit 量化",
            "memory_factor": 5.5
        },
        "q5_1": {
            "bits": 5,
            "description": "5-bit 量化, 带scale",
            "memory_factor": 5.7
        },
        "q8_0": {
            "bits": 8,
            "description": "8-bit 量化, 几乎无损",
            "memory_factor": 8.5
        },
        "f16": {
            "bits": 16,
            "description": "半精度",
            "memory_factor": 2.0
        }
    }
    
    def quantize_model(self, input_path: str, output_path: str, 
                       quant_type: str = "q4_0"):
        """
        调用llama.cpp量化
        """
        import subprocess
        
        cmd = [
            "./llama.cpp/quantize",
            input_path,
            output_path,
            quant_type
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        return result.returncode == 0

量化性能对比

内存与性能对比

量化方法	精度	7B模型大小	相对精度	推理速度
FP16	16bit	~14GB	100%	1x
INT8	8bit	~7GB	~98%	1.5x
Q4_0	4bit	~3.5GB	~96%	2.2x
Q4_K_S	4bit	~3.9GB	~97%	2.0x
Q5_K_M	5bit	~4.5GB	~98%	1.8x
Q8_0	8bit	~6.5GB	~99%	1.6x

量化校准实现

class CalibrationDataset:
    """
    量化校准数据集
    """
    def __init__(self, tokenizer, max_samples=512):
        self.tokenizer = tokenizer
        self.max_samples = max_samples
        self.data = []
        
    def add_samples(self, texts: list):
        """添加校准样本"""
        for text in texts:
            encoded = self.tokenizer(
                text,
                truncation=True,
                max_length=2048,
                return_tensors="pt"
            )
            self.data.append(encoded)
            
    def __iter__(self):
        """迭代返回校准数据"""
        for i, sample in enumerate(self.data[:self.max_samples]):
            if i >= self.max_samples:
                break
            yield sample['input_ids']
            
    def __len__(self):
        return min(len(self.data), self.max_samples)


def evaluate_quantized_model(model, quantizer, test_data):
    """
    评估量化模型性能
    """
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in test_data:
            outputs = model(batch)
            # 计算准确率
            # ...
            
    accuracy = correct / total
    
    # 获取内存占用
    memory = quantizer.get_memory_footprint(model)
    
    return {
        "accuracy": accuracy,
        "memory_gb": memory["total_gb"]
    }

总结

模型量化是LLM部署的关键技术。通过INT8和INT4量化，可以将模型大小压缩4-8倍，同时保持较高的模型性能。GPTQ和AWQ等先进量化算法在低比特量化下表现优异。本文详细介绍了量化的技术原理和实战代码，帮助开发者在大模型部署中做出最优选择。

ovo$^{mc^2}$