LLM大模型量化技术:INT4与INT8实战指南

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

引言

大语言模型的参数规模庞大,给部署和推理带来了巨大挑战。模型量化通过降低权重精度,在保持模型性能的同时大幅减少内存占用和计算开销。本文深入解析LLM量化的技术原理,包括INT8、INT4量化方法以及GPTQ、AWQ等先进量化算法。

量化基础理论

1. 量化原理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
import numpy as np

class Quantizer:
"""
量化器基类
将浮点张量量化为低比特整数
"""
def __init__(self, num_bits=8):
self.num_bits = num_bits
self.qmin = -(2 ** (num_bits - 1))
self.qmax = 2 ** (num_bits - 1) - 1

def quantize(self, x: torch.Tensor) -> tuple:
"""
量化:张量 -> (量化值, scale, zero_point)
"""
# 计算scale和zero_point
x_min = x.min()
x_max = x.max()

scale = (x_max - x_min) / (self.qmax - self.qmin)
zero_point = self.qmin - x_min / scale

# 量化
x_quant = torch.round(x / scale + zero_point)
x_quant = torch.clamp(x_quant, self.qmin, self.qmax)

return x_quant.to(torch.int8), scale, zero_point

def dequantize(self, x_quant, scale, zero_point):
"""
反量化:量化值 -> 浮点张量
"""
return (x_quant.float() - zero_point) * scale


class DynamicQuantizer:
"""
动态量化
仅在推理时量化权重,激活值动态量化
"""
def __init__(self):
self.weight_scale = None
self.weight_zero_point = None

def quantize_weights(self, weight: torch.Tensor):
"""量化权重"""
# per-tensor量化
scale = weight.abs().max() / 127.0
weight_quant = torch.round(weight / scale)
weight_quant = torch.clamp(weight_quant, -128, 127)

self.weight_scale = scale
return weight_quant.to(torch.int8)

def quantize_activations(self, x: torch.Tensor):
"""动态量化激活值"""
scale = x.abs().max() / 127.0
x_quant = torch.round(x / scale)
x_quant = torch.clamp(x_quant, -128, 127)
return x_quant.to(torch.int8), scale

def matmul_int8(self, x_quant, x_scale, w_quant):
"""INT8矩阵乘法"""
# 反量化相乘
x_float = x_quant.float() * x_scale
w_float = w_quant.float() * self.weight_scale
return torch.matmul(x_float, w_float.t())

2. GPTQ量化算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class GPTQQuantizer:
"""
GPTQ (Generative Pretrained Transformer Quantization)
基于近似二阶信息的逐层量化方法
"""
def __init__(self, model, bit_width=4):
self.model = model
self.bit_width = bit_width
self.quant_stats = {}

def quantize_layer(self, layer, weight_name: str):
"""
量化单个层
"""
weight = layer.state_dict()[weight_name].clone()

# 获取权重形状
out_features, in_features = weight.shape

# 量化参数
if self.bit_width == 4:
num_groups = 128 # 每128列一组
else:
num_groups = 1

groups = out_features // num_groups

# 逐组量化
quantized_weights = torch.zeros_like(weight)

for g in range(groups):
start = g * num_groups
end = min((g + 1) * num_groups, out_features)

w_g = weight[start:end, :]

# 计算量化参数
scale = w_g.abs().max() / (2 ** (self.bit_width - 1) - 1)

# 量化
w_q = torch.round(w_g / scale)
w_q = torch.clamp(w_q, -(2**(self.bit_width-1)), 2**(self.bit_width-1)-1)

quantized_weights[start:end, :] = w_q * scale

return quantized_weights

def calibrate(self, calibration_data):
"""
校准:使用少量数据确定量化参数
"""
self.model.eval()

with torch.no_grad():
for batch in calibration_data:
self.model(batch)

def quantize_model(self):
"""
量化整个模型
"""
quantized_state_dict = {}

for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Linear):
# 量化线性层
weight = module.weight.data
quantized_weight = self.quantize_layer(module, 'weight')
quantized_state_dict[name + '.weight'] = quantized_weight

return quantized_state_dict

3. AWQ量化算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class AWQQuantizer:
"""
AWQ (Activation-Aware Weight Quantization)
考虑激活分布的权重量化
"""
def __init__(self, model, bit_width=4):
self.model = model
self.bit_width = bit_width

def find_best_scale(self, weight, activations, n_grid=128):
"""
寻找最优量化scale
"""
best_scale = None
best_error = float('inf')

# 权重标准差
weight_scale = weight.abs().max()

for scale_percent in np.linspace(0.5, 2.0, n_grid):
scale = weight_scale * scale_percent

# 量化
w_q = torch.round(weight / scale)
w_q = torch.clamp(w_q, -128, 127) * scale

# 计算误差
error = ((weight - w_q) ** 2).mean()

if error < best_error:
best_error = error
best_scale = scale

return best_scale

def quantize_with_calibration(self, calibration_loader):
"""
使用校准数据量化
"""
# 收集激活值统计
act_scales = {}

def hook_fn(module, input, output):
if isinstance(input[0], torch.Tensor):
key = id(module)
if key not in act_scales:
act_scales[key] = []
act_scales[key].append(input[0].detach().abs().max())

# 注册hook
handles = []
for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Linear):
h = module.register_forward_hook(hook_fn)
handles.append(h)

# 运行校准数据
self.model.eval()
with torch.no_grad():
for batch in calibration_loader:
self.model(batch)

# 移除hook
for h in handles:
h.remove()

# 计算激活scale
for key in act_scales:
act_scales[key] = torch.stack(act_scales[key]).mean()

return act_scales

Transformers量化实战

1. BitsAndBytes配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

class LLMQuantizer:
"""
使用HuggingFace BitsAndBytes进行量化
"""
def __init__(self, model_path: str):
self.model_path = model_path

def load_int8_model(self):
"""
加载INT8量化模型
"""
quant_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0, # 离群值阈值
llm_int8_has_fp16_weight=False,
llm_int8_skip_modules=None, # 不量化的模块
llm_int8_enable_fp32_cpu_offload=False
)

model = AutoModelForCausalLM.from_pretrained(
self.model_path,
quantization_config=quant_config,
device_map="auto",
trust_remote_code=True
)

return model

def load_int4_model(self):
"""
加载INT4量化模型 (QLoRA)
"""
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4", # Normal Float 4
bnb_4bit_use_double_quant=True, # 双重量化
bnb_4bit_quant_storage=torch.uint8
)

model = AutoModelForCausalLM.from_pretrained(
self.model_path,
quantization_config=quant_config,
device_map="auto",
trust_remote_code=True
)

return model

def get_memory_footprint(self, model):
"""
获取模型内存占用
"""
param_size = 0
buffer_size = 0

for param in model.parameters():
param_size += param.nelement() * param.element_size()

for buffer in model.buffers():
buffer_size += buffer.nelement() * buffer.element_size()

total_size = param_size + buffer_size
return {
"total_mb": total_size / 1024 / 1024,
"total_gb": total_size / 1024 / 1024 / 1024,
"param_size_mb": param_size / 1024 / 1024
}

2. GGML/llama.cpp量化

1
2
3
4
5
6
7
# 使用llama.cpp量化模型
# 1. 转换为GGUF格式
python3 convert.py models/llama-7b --outfile models/llama-7b/ggml-model-f16.gguf

# 2. 量化
./quantize models/llama-7b/ggml-model-f16.gguf \
models/llama-7b/ggml-model-q4_0.gguf q4_0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
class GGMLQuantizer:
"""
GGML量化配置
"""
QUANT_TYPES = {
"q4_0": {
"bits": 4,
"description": "4-bit 量化, 传统方法",
"memory_factor": 4.5
},
"q4_1": {
"bits": 4,
"description": "4-bit 量化, 带scale",
"memory_factor": 4.7
},
"q5_0": {
"bits": 5,
"description": "5-bit 量化",
"memory_factor": 5.5
},
"q5_1": {
"bits": 5,
"description": "5-bit 量化, 带scale",
"memory_factor": 5.7
},
"q8_0": {
"bits": 8,
"description": "8-bit 量化, 几乎无损",
"memory_factor": 8.5
},
"f16": {
"bits": 16,
"description": "半精度",
"memory_factor": 2.0
}
}

def quantize_model(self, input_path: str, output_path: str,
quant_type: str = "q4_0"):
"""
调用llama.cpp量化
"""
import subprocess

cmd = [
"./llama.cpp/quantize",
input_path,
output_path,
quant_type
]

result = subprocess.run(cmd, capture_output=True, text=True)
return result.returncode == 0

量化性能对比

内存与性能对比

量化方法 精度 7B模型大小 相对精度 推理速度
FP16 16bit ~14GB 100% 1x
INT8 8bit ~7GB ~98% 1.5x
Q4_0 4bit ~3.5GB ~96% 2.2x
Q4_K_S 4bit ~3.9GB ~97% 2.0x
Q5_K_M 5bit ~4.5GB ~98% 1.8x
Q8_0 8bit ~6.5GB ~99% 1.6x

量化校准实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class CalibrationDataset:
"""
量化校准数据集
"""
def __init__(self, tokenizer, max_samples=512):
self.tokenizer = tokenizer
self.max_samples = max_samples
self.data = []

def add_samples(self, texts: list):
"""添加校准样本"""
for text in texts:
encoded = self.tokenizer(
text,
truncation=True,
max_length=2048,
return_tensors="pt"
)
self.data.append(encoded)

def __iter__(self):
"""迭代返回校准数据"""
for i, sample in enumerate(self.data[:self.max_samples]):
if i >= self.max_samples:
break
yield sample['input_ids']

def __len__(self):
return min(len(self.data), self.max_samples)


def evaluate_quantized_model(model, quantizer, test_data):
"""
评估量化模型性能
"""
model.eval()
correct = 0
total = 0

with torch.no_grad():
for batch in test_data:
outputs = model(batch)
# 计算准确率
# ...

accuracy = correct / total

# 获取内存占用
memory = quantizer.get_memory_footprint(model)

return {
"accuracy": accuracy,
"memory_gb": memory["total_gb"]
}

总结

模型量化是LLM部署的关键技术。通过INT8和INT4量化,可以将模型大小压缩4-8倍,同时保持较高的模型性能。GPTQ和AWQ等先进量化算法在低比特量化下表现优异。本文详细介绍了量化的技术原理和实战代码,帮助开发者在大模型部署中做出最优选择。

参考资源

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero