ChatGLM中文大模型:技术原理与产业应用

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

引言

ChatGLM是由清华大学KEG实验室和智谱AI联合开发的中英双语大语言模型。作为国产开源大模型的代表,ChatGLM在中文理解与生成方面表现出色,并已广泛应用于工业界。本文将深入分析ChatGLM的技术架构、训练方法和部署实践。

ChatGLM技术架构

1. GLM核心设计

ChatGLM基于通用语言模型(General Language Model, GLM)架构,这是一种结合了自编码和自回归优势的混合目标函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
import torch.nn as nn
from transformers import PretrainedConfig

class GLMAttention(nn.Module):
"""
GLM的自注意力机制
支持长短混合注意力模式
"""
def __init__(self, config):
super().__init__()
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads

# 旋转位置编码
self.rotary_emb = RotaryEmbedding(self.head_dim)

# 注意力投影
self.query_key_value = nn.Linear(
config.hidden_size,
3 * config.hidden_size
)
self.dense = nn.Linear(config.hidden_size, config.hidden_size)

def forward(self, hidden_states, position_ids, attention_mask=None):
B, L, _ = hidden_states.shape

# QKV投影
qkv = self.query_key_value(hidden_states)
qkv = qkv.reshape(B, L, 3, self.num_heads, self.head_dim)
q, k, v = qkv.unbind(2)

# 应用RoPE
q, k = self.rotary_emb(q, k, position_ids)

# 计算注意力
attn_output = self._attn(q, k, v, attention_mask)

return self.dense(attn_output)

def _attn(self, q, k, v, attention_mask):
scale = 1.0 / (self.head_dim ** 0.5)
attn_weights = torch.matmul(q, k.transpose(-2, -1)) * scale

if attention_mask is not None:
attn_weights = attn_weights + attention_mask

attn_weights = nn.functional.softmax(attn_weights, dim=-1)
return torch.matmul(attn_weights, v)

class GLMBlock(nn.Module):
"""
GLM Transformer Block
"""
def __init__(self, config):
super().__init__()
self.attention = GLMAttention(config)
self.mlp = nn.Sequential(
nn.Linear(config.hidden_size, config.intermediate_size * 4),
nn.GELU(),
nn.Linear(config.intermediate_size * 4, config.hidden_size)
)
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)

def forward(self, hidden_states, position_ids, attention_mask=None):
# 自注意力 + 残差
residual = hidden_states
hidden_states = self.attention(hidden_states, position_ids, attention_mask)
hidden_states = residual + hidden_states

# 前馈网络 + 残差
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states

return hidden_states

2. Prefix LM训练范式

ChatGLM采用Prefix LM训练范式,支持不同注意力模式:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class PrefixLMConfig:
def __init__(self):
self.num_layers = 28
self.hidden_size = 4096
self.num_attention_heads = 32
self.pre_seq_len = 128 # Prefix序列长度
self.virtual_pipeline_parallel_size = 1

class PrefixLMModel(nn.Module):
"""
Prefix LM支持双向注意力和单向注意力的混合
"""
def __init__(self, config):
super().__init__()
self.num_layers = config.num_layers
self.prefix_tokens = config.pre_seq_len

# 词嵌入
self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)

# Prefix编码器
self.prefix_encoder = nn.Sequential(
nn.Linear(config.hidden_size, config.hidden_size),
nn.Tanh(),
nn.Linear(config.hidden_size, config.num_layers * 2 * config.hidden_size)
)

# Transformer层
self.layers = nn.ModuleList([
GLMBlock(config) for _ in range(config.num_layers)
])

def get_position_ids(self, input_ids, seq_len, past_key_values_length=0):
"""
生成位置编码
"""
mask = input_ids == 150_004 # [MASK] token
positions = torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length - 1
positions = positions - positions.where(~mask).max(-1, keepdim=True)[0] + past_key_values_length
return positions

def forward(self, input_ids, attention_mask=None):
B, L = input_ids.shape

# 获取embedding
hidden_states = self.embedding(input_ids)

# 生成prefix注意力
prefix_attention = self.get_prefix_attention(input_ids)

# 生成位置编码
position_ids = self.get_position_ids(input_ids, L)

# 通过Transformer层
for layer in self.layers:
hidden_states = layer(
hidden_states,
position_ids,
attention_mask=prefix_attention
)

return hidden_states

def get_prefix_attention(self, input_ids):
"""
为prefix tokens生成特殊注意力掩码
"""
B, L = input_ids.shape
# Prefix tokens之间双向注意,其他单向
prefix_mask = (input_ids == 150_004).unsqueeze(1).unsqueeze(2)
return prefix_mask

ChatGLM训练流程

1. 预训练阶段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class ChatGLMPretraining:
"""
ChatGLM预训练配置
"""
def __init__(self):
self.optimizer_config = {
"learning_rate": 1e-4,
"weight_decay": 0.1,
"beta1": 0.9,
"beta2": 0.999,
"warmup_steps": 2000
}
self.training_config = {
"batch_size": 8,
"sequence_length": 2048,
"gradient_accumulation_steps": 16,
"max_steps": 100_000
}

def compute_loss(self, model, batch):
"""
GLM的预训练损失
Token级别的语言建模损失
"""
input_ids = batch['input_ids']
labels = batch['labels']

outputs = model(input_ids)
logits = outputs.logits

# Shift logits and labels for causal LM
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()

loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1)
)

return loss

2. SFT监督微调

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class ChatGLMSFT:
"""
ChatGLM监督微调
"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer

def format_prompt(self, conversation):
"""
格式化对话prompt
"""
prompt = ""
for turn in conversation:
role = turn['role']
content = turn['content']
if role == 'user':
prompt += f"[Round {turn['round']}]\n问:{content}\n"
else:
prompt += f"答:{content}\n"
prompt += "答:"
return prompt

def sft_collate_fn(self, batch):
"""
SFT数据整理
"""
prompts = [self.format_prompt(conv) for conv in batch['conversation']]

encoded = self.tokenizer(
prompts,
padding=True,
truncation=True,
max_length=2048,
return_tensors='pt'
)

return {
'input_ids': encoded['input_ids'],
'attention_mask': encoded['attention_mask'],
'labels': encoded['input_ids'].clone()
}

def train(self, train_dataset, output_dir):
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=1e-5,
num_train_epochs=3,
logging_steps=10,
save_steps=1000,
save_total_limit=3,
fp16=True,
dataloader_num_workers=4
)

trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
data_collator=self.sft_collate_fn
)

trainer.train()

3. PPO强化学习训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class ChatGLMPPOTrainer:
"""
ChatGLM的RLHF训练
"""
def __init__(self, ref_model, reward_model, ppo_config):
self.ref_model = ref_model
self.reward_model = reward_model
self.ppo_config = ppo_config

def compute_reward(self, response, prompt):
"""
使用奖励模型计算reward
"""
full_text = prompt + response
reward_score = self.reward_model(full_text)
return reward_score

def compute_kl_penalty(self, response, prompt):
"""
KL散度惩罚,防止过度优化
"""
with torch.no_grad():
ref_log_probs = self.ref_model(prompt + response).log_probs
new_log_probs = self.model(prompt + response).log_probs
return ref_log_probs - new_log_probs

def ppo_step(self, prompts, responses):
"""
PPO更新步骤
"""
rewards = []
kl_penalties = []

for prompt, response in zip(prompts, responses):
reward = self.compute_reward(response, prompt)
kl = self.compute_kl_penalty(response, prompt)
rewards.append(reward - self.ppo_config.kl_coef * kl)
kl_penalties.append(kl)

# PPO策略更新...

ChatGLM6B部署实践

1. 量化部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from transformers import AutoModel, AutoTokenizer
import torch

class ChatGLMDeployer:
def __init__(self, model_path, quantize_mode='int4'):
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

load_kwargs = {
"trust_remote_code": True,
"device_map": "auto"
}

if quantize_mode == 'int4':
# INT4量化,大幅降低显存占用
self.model = AutoModel.from_pretrained(
model_path,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
),
**load_kwargs
)
elif quantize_mode == 'int8':
self.model = AutoModel.from_pretrained(
model_path,
quantization_config=BitsAndBytesConfig(
load_in_8bit=True
),
**load_kwargs
)
else:
self.model = AutoModel.from_pretrained(model_path, **load_kwargs)

def chat(self, query, history=None, max_length=2048):
response, history = self.model.chat(
self.tokenizer,
query,
history=history or [],
max_length=max_length
)
return response, history

def stream_chat(self, query, history=None):
"""
流式输出
"""
for response, history in self.model.stream_chat(
self.tokenizer,
query,
history=history or []
):
yield response

2. OpenAI兼容API

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn

app = FastAPI(title="ChatGLM API")

class ChatRequest(BaseModel):
prompt: str
history: list = []
max_length: int = 2048
temperature: float = 0.7

@app.post("/v1/chat/completions")
async def chat_completions(request: ChatRequest):
response, _ = chatglm.chat(
request.prompt,
request.history,
request.max_length
)

return {
"choices": [{
"message": {
"role": "assistant",
"content": response
},
"finish_reason": "stop",
"index": 0
}],
"usage": {
"prompt_tokens": len(request.prompt),
"completion_tokens": len(response),
"total_tokens": len(request.prompt) + len(response)
}
}

@app.get("/health")
async def health():
return {"status": "ok"}

# 启动服务
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)

ChatGLM产业应用

智能客服系统

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class ChatGLMCustomerService:
def __init__(self, model_path):
self.llm = ChatGLMDeployer(model_path)
self.context_db = vector_db # 知识库

def retrieve_context(self, query):
"""
检索相关知识
"""
relevant_docs = self.context_db.similarity_search(query, k=3)
context = "\n".join([doc.page_content for doc in relevant_docs])
return context

def generate_response(self, query, user_id):
# 检索相关知识
context = self.retrieve_context(query)

# 构建prompt
prompt = f"""基于以下知识回答用户问题:

知识库内容:
{context}

用户问题:{query}

回答:"""

response, _ = self.llm.chat(prompt)
return response

总结

ChatGLM作为国产大模型的优秀代表,在中文理解与生成方面展现了强大的能力。通过GLM架构创新、Prefix LM训练范式和完整的RLHF流程,ChatGLM实现了卓越的性能。开源的ChatGLM-6B更是降低了中文大模型的应用门槛,推动了国内AI生态的快速发展。

参考资源

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero