Qwen:阿里通义大模型生态与实践

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

引言

2023年,阿里云发布了通义千问(Qwen)系列大模型,这是国内科技巨头在开源大模型领域的重要布局。Qwen凭借出色的中文能力和开放的生态,迅速成为开源社区的热门选择。本文将深入分析Qwen的技术架构、训练方法和应用实践。

Qwen技术架构

1. 模型架构设计

Qwen采用了类似LLaMA的Transformer解码器架构,并进行了多项技术优化:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import torch
import torch.nn as nn
import math

class QwenAttention(nn.Module):
"""
Qwen注意力机制
支持Flash Attention优化
"""
def __init__(self, config):
super().__init__()
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.max_position_embeddings = config.max_position_embeddings

# 旋转位置编码(RoPE)
self.rotary_emb = QwenRotaryEmbedding(
dim=self.head_dim,
max_position_embeddings=self.max_position_embeddings
)

# QKV投影
self.qkv_proj = nn.Linear(
self.hidden_size,
3 * self.hidden_size,
bias=False
)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)

def forward(self, x, attention_mask=None, position_ids=None):
B, T, C = x.shape

# QKV投影
qkv = self.qkv_proj(x)
qkv = qkv.reshape(B, T, 3, self.num_heads, self.head_dim)
qkv = qkv.permute(2, 0, 3, 1, 4) # [3, B, H, T, D]
q, k, v = qkv[0], qkv[1], qkv[2]

# 应用旋转位置编码
q, k = self.rotary_emb(q, k, position_ids)

# Flash Attention
if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
attn_output = torch.nn.functional.scaled_dot_product_attention(
q, k, v,
attn_mask=attention_mask,
dropout_p=0.0 if not self.training else 0.1
)
else:
# 兼容实现
attn_output = self._fallback_attention(q, k, v, attention_mask)

return self.o_proj(attn_output)

def _fallback_attention(self, q, k, v, mask):
scale = 1.0 / math.sqrt(self.head_dim)
attn = torch.matmul(q, k.transpose(-2, -1)) * scale

if mask is not None:
attn = attn + mask

attn = nn.functional.softmax(attn, dim=-1)
return torch.matmul(attn, v)

class QwenRotaryEmbedding(nn.Module):
"""
Qwen旋转位置编码
"""
def __init__(self, dim, max_position_embeddings=2048, base=10000):
super().__init__()
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer("inv_freq", inv_freq)

def forward(self, q, k, position_ids=None):
if position_ids is None:
T = q.shape[2]
position_ids = torch.arange(T, device=q.device).unsqueeze(0)

# 计算频率
position_ids = position_ids.float()
freqs = torch.einsum("i,j->ij", position_ids[0], self.inv_freq)
emb = torch.cat((freqs, freqs), dim=-1)

# 计算cos和sin
cos = emb.cos()
sin = emb.sin()

# 旋转Q和K
q_embed = self._rotate_half(q, cos, sin)
k_embed = self._rotate_half(k, cos, sin)

return q_embed, k_embed

def _rotate_half(self, x, cos, sin):
x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
return torch.cat((-x2, x1), dim=-1) * cos + x1 * (-sin) + x2 * cos

2. SwiGLU激活与门控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
class QwenMLP(nn.Module):
"""
Qwen前馈网络
使用SwiGLU激活函数
"""
def __init__(self, config):
super().__init__()
hidden_size = config.hidden_size
intermediate_size = config.intermediate_size

self.gate_proj = nn.Linear(hidden_size, intermediate_size * 2, bias=False)
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)

def forward(self, x):
gate = self.gate_proj(x)
gate_1, gate_2 = gate.chunk(2, dim=-1)

return self.down_proj(nn.functional.silu(gate_1) * self.up_proj(x))

3. QwenBlock实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
class QwenBlock(nn.Module):
"""
Qwen Transformer Block
"""
def __init__(self, config):
super().__init__()
self.attention = QwenAttention(config)
self.mlp = QwenMLP(config)

self.post_attention_layernorm = nn.RMSNorm(
config.hidden_size,
eps=config.layer_norm_epsilon
)
self.input_layernorm = nn.RMSNorm(
config.hidden_size,
eps=config.layer_norm_epsilon
)

def forward(self, x, attention_mask=None, position_ids=None):
# 自注意力 + 残差连接
residual = x
x = self.input_layernorm(x)
x = self.attention(x, attention_mask, position_ids)
x = residual + x

# 前馈网络 + 残差连接
residual = x
x = self.post_attention_layernorm(x)
x = residual + self.mlp(x)

return x

Qwen模型系列

模型 参数量 上下文 特点
Qwen-1.8B 1.8B 2K 轻量级部署
Qwen-7B 7B 2K/8K 主流版本
Qwen-14B 14B 2K/8K 高性能
Qwen-72B 72B 2K/8K 旗舰版本
Qwen-Chat - 2K 对话优化

Qwen训练技术

1. 预训练数据处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class QwenDataProcessor:
"""
Qwen预训练数据处理
"""
def __init__(self, vocab_file):
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
vocab_file,
trust_remote_code=True
)

def prepare_training_data(self, text_files: List[str]) -> Dataset:
"""
准备预训练数据
"""
all_texts = []

for file_path in text_files:
with open(file_path, 'r', encoding='utf-8') as f:
texts = f.readlines()

# Tokenize
for text in texts:
if len(text.strip()) < 10:
continue

tokens = self.tokenizer(
text,
max_length=2048,
truncation=True,
padding='max_length'
)

all_texts.append({
'input_ids': tokens['input_ids'],
'attention_mask': tokens['attention_mask']
})

return Dataset.from_list(all_texts)

def prepare_sft_data(self, conversations: List[dict]) -> Dataset:
"""
准备SFT数据
"""
processed = []

for conv in conversations:
prompt = self._format_conversation(conv)
tokens = self.tokenizer(
prompt,
max_length=2048,
truncation=True
)

processed.append({
'input_ids': tokens['input_ids'],
'labels': tokens['input_ids'].copy()
})

return Dataset.from_list(processed)

def _format_conversation(self, conv: dict) -> str:
"""
格式化对话
"""
lines = []
for msg in conv['messages']:
if msg['role'] == 'system':
lines.append(f"System: {msg['content']}")
elif msg['role'] == 'user':
lines.append(f"User: {msg['content']}")
elif msg['role'] == 'assistant':
lines.append(f"Assistant: {msg['content']}")

return "\n".join(lines)

2. RLHF训练流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
class QwenRLHFTrainer:
"""
Qwen RLHF训练
"""
def __init__(self, model, ref_model, reward_model):
self.model = model
self.ref_model = ref_model
self.reward_model = reward_model

def compute_reward(self, response_ids, prompt_ids):
"""
计算reward
"""
with torch.no_grad():
reward = self.reward_model(
torch.cat([prompt_ids, response_ids], dim=1)
)
return reward

def compute_kl_divergence(self, response_ids, prompt_ids, gamma=0.1):
"""
计算KL散度
"""
with torch.no_grad():
ref_log_probs = self.ref_model(
torch.cat([prompt_ids, response_ids], dim=1)
).log_probs
ref_log_probs = ref_log_probs[:, prompt_ids.shape[1]:]

new_log_probs = self.model(
torch.cat([prompt_ids, response_ids], dim=1)
).log_probs[:, prompt_ids.shape[1]:]

return (new_log_probs - ref_log_probs).mean() * gamma

def ppo_update(self, batch):
"""
PPO更新
"""
prompt_ids = batch['prompt_ids']
response_ids = batch['response_ids']

# 计算reward
reward = self.compute_reward(response_ids, prompt_ids)

# 计算KL散度
kl = self.compute_kl_divergence(response_ids, prompt_ids)

# 组合损失
loss = -reward + kl

# 反向传播
loss.backward()

Qwen部署实践

1. Transformers部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class QwenDeployer:
def __init__(self, model_path, device="cuda"):
self.tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True
)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=torch.float16,
device_map="auto"
)

def chat(self, query, history=None, max_length=2048):
"""
对话接口
"""
if history is None:
history = []

# 构建输入
prompt = self._build_prompt(query, history)

inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

# 生成
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_length,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1
)

response = self.tokenizer.decode(
outputs[0][inputs['input_ids'].shape[1]:],
skip_special_tokens=True
)

return response.strip()

def _build_prompt(self, query, history):
"""
构建prompt
"""
prompt = ""
for q, a in history:
prompt += f"User: {q}\nAssistant: {a}\n"
prompt += f"User: {query}\nAssistant: "
return prompt

def batch_chat(self, queries):
"""
批量对话
"""
return [self.chat(q) for q in queries]

2. vLLM高性能部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# vLLM部署脚本
"""
使用vLLM进行Qwen高性能推理

安装:pip install vllm
"""

from vllm import LLM, SamplingParams

class QwenvLLMDeployer:
def __init__(self, model_path, tensor_parallel_size=1):
self.llm = LLM(
model=model_path,
trust_remote_code=True,
tensor_parallel_size=tensor_parallel_size,
dtype='float16'
)

def generate(self, prompts, max_tokens=256, temperature=0.7):
"""
高性能生成
"""
sampling_params = SamplingParams(
temperature=temperature,
top_p=0.95,
max_tokens=max_tokens
)

outputs = self.llm.generate(prompts, sampling_params)

return [output.outputs[0].text for output in outputs]

def chat(self, query, history=None):
"""
对话接口
"""
prompt = self._build_prompt(query, history)
outputs = self.generate([prompt])
return outputs[0].strip()

3. OpenAI兼容API

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn

app = FastAPI(title="Qwen API")

class ChatRequest(BaseModel):
messages: list
temperature: float = 0.7
max_tokens: int = 2048

@app.post("/v1/chat/completions")
async def chat_completions(request: ChatRequest):
# 构建对话
history = []
for msg in request.messages:
if msg['role'] == 'user':
history.append((msg['content'], None))
elif msg['role'] == 'assistant':
if history:
history[-1] = (history[-1][0], msg['content'])

# 获取最后一条用户消息
query = request.messages[-1]['content']

# 生成回复
response = qwen_deployer.chat(query, [
(q, a) for q, a in history if a is not None
])

return {
"id": "chatcmpl-" + generate_id(),
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": response
},
"finish_reason": "stop"
}],
"usage": {
"prompt_tokens": len(query) // 4,
"completion_tokens": len(response) // 4,
"total_tokens": (len(query) + len(response)) // 4
}
}

if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)

Qwen生态工具

1. LangChain集成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from langchain.llms import OpenAI
from langchain.chains import LLMChain

class QwenLangChain:
"""
LangChain集成
"""
def __init__(self, model_path):
# 使用HuggingFace LangChain后端
from langchain.llms import HuggingFacePipeline

self.llm = HuggingFacePipeline.from_model_id(
model_id=model_path,
task="text-generation",
model_kwargs={
"temperature": 0.7,
"max_length": 2048
}
)

def create_chain(self, prompt_template):
"""
创建LLM Chain
"""
return LLMChain(
llm=self.llm,
prompt=prompt_template
)

总结

Qwen作为阿里云在开源大模型领域的重要布局,凭借出色的中文能力和开放的生态,为开发者和企业提供了强大的AI能力。通过持续的技术创新和生态建设,Qwen正在成为国内最具影响力的开源大模型之一。未来,Qwen将继续在多模态、长上下文等方向发力,为用户带来更强大的AI体验。

参考资源

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero