GPT-3:大规模语言模型的突破

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

前言

GPT-3是OpenAI在2020年发布的1750亿参数语言模型,展现了惊人的few-shot和zero-shot能力,引发了对大语言模型的广泛关注。本文深入解析GPT-3的核心技术。

GPT-3规模

GPT-3的规模远超之前的模型:

指标 GPT-1 GPT-2 GPT-3
参数量 1.17亿 15亿 1750亿
训练数据 4.5GB 40GB 45TB
上下文长度 512 1024 2048
层数 12 48 96
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# GPT-3架构配置
class GPT3Config:
"""GPT-3配置"""

def __init__(self):
# 模型规模
self.n_vocab = 50257 # BPE词汇表
self.n_ctx = 2048 # 上下文长度
self.n_positions = 2048
self.n_embd = 12288 # 嵌入维度
self.n_layer = 96 # 层数
self.n_head = 96 # 注意力头数
self.n_head_size = 128 # 每头维度

# 训练配置
self.learning_rate = 0.00001
self.batch_size = 3.2M tokens
self.train_steps = 300B tokens

# 其他参数
self.afn = "gelu"
self.resid_dropout = 0.1
self.embd_dropout = 0.1
self.attn_dropout = 0.1
self.vocab_size = 50257

class GPT3SparseAttention(nn.Module):
"""GPT-3使用的稀疏注意力"""

def __init__(self, n_embd, n_head, n_ctx, dropout=0.1):
super().__init__()
self.n_embd = n_embd
self.n_head = n_head
self.n_ctx = n_ctx
self.d_k = n_embd // n_head

# 标准QKV
self.qkv = nn.Linear(n_embd, n_embd * 3)

# 局部注意力窗口
self.window_size = 2048

# 跨层次连接
self.stride = 64 # 每个跨层次token的stride

self.out_proj = nn.Linear(n_embd, n_embd)
self.dropout = nn.Dropout(dropout)

def forward(self, x, attention_mask=None):
B, T, C = x.shape

# QKV投影
qkv = self.qkv(x).reshape(B, T, 3, self.n_head, self.d_k)
qkv = qkv.permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2]

# 稀疏注意力掩码
attn_mask = self._create_sparse_mask(T, x.device)

# 局部注意力和跨层次注意力
local_attn = self._local_attention(q, k, v, attn_mask)
global_attn = self._global_attention(q, k, v)

# 合并
attn_output = local_attn + global_attn

# 输出投影
output = self.out_proj(attn_output)

return self.dropout(output)

def _create_sparse_mask(self, seq_len, device):
"""创建稀疏注意力掩码"""
mask = torch.zeros(seq_len, seq_len, device=device)

# 局部窗口
for i in range(seq_len):
start = max(0, i - self.window_size)
mask[i, start:i+1] = 1

# 跨层次连接
for i in range(0, seq_len, self.stride):
for j in range(i, min(i + self.stride, seq_len)):
mask[j, i] = 1
if i + self.stride < seq_len:
mask[j, i + self.stride] = 1

return mask.bool()

Few-Shot学习能力

GPT-3的核心能力是在不进行梯度更新的情况下通过提示进行学习:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
class FewShotPrompt:
"""Few-shot提示生成"""

def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer

def generate(self, task_description, examples, query):
"""
生成few-shot提示

Args:
task_description: 任务描述
examples: [{"input": "...", "output": "..."}]
query: 查询输入
"""
# 构建提示
prompt_parts = [task_description]

for ex in examples:
prompt_parts.append(f"Input: {ex['input']}")
prompt_parts.append(f"Output: {ex['output']}")

prompt_parts.append(f"Input: {query}")
prompt_parts.append("Output:")

prompt = "\n".join(prompt_parts)

# 生成
input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
output_ids = self.model.generate(
input_ids,
max_new_tokens=50,
temperature=0.7,
top_p=0.9,
num_return_sequences=1
)

return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 示例:情感分类
task_desc = "Classify the sentiment of the text as Positive or Negative."
examples = [
{"input": "This movie is amazing!", "output": "Positive"},
{"input": "Terrible experience, would not recommend.", "output": "Negative"},
{"input": "The food was okay, nothing special.", "output": "Neutral"}
]
query = "I absolutely love this product!"

prompt_generator = FewShotPrompt(model, tokenizer)
result = prompt_generator.generate(task_desc, examples, query)

上下文学习机制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class InContextLearning:
"""上下文学习机制"""

def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer

def compute_loss(self, context, target):
"""计算语言模型损失"""
# Tokenize
full_text = context + target
context_len = len(self.tokenizer.encode(context))

input_ids = self.tokenizer.encode(full_text, return_tensors='pt')

with torch.no_grad():
outputs = self.model(input_ids)
logits = outputs.logits[:, context_len-1:-1, :]
targets = input_ids[:, context_len:]

loss = F.cross_entropy(
logits.reshape(-1, logits.size(-1)),
targets.reshape(-1)
)

return loss.item()

def analyze_pattern(self, examples):
"""分析few-shot学习中的模式"""
patterns = []

for ex in examples:
input_text = ex['input']
output_text = ex['output']

# 提取关键模式
pattern = {
'input_length': len(input_text),
'output_length': len(output_text),
'input_tokens': len(self.tokenizer.encode(input_text)),
'output_tokens': len(self.tokenizer.encode(output_text))
}
patterns.append(pattern)

return patterns

GPT-3的实际应用

1. 文本生成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def generate_text(prompt, max_length=200, temperature=0.8):
"""文本生成"""
input_ids = tokenizer.encode(prompt, return_tensors='pt')

output_ids = model.generate(
input_ids,
max_length=max_length,
temperature=temperature,
top_p=0.92,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)

return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 使用示例
prompt = "Once upon a time in a distant galaxy,"
story = generate_text(prompt, max_length=500, temperature=0.9)

2. 代码生成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def generate_code(description, language='python'):
"""根据描述生成代码"""
prompt = f"""// {language} code for: {description}

"""
input_ids = tokenizer.encode(prompt, return_tensors='pt')

output_ids = model.generate(
input_ids,
max_new_tokens=200,
temperature=0.3, # 较低的temperature保证代码准确性
top_p=0.95,
stop_sequence="\n\n"
)

return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 示例
code = generate_code("quicksort algorithm in python")

3. 问答系统

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def answer_question(question, context):
"""基于上下文的问答"""
prompt = f"""Answer the question based on the context.

Context: {context}

Question: {question}
Answer:"""

input_ids = tokenizer.encode(prompt, return_tensors='pt')

output_ids = model.generate(
input_ids,
max_new_tokens=100,
temperature=0.2,
do_sample=False
)

return tokenizer.decode(output_ids[0], skip_special_tokens=True).split("Answer:")[-1].strip()

GPT-3的能力评估

任务 Zero-Shot One-Shot Few-Shot
LAMBADA 76.2% 77.9% 86.4%
TriviaQA 64.3% 71.2% 77.2%
Pile (BPB) 21.3 25.2 29.7
HumanEval 0% 0% 28.7%

局限性

  • 高昂计算成本:推理需要大量GPU资源
  • 潜在的偏见:训练数据中的偏见被放大
  • 幻觉问题:可能生成看似合理但错误的内容
  • 缺乏可解释性:决策过程不透明

总结

GPT-3展示了大规模语言模型的惊人能力,few-shot学习范式为AI应用开辟了新道路。其后续发展(GPT-3.5、GPT-4)继续推动着大语言模型技术的进步。

参考资源

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero