预训练与微调:现代NLP的范式

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

前言

预训练-微调(Pre-training & Fine-tuning)范式是现代NLP成功的关键。本文将系统解析这一范式的原理、实现方法和最佳实践。

预训练-微调范式

1
2
3
4
5
6
mermaid
graph LR
A[大规模无标注语料] --> B[预训练语言模型]
B --> C[下游任务数据]
C --> D[微调]
D --> E[任务特定模型]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, AdamW
from torch.utils.data import DataLoader, Dataset

class PretrainFineTunePipeline:
"""预训练-微调完整流程"""

def __init__(self, model_name='bert-base-chinese'):
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.model = BertModel.from_pretrained(model_name)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def pretrain(self, corpus, epochs=4, batch_size=32, max_length=512):
"""自监督预训练"""
print("=== 阶段1: 预训练 ===")

# 创建预训练数据集
dataset = PretrainDataset(corpus, self.tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

optimizer = AdamW(self.model.parameters(), lr=2e-5)

self.model.train()
for epoch in range(epochs):
total_loss = 0
for batch in dataloader:
input_ids = batch['input_ids'].to(self.device)
attention_mask = batch['attention_mask'].to(self.device)

# MLM任务
outputs = self.model(input_ids, attention_mask=attention_mask)
logits = outputs.last_hidden_state

# 计算MLM损失
loss = self._mlm_loss(logits, input_ids)

optimizer.zero_grad()
loss.backward()
optimizer.step()

total_loss += loss.item()

print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

def finetune(self, train_data, val_data, task='classification', epochs=3):
"""下游任务微调"""
print(f"=== 阶段2: {task}微调 ===")

# 添加任务头
if task == 'classification':
self.task_head = ClassificationHead(self.model.config.hidden_size, num_labels=2)
elif task == 'ner':
self.task_head = NERHead(self.model.config.hidden_size, num_labels=10)
elif task == 'qa':
self.task_head = QAHead(self.model.config.hidden_size)

# 冻结底层参数(可选)
self._freeze_layers(freeze_ratio=0.6)

# 训练循环
optimizer = AdamW(
list(self.model.parameters()) + list(self.task_head.parameters()),
lr=2e-5
)

for epoch in range(epochs):
self.model.train()
train_loss = self._train_epoch(train_data, optimizer)

val_metrics = self._evaluate(val_data)
print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Acc={val_metrics['accuracy']:.4f}")

def _mlm_loss(self, logits, input_ids):
"""掩码语言模型损失"""
loss_fn = nn.CrossEntropyLoss()
logits = logits[:, 1:, :] # 跳过[CLS]
target = input_ids[:, 1:]
loss = loss_fn(logits.reshape(-1, logits.size(-1)), target.reshape(-1))
return loss

def _freeze_layers(self, freeze_ratio=0.6):
"""冻结底层参数"""
total_layers = self.model.config.num_hidden_layers
freeze_layers = int(total_layers * freeze_ratio)

for i, layer in enumerate(self.model.bert.encoder.layer):
if i < freeze_layers:
for param in layer.parameters():
param.requires_grad = False

分类任务微调

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class ClassificationHead(nn.Module):
"""文本分类头"""

def __init__(self, hidden_size, num_labels=2, dropout=0.1):
super().__init__()
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(hidden_size, num_labels)

def forward(self, sequence_output):
pooled = sequence_output[:, 0] # [CLS] token
pooled = self.dropout(pooled)
logits = self.classifier(pooled)
return logits

class TextClassificationFineTuner:
"""文本分类微调器"""

def __init__(self, model_name='bert-base-chinese', num_labels=2):
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.model = BertModel.from_pretrained(model_name)
self.num_labels = num_labels
self.classifier = ClassificationHead(
self.model.config.hidden_size, num_labels
)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train(self, train_texts, train_labels, val_texts, val_labels,
epochs=3, batch_size=16, lr=2e-5):
"""训练流程"""

# 编码数据
train_encodings = self.tokenizer(
train_texts, truncation=True, padding=True, return_tensors='pt'
)

# 创建Dataset
train_dataset = ClassificationDataset(train_encodings, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 优化器
optimizer = AdamW(
list(self.model.parameters()) + list(self.classifier.parameters()),
lr=lr
)

# 训练
for epoch in range(epochs):
self.model.train()
total_loss = 0

for batch in train_loader:
optimizer.zero_grad()

input_ids = batch['input_ids'].to(self.device)
attention_mask = batch['attention_mask'].to(self.device)
labels = batch['labels'].to(self.device)

outputs = self.model(input_ids, attention_mask=attention_mask)
logits = self.classifier(outputs.last_hidden_state)

loss = nn.CrossEntropyLoss()(logits, labels)
loss.backward()
optimizer.step()

total_loss += loss.item()

val_acc = self.evaluate(val_texts, val_labels)
print(f"Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f}, Val Acc={val_acc:.4f}")

def predict(self, texts):
"""预测"""
self.model.eval()

encodings = self.tokenizer(texts, truncation=True, padding=True, return_tensors='pt')

with torch.no_grad():
input_ids = encodings['input_ids'].to(self.device)
attention_mask = encodings['attention_mask'].to(self.device)

outputs = self.model(input_ids, attention_mask=attention_mask)
logits = self.classifier(outputs.last_hidden_state)
predictions = torch.argmax(logits, dim=-1)

return predictions.cpu().numpy()

NER任务微调

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class NERHead(nn.Module):
"""命名实体识别头"""

def __init__(self, hidden_size, num_labels):
super().__init__()
self.classifier = nn.Linear(hidden_size, num_labels)

def forward(self, sequence_output):
logits = self.classifier(sequence_output)
return logits

class NERDataset(Dataset):
"""NER数据集"""

def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
self.label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3, 'I-LOC': 4}

def __len__(self):
return len(self.texts)

def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]

encoding = self.tokenizer(
text, max_length=self.max_length, padding='max_length',
truncation=True, return_tensors='pt'
)

return {
'input_ids': encoding['input_ids'].squeeze(),
'attention_mask': encoding['attention_mask'].squeeze(),
'labels': torch.tensor([self.label2id[l] for l in label])
}

问答任务微调

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
class QAHead(nn.Module):
"""问答任务头(预测start和end位置)"""

def __init__(self, hidden_size):
super().__init__()
self.qa_outputs = nn.Linear(hidden_size, 2) # start和end

def forward(self, sequence_output):
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
return start_logits, end_logits

def qa_loss(start_logits, end_logits, start_positions, end_positions):
"""问答损失"""
loss_fn = nn.CrossEntropyLoss()

start_loss = loss_fn(start_logits, start_positions)
end_loss = loss_fn(end_logits, end_positions)

return (start_loss + end_loss) / 2

迁移学习策略

策略 描述 适用场景
Full Fine-tune 更新所有参数 数据充足
Feature Freeze 冻结底层,只更新上层 数据较少
Adapter 添加适配层 多任务学习
LoRA 低秩适配 高效微调
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class LoRALayer(nn.Module):
"""Low-Rank Adaptation"""

def __init__(self, in_features, out_features, rank=4, alpha=1.0):
super().__init__()
self.rank = rank
self.alpha = alpha

# 低秩分解的A和B矩阵
self.lora_A = nn.Parameter(torch.randn(in_features, rank))
self.lora_B = nn.Parameter(torch.zeros(rank, out_features))

def forward(self, x):
# W = W0 + BA * (alpha/r)
return x @ (self.lora_A @ self.lora_B * (self.alpha / self.rank))

总结

预训练-微调范式使得NLP模型能够从大规模无监督数据中学习通用语言表示,然后通过少量标注数据快速适配到具体任务,大大降低了NLP应用的门槛。

参考资源

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero