决策树与随机森林:从信息增益到集成学习

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

决策树与随机森林:从信息增益到集成学习

决策树是最直观的机器学习算法之一,而随机森林通过集成多棵决策树,显著提升了模型性能。

决策树基本原理

决策树通过一系列规则对数据进行划分,每个内部节点表示一个属性判断,每个分支代表一个判断结果,每个叶节点代表一个类别或值。

信息增益

信息增益基于信息熵来选择最优划分属性:

$$H(D) = -\sum_{k=1}^{K} p_k \log_2 p_k$$

$$Gain(D, a) = H(D) - \sum_{v=1}^{V} \frac{|D^v|}{|D|} H(D^v)$$

1
2
3
4
5
6
7
8
9
10
11
12
import numpy as np

def entropy(y):
"""计算信息熵"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return -np.sum(probabilities * np.log2(probabilities + 1e-10))

def information_gain(y, y_left, y_right):
"""计算信息增益"""
p = len(y_left) / len(y)
return entropy(y) - p * entropy(y_left) - (1 - p) * entropy(y_right)

基尼指数

CART决策树使用基尼指数:

$$Gini(D) = 1 - \sum_{k=1}^{K} p_k^2$$

1
2
3
4
5
def gini(y):
"""计算基尼指数"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return 1 - np.sum(probabilities ** 2)

手动实现决策树

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class DecisionTreeNode:
def __init__(self):
self.feature = None
self.threshold = None
self.left = None
self.right = None
self.value = None

class DecisionTree:
def __init__(self, max_depth=10, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.root = None

def _best_split(self, X, y):
best_gain = -1
best_feature = None
best_threshold = None
n_features = X.shape[1]

for feature in range(n_features):
thresholds = np.unique(X[:, feature])
for threshold in thresholds:
left_mask = X[:, feature] <= threshold
right_mask = ~left_mask

if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
continue

gain = information_gain(y, y[left_mask], y[right_mask])
if gain > best_gain:
best_gain = gain
best_feature = feature
best_threshold = threshold

return best_feature, best_threshold

def _build_tree(self, X, y, depth=0):
node = DecisionTreeNode()

if depth >= self.max_depth or len(y) < self.min_samples_split:
node.value = np.argmax(np.bincount(y.astype(int)))
return node

feature, threshold = self._best_split(X, y)

if feature is None:
node.value = np.argmax(np.bincount(y.astype(int)))
return node

left_mask = X[:, feature] <= threshold
node.feature = feature
node.threshold = threshold
node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
node.right = self._build_tree(X[~left_mask], y[~left_mask], depth + 1)

return node

def fit(self, X, y):
self.root = self._build_tree(X, y)
return self

def _predict_one(self, x, node):
if node.value is not None:
return node.value
if x[node.feature] <= node.threshold:
return self._predict_one(x, node.left)
return self._predict_one(x, node.right)

def predict(self, X):
return np.array([self._predict_one(x, self.root) for x in X])

随机森林

随机森林通过Bagging和特征随机选择构建多棵决策树:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class RandomForest:
def __init__(self, n_trees=10, max_depth=10, min_samples_split=2,
max_features='sqrt'):
self.n_trees = n_trees
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.max_features = max_features
self.trees = []

def _bootstrap_sample(self, X, y):
n_samples = X.shape[0]
indices = np.random.choice(n_samples, n_samples, replace=True)
return X[indices], y[indices]

def fit(self, X, y):
self.trees = []
for _ in range(self.n_trees):
tree = DecisionTree(max_depth=self.max_depth,
min_samples_split=self.min_samples_split)
X_sample, y_sample = self._bootstrap_sample(X, y)
tree.fit(X_sample, y_sample)
self.trees.append(tree)
return self

def predict(self, X):
tree_preds = np.array([tree.predict(X) for tree in self.trees])
# 多数投票
majority_votes = [np.bincount(tree_preds[:, i].astype(int)).argmax()
for i in range(X.shape[0])]
return np.array(majority_votes)

使用scikit-learn

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

特征重要性

随机森林可以评估特征重要性:

1
2
3
4
5
6
7
8
9
10
11
import matplotlib.pyplot as plt

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), data.feature_names[indices], rotation=90)
plt.tight_layout()
plt.show()

总结

决策树直观易懂,但容易过拟合。随机森林通过集成多棵树和引入随机性,有效降低了方差,提升了泛化能力。特征重要性是随机森林的额外优势,帮助理解数据。在实际应用中,随机森林往往是首选的基线模型之一。

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero