数据预处理技术大全

🎙️ 语音朗读 当前: 晓晓 (温柔女声)

数据预处理技术大全

数据预处理是机器学习 pipeline 中最关键的环节,好的预处理能显著提升模型性能。

缺失值处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer

# 创建含缺失值的数据
df = pd.DataFrame({
'age': [25, 30, np.nan, 35, 28, np.nan, 40],
'income': [5000, 8000, 6000, np.nan, 5500, 7000, np.nan],
'score': [85, 90, 78, 92, np.nan, 88, 95]
})

# 1. 均值填充
imputer_mean = SimpleImputer(strategy='mean')
df_mean = pd.DataFrame(imputer_mean.fit_transform(df), columns=df.columns)

# 2. 中位数填充
imputer_median = SimpleImputer(strategy='median')
df_median = pd.DataFrame(imputer_median.fit_transform(df), columns=df.columns)

# 3. 众数填充
imputer_mode = SimpleImputer(strategy='most_frequent')
df_mode = pd.DataFrame(imputer_mode.fit_transform(df), columns=df.columns)

# 4. KNN填充
imputer_knn = KNNImputer(n_neighbors=3)
df_knn = pd.DataFrame(imputer_knn.fit_transform(df), columns=df.columns)

特征缩放

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

data = np.array([[1, 100], [2, 200], [3, 300], [4, 400], [5, 500]])

# 标准化(Z-score)
scaler = StandardScaler()
data_standard = scaler.fit_transform(data)
print(f"标准化 - 均值: {data_standard.mean(axis=0)}")
print(f"标准化 - 标准差: {data_standard.std(axis=0)}")

# 归一化(Min-Max)
scaler_mm = MinMaxScaler()
data_minmax = scaler_mm.fit_transform(data)
print(f"归一化 - 范围: [{data_minmax.min()}, {data_minmax.max()}]")

# 鲁棒缩放(对异常值鲁棒)
scaler_robust = RobustScaler()
data_robust = scaler_robust.fit_transform(data)

选择建议

  • 大多数算法:StandardScaler
  • 需要有界特征(如神经网络输入):MinMaxScaler
  • 存在异常值:RobustScaler

类别特征编码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

df_cat = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red', 'blue'],
'size': ['S', 'M', 'L', 'M', 'S'],
'price': [10, 20, 30, 25, 15]
})

# Label Encoding
le = LabelEncoder()
df_cat['color_encoded'] = le.fit_transform(df_cat['color'])

# One-Hot Encoding
df_onehot = pd.get_dummies(df_cat, columns=['color'], drop_first=True)

# Ordinal Encoding(有序类别)
oe = OrdinalEncoder(categories=[['S', 'M', 'L']])
df_cat['size_encoded'] = oe.fit_transform(df_cat[['size']])

# Target Encoding(目标编码)
def target_encode(df, column, target):
means = df.groupby(column)[target].mean()
return df[column].map(means)

df_cat['color_target'] = target_encode(df_cat, 'color', 'price')

特征变换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

# 对数变换(处理右偏分布)
df['log_income'] = np.log1p(df['income'])

# Box-Cox变换
pt = PowerTransformer(method='box-cox')
df['boxcox_income'] = pt.fit_transform(df[['income']])

# Yeo-Johnson变换(支持负值)
pt_yj = PowerTransformer(method='yeo-johnson')
df['yeojohnson_income'] = pt_yj.fit_transform(df[['income']])

# 分位数变换
qt = QuantileTransformer(output_distribution='normal')
df['quantile_income'] = qt.fit_transform(df[['income']])

异常值检测与处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# IQR方法
def detect_outliers_iqr(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
return df[(df[column] < lower) | (df[column] > upper)]

# Z-score方法
def detect_outliers_zscore(df, column, threshold=3):
z_scores = np.abs((df[column] - df[column].mean()) / df[column].std())
return df[z_scores > threshold]

# 处理异常值
def handle_outliers(df, column, method='clip'):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

if method == 'clip':
df[column] = df[column].clip(lower, upper)
elif method == 'remove':
df = df[(df[column] >= lower) & (df[column] <= upper)]
return df

特征构造

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 数值特征组合
df['income_per_age'] = df['income'] / df['age']

# 时间特征提取
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# 交互特征
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(df[['age', 'income']])

数据 pipeline

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 定义数值和类别特征
numeric_features = ['age', 'income', 'score']
categorical_features = ['color', 'size']

# 创建预处理 pipeline
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])

# 完整 pipeline
from sklearn.linear_model import LogisticRegression
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

总结

数据预处理是机器学习中不可忽视的环节。缺失值处理、特征缩放、类别编码、异常值处理和特征构造是最常用的预处理技术。使用sklearn的Pipeline和ColumnTransformer可以构建可复用的预处理流程,避免数据泄露,提高代码质量。

© 2019-2026 ovo$^{mc^2}$ All Rights Reserved. | 站点总访问 28969 次 | 访客 19045
Theme by hiero