逻辑回归与二分类问题

Posted on 五月 28, 2019

🎙️ 语音朗读当前: 晓晓 (温柔女声)

逻辑回归与二分类问题

逻辑回归虽然名字中有”回归”，但实际是一种分类算法，广泛用于二分类问题。

从线性回归到逻辑回归

线性回归的输出是连续值，而分类问题需要输出概率。逻辑回归通过Sigmoid函数将线性输出映射到(0,1)区间：

$$P(y=1|\mathbf{x}) = \sigma(\mathbf{w}^T\mathbf{x} + b) = \frac{1}{1 + e^{-(\mathbf{w}^T\mathbf{x} + b)}}$$

手动实现逻辑回归

import numpy as np

class LogisticRegression:
    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-np.clip(z, -250, 250)))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            z = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(z)

            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        return self

    def predict_proba(self, X):
        z = np.dot(X, self.weights) + self.bias
        return self.sigmoid(z)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)

损失函数：交叉熵

逻辑回归使用交叉熵作为损失函数：

$$L = -\frac{1}{n}\sum_{i=1}^{n}[y_i\log(\hat{y}_i) + (1-y_i)\log(1-\hat{y}_i)]$$

1
2
3

def cross_entropy_loss(y_true, y_pred, eps=1e-15):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

决策边界

逻辑回归的决策边界是线性的。对于二维特征：

import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=200, n_features=2,
                           n_redundant=0, random_state=42)

model = LogisticRegression(lr=0.1, n_iters=1000)
model.fit(X, y)

# 绘制决策边界
plt.figure(figsize=(8, 6))
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                      np.arange(y_min, y_max, 0.02))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='bwr', alpha=0.7)
plt.title('Logistic Regression Decision Boundary')
plt.show()

正则化逻辑回归

from sklearn.linear_model import LogisticRegression as SklearnLR

# L2正则化（默认）
model_l2 = SklearnLR(penalty='l2', C=1.0)

# L1正则化
model_l1 = SklearnLR(penalty='l1', C=1.0, solver='liblinear')

# ElasticNet
model_en = SklearnLR(penalty='elasticnet', C=1.0, solver='saga', l1_ratio=0.5)

多分类扩展

逻辑回归可以通过OvR（One-vs-Rest）或Softmax回归扩展到多分类：

class SoftmaxRegression:
    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        self.weights = np.zeros((n_features, n_classes))
        self.bias = np.zeros(n_classes)

        # One-hot编码
        y_onehot = np.eye(n_classes)[y]

        for _ in range(self.n_iters):
            z = np.dot(X, self.weights) + self.bias
            y_pred = self.softmax(z)

            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y_onehot))
            db = (1 / n_samples) * np.sum(y_pred - y_onehot, axis=0)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db
        return self

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        y_pred = self.softmax(z)
        return np.argmax(y_pred, axis=1)

模型评估指标

from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score,
                             confusion_matrix, classification_report)

y_pred = model.predict(X_test)

print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

ROC曲线与AUC

from sklearn.metrics import roc_curve, auc

y_prob = model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

总结

逻辑回归是二分类问题的基线模型，通过Sigmoid函数实现概率输出。它简单高效、可解释性强，在实际业务中广泛使用。通过正则化可以防止过拟合，通过OvR或Softmax可以扩展到多分类问题。理解逻辑回归是学习更复杂分类算法的基础。