逻辑回归分类器

发表于 2019-07-17 更新于 2021-07-09 分类于机器学习/machine learning 阅读次数：

本文字数： 7.2k 阅读时长 ≈ 13 分钟

参考线性SVM分类器实现逻辑回归分类器，并进行决策边界可视化

分类器实现

# -*- coding: utf-8 -*-

# @Time    : 19-7-16 上午10:24
# @Author  : zj


import numpy as np


class LogisticClassifier(object):

    def __init__(self):
        self.W = None
        self.b = None

        self.lr = None
        self.reg = None

    def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100, batch_size=200, verbose=False):
        """
        Inputs:
        - X: A numpy array of shape (N, D) containing training data; there are N
          training samples each of dimension D.
        - y: A numpy array of shape (N,) containing training labels; y[i] = c
          means that X[i] has label 0 <= c < C for C classes.
        - learning_rate: (float) learning rate for optimization.
        - reg: (float) regularization strength.
        - num_iters: (integer) number of steps to take when optimizing
        - batch_size: (integer) number of training examples to use at each step.
        - verbose: (boolean) If true, print progress during optimization.

        Outputs:
        A list containing the value of the loss function at each training iteration.
        """
        self.lr = learning_rate
        self.reg = reg

        num_train, dim = X.shape
        num_classes = np.max(y) + 1  # assume y takes values 0...K-1 where K is number of classes
        if self.W is None:
            # lazily initialize W
            self.W = 0.001 * np.random.randn(dim)
            self.b = np.zeros(1)

        # Run stochastic gradient descent to optimize W
        loss_history = []
        for it in range(num_iters):
            indices = np.random.choice(num_train, batch_size)
            X_batch = X[indices]
            y_batch = y[indices]

            # evaluate loss and gradient
            loss, dW, db = self.loss(X_batch, y_batch, reg)
            loss_history.append(loss)

            self.W -= learning_rate * dW
            self.b -= learning_rate * db

            if verbose and it % 100 == 0:
                print('iteration %d / %d: loss %f' % (it, num_iters, loss))

        return loss_history

    def predict(self, X):
        """
        Use the trained weights of this linear classifier to predict labels for
        data points.

        Inputs:
        - X: A numpy array of shape (N, D) containing training data; there are N
          training samples each of dimension D.

        Returns:
        - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
          array of length N, and each element is an integer giving the predicted
          class.
        """
        scores = self.logistic_regression(X)
        y_pred = (scores > 0.5).astype(np.uint8)

        return y_pred

    def loss(self, X_batch, y_batch, reg, delta=1):
        """
        Compute the loss function and its derivative.
        Subclasses will override this.

        Inputs:
        - X_batch: A numpy array of shape (N, D) containing a minibatch of N
          data points; each point has dimension D.
        - y_batch: A numpy array of shape (N,) containing labels for the minibatch.
        - reg: (float) regularization strength.

        Returns: A tuple containing:
        - loss as a single float
        - gradient with respect to self.W; an array of the same shape as W
        """
        eplison = 1e-5
        num_train = X_batch.shape[0]

        scores = self.logistic_regression(X_batch)
        data_loss = -1.0 / num_train * \
                    np.sum(y_batch * np.log(np.maximum(scores, eplison)) + (1 - y_batch) * np.log(np.maximum(1 - scores, eplison)))
        reg_loss = 0.5 * reg * np.sum(self.W ** 2)

        loss = data_loss + reg_loss

        dscores = scores - y_batch
        dscores /= num_train
        dW = X_batch.T.dot(dscores) + reg * self.W
        db = np.sum(dscores)

        return loss, dW, db

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-1 * x))

    def logistic_regression(self, x):
        """
        :param x: A numpy array of shape (N, D)
        :param w: A numpy array of shape (D)
        :param b: A numpy array of shape (1)
        :return: A numpy array of shape (N)
        """
        z = x.dot(self.W) + self.b
        return self.sigmoid(z)

决策边界

逻辑回归前向操作是线性映射 + sigmoid函数，其决策界面如下：

\[ x_{1}\cdot w_{1} + x_{2}\cdot w_{2} + w_{0} = 0\\ \Rightarrow x_{2} = -1 * \frac {x_{1}\cdot w_{1} + w_{0}}{w_{2}} \]

测试数据

生成一个二维线性数据集，训练集大小为\(200\times 2\)，测试集大小为\(40\times 2\)

def two_cate_linear():
    x1 = np.linspace(20, 40, num=200)[np.random.choice(200, 120)]
    y1 = np.linspace(20, 40, num=200)[np.random.choice(200, 120)]

    x2 = np.linspace(-10, 10, num=200)[np.random.choice(200, 120)]
    y2 = np.linspace(-10, 10, num=200)[np.random.choice(200, 120)]

    x = np.vstack((np.concatenate((x1, x2)), np.concatenate((y1, y2)))).T
    y = np.concatenate((np.zeros(120), np.ones(120)))

    np.random.seed(120)
    np.random.shuffle(x)
    np.random.seed(120)
    np.random.shuffle(y)

    return x[:200], x[200:], y[:200], y[200:]

模型训练

以学习率和正则化强度为参数进行交叉验证，搜索最好的学习率和正则化强度组合

# -*- coding: utf-8 -*-

# @Time    : 19-7-16 上午10:38
# @Author  : zj


from lr_classifier import LogisticClassifier
import numpy as np
import math
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")


def two_cate_linear():
    x1 = np.linspace(20, 40, num=200)[np.random.choice(200, 120)]
    y1 = np.linspace(20, 40, num=200)[np.random.choice(200, 120)]

    x2 = np.linspace(-10, 10, num=200)[np.random.choice(200, 120)]
    y2 = np.linspace(-10, 10, num=200)[np.random.choice(200, 120)]

    x = np.vstack((np.concatenate((x1, x2)), np.concatenate((y1, y2)))).T
    y = np.concatenate((np.zeros(120), np.ones(120)))

    np.random.seed(120)
    np.random.shuffle(x)
    np.random.seed(120)
    np.random.shuffle(y)

    return x[:200], x[200:], y[:200], y[200:]


def cross_validation(x_train, y_train, x_val, y_val, lr_choices, reg_choices, classifier=LogisticClassifier):
    results = {}
    best_val = -1  # The highest validation accuracy that we have seen so far.
    best_svm = None  # The LinearSVM object that achieved the highest validation rate.

    for lr in lr_choices:
        for reg in reg_choices:
            svm = classifier()

            svm.train(x_train, y_train, learning_rate=lr, reg=reg, num_iters=2000, batch_size=30, verbose=True)
            y_train_pred = svm.predict(x_train)
            y_val_pred = svm.predict(x_val)

            train_acc = np.mean(y_train_pred == y_train)
            val_acc = np.mean(y_val_pred == y_val)

            results[(lr, reg)] = (train_acc, val_acc)
            if best_val < val_acc:
                best_val = val_acc
                best_svm = svm

    return results, best_svm, best_val


def compute_accuracy(y, y_pred):
    num = y.shape[0]
    num_correct = np.sum(y_pred == y)
    acc = float(num_correct) / num
    return acc


def plot(results):
    # Visualize the cross-validation results
    x_scatter = [math.log10(x[0]) for x in results]
    y_scatter = [math.log10(x[1]) for x in results]

    # plot training accuracy
    marker_size = 100
    colors = [results[x][0] for x in results]
    plt.subplot(2, 1, 1)
    plt.scatter(x_scatter, y_scatter, marker_size, c=colors, cmap=plt.cm.coolwarm)
    plt.colorbar()
    plt.xlabel('log learning rate')
    plt.ylabel('log regularization strength')
    plt.title('training accuracy')

    # plot validation accuracy
    colors = [results[x][1] for x in results]  # default size of markers is 20
    plt.subplot(2, 1, 2)
    plt.scatter(x_scatter, y_scatter, marker_size, c=colors, cmap=plt.cm.coolwarm)
    plt.colorbar()
    plt.xlabel('log learning rate')
    plt.ylabel('log regularization strength')
    plt.title('validation accuracy')
    plt.show()


def plot_v2(x, w, b):
    plt.scatter(x[:, 0], x[:, 1])

    x = np.linspace(-10, 40, num=200)
    y = (-x * w[0] - b) / w[1]
    plt.plot(x, y)

    plt.show()


if __name__ == '__main__':
    x_train, x_test, y_train, y_test = two_cate_linear()

    lr_choices = [1e-3, 2.5e-3, 5e-3, 7.5e-3, 1e-2, 2.5e-2]
    reg_choices = [8e-6, 1e-5, 2.5e-5, 5e-5, 7.5e-5, 1e-4]
    results, best_classifier, best_val = cross_validation(x_train, y_train, x_test, y_test, lr_choices, reg_choices)

    plot(results)
    plot_v2(x_test, best_classifier.W, best_classifier.b)

    for k in results.keys():
        lr, reg = k
        train_acc, val_acc = results[k]
        print('lr = %f, reg = %f, train_acc = %f, val_acc = %f' % (lr, reg, train_acc, val_acc))

    print('最好的设置是： lr = %f, reg = %f' % (best_classifier.lr, best_classifier.reg))
    print('最好的测试精度： %f' % best_val)

训练结果如下，能够100%正确检测测试数据

1 2	最好的设置是： lr = 0.005000, reg = 0.000008 最好的测试精度： 1.000000

大海

逻辑回归分类器

分类器实现

决策边界

测试数据

模型训练

相关阅读