逻辑回归分类器

参考线性SVM分类器实现逻辑回归分类器,并进行决策边界可视化

分类器实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-

# @Time : 19-7-16 上午10:24
# @Author : zj


import numpy as np


class LogisticClassifier(object):

def __init__(self):
self.W = None
self.b = None

self.lr = None
self.reg = None

def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100, batch_size=200, verbose=False):
"""
Inputs:
- X: A numpy array of shape (N, D) containing training data; there are N
training samples each of dimension D.
- y: A numpy array of shape (N,) containing training labels; y[i] = c
means that X[i] has label 0 <= c < C for C classes.
- learning_rate: (float) learning rate for optimization.
- reg: (float) regularization strength.
- num_iters: (integer) number of steps to take when optimizing
- batch_size: (integer) number of training examples to use at each step.
- verbose: (boolean) If true, print progress during optimization.

Outputs:
A list containing the value of the loss function at each training iteration.
"""
self.lr = learning_rate
self.reg = reg

num_train, dim = X.shape
num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
if self.W is None:
# lazily initialize W
self.W = 0.001 * np.random.randn(dim)
self.b = np.zeros(1)

# Run stochastic gradient descent to optimize W
loss_history = []
for it in range(num_iters):
indices = np.random.choice(num_train, batch_size)
X_batch = X[indices]
y_batch = y[indices]

# evaluate loss and gradient
loss, dW, db = self.loss(X_batch, y_batch, reg)
loss_history.append(loss)

self.W -= learning_rate * dW
self.b -= learning_rate * db

if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))

return loss_history

def predict(self, X):
"""
Use the trained weights of this linear classifier to predict labels for
data points.

Inputs:
- X: A numpy array of shape (N, D) containing training data; there are N
training samples each of dimension D.

Returns:
- y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
array of length N, and each element is an integer giving the predicted
class.
"""
scores = self.logistic_regression(X)
y_pred = (scores > 0.5).astype(np.uint8)

return y_pred

def loss(self, X_batch, y_batch, reg, delta=1):
"""
Compute the loss function and its derivative.
Subclasses will override this.

Inputs:
- X_batch: A numpy array of shape (N, D) containing a minibatch of N
data points; each point has dimension D.
- y_batch: A numpy array of shape (N,) containing labels for the minibatch.
- reg: (float) regularization strength.

Returns: A tuple containing:
- loss as a single float
- gradient with respect to self.W; an array of the same shape as W
"""
eplison = 1e-5
num_train = X_batch.shape[0]

scores = self.logistic_regression(X_batch)
data_loss = -1.0 / num_train * \
np.sum(y_batch * np.log(np.maximum(scores, eplison)) + (1 - y_batch) * np.log(np.maximum(1 - scores, eplison)))
reg_loss = 0.5 * reg * np.sum(self.W ** 2)

loss = data_loss + reg_loss

dscores = scores - y_batch
dscores /= num_train
dW = X_batch.T.dot(dscores) + reg * self.W
db = np.sum(dscores)

return loss, dW, db

def sigmoid(self, x):
return 1 / (1 + np.exp(-1 * x))

def logistic_regression(self, x):
"""
:param x: A numpy array of shape (N, D)
:param w: A numpy array of shape (D)
:param b: A numpy array of shape (1)
:return: A numpy array of shape (N)
"""
z = x.dot(self.W) + self.b
return self.sigmoid(z)

决策边界

逻辑回归前向操作是线性映射 + sigmoid函数,其决策界面如下:

\[ x_{1}\cdot w_{1} + x_{2}\cdot w_{2} + w_{0} = 0\\ \Rightarrow x_{2} = -1 * \frac {x_{1}\cdot w_{1} + w_{0}}{w_{2}} \]

测试数据

生成一个二维线性数据集,训练集大小为\(200\times 2\),测试集大小为\(40\times 2\)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def two_cate_linear():
x1 = np.linspace(20, 40, num=200)[np.random.choice(200, 120)]
y1 = np.linspace(20, 40, num=200)[np.random.choice(200, 120)]

x2 = np.linspace(-10, 10, num=200)[np.random.choice(200, 120)]
y2 = np.linspace(-10, 10, num=200)[np.random.choice(200, 120)]

x = np.vstack((np.concatenate((x1, x2)), np.concatenate((y1, y2)))).T
y = np.concatenate((np.zeros(120), np.ones(120)))

np.random.seed(120)
np.random.shuffle(x)
np.random.seed(120)
np.random.shuffle(y)

return x[:200], x[200:], y[:200], y[200:]

模型训练

以学习率和正则化强度为参数进行交叉验证,搜索最好的学习率和正则化强度组合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-

# @Time : 19-7-16 上午10:38
# @Author : zj


from lr_classifier import LogisticClassifier
import numpy as np
import math
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")


def two_cate_linear():
x1 = np.linspace(20, 40, num=200)[np.random.choice(200, 120)]
y1 = np.linspace(20, 40, num=200)[np.random.choice(200, 120)]

x2 = np.linspace(-10, 10, num=200)[np.random.choice(200, 120)]
y2 = np.linspace(-10, 10, num=200)[np.random.choice(200, 120)]

x = np.vstack((np.concatenate((x1, x2)), np.concatenate((y1, y2)))).T
y = np.concatenate((np.zeros(120), np.ones(120)))

np.random.seed(120)
np.random.shuffle(x)
np.random.seed(120)
np.random.shuffle(y)

return x[:200], x[200:], y[:200], y[200:]


def cross_validation(x_train, y_train, x_val, y_val, lr_choices, reg_choices, classifier=LogisticClassifier):
results = {}
best_val = -1 # The highest validation accuracy that we have seen so far.
best_svm = None # The LinearSVM object that achieved the highest validation rate.

for lr in lr_choices:
for reg in reg_choices:
svm = classifier()

svm.train(x_train, y_train, learning_rate=lr, reg=reg, num_iters=2000, batch_size=30, verbose=True)
y_train_pred = svm.predict(x_train)
y_val_pred = svm.predict(x_val)

train_acc = np.mean(y_train_pred == y_train)
val_acc = np.mean(y_val_pred == y_val)

results[(lr, reg)] = (train_acc, val_acc)
if best_val < val_acc:
best_val = val_acc
best_svm = svm

return results, best_svm, best_val


def compute_accuracy(y, y_pred):
num = y.shape[0]
num_correct = np.sum(y_pred == y)
acc = float(num_correct) / num
return acc


def plot(results):
# Visualize the cross-validation results
x_scatter = [math.log10(x[0]) for x in results]
y_scatter = [math.log10(x[1]) for x in results]

# plot training accuracy
marker_size = 100
colors = [results[x][0] for x in results]
plt.subplot(2, 1, 1)
plt.scatter(x_scatter, y_scatter, marker_size, c=colors, cmap=plt.cm.coolwarm)
plt.colorbar()
plt.xlabel('log learning rate')
plt.ylabel('log regularization strength')
plt.title('training accuracy')

# plot validation accuracy
colors = [results[x][1] for x in results] # default size of markers is 20
plt.subplot(2, 1, 2)
plt.scatter(x_scatter, y_scatter, marker_size, c=colors, cmap=plt.cm.coolwarm)
plt.colorbar()
plt.xlabel('log learning rate')
plt.ylabel('log regularization strength')
plt.title('validation accuracy')
plt.show()


def plot_v2(x, w, b):
plt.scatter(x[:, 0], x[:, 1])

x = np.linspace(-10, 40, num=200)
y = (-x * w[0] - b) / w[1]
plt.plot(x, y)

plt.show()


if __name__ == '__main__':
x_train, x_test, y_train, y_test = two_cate_linear()

lr_choices = [1e-3, 2.5e-3, 5e-3, 7.5e-3, 1e-2, 2.5e-2]
reg_choices = [8e-6, 1e-5, 2.5e-5, 5e-5, 7.5e-5, 1e-4]
results, best_classifier, best_val = cross_validation(x_train, y_train, x_test, y_test, lr_choices, reg_choices)

plot(results)
plot_v2(x_test, best_classifier.W, best_classifier.b)

for k in results.keys():
lr, reg = k
train_acc, val_acc = results[k]
print('lr = %f, reg = %f, train_acc = %f, val_acc = %f' % (lr, reg, train_acc, val_acc))

print('最好的设置是: lr = %f, reg = %f' % (best_classifier.lr, best_classifier.reg))
print('最好的测试精度: %f' % best_val)

训练结果如下,能够100%正确检测测试数据

1
2
最好的设置是: lr = 0.005000, reg = 0.000008
最好的测试精度: 1.000000

相关阅读