# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

第一步：定义网络参数

批量数据大小 $N = 64$
输入层神经元个数 $D_{i n} = 1000$
隐藏层神经元个数 $H = 100$
输出层神经元个数 $D_{o u t} = 10$

1
2
3

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

第二步：初始化数据、权重（该网络没有偏置向量）以及学习率

输入数据 $x \in R^{N \times D_{i n}}$
输出数据 $y \in R^{N \times D_{o u t}}$
隐藏层权重矩阵 $w 1 \in R^{D_{i n} \times H}$
输出层权重矩阵 $w 2 \in R^{H \times D_{o u t}}$

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

第三步：迭代计算，输入批量数据到神经网络，进行前向传播

$h = x \cdot w 1 h_{r e l u} = m a x (0, h) y_{p r e d} = h_{r e l u} \cdot w 2$

# Forward pass: compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)

第四步：迭代计算，计算损失函数（误差平方和 - L1范数的平方）

$l o s s = {∥ \begin{matrix} y_{p r e d} - y \end{matrix} ∥}^{2}$

1
2
3

# Compute and print loss
loss = np.square(y_pred - y).sum()
print(t, loss)

第五步：迭代计算，反向传播，计算输出层输入向量梯度

设 $y_{p r e d} - y = X$ ， $X$ 大小为 $N \times D_{o u t}$ ，则

$l o s s = {∥ \begin{matrix} X \end{matrix} ∥}^{2} = (v e c (X))^{T} \cdot v e c (X)$

对损失函数 $l o s s (y_{p r e d})$ 求输出层输入向量的微分

$d l o s s = d (t r (l o s s)) = t r (d l o s s) = t r (d ((v e c (X))^{T} \cdot v e c (X))) = t r (d (v e c (X)^{T}) \cdot v e c (X) + v e c (X)^{T} \cdot d v e c (X)) = t r (d (v e c (X)^{T}) \cdot v e c (X)) + t r (v e c (X)^{T} \cdot d v e c (X)) = t r ((d v e c (X))^{T} \cdot v e c (X)) + t r (v e c (X)^{T} \cdot d v e c (X)) = t r ((v e c (X))^{T} \cdot d v e c (X)) + t r (v e c (X)^{T} \cdot d v e c (X)) = t r (2 (v e c (X))^{T} \cdot d v e c (X)) = t r (2 X^{T} \cdot d X)$

所以Jacobian矩阵为 $D_{X} f (X) = 2 X^{T}$ ，梯度矩阵为 $▽_{X} f (X) = 2 X = 2 (y_{p r e d} - y)$

1	grad_y_pred = 2.0 * (y_pred - y)

第六步：迭代计算，反向传播，计算输出层权重向量以及隐藏层输出向量梯度

$y_{p r e d} = h_{r e l u} \cdot w 2 \Rightarrow d y_{p r e d} = d h_{r e l u} \cdot w 2 + h_{r e l u} \cdot d w 2$

$d l o s s = t r (2 X^{T} \cdot d X) = t r (2 (y_{p r e d} - y)^{T} \cdot d ((y_{p r e d} - y))) = t r (2 (y_{p r e d} - y)^{T} \cdot d y_{p r e d}) = t r (2 (y_{p r e d} - y)^{T} \cdot (d h_{r e l u} \cdot w 2 + h_{r e l u} \cdot d w 2)) = t r (2 (y_{p r e d} - y)^{T} \cdot d h_{r e l u} \cdot w 2) + t r (2 (y_{p r e d} - y)^{T} \cdot h_{r e l u} \cdot d w 2) = t r (w 2 \cdot 2 (y_{p r e d} - y)^{T} \cdot d h_{r e l u}) + t r (2 (y_{p r e d} - y)^{T} \cdot h_{r e l u} \cdot d w 2)$

输出层权重向量的Jacobian矩阵为 $2 (y_{p r e d} - y)^{T} \cdot h_{r e l u}$ ，梯度矩阵为 $(h_{r e l u})^{T} \cdot 2 (y_{p r e d} - y)$

隐藏层输出向量的Jacobian矩阵为 $w 2 \cdot 2 (y_{p r e d} - y)^{T}$ ，梯度矩阵为 $2 (y_{p r e d} - y) \cdot (w 2)^{T}$

1 2	grad_w2 = h_relu.T.dot(grad_y_pred) grad_h_relu = grad_y_pred.dot(w2.T)

第七步：迭代计算，反向传播，计算隐藏层输入向量梯度

$h_{r e l u} = m a x (0, h) \Rightarrow d h_{r e l u} = {\begin{matrix} d h & h \geq 0 \\ 0 & h < 0 \end{matrix} = 1 (h \geq 0) * d h$

激活函数是逐个元素操作，所以使用Hadamard积

$d l o s s = t r (w 2 \cdot 2 (y_{p r e d} - y)^{T} \cdot d h_{r e l u}) = t r (w 2 \cdot 2 (y_{p r e d} - y)^{T} \cdot 1 (h \geq 0) * d h) = t r ((2 (y_{p r e d} - y) \cdot (w 2)^{T})^{T} \cdot 1 (h \geq 0) * d h) = t r ((2 (y_{p r e d} - y) \cdot (w 2)^{T})^{T} * 1 (h \geq 0)^{T} \cdot d h)$

所以Jacobian矩阵为 $(2 (y_{p r e d} - y) \cdot (w 2)^{T})^{T} * 1 (h \geq 0)^{T}$ ，梯度矩阵为

$▽_{h} f (h) = 1 (h \geq 0) \cdot 2 (y_{p r e d} - y) \cdot (w 2)^{T}$

1 2	grad_h = grad_h_relu.copy() grad_h[h < 0] = 0

第八步：迭代计算，反向传播，计算隐藏层权重向量梯度

$h = x \cdot w 1 \Rightarrow d h = x \cdot d w 1$

$d l o s s = t r ((2 (y_{p r e d} - y) \cdot (w 2)^{T})^{T} * 1 (h \geq 0)^{T} \cdot d h) = t r ((2 (y_{p r e d} - y) \cdot (w 2)^{T})^{T} * 1 (h \geq 0)^{T} \cdot x \cdot d w 1)$

所以Jacobian矩阵为 $(2 (y_{p r e d} - y) \cdot (w 2)^{T})^{T} * 1 (h \geq 0)^{T} \cdot x$ ，梯度矩阵为

$▽_{w 1} f (w 1) = ((2 (y_{p r e d} - y) \cdot (w 2)^{T})^{T} * 1 (h \geq 0)^{T} \cdot x)^{T} = x^{T} \cdot 1 (h \geq 0) * 2 (y_{p r e d} - y) \cdot (w 2)^{T}$

1	grad_w1 = x.T.dot(grad_h)

第九步：迭代计算，反向传播，更新权重矩阵

1
2
3

# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2

推导二

cs231n课程Putting it together: Minimal Neural Network Case Study中实现了一个2层神经网络

N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D)) # data matrix (each row = single example)
y = np.zeros(N*K, dtype='uint8') # class labels
for j in xrange(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j

# initialize parameters randomly
h = 100 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

# gradient descent loop
num_examples = X.shape[0]
for i in xrange(10000):
  
  # evaluate class scores, [N x K]
  hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
  scores = np.dot(hidden_layer, W2) + b2
  
  # compute the class probabilities
  exp_scores = np.exp(scores)
  probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
  
  # compute the loss: average cross-entropy loss and regularization
  correct_logprobs = -np.log(probs[range(num_examples),y])
  data_loss = np.sum(correct_logprobs)/num_examples
  reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
  loss = data_loss + reg_loss
  if i % 1000 == 0:
    print "iteration %d: loss %f" % (i, loss)
  
  # compute the gradient on scores
  dscores = probs
  dscores[range(num_examples),y] -= 1
  dscores /= num_examples
  
  # backpropate the gradient to the parameters
  # first backprop into parameters W2 and b2
  dW2 = np.dot(hidden_layer.T, dscores)
  db2 = np.sum(dscores, axis=0, keepdims=True)
  # next backprop into hidden layer
  dhidden = np.dot(dscores, W2.T)
  # backprop the ReLU non-linearity
  dhidden[hidden_layer <= 0] = 0
  # finally into W,b
  dW = np.dot(X.T, dhidden)
  db = np.sum(dhidden, axis=0, keepdims=True)
  
  # add regularization gradient contribution
  dW2 += reg * W2
  dW += reg * W
  
  # perform a parameter update
  W += -step_size * dW
  b += -step_size * db
  W2 += -step_size * dW2
  b2 += -step_size * db2

第一步：设置批量输入数据和输出数据

批量数据大小 $N = 100$
数据维数 $D = 2$
类别数 $K = 3$
输入数据 $X \in R^{N \times D}$
输出数据 $y \in R^{N \times K}$

N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D)) # data matrix (each row = single example)
y = np.zeros(N*K, dtype='uint8') # class labels
for j in xrange(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j

第二步：初始化权重参数

隐藏层神经元个数 $h = 100$
隐藏层权重矩阵 $W \in R^{D \times h}$
隐藏层偏置向量 $b \in R^{1 \times h}$
输出层权重矩阵 $W 2 \in R^{h \times K}$
输出层偏置向量 $b 2 \in R^{1 \times K}$

# initialize parameters randomly
h = 100 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))

第三步：设置学习率和正则化强度

1
2
3

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

第四步：迭代计算，输入批量数据到神经网络，进行前向传播

$h i d d e n L a y e r = m a x (X \cdot W + b, 0) s c o r e s = h i d d e n L a y e r \cdot W 2 + b 2$

1
2
3

# evaluate class scores, [N x K]
hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
scores = np.dot(hidden_layer, W2) + b2

第四步：迭代计算，计算损失值

$e x p S c o r e s = e x p (s c o r e s) p r o b s = \frac{e x p S c o r e s}{e x p S c o r e s \cdot 1} c o r r e c t L o g P r o b s = - \ln p r o b s_{y} \in R^{N \times 1} d a t a L o s s = \frac{1}{N} 1^{T} \cdot c o r r e c t L o g P r o b s r e g L o s s = 0.5 \cdot r e g \cdot | | W | |^{2} + 0.5 \cdot r e g \cdot | | W 2 | |^{2} l o s s = d a t a L o s s + r e g L o s s$

$1$ 表示求和向量： $[1, 1, . . .]^{T}$

$p r o b s_{y}$ 表示每行正确类别的概率

# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]

# compute the loss: average cross-entropy loss and regularization
correct_logprobs = -np.log(probs[range(num_examples),y])
data_loss = np.sum(correct_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
loss = data_loss + reg_loss
if i % 1000 == 0:
    print "iteration %d: loss %f" % (i, loss)

第五步：迭代计算，反向传播，计算输出层输入向量梯度

$s c o r e s_{y} = s c o r e s * Y \cdot 1 e x p s c o r e s_{y} = e x p (s c o r e s * Y \cdot 1) e x p s c o r e s = e x p (s c o r e s) e x p s c o r e s_{s u m} = e x p (s c o r e s) \cdot 1 p r o b s_{y} = \frac{e x p s c o r e s_{y}}{e x p s c o r e s_{s u m}} p r o b s = \frac{e x p s c o r e s}{e x p s c o r e s_{s u m}}$

$d a t a l o s s = - \frac{1}{N} 1^{T} \cdot \ln (p r o b s_{y}) = - \frac{1}{N} 1^{T} \cdot \ln \frac{e x p s c o r e s_{y}}{e x p s c o r e s_{s u m}} = - \frac{1}{N} 1^{T} \cdot (\ln e x p s c o r e s_{y} - \ln e x p s c o r e s_{s u m}) = - \frac{1}{N} 1^{T} \cdot (s c o r e s * Y \cdot 1 - \ln e x p s c o r e s_{s u m})$

$d (d a t a l o s s) = t r (d (- \frac{1}{N} (1^{T} \cdot s c o r e s * Y \cdot 1 - 1^{T} \cdot \ln e x p s c o r e s_{s u m}))) = t r (d (- \frac{1}{N} (1^{T} \cdot s c o r e s * Y \cdot 1))) - t r (d (- \frac{1}{N} (1^{T} \cdot \ln e x p s c o r e s_{s u m})))$

$t r (d (- \frac{1}{N} (1^{T} \cdot s c o r e s * Y \cdot 1))) = t r (- \frac{1}{N} (1^{T} \cdot d s c o r e s * Y \cdot 1)) = t r (- \frac{1}{N} (d s c o r e s^{T} \cdot Y)) = t r (- \frac{1}{N} Y^{T} \cdot d s c o r e s)$

$t r (d (- \frac{1}{N} (1^{T} \cdot \ln e x p s c o r e s_{s u m}))) = t r (- \frac{1}{N} (1^{T} \cdot e x p s c o r e s_{s u m}^{- 1} \cdot d e x p s c o r e s_{s u m})) = t r (- \frac{1}{N} \frac{(1^{T} \cdot d e x p s c o r e s_{s u m})}{e x p s c o r e s_{s u m}}) = t r (- \frac{1}{N} \frac{(1^{T} \cdot e x p (s c o r e s) * d s c o r e s \cdot 1)}{e x p s c o r e s_{s u m}}) = t r (- \frac{1}{N} \frac{e x p (s c o r e s)^{T} \cdot d s c o r e s}{e x p s c o r e s_{s u m}}) = t r (- \frac{1}{N} (\frac{e x p (s c o r e s)}{e x p s c o r e s_{s u m}})^{T} \cdot d s c o r e s) = t r (- \frac{1}{N} p r o b s^{T} \cdot d s c o r e s)$

$\Rightarrow d (d a t a l o s s) = t r (- \frac{1}{N} Y^{T} \cdot d s c o r e s) - t r (- \frac{1}{N} p r o b s^{T} \cdot d s c o r e s) = t r (\frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot d s c o r e s)$

所以 $J a c o b i a n$ 矩阵为 $D_{s c o r e s} f (s c o r e s) = p r o b s^{T} - Y^{T}$ ，梯度矩阵为 $▽_{s c o r e s} f (s c o r e s) = p r o b s - Y$

$Y$ 大小为 $N \times K$ ，每行仅正确类别位置为1，其余为0
$1$ 是求和向量， $[1, 1, . . .]^{T}$

计算softmax分类的交叉熵损失关于输出层输入向量梯度，这一部分想了好久，主要问题是关于矩阵除法和逐元素除法（标量除法）的分别，感觉还是先对单个数据进行求梯度再泛化比较方便

# compute the gradient on scores
dscores = probs
dscores[range(num_examples),y] -= 1
dscores /= num_examples

第六步：迭代计算，反向传播，计算输出层权重矩阵、偏置向量以及隐藏层输出向量梯度

$s c o r e s = h i d d e n L a y e r \cdot W 2 + b 2 d s c o r e s = d h i d d e n L a y e r \cdot W 2 + h i d d e n L a y e r \cdot d W 2 + d b 2$

$d (d a t a l o s s) = t r (\frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot d s c o r e s) = t r (\frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot (d h i d d e n L a y e r \cdot W 2 + h i d d e n L a y e r \cdot d W 2 + d b 2)) = t r (\frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot d h i d d e n L a y e r \cdot W 2) + t r (\frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot h i d d e n L a y e r \cdot d W 2) + t r (\frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot d b 2)$

求输出层权重矩阵梯度

$d (d a t a l o s s) = t r (\frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot h i d d e n L a y e r \cdot d W 2)$

$D_{W 2} f (W 2) = \frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot h i d d e n L a y e r ▽_{W 2} f (W 2) = \frac{1}{N} h i d d e n L a y e r^{T} \cdot (p r o b s - Y)$

求输出层偏置向量梯度

$d (d a t a l o s s) = t r (\frac{1}{N} \sum_{i = 1}^{N} (p r o b s_{i}^{T} - Y_{i}^{T}) \cdot d b 2)$

$D_{b 2} f (b 2) = \frac{1}{N} \sum_{i = 1}^{N} (p r o b s_{i}^{T} - Y_{i}^{T}) ▽_{b 2} f (b 2) = \frac{1}{N} \sum_{i = 1}^{N} (p r o b s_{i} - Y_{i})$

对偏置向量还需要注意维数，求和批量数据的偏置向量梯度

求隐藏层输出向量梯度

$d (d a t a l o s s) = t r (\frac{1}{N} (p r o b s^{T} - Y^{T}) \cdot d h i d d e n L a y e r \cdot W 2) = t r (\frac{1}{N} W 2 \cdot (p r o b s^{T} - Y^{T}) \cdot d h i d d e n L a y e r)$

$D_{h i d d e n L a y e r} f (h i d d e n L a y e r) = \frac{1}{N} W 2 \cdot (p r o b s^{T} - Y^{T}) ▽_{h i d d e n L a y e r} f (h i d d e n L a y e r) = \frac{1}{N} (p r o b s - Y) \cdot (W 2)^{T}$

# backpropate the gradient to the parameters
# first backprop into parameters W2 and b2
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0, keepdims=True)
# next backprop into hidden layer
dhidden = np.dot(dscores, W2.T)

第七步：迭代计算，反向传播，计算隐藏层输入向量梯度

$h i d d e n L a y e r_{i n} = X \cdot W + b h i d d e n L a y e r = m a x (0, h i d d e n L a y e r_{i n}) d h i d d e n L a y e r = 1 (h i d d e n L a y e r_{i n} \geq 0) * d h i d d e n L a y e r_{i n}$

$d (d a t a l o s s) = t r (\frac{1}{N} W 2 \cdot (p r o b s^{T} - Y^{T}) \cdot d h i d d e n L a y e r) = t r (\frac{1}{N} W 2 \cdot (p r o b s^{T} - Y^{T}) \cdot 1 (h i d d e n L a y e r_{i n} \geq 0) * d h i d d e n L a y e r_{i n}) = t r (\frac{1}{N} (W 2 \cdot (p r o b s^{T} - Y^{T}))^{T} * 1 (h i d d e n L a y e r_{i n} \geq 0)^{T} \cdot d h i d d e n L a y e r_{i n})$

$D_{h i d d e n L a y e r_{i n}} f (h i d d e n L a y e r_{i n}) = \frac{1}{N} (W 2 \cdot (p r o b s^{T} - Y^{T}))^{T} * 1 (h i d d e n L a y e r_{i n} \geq 0)^{T} ▽_{h i d d e n L a y e r_{i n}} f (h i d d e n L a y e r_{i n}) = \frac{1}{N} ((p r o b s - Y) \cdot (W 2)^{T}) * 1 (h i d d e n L a y e r_{i n} \geq 0)$

1 2	# backprop the ReLU non-linearity dhidden[hidden_layer <= 0] = 0

第七步：迭代计算，反向传播，计算隐藏层权重向量和偏置向量梯度

$h i d d e n L a y e r_{i n} = X \cdot W + b d h i d d e n L a y e r_{i n} = X \cdot d W + d b$

$d (d a t a l o s s) = t r (\frac{1}{N} (W 2 \cdot (p r o b s^{T} - Y^{T}))^{T} * 1 (h i d d e n L a y e r_{i n} \geq 0)^{T} \cdot d h i d d e n L a y e r_{i n}) = t r (\frac{1}{N} (W 2 \cdot (p r o b s^{T} - Y^{T}))^{T} * 1 (h i d d e n L a y e r_{i n} \geq 0)^{T} \cdot (X \cdot d W + d b))$

求隐藏层权重向量梯度

$d (d a t a l o s s) = t r (\frac{1}{N} (W 2 \cdot (p r o b s^{T} - Y^{T}))^{T} * 1 (h i d d e n L a y e r_{i n} \geq 0)^{T} \cdot X \cdot d W)$

$D_{W} f (W) = \frac{1}{N} (W 2 \cdot (p r o b s^{T} - Y^{T}))^{T} * 1 (h i d d e n L a y e r_{i n} \geq 0)^{T} \cdot X ▽_{W} f (W) = \frac{1}{N} X^{T} \cdot ((p r o b s - Y) \cdot (W 2)^{T}) * 1 (h i d d e n L a y e r_{i n} \geq 0)$

求隐藏层偏置向量梯度

$d (d a t a l o s s) = t r (\frac{1}{N} \sum_{i = 1}^{N} (W 2 \cdot (p r o b s^{T} - Y^{T}))^{T} * 1 (h i d d e n L a y e r_{i n} \geq 0)^{T} \cdot d b)$

$D_{W} f (W) = \frac{1}{N} \sum_{i = 1}^{N} (W 2 \cdot (p r o b s^{T} - Y^{T}))^{T} * 1 (h i d d e n L a y e r_{i n} \geq 0)^{T} ▽_{W} f (W) = \frac{1}{N} \sum_{i = 1}^{N} ((p r o b s - Y) \cdot (W 2)^{T}) * 1 (h i d d e n L a y e r_{i n} \geq 0)$

对偏置向量还需要注意维数，求和批量数据的偏置向量梯度

1
2
3

# finally into W,b
dW = np.dot(X.T, dhidden)
db = np.sum(dhidden, axis=0, keepdims=True)

第八步：迭代计算，反向传播，计算正则化梯度

$r e g L o s s = 0.5 \cdot r e g \cdot | | W | |^{2} + 0.5 \cdot r e g \cdot | | W 2 | |^{2} d (r e g L o s s) = r e g \cdot W \cdot d W + r e g \cdot W 2 \cdot d W 2$

1
2
3

# add regularization gradient contribution
dW2 += reg * W2
dW += reg * W

第九步：迭代计算，反向传播，更新权重矩阵和偏置向量

# perform a parameter update
W += -step_size * dW
b += -step_size * db
W2 += -step_size * dW2
b2 += -step_size * db2

大海

神经网络推导-矩阵计算

推导一

推导二

相关资料