Skip to main content

Backpropagation

Backprop from scratch
Created on July 17|Last edited on July 17
The key insight of backpropagation is that it efficiently computes the gradient of the loss function with respect to each weight by using the chain rule of calculus. Starting from the output layer and moving backward, we accumulate the product of gradients.
In this report we are going to implement both the forward and the backward function for the affine layer and the categorical cross-entropy layer. Then we fit a logistic regression model to the digit dataset (https://archive.ics.uci.edu/dataset/81/pen+based+recognition+of+handwritten+digits).
Let's start with the affine layer (the derivation of backward pass for both the affine and the categorical cross-entropy layers can be found in this document: https://www.overleaf.com/read/mhxwchzhtcqt#93ca31)

Affine

class Affine:
def forward(self, inputs, weight, bias):
# Forward pass of an affine (fully connected) layer.

# Args:
# inputs: input matrix, shape (N, D)
# weight: weight matrix, shape (D, H)
# bias: bias vector, shape (H)

# Returns
# out: output matrix, shape (N, H)

self.cache = (inputs, weight, bias)

out = inputs @ weight + bias
assert out.shape[0] == inputs.shape[0]
assert out.shape[1] == weight.shape[1] == bias.shape[0]
return out

def backward(self, d_out):
# Backward pass of an affine (fully connected) layer.

# Args:
# d_out: incoming derivatives, shape (N, H)

# Returns:
# d_inputs: gradient w.r.t. the inputs, shape (N, D)
# d_weight: gradient w.r.t. the weight, shape (D, H)
# d_bias: gradient w.r.t. the bias, shape (H)

inputs, weight, bias = self.cache

d_inputs = d_out @ weight.T
d_weight = inputs.T @ d_out
d_bias = np.sum(d_out, axis=0)

assert np.all(d_inputs.shape == inputs.shape)
assert np.all(d_weight.shape == weight.shape)
assert np.all(d_bias.shape == bias.shape)
return d_inputs, d_weight, d_bias


CategoricalCrossEntropy

class CategoricalCrossEntropy:
def forward(self, logits, labels):
# Compute categorical cross-entropy loss.

# Args:
# logits: class logits, shape (N, K)
# labels: target labels in one-hot format, shape (N, K)

# Returns:
# loss: loss value, float (a single number)

N = logits.shape[0]

logits_shifted = logits - logits.max(axis=1, keepdims=True)
log_sum_exp = np.log(np.sum(np.exp(logits_shifted), axis=1, keepdims=True))
log_probs = logits_shifted - log_sum_exp
probs = softmax(logits_shifted, axis=-1)
loss = -np.sum(labels * log_probs) / N

# probs is the (N, K) matrix of class probabilities
self.cache = (probs, labels)
assert isinstance(loss, float)
return loss

def backward(self, d_out=1.0):
# Backward pass of the Cross Entropy loss.

# Args:
# d_out: Incoming derivatives. We set this value to 1.0 by default,
# since this is the terminal node of our computational graph
# Returns:
# d_logits: gradient w.r.t. the logits, shape (N, K)
# d_labels: gradient w.r.t. the labels
# we don't need d_labels for our models, so we don't
# compute it and set it to None. It's only included in the
# function definition for consistency with other layers.

probs, labels = self.cache

N = probs.shape[0]
d_logits = d_out * ((probs - labels)/N)

d_labels = None
assert np.all(d_logits.shape == probs.shape == labels.shape)
return d_logits, d_labels


Logistic Regression

class LogisticRegression:
def __init__(self, num_features, num_classes, learning_rate=1e-2):
# Logistic regression model.
# Gradients are computed with backpropagation.

# The model consists of the following sequence of operations:
# input -> affine -> softmax

self.learning_rate = learning_rate

# Initialize the model parameters
self.params = {
'W': np.zeros([num_features, num_classes]),
'b': np.zeros([num_classes])
}

# Define layers
self.affine = Affine()
self.cross_entropy = CategoricalCrossEntropy()

def predict(self, X):
# Generate predictions for one minibatch.

# Args:
# X: data matrix, shape (N, D)

# Returns:
# Y_pred: predicted class probabilities, shape (N, D)
# Y_pred[n, k] = probability that sample n belongs to class k
logits = self.affine.forward(X,self.params['W'], self.params['b'])
Y_pred = softmax(logits, axis=1)
return Y_pred

def step(self, X, Y):
# Perform one step of gradient descent on the minibatch of data.

# 1. Compute the cross-entropy loss for given (X, Y).
# 2. Compute the gradients of the loss w.r.t. model parameters.
# 3. Update the model parameters using the gradients.

# Args:
# X: data matrix, shape (N, D)
# Y: target labels in one-hot format, shape (N, K)

# Returns:
# loss: loss for (X, Y), float, (a single number)

# Forward pass - compute the loss on training data
logits = self.affine.forward(X, self.params['W'], self.params['b'])
loss = self.cross_entropy.forward(logits, Y)

# Backward pass - compute the gradients of loss w.r.t. all the model parameters
grads = {}
d_logits, _ = self.cross_entropy.backward()
_, grads['W'], grads['b'] = self.affine.backward(d_logits)

# Apply the gradients
for p in self.params:
self.params[p] = self.params[p] - self.learning_rate * grads[p]
return loss


Main training loop

for epoch in range(max_epochs):
loss = log_reg.step(X_train, Y_train)
if epoch % report_frequency == 0:
print(f'Epoch {epoch:4d}, loss = {loss:.4f}')


Results

After training the model for 500 epochs we obtain approximately 95% of accuracy.

Run set
1