- Prev: 08_pytorch_backprop
- Next: 10_
- Related:
Valueobject data structure, computation graph direction terminology
# imports, `Value` class, reset_graph() to init nn, graphviz: trace() & draw_dot(), and PyTorch: `Neuron`, `Layer`, `MLP` classes
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random
# extend `Value` class with the constituent methods listed above:
class Value:
def __init__(self, data, _children=(), _op='', label=''):
self.data = data
self.grad = 0.0
self._backward = lambda: None
self._prev = set(_children)
self._op = _op
self.label = label
def __repr__(self):
return f"Value(data={self.data})"
def __add__(self, other):
# pre-process `other`. If it is non-`Value`, assume `int`/`float` and wrap in `Value()`
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += out.grad * 1.0
other.grad += out.grad * 1.0
out._backward = _backward
return out
def __mul__(self, other):
# pre-process `other`. If it is non-`Value`, assume int/float and wrap in `Value()`
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward
return out
def __radd__(self, other): # fallback for swapped operands: i.e. other + self
return self + other # route to `__add__`
def __rmul__(self, other): # fallback for swapped operands: i.e. other * self
return self * other # route to `__mul__`
# ensure `other` is NEVER a `Value` object. Only int/float allowed
def __pow__(self, other):
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data**other, (self,), f'**{other}')
# recall downstream grad = local grad * upstream grad
# local gradient for x^k: d(x^k)/dx = kx^(k-1)
def _backward():
self.grad += other * (self.data ** (other - 1)) * out.grad
out._backward = _backward
return out
def __truediv__(self, other): # i.e. self / other but...
return self * other**-1 # use previously defined __mul__() and __pow__(), instead of implementing `/` operation and its own `_backward()``
def __neg__(self): # -self
return self * -1 # use previously defined __mul__() to evaluate this `Value` * `int` expression
def __sub__(self, other): # self - other
return self + (-other) # use previously defined __add__(), instead of implementing `-` operation and its own `_backward()``
def tanh(self):
x = self.data
t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
out = Value(t, (self, ), 'tanh')
def _backward():
self.grad += (1 - t**2) * out.grad
out._backward = _backward
return out
# define exponentiation method
def exp(self):
x = self.data # input data value
out = Value(math.exp(x), (self, ), 'exp') # output data value: use builtin math.exp(x)
# recall downstream grad = local grad * upstream grad
# local gradient for exp: d(e^x)/dx = e^x (i.e. out.data, just calculated!)
def _backward():
self.grad += out.data * out.grad
out._backward = _backward
return out
# define division method
def backward(self):
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
self.grad = 1.0
for node in reversed(topo):
node._backward()
# helper: re-initialise graph
def reset_graph(reset_level):
# declare global gradients
global x1, x2, w1, w2, x1w1, x2w2, x1w1x2w2, b, n, o
if reset_level == 'gradients':
x1.grad = x2.grad = w1.grad = w2.grad = x1w1.grad = x2w2.grad = x1w1x2w2.grad = b.grad = n.grad = o.grad = 0
print("reset_graph(): All gradients have been reset to 0")
# reset all variables
elif reset_level == 'graph':
# redefine inputs (x1,x2), weights (w1,w2), and then the graph (n = x1*w1 + x2*w2 + b)
x1 = Value(2.0, label='x1'); x2 = Value(0.0, label='x2')
w1 = Value(-3.0, label='w1'); w2 = Value(1.0, label='w2')
x1w1 = x1 * w1; x1w1.label = 'x1*w1'; x2w2 = x2 * w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
# manually change bias to make number nice for education: (b=8 to see tanh squishing!, b=6.8813735870195432 so deriv = 1)
b = Value(6.8813735870195432, label='b');
n = x1w1x2w2 + b; n.label = 'n'
# try re-run the activation function on n (the raw cell body) and draw the output node o
o = n.tanh(); o.label = 'o'
print("reset_graph(): All vars, initial and intermediate, have been reset. All gradients now 0")
else: print("reset_graph(): please specify the level of reset desired 'gradients' or 'graph'")
# graphviz
from graphviz import Digraph
def trace(root):
# recursively builds a set of all nodes and edges in a graph
nodes, edges = set(), set()
def build(v):
if v not in nodes:
nodes.add(v)
for child in v._prev:
edges.add((child, v))
build(child)
build(root)
return nodes, edges
def draw_dot(root):
dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
nodes, edges = trace(root)
for n in nodes:
uid = str(id(n))
# for any value in the graph, create a rectangular ('record') node for it
dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
if n._op:
# if this value is a result of some operation, create an op node for it
dot.node(name = uid + n._op, label = n._op)
# and connect this node to it
dot.edge(uid + n._op, uid)
for n1, n2 in edges:
# connect n_i to the op node of n2
dot.edge(str(id(n1)), str(id(n2)) + n2._op)
return dotPyTorch Neuron, Layer, MLP class definitions, unchanged from previous notebook:
# PyTorch Neuron, Layer, MLP classes
class Neuron:
# nin: number of inputs to the neuron
def __init__(self, nin):
self.w = [Value(random.uniform(-1,1)) for _ in range(nin)] # create 1 weight per input
self.b = Value(random.uniform(-1,1)) # create 1 bias for the neuron
# function call [e.g. n(x)] returns forward pass of this neuron (its post-activation value)
def __call__(self, x):
# pre-activation weighted input: act = (w ⋅ x) + b. NB: w * x is their dot product
act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
out = act.tanh() # output: after applying activitation function (non-linearity)
return out
# for a neuron: list of incoming weights concat w/ bias at that neuron (single element list)
def parameters(self):
return self.w + [self.b]
class Layer:
# nout: how many (independent evaluated) neurons in this layer
def __init__(self, nin, nout):
# define 1 layer as a list of `nout` `Neuron` objects, EACH being a `nin`-dim Neuron
self.neurons = [Neuron(nin) for _ in range(nout)]
# function calls [e.g. l(x)] independently evaluates the `nout` neurons in this layer
def __call__(self, x):
outs = [n(x) for n in self.neurons]
return outs[0] if len(outs) == 1 else outs # strip list if single element return
# for a layer: list containing:
# `neuron.parameters`: all params (incoming weights, bias) for each neuron
# `self.neurons`: for all neurons in that layer
def parameters(self):
return [p for neuron in self.neurons for p in neuron.parameters()]
class MLP:
# nouts: list defining the desired sizes of each layer in the MLP
def __init__(self, nin, nouts):
sz = [nin] + nouts
# define each `Layer` object by iterating consecutive pairs of sizes (i and i+1)
self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
# function calls returns `Layer` objects sequentially
def __call__(self, x):
for layer in self.layers:
x = layer(x)
return x
# for full MLP:
# `layer.parameters`: all params for each layer (i.e. ALL incoming weights, and baises for ALL neurons of that layer)
# `self.layers`: for all layers in the MLP network
def parameters(self):
return [p for layer in self.layers for p in layer.parameters()]
# helper function to inspect 1st layer, 1st neuron: data and grad of bias and 1st weight
def inspect_params_l1_n1_w_b(network):
# prints (4x)
print('1st layer, 1st neuron\'s bias, data:', network.layers[0].neurons[0].b.data)
print('1st layer, 1st neuron\'s bias, grad:', network.layers[0].neurons[0].b.grad)
print('1st layer, 1st neuron, 1st weight, data:', network.layers[0].neurons[0].w[0].data)
print('1st layer, 1st neuron, 1st weight, grad:', network.layers[0].neurons[0].w[0].grad)Example binary classifier
Perform backpropagation with PyTorch’s built-in .backward() method.
# o - define binary classifier NN
xs = [ # dataset with 4 examples
[2.0, 3.0, -1.0], # when nn is fed ex. 1 -> desired output: 1.0
[3.0, -1.0, 0.5], # when nn is fed ex. 2 -> desired output: -1.0
[0.5, 1.0, 1.0], # when nn is fed ex. 3 -> desired output: -1.0
[1.0, 1.0, -1.0], # when nn is fed ex. 4 -> desired output: 1.0
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets
# initialise neural network (MLP)
nn = MLP(3, [4, 4, 1])
# what nn currently outputs for these 4 examples
ypred = [nn(x) for x in xs]
print("For the 4 examples `xs`, we...:")
print("have actual nn outputs `ypred` -> want desired outputs `ys`")
for y_pred, y in zip(ypred, ys):
print(f"{y_pred.data} -> {y}")
For the 4 examples `xs`, we...:
have actual nn outputs `ypred` -> want desired outputs `ys`
-0.030669685297522595 -> 1.0
0.618279041630556 -> -1.0
0.3365212611340581 -> -1.0
0.7533370832196005 -> 1.0Inspect above output
Implement MSE (L2) loss function
- Predicted outputs
ypred(yout) are quite different from desired outputsys(ygt). - The
lossis a single number that measures the total performance of the neural network. - We use the mean squared error (MSE) loss function:
- where
- : mean squared error (loss function)
- : number of data points
- : observed values (
ys, or -ground truth:ygt) - : predicted values (
ypred, or output:yout)
If , (i.e. ygt = yout), then the . Otherwise, . The larger the difference the greater the is.
Other loss functions exist
2 examples:
- Mean Absolute Error (L1) Loss
- Cross Entropy Loss for next token prediction
# o - calculated individual loss (squared error) components for each data point:
print('individual loss (squared error) component for each data point (bigger error = worse!):')
print(*[err.data for err in [(yout - ygt)**2 for ygt, yout in zip(ys, ypred)]], sep='\n')
print('\ncompare individual loss components with previous block\'s outputs \n(see size of difference between each element of `ypred` and corresponding element of `ys`)\n\n---')
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
print('\ntotal loss (MSE):', loss.data)individual loss (squared error) component for each data point (bigger error = worse!):
1.0622800001912942
2.6188270565807112
1.7862890814633734
0.0608425945146143
compare individual loss components with previous block's outputs
(see size of difference between each element of `ypred` and corresponding element of `ys`)
---
total loss (MSE): 5.528238732749994Inspect above output
Inspecting network parameters
In gradient-descent, a parameter’s grad vector (scalar, in this example) points in the direction of steepest ascent.
Gradient Descent: Improving the loss function
After performing the loss.backward() pass, the grad of all parameter nodes (weights, baises) will change from → some other value:
- If
grad, that parameter negatively influences the loss function- To improve (minimise) the loss function, increase that parameter’s
datavalue
- To improve (minimise) the loss function, increase that parameter’s
- If
grad, that parameter positively influences the loss function- To improve (minimise) the loss function, decrease that parameter’s
datavalue
- To improve (minimise) the loss function, decrease that parameter’s
Note:
gradfor input data,xs, are not useful since we cannot change the input data.- inputs on all the
wandbvalues are useful to update theirdatavalues
# o - inspect a single weight and bias (data value and gradient) before backward pass
print('before backward pass: `loss.backward()`')
inspect_params_l1_n1_w_b(nn)before backward pass: `loss.backward()`
1st layer, 1st neuron's bias, data: -0.2527039348623852
1st layer, 1st neuron's bias, grad: 0.0
1st layer, 1st neuron, 1st weight, data: 0.5858915688216553
1st layer, 1st neuron, 1st weight, grad: 0.0# o - perform backward pass on loss "output" node -> re-inspect the same weight and bias AFTER this:
loss.backward()
print('after backward pass: `loss.backward()`')
inspect_params_l1_n1_w_b(nn)after backward pass: `loss.backward()`
1st layer, 1st neuron's bias, data: -0.2527039348623852
1st layer, 1st neuron's bias, grad: -0.6425469767350062
1st layer, 1st neuron, 1st weight, data: 0.5858915688216553
1st layer, 1st neuron, 1st weight, grad: -0.1083991069576411Inspect both above outputs
# i - HUGE output graph. worth inspecting root node (`loss`) and leaf nodes (input data: `xs`, bias `b`)
# draw_dot(loss)Inspecting ALL parameters
# i - inspect all parameters in the MLP network
print('no. of parameters:',len(nn.parameters()))
print('\nparameter values:', *nn.parameters(), sep='\n')no. of parameters: 41
parameter values:
Value(data=0.5858915688216553)
Value(data=0.24204639586837384)
Value(data=0.5540617455238359)
Value(data=-0.2527039348623852)
Value(data=-0.0024192363061177335)
Value(data=-0.32365360415300093)
Value(data=-0.12238259702033294)
Value(data=-0.0009319366871884949)
Value(data=0.853482765777527)
Value(data=0.9518014994361543)
Value(data=0.4694005041885234)
Value(data=-0.7151731529707315)
Value(data=-0.4132421662812258)
Value(data=0.8742470275180891)
Value(data=-0.4770288161590237)
Value(data=-0.512392518490292)
Value(data=-0.7214804589607708)
Value(data=0.15524433510895141)
Value(data=-0.606947212375128)
Value(data=0.8057384636851816)
Value(data=0.34423675024790956)
Value(data=0.8911643253773633)
Value(data=-0.3074846830517708)
Value(data=0.674843391644385)
Value(data=0.9803638159760659)
Value(data=-0.9379344688828088)
Value(data=0.026729242517386398)
Value(data=0.5592713428822869)
Value(data=0.36755329819874083)
Value(data=-0.7500473600441895)
Value(data=-0.03331756079480508)
Value(data=0.6999906180870976)
Value(data=0.3953800862639454)
Value(data=0.6509811930800722)
Value(data=0.9173726035761776)
Value(data=0.805925459887916)
Value(data=0.569870544614919)
Value(data=-0.6917547546903493)
Value(data=0.07826376203371344)
Value(data=-0.21450799987280855)
Value(data=0.9705860522133329)Gradient descent
Minimise the loss function
- For each parameter (incoming weights, or bias) in the neural network:
p in nn.parameters() - Change that parameter’s data value
p.dataslightly, in a direction OPPOSITE to its gradientp.grad- i.e.
p.data += -0.1 * p.grad
- i.e.
# i - perform 1 iteration of gradient descent (on all parameters in network)
for p in nn.parameters():
p.data += -0.1 * p.grad# o - re-inspect the same weight and bias from before, AFTER performing gradient descent:
print('after gradient descent (iteration: 1)')
inspect_params_l1_n1_w_b(nn)after gradient descent (iteration: 1)
1st layer, 1st neuron's bias, data: -0.18844923718888457
1st layer, 1st neuron's bias, grad: -0.6425469767350062
1st layer, 1st neuron, 1st weight, data: 0.5967314795174193
1st layer, 1st neuron, 1st weight, grad: -0.1083991069576411Manually loop: 3 iterations
Iteration 2
# i - forward pass, inspect loss function value
ypred = [nn(x) for x in xs] # calculate new network values
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred)) # recalculate loss
print('\nforward pass (iter 2): total loss (MSE):', loss.data)forward pass (iter 2): total loss (MSE): 1.029708597714175# i - re-initialise all parameter gradients
for p in nn.parameters():
p.grad = 0.0# i - backward pass: update gradients on all parameters via backprop
loss.backward()# i - gradient descent: nudge (0.1) all params in direction of steepest descent
for p in nn.parameters():
p.data += -0.1 * p.grad # nudge is OPPOSITE to grad direction (grad: steepest ascent)# o - inspect parameters and predicted values
print('after gradient descent (iteration: 2)\nparam values:')
inspect_params_l1_n1_w_b(nn)
print('\npredicted outputs ypred/yout (vs desired outputs ys/ygt)')
for y_pred, y in zip(ypred, ys):
print(f"{y_pred.data} -> {y}")after gradient descent (iteration: 2)
param values:
1st layer, 1st neuron's bias, data: -0.2103971111257778
1st layer, 1st neuron's bias, grad: 0.21947873936893247
1st layer, 1st neuron, 1st weight, data: 0.5318873931345732
1st layer, 1st neuron, 1st weight, grad: 0.6484408638284616
predicted outputs ypred/yout (vs desired outputs ys/ygt)
0.38199162402373854 -> 1.0
-0.6145309735746732 -> -1.0
-0.34199409598487684 -> -1.0
0.7426750985906923 -> 1.0Iteration 3
# o - forward pass, inspect loss function value
ypred = [nn(x) for x in xs]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
print('\nforward pass (iter 3): total loss (MSE):', loss.data)forward pass (iter 3): total loss (MSE): 0.36594039334941736# re-initialise all parameter gradients
for p in nn.parameters():
p.grad = 0.0# backward pass: update gradients on all parameters via backprop
loss.backward()# gradient descent: nudge (0.1) all params in direction of steepest descent
for p in nn.parameters():
p.data += -0.1 * p.grad # nudge is OPPOSITE to grad direction (grad: steepest ascent)# o - inspect parameters and predicted values
print('after gradient descent (iteration: 3)')
inspect_params_l1_n1_w_b(nn)
print('\npredicted outputs ypred/yout (vs desired outputs ys/ygt)')
for y_pred, y in zip(ypred, ys):
print(f"{y_pred.data} -> {y}")after gradient descent (iteration: 3)
1st layer, 1st neuron's bias, data: -0.2169528627439334
1st layer, 1st neuron's bias, grad: 0.0655575161815562
1st layer, 1st neuron, 1st weight, data: 0.49980160109216426
1st layer, 1st neuron, 1st weight, grad: 0.32085792042408956
predicted outputs ypred/yout (vs desired outputs ys/ygt)
0.6674766302017127 -> 1.0
-0.7760980798078445 -> -1.0
-0.5706257617365379 -> -1.0
0.8555206051461104 -> 1.0Iteration 4
# o - forward pass, inspect loss function value
ypred = [nn(x) for x in xs]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
print('\nforward pass (iter 4): total loss (MSE):', loss.data)forward pass (iter 4): total loss (MSE): 0.18176975167171786# re-initialise all parameter gradients
for p in nn.parameters():
p.grad = 0.0# backward pass: update gradients on all parameters via backprop
loss.backward()# gradient descent: nudge (0.1) all params in direction of steepest descent
for p in nn.parameters():
p.data += -0.1 * p.grad # nudge is OPPOSITE to grad direction (grad: steepest ascent)# o - inspect parameters and predicted values
print('after gradient descent (iteration: 4)')
inspect_params_l1_n1_w_b(nn)
print('\npredicted outputs ypred/yout (vs desired outputs ys/ygt)')
for y_pred, y in zip(ypred, ys):
print(f"{y_pred.data} -> {y}")after gradient descent (iteration: 4)
1st layer, 1st neuron's bias, data: -0.22526972514243898
1st layer, 1st neuron's bias, grad: 0.0831686239850558
1st layer, 1st neuron, 1st weight, data: 0.4749438078763519
1st layer, 1st neuron, 1st weight, grad: 0.2485779321581236
predicted outputs ypred/yout (vs desired outputs ys/ygt)
0.7425864308571539 -> 1.0
-0.8457322647516171 -> -1.0
-0.7213795164363924 -> -1.0
0.8813404108679985 -> 1.0Automate loop: 20 iterations
# i - perform 20 more gradient descent iterations (for all 41 network parameters)
for k in range(20):
# forward pass, with mean square error (MSE) loss
ypred = [nn(x) for x in xs]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
# re-initialise all parameter gradients
for p in nn.parameters():
p.grad = 0.0
# backward pass: update gradients on all parameters via backprop
loss.backward()
# gradient descent: nudge (0.1) all params in direction of steepest descent
for p in nn.parameters():
p.data += -0.1 * p.grad # nudge is OPPOSITE to grad direction (grad: steepest ascent)
print(k, loss.data)
0 0.11852673045381416
1 0.0880052700087189
2 0.06987862296212292
3 0.057878823177831724
4 0.04935460171584655
5 0.0429905228649674
6 0.03805999662401936
7 0.03412907715440516
8 0.03092268619290014
9 0.02825805578160131
10 0.026009033488036827
11 0.024085754486472596
12 0.022422480881888758
13 0.020970021409476274
14 0.01969083836718791
15 0.01855579283222802
16 0.01754192231835979
17 0.016630888154989535
18 0.01580786849680682
19 0.015060754626918213# o - inspect network's predictions: ypred
print('predicted outputs ypred/yout (vs desired outputs ys/ygt)')
for y_pred, y in zip(ypred, ys):
print(f"{y_pred.data} -> {y}")predicted outputs ypred/yout (vs desired outputs ys/ygt)
0.9327166974173132 -> 1.0
-0.9544386424494977 -> -1.0
-0.9190363251214165 -> -1.0
0.9563793871111931 -> 1.0Inspect above outputs
Takeaways
Neural network, forward pass, loss function, backward pass, gradient descent
- Neural network: A mathematical expression (graph) with the following structure:
- input data (e.g. input layer neurons, each with a value),
- input parameters (full network’s weights, biases)
- hidden and output layer neurons (each with activation values)
- Forward pass: Mathematical expression computing the value at each neuron
- based on the input data, and parameters on that iteration
- Loss function: Accuracy of the network’s predictions (final layer outputs in NN).
- Goal: minimise the loss function
- Low
lossmeans the predictionsypredsare better matching the targetsyout- Backward pass: Use backpropagation to update each parameter’s gradient
- Gradient descent: Tune all parameters to iteratively decrease the loss locally (minimise loss function)
- Repeat for N iterations
Common gotchas (learning rate,
zero_grad())
- The learning rate (in our example
lr = 0.1) should be set dynamically
- Too low: too long to converge at local minima; too high: unstable overshooting behaviour
- Hard coding
0.1leads to overshooting the loss function minima (i.e.lossincreases, predictions worsen)- 2 Examples of dynamic
lrdecay. Updatelron each iteration…
- proportional to gradient
- As gradient flattens (approaches
0.0) →lrdecays → take smaller next steps- inversely proportional to number of iterations
- As iter count rises →
lrdecays → take smaller next steps- Gradients must be reinitialised (e.g. PyTorch:
.zero_grad()) before.backward()
- Otherwise, gradients aggressively accumulate.
- Recall
.backward()hasself.grad += ... * out.gradstep.+=captures multivariate (accumulating) influence between nodes on a single backpropagation iteration.
Sources
- YouTube: The spelled-out intro to neural networks and backpropagation: building micrograd
- karpathy/micrograd on GitHub
- Jupyter notebooks from this chapter
- Google Colab exercises